From 38fce83f2d5f7211c4460b96e870561cab4f69ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=B7=A8=E4=B8=B0?= <920347627@qq.com> Date: Thu, 5 Nov 2020 17:56:13 +0800 Subject: [PATCH 6/8] =?UTF-8?q?=E5=AE=89=E8=A3=85hadoop2.7=E7=9A=84?= =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9=E9=94=81=E4=B8=BAsynchroni?= =?UTF-8?q?zed?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../scheduler/capacity/CapacityScheduler.java | 5593 ++++++++--------- .../scheduler/capacity/LeafQueue.java | 224 +- 2 files changed, 2749 insertions(+), 3068 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java index 54b301a..8aeb203 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -142,7 +142,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event - .QueueManagementChangeEvent; + .QueueManagementChangeEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.ReleaseContainerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType; @@ -169,2930 +169,2765 @@ @Evolving @SuppressWarnings("unchecked") public class CapacityScheduler extends - AbstractYarnScheduler implements - PreemptableResourceScheduler, CapacitySchedulerContext, Configurable, - ResourceAllocationCommitter, MutableConfScheduler { - - private static final Log LOG = LogFactory.getLog(CapacityScheduler.class); - - private CapacitySchedulerQueueManager queueManager; - - // timeout to join when we stop this service - protected final long THREAD_JOIN_TIMEOUT_MS = 1000; - - private PreemptionManager preemptionManager = new PreemptionManager(); - - private volatile boolean isLazyPreemptionEnabled = false; - - private int offswitchPerHeartbeatLimit; - - private boolean assignMultipleEnabled; - - private int maxAssignPerHeartbeat; - - private CSConfigurationProvider csConfProvider; - - @Override - public void setConf(Configuration conf) { - yarnConf = conf; - } - - private void validateConf(Configuration conf) { - // validate scheduler memory allocation setting - int minMem = conf.getInt( - YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, - YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_MB); - int maxMem = conf.getInt( - YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_MB, - YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_MB); - - if (minMem <= 0 || minMem > maxMem) { - throw new YarnRuntimeException("Invalid resource scheduler memory" - + " allocation configuration" - + ", " + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB - + "=" + minMem - + ", " + YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_MB - + "=" + maxMem + ", min and max should be greater than 0" - + ", max should be no smaller than min."); - } - - // validate scheduler vcores allocation setting - int minVcores = conf.getInt( - YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES, - YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES); - int maxVcores = conf.getInt( - YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES, - YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES); - - if (minVcores <= 0 || minVcores > maxVcores) { - throw new YarnRuntimeException("Invalid resource scheduler vcores" - + " allocation configuration" - + ", " + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES - + "=" + minVcores - + ", " + YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES - + "=" + maxVcores + ", min and max should be greater than 0" - + ", max should be no smaller than min."); - } - } - - @Override - public Configuration getConf() { - return yarnConf; - } - - private CapacitySchedulerConfiguration conf; - private Configuration yarnConf; - - private ResourceCalculator calculator; - private boolean usePortForNodeName; - - private boolean scheduleAsynchronously; - private List asyncSchedulerThreads; - private ResourceCommitterService resourceCommitterService; - private RMNodeLabelsManager labelManager; - private AppPriorityACLsManager appPriorityACLManager; - - private static boolean printedVerboseLoggingForAsyncScheduling = false; - - /** - * EXPERT - */ - private long asyncScheduleInterval; - private static final String ASYNC_SCHEDULER_INTERVAL = - CapacitySchedulerConfiguration.SCHEDULE_ASYNCHRONOUSLY_PREFIX - + ".scheduling-interval-ms"; - private static final long DEFAULT_ASYNC_SCHEDULER_INTERVAL = 5; - private long asyncMaxPendingBacklogs; - - public CapacityScheduler() { - super(CapacityScheduler.class.getName()); - } - - @Override - public QueueMetrics getRootQueueMetrics() { - return getRootQueue().getMetrics(); - } - - public CSQueue getRootQueue() { - return queueManager.getRootQueue(); - } - - @Override - public CapacitySchedulerConfiguration getConfiguration() { - return conf; - } - - @Override - public RMContainerTokenSecretManager getContainerTokenSecretManager() { - return this.rmContext.getContainerTokenSecretManager(); - } - - @Override - public ResourceCalculator getResourceCalculator() { - return calculator; - } - - @VisibleForTesting - public void setResourceCalculator(ResourceCalculator rc) { - this.calculator = rc; - } - - @Override - public int getNumClusterNodes() { - return nodeTracker.nodeCount(); - } - - @Override - public RMContext getRMContext() { - return this.rmContext; - } - - @Override - public void setRMContext(RMContext rmContext) { - this.rmContext = rmContext; - } - - @VisibleForTesting - void initScheduler(Configuration configuration) throws - IOException { - try { - writeLock.lock(); - String confProviderStr = configuration.get( - YarnConfiguration.SCHEDULER_CONFIGURATION_STORE_CLASS, - YarnConfiguration.DEFAULT_CONFIGURATION_STORE); - switch (confProviderStr) { - case YarnConfiguration.FILE_CONFIGURATION_STORE: - this.csConfProvider = - new FileBasedCSConfigurationProvider(rmContext); - break; - case YarnConfiguration.MEMORY_CONFIGURATION_STORE: - case YarnConfiguration.LEVELDB_CONFIGURATION_STORE: - case YarnConfiguration.ZK_CONFIGURATION_STORE: - this.csConfProvider = new MutableCSConfigurationProvider(rmContext); - break; - default: - throw new IOException("Invalid configuration store class: " + - confProviderStr); - } - this.csConfProvider.init(configuration); - this.conf = this.csConfProvider.loadConfiguration(configuration); - validateConf(this.conf); - this.minimumAllocation = super.getMinimumAllocation(); - initMaximumResourceCapability(super.getMaximumAllocation()); - this.calculator = this.conf.getResourceCalculator(); - if (this.calculator instanceof DefaultResourceCalculator - && ResourceUtils.getNumberOfKnownResourceTypes() > 2) { - throw new YarnRuntimeException("RM uses DefaultResourceCalculator which" - + " used only memory as resource-type but invalid resource-types" - + " specified " + ResourceUtils.getResourceTypes() + ". Use" - + " DomainantResourceCalculator instead to make effective use of" - + " these resource-types"); - } - this.usePortForNodeName = this.conf.getUsePortForNodeName(); - this.applications = new ConcurrentHashMap<>(); - this.labelManager = rmContext.getNodeLabelManager(); - this.appPriorityACLManager = new AppPriorityACLsManager(conf); - this.queueManager = new CapacitySchedulerQueueManager(yarnConf, - this.labelManager, this.appPriorityACLManager); - this.queueManager.setCapacitySchedulerContext(this); - - this.activitiesManager = new ActivitiesManager(rmContext); - activitiesManager.init(conf); - initializeQueues(this.conf); - this.isLazyPreemptionEnabled = conf.getLazyPreemptionEnabled(); - - scheduleAsynchronously = this.conf.getScheduleAynschronously(); - asyncScheduleInterval = this.conf.getLong(ASYNC_SCHEDULER_INTERVAL, - DEFAULT_ASYNC_SCHEDULER_INTERVAL); - - this.assignMultipleEnabled = this.conf.getAssignMultipleEnabled(); - this.maxAssignPerHeartbeat = this.conf.getMaxAssignPerHeartbeat(); - - // number of threads for async scheduling - int maxAsyncSchedulingThreads = this.conf.getInt( - CapacitySchedulerConfiguration.SCHEDULE_ASYNCHRONOUSLY_MAXIMUM_THREAD, - 1); - maxAsyncSchedulingThreads = Math.max(maxAsyncSchedulingThreads, 1); - - if (scheduleAsynchronously) { - asyncSchedulerThreads = new ArrayList<>(); - for (int i = 0; i < maxAsyncSchedulingThreads; i++) { - asyncSchedulerThreads.add(new AsyncScheduleThread(this)); - } - resourceCommitterService = new ResourceCommitterService(this); - asyncMaxPendingBacklogs = this.conf.getInt( - CapacitySchedulerConfiguration. - SCHEDULE_ASYNCHRONOUSLY_MAXIMUM_PENDING_BACKLOGS, - CapacitySchedulerConfiguration. - DEFAULT_SCHEDULE_ASYNCHRONOUSLY_MAXIMUM_PENDING_BACKLOGS); - } - - // Setup how many containers we can allocate for each round - offswitchPerHeartbeatLimit = this.conf.getOffSwitchPerHeartbeatLimit(); - - LOG.info("Initialized CapacityScheduler with " + "calculator=" - + getResourceCalculator().getClass() + ", " + "minimumAllocation=<" - + getMinimumResourceCapability() + ">, " + "maximumAllocation=<" - + getMaximumResourceCapability() + ">, " + "asynchronousScheduling=" - + scheduleAsynchronously + ", " + "asyncScheduleInterval=" - + asyncScheduleInterval + "ms"); - } finally { - writeLock.unlock(); - } - } - - private void startSchedulerThreads() { - try { - writeLock.lock(); - activitiesManager.start(); - if (scheduleAsynchronously) { - Preconditions.checkNotNull(asyncSchedulerThreads, - "asyncSchedulerThreads is null"); - for (Thread t : asyncSchedulerThreads) { - t.start(); - } - - resourceCommitterService.start(); - } - } finally { - writeLock.unlock(); - } - } - - @Override - public void serviceInit(Configuration conf) throws Exception { - Configuration configuration = new Configuration(conf); - super.serviceInit(conf); - initScheduler(configuration); - // Initialize SchedulingMonitorManager - schedulingMonitorManager.initialize(rmContext, conf); - } - - @Override - public void serviceStart() throws Exception { - startSchedulerThreads(); - super.serviceStart(); - } - - @Override - public void serviceStop() throws Exception { - try { - writeLock.lock(); - this.activitiesManager.stop(); - if (scheduleAsynchronously && asyncSchedulerThreads != null) { - for (Thread t : asyncSchedulerThreads) { - t.interrupt(); - t.join(THREAD_JOIN_TIMEOUT_MS); - } - resourceCommitterService.interrupt(); - resourceCommitterService.join(THREAD_JOIN_TIMEOUT_MS); - } - } finally { - writeLock.unlock(); - } - - if (isConfigurationMutable()) { - ((MutableConfigurationProvider) csConfProvider).close(); - } - super.serviceStop(); - } - - @Override - public void reinitialize(Configuration newConf, RMContext rmContext) - throws IOException { - writeLock.lock(); - try { - Configuration configuration = new Configuration(newConf); - CapacitySchedulerConfiguration oldConf = this.conf; - this.conf = csConfProvider.loadConfiguration(configuration); - validateConf(this.conf); - try { - LOG.info("Re-initializing queues..."); - refreshMaximumAllocation( - ResourceUtils.fetchMaximumAllocationFromConfig(this.conf)); - reinitializeQueues(this.conf); - } catch (Throwable t) { - this.conf = oldConf; - refreshMaximumAllocation( - ResourceUtils.fetchMaximumAllocationFromConfig(this.conf)); - throw new IOException("Failed to re-init queues : " + t.getMessage(), - t); - } - - // update lazy preemption - this.isLazyPreemptionEnabled = this.conf.getLazyPreemptionEnabled(); - - // Setup how many containers we can allocate for each round - offswitchPerHeartbeatLimit = this.conf.getOffSwitchPerHeartbeatLimit(); - - super.reinitialize(newConf, rmContext); - } finally { - writeLock.unlock(); - } - } - - long getAsyncScheduleInterval() { - return asyncScheduleInterval; - } - - private final static Random random = new Random(System.currentTimeMillis()); - - private static boolean shouldSkipNodeSchedule(FiCaSchedulerNode node, - CapacityScheduler cs, boolean printVerboseLog) { - // Skip node which missed 2 heartbeats since the node might be dead and - // we should not continue allocate containers on that. - long timeElapsedFromLastHeartbeat = - Time.monotonicNow() - node.getLastHeartbeatMonotonicTime(); - if (timeElapsedFromLastHeartbeat > cs.nmHeartbeatInterval * 2) { - if (printVerboseLog && LOG.isDebugEnabled()) { - LOG.debug("Skip scheduling on node because it haven't heartbeated for " - + timeElapsedFromLastHeartbeat / 1000.0f + " secs"); - } - return true; - } - return false; - } - - /** - * Schedule on all nodes by starting at a random point. - * @param cs - */ - static void schedule(CapacityScheduler cs) throws InterruptedException{ - // First randomize the start point - int current = 0; - Collection nodes = cs.nodeTracker.getAllNodes(); - - // If nodes size is 0 (when there are no node managers registered, - // we can return from here itself. - int nodeSize = nodes.size(); - if(nodeSize == 0) { - return; - } - int start = random.nextInt(nodeSize); - - // To avoid too verbose DEBUG logging, only print debug log once for - // every 10 secs. - boolean printSkipedNodeLogging = false; - if (Time.monotonicNow() / 1000 % 10 == 0) { - printSkipedNodeLogging = (!printedVerboseLoggingForAsyncScheduling); - } else { - printedVerboseLoggingForAsyncScheduling = false; - } - - // Allocate containers of node [start, end) - for (FiCaSchedulerNode node : nodes) { - if (current++ >= start) { - if (shouldSkipNodeSchedule(node, cs, printSkipedNodeLogging)) { - continue; - } - cs.allocateContainersToNode(node.getNodeID(), false); - } - } - - current = 0; - - // Allocate containers of node [0, start) - for (FiCaSchedulerNode node : nodes) { - if (current++ > start) { - break; - } - if (shouldSkipNodeSchedule(node, cs, printSkipedNodeLogging)) { - continue; - } - cs.allocateContainersToNode(node.getNodeID(), false); - } - - if (printSkipedNodeLogging) { - printedVerboseLoggingForAsyncScheduling = true; - } - - Thread.sleep(cs.getAsyncScheduleInterval()); - } + AbstractYarnScheduler implements + PreemptableResourceScheduler, CapacitySchedulerContext, Configurable, + ResourceAllocationCommitter, MutableConfScheduler { - static class AsyncScheduleThread extends Thread { + private static final Log LOG = LogFactory.getLog(CapacityScheduler.class); - private final CapacityScheduler cs; - private AtomicBoolean runSchedules = new AtomicBoolean(false); + private CapacitySchedulerQueueManager queueManager; - public AsyncScheduleThread(CapacityScheduler cs) { - this.cs = cs; - setDaemon(true); + // timeout to join when we stop this service + protected final long THREAD_JOIN_TIMEOUT_MS = 1000; + + private PreemptionManager preemptionManager = new PreemptionManager(); + + private volatile boolean isLazyPreemptionEnabled = false; + + private int offswitchPerHeartbeatLimit; + + private boolean assignMultipleEnabled; + + private int maxAssignPerHeartbeat; + + private CSConfigurationProvider csConfProvider; + + @Override + public void setConf(Configuration conf) { + yarnConf = conf; + } + + private void validateConf(Configuration conf) { + // validate scheduler memory allocation setting + int minMem = conf.getInt( + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_MB); + int maxMem = conf.getInt( + YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_MB, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_MB); + + if (minMem <= 0 || minMem > maxMem) { + throw new YarnRuntimeException("Invalid resource scheduler memory" + + " allocation configuration" + + ", " + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB + + "=" + minMem + + ", " + YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_MB + + "=" + maxMem + ", min and max should be greater than 0" + + ", max should be no smaller than min."); + } + + // validate scheduler vcores allocation setting + int minVcores = conf.getInt( + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES); + int maxVcores = conf.getInt( + YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES); + + if (minVcores <= 0 || minVcores > maxVcores) { + throw new YarnRuntimeException("Invalid resource scheduler vcores" + + " allocation configuration" + + ", " + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES + + "=" + minVcores + + ", " + YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES + + "=" + maxVcores + ", min and max should be greater than 0" + + ", max should be no smaller than min."); + } + } + + @Override + public Configuration getConf() { + return yarnConf; + } + + private CapacitySchedulerConfiguration conf; + private Configuration yarnConf; + + private ResourceCalculator calculator; + private boolean usePortForNodeName; + + private boolean scheduleAsynchronously; + private List asyncSchedulerThreads; + private ResourceCommitterService resourceCommitterService; + private RMNodeLabelsManager labelManager; + private AppPriorityACLsManager appPriorityACLManager; + + private static boolean printedVerboseLoggingForAsyncScheduling = false; + + /** + * EXPERT + */ + private long asyncScheduleInterval; + private static final String ASYNC_SCHEDULER_INTERVAL = + CapacitySchedulerConfiguration.SCHEDULE_ASYNCHRONOUSLY_PREFIX + + ".scheduling-interval-ms"; + private static final long DEFAULT_ASYNC_SCHEDULER_INTERVAL = 5; + private long asyncMaxPendingBacklogs; + + public CapacityScheduler() { + super(CapacityScheduler.class.getName()); + } + + @Override + public QueueMetrics getRootQueueMetrics() { + return getRootQueue().getMetrics(); + } + + public CSQueue getRootQueue() { + return queueManager.getRootQueue(); + } + + @Override + public CapacitySchedulerConfiguration getConfiguration() { + return conf; + } + + @Override + public RMContainerTokenSecretManager getContainerTokenSecretManager() { + return this.rmContext.getContainerTokenSecretManager(); + } + + @Override + public ResourceCalculator getResourceCalculator() { + return calculator; + } + + @VisibleForTesting + public void setResourceCalculator(ResourceCalculator rc) { + this.calculator = rc; + } + + @Override + public int getNumClusterNodes() { + return nodeTracker.nodeCount(); + } + + @Override + public RMContext getRMContext() { + return this.rmContext; + } + + @Override + public void setRMContext(RMContext rmContext) { + this.rmContext = rmContext; + } + + @VisibleForTesting + synchronized void initScheduler(Configuration configuration) throws + IOException { + String confProviderStr = configuration.get( + YarnConfiguration.SCHEDULER_CONFIGURATION_STORE_CLASS, + YarnConfiguration.DEFAULT_CONFIGURATION_STORE); + switch (confProviderStr) { + case YarnConfiguration.FILE_CONFIGURATION_STORE: + this.csConfProvider = + new FileBasedCSConfigurationProvider(rmContext); + break; + case YarnConfiguration.MEMORY_CONFIGURATION_STORE: + case YarnConfiguration.LEVELDB_CONFIGURATION_STORE: + case YarnConfiguration.ZK_CONFIGURATION_STORE: + this.csConfProvider = new MutableCSConfigurationProvider(rmContext); + break; + default: + throw new IOException("Invalid configuration store class: " + + confProviderStr); + } + this.csConfProvider.init(configuration); + this.conf = this.csConfProvider.loadConfiguration(configuration); + validateConf(this.conf); + this.minimumAllocation = super.getMinimumAllocation(); + initMaximumResourceCapability(super.getMaximumAllocation()); + this.calculator = this.conf.getResourceCalculator(); + if (this.calculator instanceof DefaultResourceCalculator + && ResourceUtils.getNumberOfKnownResourceTypes() > 2) { + throw new YarnRuntimeException("RM uses DefaultResourceCalculator which" + + " used only memory as resource-type but invalid resource-types" + + " specified " + ResourceUtils.getResourceTypes() + ". Use" + + " DomainantResourceCalculator instead to make effective use of" + + " these resource-types"); + } + this.usePortForNodeName = this.conf.getUsePortForNodeName(); + this.applications = new ConcurrentHashMap<>(); + this.labelManager = rmContext.getNodeLabelManager(); + this.appPriorityACLManager = new AppPriorityACLsManager(conf); + this.queueManager = new CapacitySchedulerQueueManager(yarnConf, + this.labelManager, this.appPriorityACLManager); + this.queueManager.setCapacitySchedulerContext(this); + + this.activitiesManager = new ActivitiesManager(rmContext); + activitiesManager.init(conf); + initializeQueues(this.conf); + this.isLazyPreemptionEnabled = conf.getLazyPreemptionEnabled(); + + scheduleAsynchronously = this.conf.getScheduleAynschronously(); + asyncScheduleInterval = this.conf.getLong(ASYNC_SCHEDULER_INTERVAL, + DEFAULT_ASYNC_SCHEDULER_INTERVAL); + + this.assignMultipleEnabled = this.conf.getAssignMultipleEnabled(); + this.maxAssignPerHeartbeat = this.conf.getMaxAssignPerHeartbeat(); + + // number of threads for async scheduling + int maxAsyncSchedulingThreads = this.conf.getInt( + CapacitySchedulerConfiguration.SCHEDULE_ASYNCHRONOUSLY_MAXIMUM_THREAD, + 1); + maxAsyncSchedulingThreads = Math.max(maxAsyncSchedulingThreads, 1); + + if (scheduleAsynchronously) { + asyncSchedulerThreads = new ArrayList<>(); + for (int i = 0; i < maxAsyncSchedulingThreads; i++) { + asyncSchedulerThreads.add(new AsyncScheduleThread(this)); + } + resourceCommitterService = new ResourceCommitterService(this); + asyncMaxPendingBacklogs = this.conf.getInt( + CapacitySchedulerConfiguration. + SCHEDULE_ASYNCHRONOUSLY_MAXIMUM_PENDING_BACKLOGS, + CapacitySchedulerConfiguration. + DEFAULT_SCHEDULE_ASYNCHRONOUSLY_MAXIMUM_PENDING_BACKLOGS); + } + + // Setup how many containers we can allocate for each round + offswitchPerHeartbeatLimit = this.conf.getOffSwitchPerHeartbeatLimit(); + + LOG.info("Initialized CapacityScheduler with " + "calculator=" + + getResourceCalculator().getClass() + ", " + "minimumAllocation=<" + + getMinimumResourceCapability() + ">, " + "maximumAllocation=<" + + getMaximumResourceCapability() + ">, " + "asynchronousScheduling=" + + scheduleAsynchronously + ", " + "asyncScheduleInterval=" + + asyncScheduleInterval + "ms"); + } + + private synchronized void startSchedulerThreads() { + activitiesManager.start(); + if (scheduleAsynchronously) { + Preconditions.checkNotNull(asyncSchedulerThreads, + "asyncSchedulerThreads is null"); + for (Thread t : asyncSchedulerThreads) { + t.start(); + } + resourceCommitterService.start(); + } + } + + @Override + public void serviceInit(Configuration conf) throws Exception { + Configuration configuration = new Configuration(conf); + super.serviceInit(conf); + initScheduler(configuration); + // Initialize SchedulingMonitorManager + schedulingMonitorManager.initialize(rmContext, conf); + } + + @Override + public void serviceStart() throws Exception { + startSchedulerThreads(); + super.serviceStart(); + } + + @Override + public void serviceStop() throws Exception { + synchronized (this) { + this.activitiesManager.stop(); + if (scheduleAsynchronously && asyncSchedulerThreads != null) { + for (Thread t : asyncSchedulerThreads) { + t.interrupt(); + t.join(THREAD_JOIN_TIMEOUT_MS); + } + resourceCommitterService.interrupt(); + resourceCommitterService.join(THREAD_JOIN_TIMEOUT_MS); + } + } + + if (isConfigurationMutable()) { + ((MutableConfigurationProvider) csConfProvider).close(); + } + super.serviceStop(); + } + + @Override + public synchronized void reinitialize(Configuration newConf, RMContext rmContext) + throws IOException { + Configuration configuration = new Configuration(newConf); + CapacitySchedulerConfiguration oldConf = this.conf; + this.conf = csConfProvider.loadConfiguration(configuration); + validateConf(this.conf); + try { + LOG.info("Re-initializing queues..."); + refreshMaximumAllocation( + ResourceUtils.fetchMaximumAllocationFromConfig(this.conf)); + reinitializeQueues(this.conf); + } catch (Throwable t) { + this.conf = oldConf; + refreshMaximumAllocation( + ResourceUtils.fetchMaximumAllocationFromConfig(this.conf)); + throw new IOException("Failed to re-init queues : " + t.getMessage(), + t); + } + + // update lazy preemption + this.isLazyPreemptionEnabled = this.conf.getLazyPreemptionEnabled(); + + // Setup how many containers we can allocate for each round + offswitchPerHeartbeatLimit = this.conf.getOffSwitchPerHeartbeatLimit(); + + super.reinitialize(newConf, rmContext); + } + + long getAsyncScheduleInterval() { + return asyncScheduleInterval; + } + + private final static Random random = new Random(System.currentTimeMillis()); + + private static boolean shouldSkipNodeSchedule(FiCaSchedulerNode node, + CapacityScheduler cs, boolean printVerboseLog) { + // Skip node which missed 2 heartbeats since the node might be dead and + // we should not continue allocate containers on that. + long timeElapsedFromLastHeartbeat = + Time.monotonicNow() - node.getLastHeartbeatMonotonicTime(); + if (timeElapsedFromLastHeartbeat > cs.nmHeartbeatInterval * 2) { + if (printVerboseLog && LOG.isDebugEnabled()) { + LOG.debug("Skip scheduling on node because it haven't heartbeated for " + + timeElapsedFromLastHeartbeat / 1000.0f + " secs"); + } + return true; + } + return false; + } + + /** + * Schedule on all nodes by starting at a random point. + * @param cs + */ + static void schedule(CapacityScheduler cs) throws InterruptedException { + // First randomize the start point + int current = 0; + Collection nodes = cs.nodeTracker.getAllNodes(); + + // If nodes size is 0 (when there are no node managers registered, + // we can return from here itself. + int nodeSize = nodes.size(); + if (nodeSize == 0) { + return; + } + int start = random.nextInt(nodeSize); + + // To avoid too verbose DEBUG logging, only print debug log once for + // every 10 secs. + boolean printSkipedNodeLogging = false; + if (Time.monotonicNow() / 1000 % 10 == 0) { + printSkipedNodeLogging = (!printedVerboseLoggingForAsyncScheduling); + } else { + printedVerboseLoggingForAsyncScheduling = false; + } + + // Allocate containers of node [start, end) + for (FiCaSchedulerNode node : nodes) { + if (current++ >= start) { + if (shouldSkipNodeSchedule(node, cs, printSkipedNodeLogging)) { + continue; + } + cs.allocateContainersToNode(node.getNodeID(), false); + } + } + + current = 0; + + // Allocate containers of node [0, start) + for (FiCaSchedulerNode node : nodes) { + if (current++ > start) { + break; + } + if (shouldSkipNodeSchedule(node, cs, printSkipedNodeLogging)) { + continue; + } + cs.allocateContainersToNode(node.getNodeID(), false); + } + + if (printSkipedNodeLogging) { + printedVerboseLoggingForAsyncScheduling = true; + } + + Thread.sleep(cs.getAsyncScheduleInterval()); + } + + static class AsyncScheduleThread extends Thread { + + private final CapacityScheduler cs; + private AtomicBoolean runSchedules = new AtomicBoolean(false); + + public AsyncScheduleThread(CapacityScheduler cs) { + this.cs = cs; + setDaemon(true); + } + + @Override + public void run() { + int debuggingLogCounter = 0; + while (!Thread.currentThread().isInterrupted()) { + try { + if (!runSchedules.get()) { + Thread.sleep(100); + } else { + // Don't run schedule if we have some pending backlogs already + if (cs.getAsyncSchedulingPendingBacklogs() + > cs.asyncMaxPendingBacklogs) { + Thread.sleep(1); + } else { + schedule(cs); + if (LOG.isDebugEnabled()) { + // Adding a debug log here to ensure that the thread is alive + // and running fine. + if (debuggingLogCounter++ > 10000) { + debuggingLogCounter = 0; + LOG.debug("AsyncScheduleThread[" + getName() + "] is running!"); + } + } + } + } + } catch (InterruptedException ie) { + // keep interrupt signal + Thread.currentThread().interrupt(); + } + } + LOG.info("AsyncScheduleThread[" + getName() + "] exited!"); + } + + public void beginSchedule() { + runSchedules.set(true); + } + + public void suspendSchedule() { + runSchedules.set(false); + } + + } + + static class ResourceCommitterService extends Thread { + private final CapacityScheduler cs; + private BlockingQueue> + backlogs = new LinkedBlockingQueue<>(); + + public ResourceCommitterService(CapacityScheduler cs) { + this.cs = cs; + setDaemon(true); + } + + @Override + public void run() { + while (!Thread.currentThread().isInterrupted()) { + try { + ResourceCommitRequest request = + backlogs.take(); + + synchronized (backlogs) { + cs.tryCommit(cs.getClusterResource(), request, true); + } + + } catch (InterruptedException e) { + LOG.error(e); + Thread.currentThread().interrupt(); + } + } + LOG.info("ResourceCommitterService exited!"); + } + + public void addNewCommitRequest( + ResourceCommitRequest proposal) { + backlogs.add(proposal); + } + + public int getPendingBacklogs() { + return backlogs.size(); + } + } + + @VisibleForTesting + public PlacementRule getUserGroupMappingPlacementRule() throws IOException { + UserGroupMappingPlacementRule ugRule = new UserGroupMappingPlacementRule(); + ugRule.initialize(this); + return ugRule; + } + + public PlacementRule getAppNameMappingPlacementRule() throws IOException { + AppNameMappingPlacementRule anRule = new AppNameMappingPlacementRule(); + anRule.initialize(this); + return anRule; + } + + @VisibleForTesting + public void updatePlacementRules() throws IOException { + // Initialize placement rules + Collection placementRuleStrs = conf.getStringCollection( + YarnConfiguration.QUEUE_PLACEMENT_RULES); + List placementRules = new ArrayList<>(); + Set distingushRuleSet = new HashSet<>(); + // fail the case if we get duplicate placementRule add in + for (String pls : placementRuleStrs) { + if (!distingushRuleSet.add(pls)) { + throw new IOException("Invalid PlacementRule inputs which " + + "contains duplicate rule strings"); + } + } + + // add UserGroupMappingPlacementRule if absent + distingushRuleSet.add(YarnConfiguration.USER_GROUP_PLACEMENT_RULE); + + placementRuleStrs = new ArrayList<>(distingushRuleSet); + + for (String placementRuleStr : placementRuleStrs) { + switch (placementRuleStr) { + case YarnConfiguration.USER_GROUP_PLACEMENT_RULE: + PlacementRule ugRule = getUserGroupMappingPlacementRule(); + if (null != ugRule) { + placementRules.add(ugRule); + } + break; + case YarnConfiguration.APP_NAME_PLACEMENT_RULE: + PlacementRule anRule = getAppNameMappingPlacementRule(); + if (null != anRule) { + placementRules.add(anRule); + } + break; + default: + boolean isMappingNotEmpty; + try { + PlacementRule rule = PlacementFactory.getPlacementRule( + placementRuleStr, conf); + if (null != rule) { + try { + isMappingNotEmpty = rule.initialize(this); + } catch (IOException ie) { + throw new IOException(ie); + } + if (isMappingNotEmpty) { + placementRules.add(rule); + } + } + } catch (ClassNotFoundException cnfe) { + throw new IOException(cnfe); + } + } + } + + rmContext.getQueuePlacementManager().updateRules(placementRules); + } + + @Lock(CapacityScheduler.class) + private void initializeQueues(CapacitySchedulerConfiguration conf) + throws IOException { + + this.queueManager.initializeQueues(conf); + + updatePlacementRules(); + + // Notify Preemption Manager + preemptionManager.refreshQueues(null, this.getRootQueue()); + } + + @Lock(CapacityScheduler.class) + private void reinitializeQueues(CapacitySchedulerConfiguration newConf) + throws IOException { + this.queueManager.reinitializeQueues(newConf); + updatePlacementRules(); + + // Notify Preemption Manager + preemptionManager.refreshQueues(null, this.getRootQueue()); + } + + @Override + public CSQueue getQueue(String queueName) { + if (queueName == null) { + return null; + } + return this.queueManager.getQueue(queueName); + } + + private synchronized void addApplicationOnRecovery(ApplicationId applicationId, + String queueName, String user, + Priority priority, ApplicationPlacementContext placementContext) { + //check if the queue needs to be auto-created during recovery + CSQueue queue = getOrCreateQueueFromPlacementContext(applicationId, user, + queueName, placementContext, true); + + if (queue == null) { + //During a restart, this indicates a queue was removed, which is + //not presently supported + if (!getConfiguration().shouldAppFailFast(getConfig())) { + this.rmContext.getDispatcher().getEventHandler().handle( + new RMAppEvent(applicationId, RMAppEventType.KILL, + "Application killed on recovery as it" + + " was submitted to queue " + queueName + + " which no longer exists after restart.")); + return; + } else { + String queueErrorMsg = "Queue named " + queueName + " missing " + + "during application recovery." + + " Queue removal during recovery is not presently " + + "supported by the capacity scheduler, please " + + "restart with all queues configured" + + " which were present before shutdown/restart."; + LOG.fatal(queueErrorMsg); + throw new QueueInvalidException(queueErrorMsg); + } + } + if (!(queue instanceof LeafQueue)) { + // During RM restart, this means leaf queue was converted to a parent + // queue, which is not supported for running apps. + if (!getConfiguration().shouldAppFailFast(getConfig())) { + this.rmContext.getDispatcher().getEventHandler().handle( + new RMAppEvent(applicationId, RMAppEventType.KILL, + "Application killed on recovery as it was " + + "submitted to queue " + queueName + + " which is no longer a leaf queue after restart.")); + return; + } else { + String queueErrorMsg = "Queue named " + queueName + + " is no longer a leaf queue during application recovery." + + " Changing a leaf queue to a parent queue during recovery is" + + " not presently supported by the capacity scheduler. Please" + + " restart with leaf queues before shutdown/restart continuing" + + " as leaf queues."; + LOG.fatal(queueErrorMsg); + throw new QueueInvalidException(queueErrorMsg); + } + } + // When recovering apps in this queue but queue is in STOPPED state, + // that means its previous state was DRAINING. So we auto transit + // the state to DRAINING for recovery. + if (queue.getState() == QueueState.STOPPED) { + ((LeafQueue) queue).recoverDrainingState(); + } + // Submit to the queue + try { + queue.submitApplication(applicationId, user, queueName); + } catch (AccessControlException ace) { + // Ignore the exception for recovered app as the app was previously + // accepted. + } + queue.getMetrics().submitApp(user); + SchedulerApplication application = + new SchedulerApplication(queue, user, priority); + applications.put(applicationId, application); + LOG.info("Accepted application " + applicationId + " from user: " + user + + ", in queue: " + queueName); + if (LOG.isDebugEnabled()) { + LOG.debug( + applicationId + " is recovering. Skip notifying APP_ACCEPTED"); + } + } + + private CSQueue getOrCreateQueueFromPlacementContext(ApplicationId + applicationId, String user, String queueName, + ApplicationPlacementContext placementContext, + boolean isRecovery) { + + CSQueue queue = getQueue(queueName); + + if (queue == null) { + if (placementContext != null && placementContext.hasParentQueue()) { + try { + return autoCreateLeafQueue(placementContext); + } catch (YarnException | IOException e) { + if (isRecovery) { + if (!getConfiguration().shouldAppFailFast(getConfig())) { + LOG.error("Could not auto-create leaf queue " + queueName + + " due to : ", e); + this.rmContext.getDispatcher().getEventHandler().handle( + new RMAppEvent(applicationId, RMAppEventType.KILL, + "Application killed on recovery" + + " as it was submitted to queue " + queueName + + " which could not be auto-created")); + } else { + String queueErrorMsg = + "Queue named " + queueName + " could not be " + + "auto-created during application recovery."; + LOG.fatal(queueErrorMsg, e); + throw new QueueInvalidException(queueErrorMsg); + } + } else { + LOG.error("Could not auto-create leaf queue due to : ", e); + final String message = + "Application " + applicationId + " submission by user : " + + user + + " to queue : " + queueName + " failed : " + e + .getMessage(); + this.rmContext.getDispatcher().getEventHandler().handle( + new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, + message)); + } + } + } + } + return queue; + } + + private synchronized void addApplication(ApplicationId applicationId, String queueName, + String user, Priority priority, + ApplicationPlacementContext placementContext) { + if (isSystemAppsLimitReached()) { + String message = "Maximum system application limit reached," + + "cannot accept submission of application: " + applicationId; + this.rmContext.getDispatcher().getEventHandler().handle( + new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, + message)); + return; + } + + //Could be a potential auto-created leaf queue + CSQueue queue = getOrCreateQueueFromPlacementContext(applicationId, user, + queueName, placementContext, false); + + if (queue == null) { + final String message = + "Application " + applicationId + " submitted by user " + user + + " to unknown queue: " + queueName; + + this.rmContext.getDispatcher().getEventHandler().handle( + new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, + message)); + return; + } + + if (!(queue instanceof LeafQueue)) { + String message = + "Application " + applicationId + " submitted by user : " + user + + " to non-leaf queue : " + queueName; + this.rmContext.getDispatcher().getEventHandler().handle( + new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, + message)); + return; + } else if (queue instanceof AutoCreatedLeafQueue && queue + .getParent() instanceof ManagedParentQueue) { + + //If queue already exists and auto-queue creation was not required, + //placement context should not be null + if (placementContext == null) { + String message = + "Application " + applicationId + " submission by user : " + user + + " to specified queue : " + queueName + " is prohibited. " + + "Verify automatic queue mapping for user exists in " + + QUEUE_MAPPING; + this.rmContext.getDispatcher().getEventHandler().handle( + new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, + message)); + return; + // For a queue which exists already and + // not auto-created above, then its parent queue should match + // the parent queue specified in queue mapping + } else if (!queue.getParent().getQueueName().equals( + placementContext.getParentQueue())) { + String message = + "Auto created Leaf queue " + placementContext.getQueue() + " " + + "already exists under queue : " + queue + .getParent().getQueuePath() + + ".But Queue mapping configuration " + + CapacitySchedulerConfiguration.QUEUE_MAPPING + " has been " + + "updated to a different parent queue : " + + placementContext.getParentQueue() + + " for the specified user : " + user; + this.rmContext.getDispatcher().getEventHandler().handle( + new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, + message)); + return; + } + } + + // Submit to the queue + try { + queue.submitApplication(applicationId, user, queueName); + } catch (AccessControlException ace) { + LOG.info("Failed to submit application " + applicationId + " to queue " + + queueName + " from user " + user, ace); + this.rmContext.getDispatcher().getEventHandler().handle( + new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, + ace.toString())); + return; + } + // update the metrics + queue.getMetrics().submitApp(user); + SchedulerApplication application = + new SchedulerApplication(queue, user, priority); + applications.put(applicationId, application); + LOG.info("Accepted application " + applicationId + " from user: " + user + + ", in queue: " + queueName); + rmContext.getDispatcher().getEventHandler().handle( + new RMAppEvent(applicationId, RMAppEventType.APP_ACCEPTED)); + } + + private synchronized void addApplicationAttempt( + ApplicationAttemptId applicationAttemptId, + boolean transferStateFromPreviousAttempt, + boolean isAttemptRecovering) { + SchedulerApplication application = applications.get( + applicationAttemptId.getApplicationId()); + if (application == null) { + LOG.warn("Application " + applicationAttemptId.getApplicationId() + + " cannot be found in scheduler."); + return; + } + CSQueue queue = (CSQueue) application.getQueue(); + + FiCaSchedulerApp attempt = new FiCaSchedulerApp(applicationAttemptId, + application.getUser(), queue, queue.getAbstractUsersManager(), + rmContext, application.getPriority(), isAttemptRecovering, + activitiesManager); + if (transferStateFromPreviousAttempt) { + attempt.transferStateFromPreviousAttempt( + application.getCurrentAppAttempt()); + } + application.setCurrentAppAttempt(attempt); + + // Update attempt priority to the latest to avoid race condition i.e + // SchedulerApplicationAttempt is created with old priority but it is not + // set to SchedulerApplication#setCurrentAppAttempt. + // Scenario would occur is + // 1. SchdulerApplicationAttempt is created with old priority. + // 2. updateApplicationPriority() updates SchedulerApplication. Since + // currentAttempt is null, it just return. + // 3. ScheduelerApplcationAttempt is set in + // SchedulerApplication#setCurrentAppAttempt. + attempt.setPriority(application.getPriority()); + + queue.submitApplicationAttempt(attempt, application.getUser()); + LOG.info("Added Application Attempt " + applicationAttemptId + + " to scheduler from user " + application.getUser() + " in queue " + + queue.getQueueName()); + if (isAttemptRecovering) { + if (LOG.isDebugEnabled()) { + LOG.debug(applicationAttemptId + + " is recovering. Skipping notifying ATTEMPT_ADDED"); + } + } else { + rmContext.getDispatcher().getEventHandler().handle( + new RMAppAttemptEvent(applicationAttemptId, + RMAppAttemptEventType.ATTEMPT_ADDED)); + } + } + + private synchronized void doneApplication(ApplicationId applicationId, + RMAppState finalState) { + SchedulerApplication application = applications.get( + applicationId); + if (application == null) { + // The AppRemovedSchedulerEvent maybe sent on recovery for completed + // apps, ignore it. + LOG.warn("Couldn't find application " + applicationId); + return; + } + CSQueue queue = (CSQueue) application.getQueue(); + if (!(queue instanceof LeafQueue)) { + LOG.error("Cannot finish application " + "from non-leaf queue: " + queue + .getQueueName()); + } else { + queue.finishApplication(applicationId, application.getUser()); + } + application.stop(finalState); + applications.remove(applicationId); + } + + private synchronized void doneApplicationAttempt( + ApplicationAttemptId applicationAttemptId, + RMAppAttemptState rmAppAttemptFinalState, boolean keepContainers) { + LOG.info("Application Attempt " + applicationAttemptId + " is done." + + " finalState=" + rmAppAttemptFinalState); + + FiCaSchedulerApp attempt = getApplicationAttempt(applicationAttemptId); + SchedulerApplication application = applications.get( + applicationAttemptId.getApplicationId()); + + if (application == null || attempt == null) { + LOG.info( + "Unknown application " + applicationAttemptId + " has completed!"); + return; + } + + // Release all the allocated, acquired, running containers + for (RMContainer rmContainer : attempt.getLiveContainers()) { + if (keepContainers && rmContainer.getState().equals( + RMContainerState.RUNNING)) { + // do not kill the running container in the case of work-preserving AM + // restart. + LOG.info("Skip killing " + rmContainer.getContainerId()); + continue; + } + super.completedContainer(rmContainer, SchedulerUtils + .createAbnormalContainerStatus(rmContainer.getContainerId(), + SchedulerUtils.COMPLETED_APPLICATION), + RMContainerEventType.KILL); + } + + // Release all reserved containers + for (RMContainer rmContainer : attempt.getReservedContainers()) { + super.completedContainer(rmContainer, SchedulerUtils + .createAbnormalContainerStatus(rmContainer.getContainerId(), + "Application Complete"), RMContainerEventType.KILL); + } + + // Clean up pending requests, metrics etc. + attempt.stop(rmAppAttemptFinalState); + + // Inform the queue + String queueName = attempt.getQueue().getQueueName(); + CSQueue queue = this.getQueue(queueName); + if (!(queue instanceof LeafQueue)) { + LOG.error( + "Cannot finish application " + "from non-leaf queue: " + queueName); + } else { + queue.finishApplicationAttempt(attempt, queue.getQueueName()); + } + } + + /** + * Normalize a list of SchedulingRequest. + * + * @param asks scheduling request + */ + private void normalizeSchedulingRequests(List asks) { + if (asks == null) { + return; + } + for (SchedulingRequest ask : asks) { + ResourceSizing sizing = ask.getResourceSizing(); + if (sizing != null && sizing.getResources() != null) { + sizing.setResources(getNormalizedResource(sizing.getResources())); + } + } + } + + @Override + @Lock(Lock.NoLock.class) + public Allocation allocate(ApplicationAttemptId applicationAttemptId, + List ask, List schedulingRequests, + List release, List blacklistAdditions, + List blacklistRemovals, ContainerUpdates updateRequests) { + FiCaSchedulerApp application = getApplicationAttempt(applicationAttemptId); + if (application == null) { + LOG.error("Calling allocate on removed or non existent application " + + applicationAttemptId.getApplicationId()); + return EMPTY_ALLOCATION; + } + + // The allocate may be the leftover from previous attempt, and it will + // impact current attempt, such as confuse the request and allocation for + // current attempt's AM container. + // Note outside precondition check for the attempt id may be + // outdated here, so double check it here is necessary. + if (!application.getApplicationAttemptId().equals(applicationAttemptId)) { + LOG.error("Calling allocate on previous or removed " + + "or non existent application attempt " + applicationAttemptId); + return EMPTY_ALLOCATION; + } + + // Handle all container updates + handleContainerUpdates(application, updateRequests); + + // Release containers + releaseContainers(release, application); + + LeafQueue updateDemandForQueue = null; + + // Sanity check for new allocation requests + normalizeResourceRequests(ask); + + // Normalize scheduling requests + normalizeSchedulingRequests(schedulingRequests); + + Allocation allocation; + + // make sure we aren't stopping/removing the application + // when the allocate comes in + synchronized (application) { + if (application.isStopped()) { + return EMPTY_ALLOCATION; + } + + // Process resource requests + if (!ask.isEmpty() || (schedulingRequests != null && !schedulingRequests + .isEmpty())) { + if (LOG.isDebugEnabled()) { + LOG.debug( + "allocate: pre-update " + applicationAttemptId + " ask size =" + + ask.size()); + application.showRequests(); + } + + // Update application requests + if (application.updateResourceRequests(ask) || application + .updateSchedulingRequests(schedulingRequests)) { + updateDemandForQueue = (LeafQueue) application.getQueue(); + } + + if (LOG.isDebugEnabled()) { + LOG.debug("allocate: post-update"); + application.showRequests(); + } + } + + application.updateBlacklist(blacklistAdditions, blacklistRemovals); + + allocation = application.getAllocation(getResourceCalculator(), + getClusterResource(), getMinimumResourceCapability()); + } + + if (updateDemandForQueue != null && !application + .isWaitingForAMContainer()) { + updateDemandForQueue.getOrderingPolicy().demandUpdated(application); + } + + if (LOG.isDebugEnabled()) { + LOG.info("Allocation for application " + applicationAttemptId + " : " + + allocation + " with cluster resource : " + getClusterResource()); + } + return allocation; + } + + @Override + @Lock(Lock.NoLock.class) + public QueueInfo getQueueInfo(String queueName, + boolean includeChildQueues, boolean recursive) + throws IOException { + CSQueue queue = null; + queue = this.getQueue(queueName); + if (queue == null) { + throw new IOException("Unknown queue: " + queueName); + } + return queue.getQueueInfo(includeChildQueues, recursive); + } + + @Override + @Lock(Lock.NoLock.class) + public List getQueueUserAclInfo() { + UserGroupInformation user = null; + try { + user = UserGroupInformation.getCurrentUser(); + } catch (IOException ioe) { + // should never happen + return new ArrayList(); + } + + return getRootQueue().getQueueUserAclInfo(user); + } + + @Override + protected synchronized void nodeUpdate(RMNode rmNode) { + long begin = System.nanoTime(); + setLastNodeUpdateTime(Time.now()); + super.nodeUpdate(rmNode); + + // Try to do scheduling + if (!scheduleAsynchronously) { + ActivitiesLogger.NODE.startNodeUpdateRecording(activitiesManager, + rmNode.getNodeID()); + + // reset allocation and reservation stats before we start doing any + // work + updateSchedulerHealth(lastNodeUpdateTime, rmNode.getNodeID(), + CSAssignment.NULL_ASSIGNMENT); + + allocateContainersToNode(rmNode.getNodeID(), true); + ActivitiesLogger.NODE.finishNodeUpdateRecording(activitiesManager, + rmNode.getNodeID()); + } + + long latency = System.nanoTime() - begin; + CapacitySchedulerMetrics.getMetrics().addNodeUpdate(latency); + } + + /** + * Process resource update on a node. + */ + private synchronized void updateNodeAndQueueResource(RMNode nm, + ResourceOption resourceOption) { + updateNodeResource(nm, resourceOption); + Resource clusterResource = getClusterResource(); + getRootQueue().updateClusterResource(clusterResource, + new ResourceLimits(clusterResource)); + } + + /** + * Process node labels update on a node. + */ + private void updateLabelsOnNode(NodeId nodeId, + Set newLabels) { + FiCaSchedulerNode node = nodeTracker.getNode(nodeId); + if (null == node) { + return; + } + + // Get new partition, we have only one partition per node + String newPartition; + if (newLabels.isEmpty()) { + newPartition = RMNodeLabelsManager.NO_LABEL; + } else { + newPartition = newLabels.iterator().next(); + } + + // old partition as well + String oldPartition = node.getPartition(); + + // Update resources of these containers + for (RMContainer rmContainer : node.getCopiedListOfRunningContainers()) { + FiCaSchedulerApp application = getApplicationAttempt( + rmContainer.getApplicationAttemptId()); + if (null != application) { + application.nodePartitionUpdated(rmContainer, oldPartition, + newPartition); + } else { + LOG.warn("There's something wrong, some RMContainers running on" + + " a node, but we cannot find SchedulerApplicationAttempt " + + "for it. Node=" + node.getNodeID() + " applicationAttemptId=" + + rmContainer.getApplicationAttemptId()); + continue; + } + } + + // Unreserve container on this node + RMContainer reservedContainer = node.getReservedContainer(); + if (null != reservedContainer) { + killReservedContainer(reservedContainer); + } + + // Update node labels after we've done this + node.updateLabels(newLabels); + } + + private void updateSchedulerHealth(long now, NodeId nodeId, + CSAssignment assignment) { + List allocations = + assignment.getAssignmentInformation().getAllocationDetails(); + List reservations = + assignment.getAssignmentInformation().getReservationDetails(); + if (!allocations.isEmpty()) { + ContainerId allocatedContainerId = + allocations.get(allocations.size() - 1).containerId; + String allocatedQueue = allocations.get(allocations.size() - 1).queue; + schedulerHealth.updateAllocation(now, nodeId, allocatedContainerId, + allocatedQueue); + } + if (!reservations.isEmpty()) { + ContainerId reservedContainerId = + reservations.get(reservations.size() - 1).containerId; + String reservedQueue = reservations.get(reservations.size() - 1).queue; + schedulerHealth.updateReservation(now, nodeId, reservedContainerId, + reservedQueue); + } + schedulerHealth.updateSchedulerReservationCounts(assignment + .getAssignmentInformation().getNumReservations()); + schedulerHealth.updateSchedulerAllocationCounts(assignment + .getAssignmentInformation().getNumAllocations()); + schedulerHealth.updateSchedulerRunDetails(now, assignment + .getAssignmentInformation().getAllocated(), assignment + .getAssignmentInformation().getReserved()); + } + + private boolean canAllocateMore(CSAssignment assignment, int offswitchCount, + int assignedContainers) { + // Current assignment shouldn't be empty + if (assignment == null + || Resources.equals(assignment.getResource(), Resources.none())) { + return false; + } + + // offswitch assignment should be under threshold + if (offswitchCount >= offswitchPerHeartbeatLimit) { + return false; + } + + // And it should not be a reserved container + if (assignment.getAssignmentInformation().getNumReservations() > 0) { + return false; + } + + // assignMultipleEnabled should be ON, + // and assignedContainers should be under threshold + return assignMultipleEnabled + && (maxAssignPerHeartbeat == -1 + || assignedContainers < maxAssignPerHeartbeat); + } + + /** + * We need to make sure when doing allocation, Node should be existed + * And we will construct a {@link CandidateNodeSet} before proceeding + */ + private void allocateContainersToNode(NodeId nodeId, + boolean withNodeHeartbeat) { + FiCaSchedulerNode node = getNode(nodeId); + if (null != node) { + int offswitchCount = 0; + int assignedContainers = 0; + + CandidateNodeSet candidates = + new SimpleCandidateNodeSet<>(node); + CSAssignment assignment = allocateContainersToNode(candidates, + withNodeHeartbeat); + // Only check if we can allocate more container on the same node when + // scheduling is triggered by node heartbeat + if (null != assignment && withNodeHeartbeat) { + if (assignment.getType() == NodeType.OFF_SWITCH) { + offswitchCount++; + } + + if (Resources.greaterThan(calculator, getClusterResource(), + assignment.getResource(), Resources.none())) { + assignedContainers++; + } + + while (canAllocateMore(assignment, offswitchCount, + assignedContainers)) { + // Try to see if it is possible to allocate multiple container for + // the same node heartbeat + assignment = allocateContainersToNode(candidates, true); + + if (null != assignment + && assignment.getType() == NodeType.OFF_SWITCH) { + offswitchCount++; + } + + if (null != assignment + && Resources.greaterThan(calculator, getClusterResource(), + assignment.getResource(), Resources.none())) { + assignedContainers++; + } + } + + if (offswitchCount >= offswitchPerHeartbeatLimit) { + if (LOG.isDebugEnabled()) { + LOG.debug("Assigned maximum number of off-switch containers: " + + offswitchCount + ", assignments so far: " + assignment); + } + } + } + } + } + + /* + * Logics of allocate container on a single node (Old behavior) + */ + private CSAssignment allocateContainerOnSingleNode( + CandidateNodeSet candidates, FiCaSchedulerNode node, + boolean withNodeHeartbeat) { + if (LOG.isDebugEnabled()) { + LOG.debug( + "Trying to schedule on node: " + node.getNodeName() + ", available: " + + node.getUnallocatedResource()); + } + + // Backward compatible way to make sure previous behavior which allocation + // driven by node heartbeat works. + if (getNode(node.getNodeID()) != node) { + LOG.error("Trying to schedule on a removed node, please double check, " + + "nodeId=" + node.getNodeID()); + return null; + } + + CSAssignment assignment; + + // Assign new containers... + // 1. Check for reserved applications + // 2. Schedule if there are no reservations + RMContainer reservedContainer = node.getReservedContainer(); + if (reservedContainer != null) { + FiCaSchedulerApp reservedApplication = getCurrentAttemptForContainer( + reservedContainer.getContainerId()); + if (reservedApplication == null) { + LOG.error( + "Trying to schedule for a finished app, please double check. nodeId=" + + node.getNodeID() + " container=" + reservedContainer + .getContainerId()); + return null; + } + + // Try to fulfill the reservation + if (LOG.isDebugEnabled()) { + LOG.debug("Trying to fulfill reservation for application " + + reservedApplication.getApplicationId() + " on node: " + node + .getNodeID()); + } + + LeafQueue queue = ((LeafQueue) reservedApplication.getQueue()); + assignment = queue.assignContainers(getClusterResource(), candidates, + // TODO, now we only consider limits for parent for non-labeled + // resources, should consider labeled resources as well. + new ResourceLimits(labelManager + .getResourceByLabel(RMNodeLabelsManager.NO_LABEL, + getClusterResource())), + SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY); + + if (assignment.isFulfilledReservation()) { + if (withNodeHeartbeat) { + // Only update SchedulerHealth in sync scheduling, existing + // Data structure of SchedulerHealth need to be updated for + // Async mode + updateSchedulerHealth(lastNodeUpdateTime, node.getNodeID(), + assignment); + } + + schedulerHealth.updateSchedulerFulfilledReservationCounts(1); + + ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, + queue.getParent().getQueueName(), queue.getQueueName(), + ActivityState.ACCEPTED, ActivityDiagnosticConstant.EMPTY); + ActivitiesLogger.NODE.finishAllocatedNodeAllocation(activitiesManager, + node, reservedContainer.getContainerId(), + AllocationState.ALLOCATED_FROM_RESERVED); + } else { + ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, + queue.getParent().getQueueName(), queue.getQueueName(), + ActivityState.ACCEPTED, ActivityDiagnosticConstant.EMPTY); + ActivitiesLogger.NODE.finishAllocatedNodeAllocation(activitiesManager, + node, reservedContainer.getContainerId(), AllocationState.SKIPPED); + } + + assignment.setSchedulingMode( + SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY); + submitResourceCommitRequest(getClusterResource(), assignment); + } + + // Do not schedule if there are any reservations to fulfill on the node + if (node.getReservedContainer() != null) { + if (LOG.isDebugEnabled()) { + LOG.debug("Skipping scheduling since node " + node.getNodeID() + + " is reserved by application " + node.getReservedContainer() + .getContainerId().getApplicationAttemptId()); + } + return null; + } + + // First check if we can schedule + // When this time look at one node only, try schedule if the node + // has any available or killable resource + if (calculator.computeAvailableContainers(Resources + .add(node.getUnallocatedResource(), node.getTotalKillableResources()), + minimumAllocation) <= 0) { + if (LOG.isDebugEnabled()) { + LOG.debug("This node or this node partition doesn't have available or" + + "killable resource"); + } + return null; + } + + return allocateOrReserveNewContainers(candidates, withNodeHeartbeat); + } + + private CSAssignment allocateOrReserveNewContainers( + CandidateNodeSet candidates, + boolean withNodeHeartbeat) { + CSAssignment assignment = getRootQueue().assignContainers( + getClusterResource(), candidates, new ResourceLimits(labelManager + .getResourceByLabel(candidates.getPartition(), + getClusterResource())), + SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY); + + assignment.setSchedulingMode(SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY); + submitResourceCommitRequest(getClusterResource(), assignment); + + if (Resources.greaterThan(calculator, getClusterResource(), + assignment.getResource(), Resources.none())) { + if (withNodeHeartbeat) { + updateSchedulerHealth(lastNodeUpdateTime, + CandidateNodeSetUtils.getSingleNode(candidates).getNodeID(), + assignment); + } + return assignment; + } + + // Only do non-exclusive allocation when node has node-labels. + if (StringUtils.equals(candidates.getPartition(), + RMNodeLabelsManager.NO_LABEL)) { + return null; + } + + // Only do non-exclusive allocation when the node-label supports that + try { + if (rmContext.getNodeLabelManager().isExclusiveNodeLabel( + candidates.getPartition())) { + return null; + } + } catch (IOException e) { + LOG.warn( + "Exception when trying to get exclusivity of node label=" + candidates + .getPartition(), e); + return null; + } + + // Try to use NON_EXCLUSIVE + assignment = getRootQueue().assignContainers(getClusterResource(), + candidates, + // TODO, now we only consider limits for parent for non-labeled + // resources, should consider labeled resources as well. + new ResourceLimits(labelManager + .getResourceByLabel(RMNodeLabelsManager.NO_LABEL, + getClusterResource())), + SchedulingMode.IGNORE_PARTITION_EXCLUSIVITY); + assignment.setSchedulingMode(SchedulingMode.IGNORE_PARTITION_EXCLUSIVITY); + submitResourceCommitRequest(getClusterResource(), assignment); + + return assignment; + } + + /* + * New behavior, allocate containers considering multiple nodes + */ + private CSAssignment allocateContainersOnMultiNodes( + CandidateNodeSet candidates) { + // When this time look at multiple nodes, try schedule if the + // partition has any available resource or killable resource + if (getRootQueue().getQueueCapacities().getUsedCapacity( + candidates.getPartition()) >= 1.0f + && preemptionManager.getKillableResource( + CapacitySchedulerConfiguration.ROOT, candidates.getPartition()) + == Resources.none()) { + if (LOG.isDebugEnabled()) { + LOG.debug("This node or this node partition doesn't have available or" + + "killable resource"); + } + return null; + } + + return allocateOrReserveNewContainers(candidates, false); + } + + @VisibleForTesting + CSAssignment allocateContainersToNode( + CandidateNodeSet candidates, + boolean withNodeHeartbeat) { + if (rmContext.isWorkPreservingRecoveryEnabled() && !rmContext + .isSchedulerReadyForAllocatingContainers()) { + return null; + } + + long startTime = System.nanoTime(); + + // Backward compatible way to make sure previous behavior which allocation + // driven by node heartbeat works. + FiCaSchedulerNode node = CandidateNodeSetUtils.getSingleNode(candidates); + + // We have two different logics to handle allocation on single node / multi + // nodes. + CSAssignment assignment; + if (null != node) { + assignment = allocateContainerOnSingleNode(candidates, + node, withNodeHeartbeat); + } else { + assignment = allocateContainersOnMultiNodes(candidates); + } + + if (assignment != null && assignment.getAssignmentInformation() != null + && assignment.getAssignmentInformation().getNumAllocations() > 0) { + long allocateTime = System.nanoTime() - startTime; + CapacitySchedulerMetrics.getMetrics().addAllocate(allocateTime); + } + return assignment; + } + + @Override + public void handle(SchedulerEvent event) { + switch (event.getType()) { + case NODE_ADDED: { + NodeAddedSchedulerEvent nodeAddedEvent = (NodeAddedSchedulerEvent) event; + addNode(nodeAddedEvent.getAddedRMNode()); + recoverContainersOnNode(nodeAddedEvent.getContainerReports(), + nodeAddedEvent.getAddedRMNode()); + } + break; + case NODE_REMOVED: { + NodeRemovedSchedulerEvent nodeRemovedEvent = (NodeRemovedSchedulerEvent) event; + removeNode(nodeRemovedEvent.getRemovedRMNode()); + } + break; + case NODE_RESOURCE_UPDATE: { + NodeResourceUpdateSchedulerEvent nodeResourceUpdatedEvent = + (NodeResourceUpdateSchedulerEvent) event; + updateNodeAndQueueResource(nodeResourceUpdatedEvent.getRMNode(), + nodeResourceUpdatedEvent.getResourceOption()); + } + break; + case NODE_LABELS_UPDATE: { + NodeLabelsUpdateSchedulerEvent labelUpdateEvent = + (NodeLabelsUpdateSchedulerEvent) event; + + updateNodeLabelsAndQueueResource(labelUpdateEvent); + } + break; + case NODE_UPDATE: { + NodeUpdateSchedulerEvent nodeUpdatedEvent = (NodeUpdateSchedulerEvent) event; + nodeUpdate(nodeUpdatedEvent.getRMNode()); + } + break; + case APP_ADDED: { + AppAddedSchedulerEvent appAddedEvent = (AppAddedSchedulerEvent) event; + String queueName = resolveReservationQueueName(appAddedEvent.getQueue(), + appAddedEvent.getApplicationId(), appAddedEvent.getReservationID(), + appAddedEvent.getIsAppRecovering()); + if (queueName != null) { + if (!appAddedEvent.getIsAppRecovering()) { + addApplication(appAddedEvent.getApplicationId(), queueName, + appAddedEvent.getUser(), appAddedEvent.getApplicatonPriority(), + appAddedEvent.getPlacementContext()); + } else { + addApplicationOnRecovery(appAddedEvent.getApplicationId(), queueName, + appAddedEvent.getUser(), appAddedEvent.getApplicatonPriority(), + appAddedEvent.getPlacementContext()); + } + } + } + break; + case APP_REMOVED: { + AppRemovedSchedulerEvent appRemovedEvent = (AppRemovedSchedulerEvent) event; + doneApplication(appRemovedEvent.getApplicationID(), + appRemovedEvent.getFinalState()); + } + break; + case APP_ATTEMPT_ADDED: { + AppAttemptAddedSchedulerEvent appAttemptAddedEvent = + (AppAttemptAddedSchedulerEvent) event; + addApplicationAttempt(appAttemptAddedEvent.getApplicationAttemptId(), + appAttemptAddedEvent.getTransferStateFromPreviousAttempt(), + appAttemptAddedEvent.getIsAttemptRecovering()); + } + break; + case APP_ATTEMPT_REMOVED: { + AppAttemptRemovedSchedulerEvent appAttemptRemovedEvent = + (AppAttemptRemovedSchedulerEvent) event; + doneApplicationAttempt(appAttemptRemovedEvent.getApplicationAttemptID(), + appAttemptRemovedEvent.getFinalAttemptState(), + appAttemptRemovedEvent.getKeepContainersAcrossAppAttempts()); + } + break; + case CONTAINER_EXPIRED: { + ContainerExpiredSchedulerEvent containerExpiredEvent = + (ContainerExpiredSchedulerEvent) event; + ContainerId containerId = containerExpiredEvent.getContainerId(); + if (containerExpiredEvent.isIncrease()) { + rollbackContainerUpdate(containerId); + } else { + completedContainer(getRMContainer(containerId), + SchedulerUtils.createAbnormalContainerStatus( + containerId, + SchedulerUtils.EXPIRED_CONTAINER), + RMContainerEventType.EXPIRE); + } + } + break; + case RELEASE_CONTAINER: { + RMContainer container = ((ReleaseContainerEvent) event).getContainer(); + completedContainer(container, + SchedulerUtils.createAbnormalContainerStatus( + container.getContainerId(), + SchedulerUtils.RELEASED_CONTAINER), + RMContainerEventType.RELEASED); + } + break; + case KILL_RESERVED_CONTAINER: { + ContainerPreemptEvent killReservedContainerEvent = + (ContainerPreemptEvent) event; + RMContainer container = killReservedContainerEvent.getContainer(); + killReservedContainer(container); + } + break; + case MARK_CONTAINER_FOR_PREEMPTION: { + ContainerPreemptEvent preemptContainerEvent = + (ContainerPreemptEvent) event; + ApplicationAttemptId aid = preemptContainerEvent.getAppId(); + RMContainer containerToBePreempted = preemptContainerEvent.getContainer(); + markContainerForPreemption(aid, containerToBePreempted); + } + break; + case MARK_CONTAINER_FOR_KILLABLE: { + ContainerPreemptEvent containerKillableEvent = (ContainerPreemptEvent) event; + RMContainer killableContainer = containerKillableEvent.getContainer(); + markContainerForKillable(killableContainer); + } + break; + case MARK_CONTAINER_FOR_NONKILLABLE: { + if (isLazyPreemptionEnabled) { + ContainerPreemptEvent cancelKillContainerEvent = + (ContainerPreemptEvent) event; + markContainerForNonKillable(cancelKillContainerEvent.getContainer()); + } + } + break; + case MANAGE_QUEUE: { + QueueManagementChangeEvent queueManagementChangeEvent = + (QueueManagementChangeEvent) event; + ParentQueue parentQueue = queueManagementChangeEvent.getParentQueue(); + try { + final List queueManagementChanges = + queueManagementChangeEvent.getQueueManagementChanges(); + ((ManagedParentQueue) parentQueue) + .validateAndApplyQueueManagementChanges(queueManagementChanges); + } catch (SchedulerDynamicEditException sde) { + LOG.error("Queue Management Change event cannot be applied for " + + "parent queue : " + parentQueue.getQueueName(), sde); + } catch (IOException ioe) { + LOG.error("Queue Management Change event cannot be applied for " + + "parent queue : " + parentQueue.getQueueName(), ioe); + } + } + break; + default: + LOG.error("Invalid eventtype " + event.getType() + ". Ignoring!"); + } + } + + /** + * Process node labels update. + */ + private synchronized void updateNodeLabelsAndQueueResource( + NodeLabelsUpdateSchedulerEvent labelUpdateEvent) { + for (Entry> entry : labelUpdateEvent + .getUpdatedNodeToLabels().entrySet()) { + NodeId id = entry.getKey(); + Set labels = entry.getValue(); + updateLabelsOnNode(id, labels); + } + Resource clusterResource = getClusterResource(); + getRootQueue().updateClusterResource(clusterResource, + new ResourceLimits(clusterResource)); + } + + private synchronized void addNode(RMNode nodeManager) { + FiCaSchedulerNode schedulerNode = new FiCaSchedulerNode(nodeManager, + usePortForNodeName, nodeManager.getNodeLabels()); + nodeTracker.addNode(schedulerNode); + + // update this node to node label manager + if (labelManager != null) { + labelManager.activateNode(nodeManager.getNodeID(), + schedulerNode.getTotalResource()); + } + + Resource clusterResource = getClusterResource(); + getRootQueue().updateClusterResource(clusterResource, + new ResourceLimits(clusterResource)); + + LOG.info( + "Added node " + nodeManager.getNodeAddress() + " clusterResource: " + + clusterResource); + + if (scheduleAsynchronously && getNumClusterNodes() == 1) { + for (AsyncScheduleThread t : asyncSchedulerThreads) { + t.beginSchedule(); + } + } + } + + private synchronized void removeNode(RMNode nodeInfo) { + // update this node to node label manager + if (labelManager != null) { + labelManager.deactivateNode(nodeInfo.getNodeID()); + } + + NodeId nodeId = nodeInfo.getNodeID(); + FiCaSchedulerNode node = nodeTracker.getNode(nodeId); + if (node == null) { + LOG.error("Attempting to remove non-existent node " + nodeId); + return; + } + + // Remove running containers + List runningContainers = + node.getCopiedListOfRunningContainers(); + for (RMContainer container : runningContainers) { + super.completedContainer(container, SchedulerUtils + .createAbnormalContainerStatus(container.getContainerId(), + SchedulerUtils.LOST_CONTAINER), RMContainerEventType.KILL); + } + + // Remove reservations, if any + RMContainer reservedContainer = node.getReservedContainer(); + if (reservedContainer != null) { + super.completedContainer(reservedContainer, SchedulerUtils + .createAbnormalContainerStatus(reservedContainer.getContainerId(), + SchedulerUtils.LOST_CONTAINER), RMContainerEventType.KILL); + } + + nodeTracker.removeNode(nodeId); + Resource clusterResource = getClusterResource(); + getRootQueue().updateClusterResource(clusterResource, + new ResourceLimits(clusterResource)); + int numNodes = nodeTracker.nodeCount(); + + if (scheduleAsynchronously && numNodes == 0) { + for (AsyncScheduleThread t : asyncSchedulerThreads) { + t.suspendSchedule(); + } + } + + LOG.info( + "Removed node " + nodeInfo.getNodeAddress() + " clusterResource: " + + getClusterResource()); + } + + @Override + protected void completedContainerInternal( + RMContainer rmContainer, ContainerStatus containerStatus, + RMContainerEventType event) { + Container container = rmContainer.getContainer(); + ContainerId containerId = container.getId(); + + // Get the application for the finished container + FiCaSchedulerApp application = getCurrentAttemptForContainer( + container.getId()); + ApplicationId appId = + containerId.getApplicationAttemptId().getApplicationId(); + if (application == null) { + LOG.info( + "Container " + container + " of" + " finished application " + appId + + " completed with event " + event); + return; + } + + // Get the node on which the container was allocated + FiCaSchedulerNode node = getNode(container.getNodeId()); + if (null == node) { + LOG.info("Container " + container + " of" + " removed node " + container + .getNodeId() + " completed with event " + event); + return; + } + + // Inform the queue + LeafQueue queue = (LeafQueue) application.getQueue(); + queue.completedContainer(getClusterResource(), application, node, + rmContainer, containerStatus, event, null, true); + if (ContainerExitStatus.PREEMPTED == containerStatus.getExitStatus()) { + updateQueuePreemptionMetrics(queue, rmContainer); + } + } + + private void updateQueuePreemptionMetrics( + CSQueue queue, RMContainer rmc) { + QueueMetrics qMetrics = queue.getMetrics(); + long usedMillis = rmc.getFinishTime() - rmc.getCreationTime(); + Resource containerResource = rmc.getAllocatedResource(); + qMetrics.preemptContainer(); + long mbSeconds = (containerResource.getMemorySize() * usedMillis) + / DateUtils.MILLIS_PER_SECOND; + long vcSeconds = (containerResource.getVirtualCores() * usedMillis) + / DateUtils.MILLIS_PER_SECOND; + qMetrics.updatePreemptedMemoryMBSeconds(mbSeconds); + qMetrics.updatePreemptedVcoreSeconds(vcSeconds); + } + + @Lock(Lock.NoLock.class) + @VisibleForTesting + @Override + public FiCaSchedulerApp getApplicationAttempt( + ApplicationAttemptId applicationAttemptId) { + return super.getApplicationAttempt(applicationAttemptId); + } + + @Lock(Lock.NoLock.class) + public FiCaSchedulerNode getNode(NodeId nodeId) { + return nodeTracker.getNode(nodeId); + } + + @Lock(Lock.NoLock.class) + public List getAllNodes() { + return nodeTracker.getAllNodes(); + } + + @Override + @Lock(Lock.NoLock.class) + public void recover(RMState state) throws Exception { + // NOT IMPLEMENTED + } + + @Override + public void killReservedContainer(RMContainer container) { + if (LOG.isDebugEnabled()) { + LOG.debug(SchedulerEventType.KILL_RESERVED_CONTAINER + ":" + + container.toString()); + } + // To think: What happens if this is no longer a reserved container, for + // e.g if the reservation became an allocation. + super.completedContainer(container, + SchedulerUtils.createAbnormalContainerStatus( + container.getContainerId(), + SchedulerUtils.UNRESERVED_CONTAINER), + RMContainerEventType.KILL); + } + + @Override + public void markContainerForPreemption(ApplicationAttemptId aid, + RMContainer cont) { + if (LOG.isDebugEnabled()) { + LOG.debug(SchedulerEventType.MARK_CONTAINER_FOR_PREEMPTION + + ": appAttempt:" + aid.toString() + " container: " + + cont.toString()); + } + FiCaSchedulerApp app = getApplicationAttempt(aid); + if (app != null) { + app.markContainerForPreemption(cont.getContainerId()); + } } + @VisibleForTesting @Override - public void run() { - int debuggingLogCounter = 0; - while (!Thread.currentThread().isInterrupted()) { - try { - if (!runSchedules.get()) { - Thread.sleep(100); - } else { - // Don't run schedule if we have some pending backlogs already - if (cs.getAsyncSchedulingPendingBacklogs() - > cs.asyncMaxPendingBacklogs) { - Thread.sleep(1); - } else{ - schedule(cs); - if(LOG.isDebugEnabled()) { - // Adding a debug log here to ensure that the thread is alive - // and running fine. - if (debuggingLogCounter++ > 10000) { - debuggingLogCounter = 0; - LOG.debug("AsyncScheduleThread[" + getName() + "] is running!"); + public void killContainer(RMContainer container) { + markContainerForKillable(container); + } + + public synchronized void markContainerForKillable( + RMContainer killableContainer) { + if (LOG.isDebugEnabled()) { + LOG.debug(SchedulerEventType.MARK_CONTAINER_FOR_KILLABLE + ": container" + + killableContainer.toString()); + } + + if (!isLazyPreemptionEnabled) { + super.completedContainer(killableContainer, SchedulerUtils + .createPreemptedContainerStatus(killableContainer.getContainerId(), + SchedulerUtils.PREEMPTED_CONTAINER), RMContainerEventType.KILL); + } else { + FiCaSchedulerNode node = (FiCaSchedulerNode) getSchedulerNode( + killableContainer.getAllocatedNode()); + + FiCaSchedulerApp application = getCurrentAttemptForContainer( + killableContainer.getContainerId()); + + node.markContainerToKillable(killableContainer.getContainerId()); + + // notify PreemptionManager + // Get the application for the finished container + if (null != application) { + String leafQueueName = application.getCSLeafQueue().getQueueName(); + getPreemptionManager().addKillableContainer( + new KillableContainer(killableContainer, node.getPartition(), + leafQueueName)); } - } } - } - } catch (InterruptedException ie) { - // keep interrupt signal - Thread.currentThread().interrupt(); + } + + private synchronized void markContainerForNonKillable( + RMContainer nonKillableContainer) { + if (LOG.isDebugEnabled()) { + LOG.debug( + SchedulerEventType.MARK_CONTAINER_FOR_NONKILLABLE + ": container" + + nonKillableContainer.toString()); + } + + FiCaSchedulerNode node = (FiCaSchedulerNode) getSchedulerNode( + nonKillableContainer.getAllocatedNode()); + + FiCaSchedulerApp application = getCurrentAttemptForContainer( + nonKillableContainer.getContainerId()); + + node.markContainerToNonKillable(nonKillableContainer.getContainerId()); + + // notify PreemptionManager + // Get the application for the finished container + if (null != application) { + String leafQueueName = application.getCSLeafQueue().getQueueName(); + getPreemptionManager().removeKillableContainer( + new KillableContainer(nonKillableContainer, node.getPartition(), + leafQueueName)); + } + } + + @Override + public boolean checkAccess(UserGroupInformation callerUGI, + QueueACL acl, String queueName) { + CSQueue queue = getQueue(queueName); + if (queue == null) { + if (LOG.isDebugEnabled()) { + LOG.debug("ACL not found for queue access-type " + acl + " for queue " + + queueName); + } + return false; + } + return queue.hasAccess(acl, callerUGI); + } + + @Override + public List getAppsInQueue(String queueName) { + CSQueue queue = getQueue(queueName); + if (queue == null) { + return null; + } + List apps = new ArrayList(); + queue.collectSchedulerApplications(apps); + return apps; + } + + public boolean isSystemAppsLimitReached() { + if (getRootQueue().getNumApplications() < conf + .getMaximumSystemApplications()) { + return false; } - } - LOG.info("AsyncScheduleThread[" + getName() + "] exited!"); + return true; + } + + private String getDefaultReservationQueueName(String planQueueName) { + return planQueueName + ReservationConstants.DEFAULT_QUEUE_SUFFIX; } - public void beginSchedule() { - runSchedules.set(true); + private synchronized String resolveReservationQueueName(String queueName, + ApplicationId applicationId, ReservationId reservationID, + boolean isRecovering) { + CSQueue queue = getQueue(queueName); + // Check if the queue is a plan queue + if ((queue == null) || !(queue instanceof PlanQueue)) { + return queueName; + } + if (reservationID != null) { + String resQName = reservationID.toString(); + queue = getQueue(resQName); + if (queue == null) { + // reservation has terminated during failover + if (isRecovering && conf.getMoveOnExpiry( + getQueue(queueName).getQueuePath())) { + // move to the default child queue of the plan + return getDefaultReservationQueueName(queueName); + } + String message = "Application " + applicationId + + " submitted to a reservation which is not currently active: " + + resQName; + this.rmContext.getDispatcher().getEventHandler().handle( + new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, + message)); + return null; + } + if (!queue.getParent().getQueueName().equals(queueName)) { + String message = + "Application: " + applicationId + " submitted to a reservation " + + resQName + " which does not belong to the specified queue: " + + queueName; + this.rmContext.getDispatcher().getEventHandler().handle( + new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, + message)); + return null; + } + // use the reservation queue to run the app + queueName = resQName; + } else { + // use the default child queue of the plan for unreserved apps + queueName = getDefaultReservationQueueName(queueName); + } + return queueName; } - public void suspendSchedule() { - runSchedules.set(false); + @Override + public synchronized void removeQueue(String queueName) + throws SchedulerDynamicEditException { + LOG.info("Removing queue: " + queueName); + CSQueue q = this.getQueue(queueName); + if (!(AbstractAutoCreatedLeafQueue.class.isAssignableFrom( + q.getClass()))) { + throw new SchedulerDynamicEditException( + "The queue that we are asked " + "to remove (" + queueName + + ") is not a AutoCreatedLeafQueue or ReservationQueue"); + } + AbstractAutoCreatedLeafQueue disposableLeafQueue = + (AbstractAutoCreatedLeafQueue) q; + // at this point we should have no more apps + if (disposableLeafQueue.getNumApplications() > 0) { + throw new SchedulerDynamicEditException( + "The queue " + queueName + " is not empty " + disposableLeafQueue + .getApplications().size() + " active apps " + + disposableLeafQueue.getPendingApplications().size() + + " pending apps"); + } + + ((AbstractManagedParentQueue) disposableLeafQueue.getParent()) + .removeChildQueue(q); + this.queueManager.removeQueue(queueName); + LOG.info( + "Removal of AutoCreatedLeafQueue " + queueName + " has succeeded"); } - } + @Override + public synchronized void addQueue(Queue queue) + throws SchedulerDynamicEditException, IOException { + if (queue == null) { + throw new SchedulerDynamicEditException( + "Queue specified is null. Should be an implementation of " + + "AbstractAutoCreatedLeafQueue"); + } else if (!(AbstractAutoCreatedLeafQueue.class + .isAssignableFrom(queue.getClass()))) { + throw new SchedulerDynamicEditException( + "Queue is not an implementation of " + + "AbstractAutoCreatedLeafQueue : " + queue.getClass()); + } + + AbstractAutoCreatedLeafQueue newQueue = + (AbstractAutoCreatedLeafQueue) queue; + + if (newQueue.getParent() == null || !(AbstractManagedParentQueue.class. + isAssignableFrom(newQueue.getParent().getClass()))) { + throw new SchedulerDynamicEditException( + "ParentQueue for " + newQueue + " is not properly set" + + " (should be set and be a PlanQueue or ManagedParentQueue)"); + } - static class ResourceCommitterService extends Thread { - private final CapacityScheduler cs; - private BlockingQueue> - backlogs = new LinkedBlockingQueue<>(); + AbstractManagedParentQueue parent = + (AbstractManagedParentQueue) newQueue.getParent(); + String queuename = newQueue.getQueueName(); + parent.addChildQueue(newQueue); + this.queueManager.addQueue(queuename, newQueue); - public ResourceCommitterService(CapacityScheduler cs) { - this.cs = cs; - setDaemon(true); + LOG.info("Creation of AutoCreatedLeafQueue " + newQueue + " succeeded"); } @Override - public void run() { - while (!Thread.currentThread().isInterrupted()) { - try { - ResourceCommitRequest request = - backlogs.take(); - - try { - cs.writeLock.lock(); - cs.tryCommit(cs.getClusterResource(), request, true); - } finally { - cs.writeLock.unlock(); - } - - } catch (InterruptedException e) { - LOG.error(e); - Thread.currentThread().interrupt(); - } - } - LOG.info("ResourceCommitterService exited!"); - } - - public void addNewCommitRequest( - ResourceCommitRequest proposal) { - backlogs.add(proposal); - } - - public int getPendingBacklogs() { - return backlogs.size(); - } - } - - @VisibleForTesting - public PlacementRule getUserGroupMappingPlacementRule() throws IOException { - try { - readLock.lock(); - UserGroupMappingPlacementRule ugRule = new UserGroupMappingPlacementRule(); - ugRule.initialize(this); - return ugRule; - } finally { - readLock.unlock(); - } - } - - public PlacementRule getAppNameMappingPlacementRule() throws IOException { - try { - readLock.lock(); - AppNameMappingPlacementRule anRule = new AppNameMappingPlacementRule(); - anRule.initialize(this); - return anRule; - } finally { - readLock.unlock(); - } - } - - @VisibleForTesting - public void updatePlacementRules() throws IOException { - // Initialize placement rules - Collection placementRuleStrs = conf.getStringCollection( - YarnConfiguration.QUEUE_PLACEMENT_RULES); - List placementRules = new ArrayList<>(); - Set distingushRuleSet = new HashSet<>(); - // fail the case if we get duplicate placementRule add in - for (String pls : placementRuleStrs) { - if (!distingushRuleSet.add(pls)) { - throw new IOException("Invalid PlacementRule inputs which " - + "contains duplicate rule strings"); - } - } - - // add UserGroupMappingPlacementRule if absent - distingushRuleSet.add(YarnConfiguration.USER_GROUP_PLACEMENT_RULE); - - placementRuleStrs = new ArrayList<>(distingushRuleSet); - - for (String placementRuleStr : placementRuleStrs) { - switch (placementRuleStr) { - case YarnConfiguration.USER_GROUP_PLACEMENT_RULE: - PlacementRule ugRule = getUserGroupMappingPlacementRule(); - if (null != ugRule) { - placementRules.add(ugRule); - } - break; - case YarnConfiguration.APP_NAME_PLACEMENT_RULE: - PlacementRule anRule = getAppNameMappingPlacementRule(); - if (null != anRule) { - placementRules.add(anRule); - } - break; - default: - boolean isMappingNotEmpty; - try { - PlacementRule rule = PlacementFactory.getPlacementRule( - placementRuleStr, conf); - if (null != rule) { + public synchronized void setEntitlement(String inQueue, QueueEntitlement entitlement) + throws YarnException { + LeafQueue queue = this.queueManager.getAndCheckLeafQueue(inQueue); + AbstractManagedParentQueue parent = + (AbstractManagedParentQueue) queue.getParent(); + + if (!(AbstractAutoCreatedLeafQueue.class.isAssignableFrom( + queue.getClass()))) { + throw new SchedulerDynamicEditException( + "Entitlement can not be" + " modified dynamically since queue " + + inQueue + " is not a AutoCreatedLeafQueue"); + } + + if (parent == null || !(AbstractManagedParentQueue.class.isAssignableFrom( + parent.getClass()))) { + throw new SchedulerDynamicEditException( + "The parent of AutoCreatedLeafQueue " + inQueue + + " must be a PlanQueue/ManagedParentQueue"); + } + + AbstractAutoCreatedLeafQueue newQueue = + (AbstractAutoCreatedLeafQueue) queue; + parent.validateQueueEntitlementChange(newQueue, entitlement); + + newQueue.setEntitlement(entitlement); + + LOG.info("Set entitlement for AutoCreatedLeafQueue " + inQueue + " to " + + queue.getCapacity() + " request was (" + entitlement.getCapacity() + + ")"); + } + + @Override + public synchronized String moveApplication(ApplicationId appId, + String targetQueueName) throws YarnException { + SchedulerApplication application = + applications.get(appId); + if (application == null) { + throw new YarnException("App to be moved " + appId + " not found."); + } + String sourceQueueName = application.getQueue().getQueueName(); + LeafQueue source = + this.queueManager.getAndCheckLeafQueue(sourceQueueName); + String destQueueName = handleMoveToPlanQueue(targetQueueName); + LeafQueue dest = this.queueManager.getAndCheckLeafQueue(destQueueName); + + String user = application.getUser(); try { - isMappingNotEmpty = rule.initialize(this); - } catch (IOException ie) { - throw new IOException(ie); - } - if (isMappingNotEmpty) { - placementRules.add(rule); - } - } - } catch (ClassNotFoundException cnfe) { - throw new IOException(cnfe); - } - } - } - - rmContext.getQueuePlacementManager().updateRules(placementRules); - } - - @Lock(CapacityScheduler.class) - private void initializeQueues(CapacitySchedulerConfiguration conf) - throws IOException { - - this.queueManager.initializeQueues(conf); - - updatePlacementRules(); - - // Notify Preemption Manager - preemptionManager.refreshQueues(null, this.getRootQueue()); - } - - @Lock(CapacityScheduler.class) - private void reinitializeQueues(CapacitySchedulerConfiguration newConf) - throws IOException { - this.queueManager.reinitializeQueues(newConf); - updatePlacementRules(); - - // Notify Preemption Manager - preemptionManager.refreshQueues(null, this.getRootQueue()); - } - - @Override - public CSQueue getQueue(String queueName) { - if (queueName == null) { - return null; - } - return this.queueManager.getQueue(queueName); - } - - private void addApplicationOnRecovery(ApplicationId applicationId, - String queueName, String user, - Priority priority, ApplicationPlacementContext placementContext) { - try { - writeLock.lock(); - //check if the queue needs to be auto-created during recovery - CSQueue queue = getOrCreateQueueFromPlacementContext(applicationId, user, - queueName, placementContext, true); - - if (queue == null) { - //During a restart, this indicates a queue was removed, which is - //not presently supported - if (!getConfiguration().shouldAppFailFast(getConfig())) { - this.rmContext.getDispatcher().getEventHandler().handle( - new RMAppEvent(applicationId, RMAppEventType.KILL, - "Application killed on recovery as it" - + " was submitted to queue " + queueName - + " which no longer exists after restart.")); - return; - } else{ - String queueErrorMsg = "Queue named " + queueName + " missing " - + "during application recovery." - + " Queue removal during recovery is not presently " - + "supported by the capacity scheduler, please " - + "restart with all queues configured" - + " which were present before shutdown/restart."; - LOG.fatal(queueErrorMsg); - throw new QueueInvalidException(queueErrorMsg); - } - } - if (!(queue instanceof LeafQueue)) { - // During RM restart, this means leaf queue was converted to a parent - // queue, which is not supported for running apps. - if (!getConfiguration().shouldAppFailFast(getConfig())) { - this.rmContext.getDispatcher().getEventHandler().handle( - new RMAppEvent(applicationId, RMAppEventType.KILL, - "Application killed on recovery as it was " - + "submitted to queue " + queueName - + " which is no longer a leaf queue after restart.")); - return; - } else{ - String queueErrorMsg = "Queue named " + queueName - + " is no longer a leaf queue during application recovery." - + " Changing a leaf queue to a parent queue during recovery is" - + " not presently supported by the capacity scheduler. Please" - + " restart with leaf queues before shutdown/restart continuing" - + " as leaf queues."; - LOG.fatal(queueErrorMsg); - throw new QueueInvalidException(queueErrorMsg); - } - } - // When recovering apps in this queue but queue is in STOPPED state, - // that means its previous state was DRAINING. So we auto transit - // the state to DRAINING for recovery. - if (queue.getState() == QueueState.STOPPED) { - ((LeafQueue) queue).recoverDrainingState(); - } - // Submit to the queue - try { - queue.submitApplication(applicationId, user, queueName); - } catch (AccessControlException ace) { - // Ignore the exception for recovered app as the app was previously - // accepted. - } - queue.getMetrics().submitApp(user); - SchedulerApplication application = - new SchedulerApplication(queue, user, priority); - applications.put(applicationId, application); - LOG.info("Accepted application " + applicationId + " from user: " + user - + ", in queue: " + queueName); - if (LOG.isDebugEnabled()) { - LOG.debug( - applicationId + " is recovering. Skip notifying APP_ACCEPTED"); - } - } finally { - writeLock.unlock(); - } - } - - private CSQueue getOrCreateQueueFromPlacementContext(ApplicationId - applicationId, String user, String queueName, - ApplicationPlacementContext placementContext, - boolean isRecovery) { - - CSQueue queue = getQueue(queueName); - - if (queue == null) { - if (placementContext != null && placementContext.hasParentQueue()) { - try { - return autoCreateLeafQueue(placementContext); - } catch (YarnException | IOException e) { - if (isRecovery) { - if (!getConfiguration().shouldAppFailFast(getConfig())) { - LOG.error("Could not auto-create leaf queue " + queueName + - " due to : ", e); - this.rmContext.getDispatcher().getEventHandler().handle( - new RMAppEvent(applicationId, RMAppEventType.KILL, - "Application killed on recovery" - + " as it was submitted to queue " + queueName - + " which could not be auto-created")); - } else{ - String queueErrorMsg = - "Queue named " + queueName + " could not be " - + "auto-created during application recovery."; - LOG.fatal(queueErrorMsg, e); - throw new QueueInvalidException(queueErrorMsg); - } - } else{ - LOG.error("Could not auto-create leaf queue due to : ", e); - final String message = - "Application " + applicationId + " submission by user : " - + user - + " to queue : " + queueName + " failed : " + e - .getMessage(); - this.rmContext.getDispatcher().getEventHandler().handle( - new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, - message)); - } - } - } - } - return queue; - } - - private void addApplication(ApplicationId applicationId, String queueName, - String user, Priority priority, - ApplicationPlacementContext placementContext) { - try { - writeLock.lock(); - if (isSystemAppsLimitReached()) { - String message = "Maximum system application limit reached," - + "cannot accept submission of application: " + applicationId; - this.rmContext.getDispatcher().getEventHandler().handle( - new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, - message)); - return; - } - - //Could be a potential auto-created leaf queue - CSQueue queue = getOrCreateQueueFromPlacementContext(applicationId, user, - queueName, placementContext, false); - - if (queue == null) { - final String message = - "Application " + applicationId + " submitted by user " + user - + " to unknown queue: " + queueName; - - this.rmContext.getDispatcher().getEventHandler().handle( - new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, - message)); - return; - } - - if (!(queue instanceof LeafQueue)) { - String message = - "Application " + applicationId + " submitted by user : " + user - + " to non-leaf queue : " + queueName; - this.rmContext.getDispatcher().getEventHandler().handle( - new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, - message)); - return; - } else if (queue instanceof AutoCreatedLeafQueue && queue - .getParent() instanceof ManagedParentQueue) { - - //If queue already exists and auto-queue creation was not required, - //placement context should not be null - if (placementContext == null) { - String message = - "Application " + applicationId + " submission by user : " + user - + " to specified queue : " + queueName + " is prohibited. " - + "Verify automatic queue mapping for user exists in " + - QUEUE_MAPPING; - this.rmContext.getDispatcher().getEventHandler().handle( - new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, - message)); - return; - // For a queue which exists already and - // not auto-created above, then its parent queue should match - // the parent queue specified in queue mapping - } else if (!queue.getParent().getQueueName().equals( - placementContext.getParentQueue())) { - String message = - "Auto created Leaf queue " + placementContext.getQueue() + " " - + "already exists under queue : " + queue - .getParent().getQueuePath() - + ".But Queue mapping configuration " + - CapacitySchedulerConfiguration.QUEUE_MAPPING + " has been " - + "updated to a different parent queue : " - + placementContext.getParentQueue() - + " for the specified user : " + user; - this.rmContext.getDispatcher().getEventHandler().handle( - new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, - message)); - return; - } - } - - // Submit to the queue - try { - queue.submitApplication(applicationId, user, queueName); - } catch (AccessControlException ace) { - LOG.info("Failed to submit application " + applicationId + " to queue " - + queueName + " from user " + user, ace); - this.rmContext.getDispatcher().getEventHandler().handle( - new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, - ace.toString())); - return; - } - // update the metrics - queue.getMetrics().submitApp(user); - SchedulerApplication application = - new SchedulerApplication(queue, user, priority); - applications.put(applicationId, application); - LOG.info("Accepted application " + applicationId + " from user: " + user - + ", in queue: " + queueName); - rmContext.getDispatcher().getEventHandler().handle( - new RMAppEvent(applicationId, RMAppEventType.APP_ACCEPTED)); - } finally { - writeLock.unlock(); - } - } - - private void addApplicationAttempt( - ApplicationAttemptId applicationAttemptId, - boolean transferStateFromPreviousAttempt, - boolean isAttemptRecovering) { - try { - writeLock.lock(); - SchedulerApplication application = applications.get( - applicationAttemptId.getApplicationId()); - if (application == null) { - LOG.warn("Application " + applicationAttemptId.getApplicationId() - + " cannot be found in scheduler."); - return; - } - CSQueue queue = (CSQueue) application.getQueue(); - - FiCaSchedulerApp attempt = new FiCaSchedulerApp(applicationAttemptId, - application.getUser(), queue, queue.getAbstractUsersManager(), - rmContext, application.getPriority(), isAttemptRecovering, - activitiesManager); - if (transferStateFromPreviousAttempt) { - attempt.transferStateFromPreviousAttempt( - application.getCurrentAppAttempt()); - } - application.setCurrentAppAttempt(attempt); - - // Update attempt priority to the latest to avoid race condition i.e - // SchedulerApplicationAttempt is created with old priority but it is not - // set to SchedulerApplication#setCurrentAppAttempt. - // Scenario would occur is - // 1. SchdulerApplicationAttempt is created with old priority. - // 2. updateApplicationPriority() updates SchedulerApplication. Since - // currentAttempt is null, it just return. - // 3. ScheduelerApplcationAttempt is set in - // SchedulerApplication#setCurrentAppAttempt. - attempt.setPriority(application.getPriority()); - - queue.submitApplicationAttempt(attempt, application.getUser()); - LOG.info("Added Application Attempt " + applicationAttemptId - + " to scheduler from user " + application.getUser() + " in queue " - + queue.getQueueName()); - if (isAttemptRecovering) { - if (LOG.isDebugEnabled()) { - LOG.debug(applicationAttemptId - + " is recovering. Skipping notifying ATTEMPT_ADDED"); - } - } else{ - rmContext.getDispatcher().getEventHandler().handle( - new RMAppAttemptEvent(applicationAttemptId, - RMAppAttemptEventType.ATTEMPT_ADDED)); - } - } finally { - writeLock.unlock(); - } - } - - private void doneApplication(ApplicationId applicationId, - RMAppState finalState) { - try { - writeLock.lock(); - SchedulerApplication application = applications.get( - applicationId); - if (application == null) { - // The AppRemovedSchedulerEvent maybe sent on recovery for completed - // apps, ignore it. - LOG.warn("Couldn't find application " + applicationId); - return; - } - CSQueue queue = (CSQueue) application.getQueue(); - if (!(queue instanceof LeafQueue)) { - LOG.error("Cannot finish application " + "from non-leaf queue: " + queue - .getQueueName()); - } else{ - queue.finishApplication(applicationId, application.getUser()); - } - application.stop(finalState); - applications.remove(applicationId); - } finally { - writeLock.unlock(); - } - } - - private void doneApplicationAttempt( - ApplicationAttemptId applicationAttemptId, - RMAppAttemptState rmAppAttemptFinalState, boolean keepContainers) { - try { - writeLock.lock(); - LOG.info("Application Attempt " + applicationAttemptId + " is done." - + " finalState=" + rmAppAttemptFinalState); - - FiCaSchedulerApp attempt = getApplicationAttempt(applicationAttemptId); - SchedulerApplication application = applications.get( - applicationAttemptId.getApplicationId()); - - if (application == null || attempt == null) { - LOG.info( - "Unknown application " + applicationAttemptId + " has completed!"); - return; - } - - // Release all the allocated, acquired, running containers - for (RMContainer rmContainer : attempt.getLiveContainers()) { - if (keepContainers && rmContainer.getState().equals( - RMContainerState.RUNNING)) { - // do not kill the running container in the case of work-preserving AM - // restart. - LOG.info("Skip killing " + rmContainer.getContainerId()); - continue; - } - super.completedContainer(rmContainer, SchedulerUtils - .createAbnormalContainerStatus(rmContainer.getContainerId(), - SchedulerUtils.COMPLETED_APPLICATION), - RMContainerEventType.KILL); - } - - // Release all reserved containers - for (RMContainer rmContainer : attempt.getReservedContainers()) { - super.completedContainer(rmContainer, SchedulerUtils - .createAbnormalContainerStatus(rmContainer.getContainerId(), - "Application Complete"), RMContainerEventType.KILL); - } - - // Clean up pending requests, metrics etc. - attempt.stop(rmAppAttemptFinalState); - - // Inform the queue - String queueName = attempt.getQueue().getQueueName(); - CSQueue queue = this.getQueue(queueName); - if (!(queue instanceof LeafQueue)) { - LOG.error( - "Cannot finish application " + "from non-leaf queue: " + queueName); - } else{ - queue.finishApplicationAttempt(attempt, queue.getQueueName()); - } - } finally { - writeLock.unlock(); - } - } - - /** - * Normalize a list of SchedulingRequest. - * - * @param asks scheduling request - */ - private void normalizeSchedulingRequests(List asks) { - if (asks == null) { - return; - } - for (SchedulingRequest ask: asks) { - ResourceSizing sizing = ask.getResourceSizing(); - if (sizing != null && sizing.getResources() != null) { - sizing.setResources(getNormalizedResource(sizing.getResources())); - } - } - } - - @Override - @Lock(Lock.NoLock.class) - public Allocation allocate(ApplicationAttemptId applicationAttemptId, - List ask, List schedulingRequests, - List release, List blacklistAdditions, - List blacklistRemovals, ContainerUpdates updateRequests) { - FiCaSchedulerApp application = getApplicationAttempt(applicationAttemptId); - if (application == null) { - LOG.error("Calling allocate on removed or non existent application " + - applicationAttemptId.getApplicationId()); - return EMPTY_ALLOCATION; - } - - // The allocate may be the leftover from previous attempt, and it will - // impact current attempt, such as confuse the request and allocation for - // current attempt's AM container. - // Note outside precondition check for the attempt id may be - // outdated here, so double check it here is necessary. - if (!application.getApplicationAttemptId().equals(applicationAttemptId)) { - LOG.error("Calling allocate on previous or removed " + - "or non existent application attempt " + applicationAttemptId); - return EMPTY_ALLOCATION; - } - - // Handle all container updates - handleContainerUpdates(application, updateRequests); - - // Release containers - releaseContainers(release, application); - - LeafQueue updateDemandForQueue = null; - - // Sanity check for new allocation requests - normalizeResourceRequests(ask); - - // Normalize scheduling requests - normalizeSchedulingRequests(schedulingRequests); - - Allocation allocation; - - // make sure we aren't stopping/removing the application - // when the allocate comes in - try { - application.getWriteLock().lock(); - if (application.isStopped()) { - return EMPTY_ALLOCATION; - } - - // Process resource requests - if (!ask.isEmpty() || (schedulingRequests != null && !schedulingRequests - .isEmpty())) { - if (LOG.isDebugEnabled()) { - LOG.debug( - "allocate: pre-update " + applicationAttemptId + " ask size =" - + ask.size()); - application.showRequests(); + dest.submitApplication(appId, user, destQueueName); + } catch (AccessControlException e) { + throw new YarnException(e); + } + + FiCaSchedulerApp app = application.getCurrentAppAttempt(); + if (app != null) { + // Move all live containers even when stopped. + // For transferStateFromPreviousAttempt required + for (RMContainer rmContainer : app.getLiveContainers()) { + source.detachContainer(getClusterResource(), app, rmContainer); + // attach the Container to another queue + dest.attachContainer(getClusterResource(), app, rmContainer); + } + if (!app.isStopped()) { + source.finishApplicationAttempt(app, sourceQueueName); + // Submit to a new queue + dest.submitApplicationAttempt(app, user); + } + // Finish app & update metrics + app.move(dest); + } + source.appFinished(); + // Detach the application.. + source.getParent().finishApplication(appId, user); + application.setQueue(dest); + LOG.info("App: " + appId + " successfully moved from " + sourceQueueName + + " to: " + destQueueName); + return targetQueueName; + } + + @Override + public synchronized void preValidateMoveApplication(ApplicationId appId, + String newQueue) throws YarnException { + SchedulerApplication application = + applications.get(appId); + if (application == null) { + throw new YarnException("App to be moved " + appId + " not found."); + } + String sourceQueueName = application.getQueue().getQueueName(); + this.queueManager.getAndCheckLeafQueue(sourceQueueName); + String destQueueName = handleMoveToPlanQueue(newQueue); + LeafQueue dest = this.queueManager.getAndCheckLeafQueue(destQueueName); + // Validation check - ACLs, submission limits for user & queue + String user = application.getUser(); + // Check active partition only when attempt is available + FiCaSchedulerApp appAttempt = + getApplicationAttempt(ApplicationAttemptId.newInstance(appId, 0)); + if (null != appAttempt) { + checkQueuePartition(appAttempt, dest); + } + try { + dest.validateSubmitApplication(appId, user, destQueueName); + } catch (AccessControlException e) { + throw new YarnException(e); + } + } + + /** + * Check application can be moved to queue with labels enabled. All labels in + * application life time will be checked + * + * @param app + * @param dest + * @throws YarnException + */ + private void checkQueuePartition(FiCaSchedulerApp app, LeafQueue dest) + throws YarnException { + if (!YarnConfiguration.areNodeLabelsEnabled(conf)) { + return; + } + Set targetqueuelabels = dest.getAccessibleNodeLabels(); + AppSchedulingInfo schedulingInfo = app.getAppSchedulingInfo(); + Set appLabelexpressions = schedulingInfo.getRequestedPartitions(); + // default partition access always available remove empty label + appLabelexpressions.remove(RMNodeLabelsManager.NO_LABEL); + Set nonAccessiblelabels = new HashSet(); + for (String label : appLabelexpressions) { + if (!SchedulerUtils.checkQueueLabelExpression(targetqueuelabels, label, + null)) { + nonAccessiblelabels.add(label); + } + } + if (nonAccessiblelabels.size() > 0) { + throw new YarnException( + "Specified queue=" + dest.getQueueName() + " can't satisfy following " + + "apps label expressions =" + nonAccessiblelabels + + " accessible node labels =" + targetqueuelabels); + } + } + + /** {@inheritDoc} */ + @Override + public EnumSet getSchedulingResourceTypes() { + if (calculator.getClass().getName() + .equals(DefaultResourceCalculator.class.getName())) { + return EnumSet.of(SchedulerResourceTypes.MEMORY); } + return EnumSet.of(SchedulerResourceTypes.MEMORY, SchedulerResourceTypes.CPU); + } - // Update application requests - if (application.updateResourceRequests(ask) || application - .updateSchedulingRequests(schedulingRequests)) { - updateDemandForQueue = (LeafQueue) application.getQueue(); + @Override + public Resource getMaximumResourceCapability(String queueName) { + CSQueue queue = getQueue(queueName); + if (queue == null) { + LOG.error("Unknown queue: " + queueName); + return getMaximumResourceCapability(); + } + if (!(queue instanceof LeafQueue)) { + LOG.error("queue " + queueName + " is not an leaf queue"); + return getMaximumResourceCapability(); } - if (LOG.isDebugEnabled()) { - LOG.debug("allocate: post-update"); - application.showRequests(); - } - } - - application.updateBlacklist(blacklistAdditions, blacklistRemovals); - - allocation = application.getAllocation(getResourceCalculator(), - getClusterResource(), getMinimumResourceCapability()); - } finally { - application.getWriteLock().unlock(); - } - - if (updateDemandForQueue != null && !application - .isWaitingForAMContainer()) { - updateDemandForQueue.getOrderingPolicy().demandUpdated(application); - } - - if (LOG.isDebugEnabled()) { - LOG.info("Allocation for application " + applicationAttemptId + " : " - + allocation + " with cluster resource : " + getClusterResource()); - } - return allocation; - } - - @Override - @Lock(Lock.NoLock.class) - public QueueInfo getQueueInfo(String queueName, - boolean includeChildQueues, boolean recursive) - throws IOException { - CSQueue queue = null; - queue = this.getQueue(queueName); - if (queue == null) { - throw new IOException("Unknown queue: " + queueName); - } - return queue.getQueueInfo(includeChildQueues, recursive); - } - - @Override - @Lock(Lock.NoLock.class) - public List getQueueUserAclInfo() { - UserGroupInformation user = null; - try { - user = UserGroupInformation.getCurrentUser(); - } catch (IOException ioe) { - // should never happen - return new ArrayList(); - } - - return getRootQueue().getQueueUserAclInfo(user); - } - - @Override - protected void nodeUpdate(RMNode rmNode) { - long begin = System.nanoTime(); - try { - readLock.lock(); - setLastNodeUpdateTime(Time.now()); - super.nodeUpdate(rmNode); - } finally { - readLock.unlock(); - } - - // Try to do scheduling - if (!scheduleAsynchronously) { - try { - writeLock.lock(); - ActivitiesLogger.NODE.startNodeUpdateRecording(activitiesManager, - rmNode.getNodeID()); - - // reset allocation and reservation stats before we start doing any - // work - updateSchedulerHealth(lastNodeUpdateTime, rmNode.getNodeID(), - CSAssignment.NULL_ASSIGNMENT); - - allocateContainersToNode(rmNode.getNodeID(), true); - ActivitiesLogger.NODE.finishNodeUpdateRecording(activitiesManager, - rmNode.getNodeID()); - } finally { - writeLock.unlock(); - } - } - - long latency = System.nanoTime() - begin; - CapacitySchedulerMetrics.getMetrics().addNodeUpdate(latency); - } - - /** - * Process resource update on a node. - */ - private void updateNodeAndQueueResource(RMNode nm, - ResourceOption resourceOption) { - try { - writeLock.lock(); - updateNodeResource(nm, resourceOption); - Resource clusterResource = getClusterResource(); - getRootQueue().updateClusterResource(clusterResource, - new ResourceLimits(clusterResource)); - } finally { - writeLock.unlock(); - } - } - - /** - * Process node labels update on a node. - */ - private void updateLabelsOnNode(NodeId nodeId, - Set newLabels) { - FiCaSchedulerNode node = nodeTracker.getNode(nodeId); - if (null == node) { - return; - } - - // Get new partition, we have only one partition per node - String newPartition; - if (newLabels.isEmpty()) { - newPartition = RMNodeLabelsManager.NO_LABEL; - } else{ - newPartition = newLabels.iterator().next(); - } - - // old partition as well - String oldPartition = node.getPartition(); - - // Update resources of these containers - for (RMContainer rmContainer : node.getCopiedListOfRunningContainers()) { - FiCaSchedulerApp application = getApplicationAttempt( - rmContainer.getApplicationAttemptId()); - if (null != application) { - application.nodePartitionUpdated(rmContainer, oldPartition, - newPartition); - } else{ - LOG.warn("There's something wrong, some RMContainers running on" - + " a node, but we cannot find SchedulerApplicationAttempt " - + "for it. Node=" + node.getNodeID() + " applicationAttemptId=" - + rmContainer.getApplicationAttemptId()); - continue; - } - } - - // Unreserve container on this node - RMContainer reservedContainer = node.getReservedContainer(); - if (null != reservedContainer) { - killReservedContainer(reservedContainer); - } - - // Update node labels after we've done this - node.updateLabels(newLabels); - } - - private void updateSchedulerHealth(long now, NodeId nodeId, - CSAssignment assignment) { - List allocations = - assignment.getAssignmentInformation().getAllocationDetails(); - List reservations = - assignment.getAssignmentInformation().getReservationDetails(); - if (!allocations.isEmpty()) { - ContainerId allocatedContainerId = - allocations.get(allocations.size() - 1).containerId; - String allocatedQueue = allocations.get(allocations.size() - 1).queue; - schedulerHealth.updateAllocation(now, nodeId, allocatedContainerId, - allocatedQueue); - } - if (!reservations.isEmpty()) { - ContainerId reservedContainerId = - reservations.get(reservations.size() - 1).containerId; - String reservedQueue = reservations.get(reservations.size() - 1).queue; - schedulerHealth.updateReservation(now, nodeId, reservedContainerId, - reservedQueue); - } - schedulerHealth.updateSchedulerReservationCounts(assignment - .getAssignmentInformation().getNumReservations()); - schedulerHealth.updateSchedulerAllocationCounts(assignment - .getAssignmentInformation().getNumAllocations()); - schedulerHealth.updateSchedulerRunDetails(now, assignment - .getAssignmentInformation().getAllocated(), assignment - .getAssignmentInformation().getReserved()); - } - - private boolean canAllocateMore(CSAssignment assignment, int offswitchCount, - int assignedContainers) { - // Current assignment shouldn't be empty - if (assignment == null - || Resources.equals(assignment.getResource(), Resources.none())) { - return false; - } - - // offswitch assignment should be under threshold - if (offswitchCount >= offswitchPerHeartbeatLimit) { - return false; - } - - // And it should not be a reserved container - if (assignment.getAssignmentInformation().getNumReservations() > 0) { - return false; - } - - // assignMultipleEnabled should be ON, - // and assignedContainers should be under threshold - return assignMultipleEnabled - && (maxAssignPerHeartbeat == -1 - || assignedContainers < maxAssignPerHeartbeat); - } - - /** - * We need to make sure when doing allocation, Node should be existed - * And we will construct a {@link CandidateNodeSet} before proceeding - */ - private void allocateContainersToNode(NodeId nodeId, - boolean withNodeHeartbeat) { - FiCaSchedulerNode node = getNode(nodeId); - if (null != node) { - int offswitchCount = 0; - int assignedContainers = 0; - - CandidateNodeSet candidates = - new SimpleCandidateNodeSet<>(node); - CSAssignment assignment = allocateContainersToNode(candidates, - withNodeHeartbeat); - // Only check if we can allocate more container on the same node when - // scheduling is triggered by node heartbeat - if (null != assignment && withNodeHeartbeat) { - if (assignment.getType() == NodeType.OFF_SWITCH) { - offswitchCount++; + // queue.getMaxAllocation returns *configured* maximum allocation. + // getMaximumResourceCapability() returns maximum allocation considers + // per-node maximum resources. So return (component-wise) min of the two. + + Resource queueMaxAllocation = ((LeafQueue) queue).getMaximumAllocation(); + Resource clusterMaxAllocationConsiderNodeMax = + getMaximumResourceCapability(); + + return Resources.componentwiseMin(queueMaxAllocation, + clusterMaxAllocationConsiderNodeMax); + } + + private String handleMoveToPlanQueue(String targetQueueName) { + CSQueue dest = getQueue(targetQueueName); + if (dest != null && dest instanceof PlanQueue) { + // use the default child reservation queue of the plan + targetQueueName = targetQueueName + ReservationConstants.DEFAULT_QUEUE_SUFFIX; } + return targetQueueName; + } - if (Resources.greaterThan(calculator, getClusterResource(), - assignment.getResource(), Resources.none())) { - assignedContainers++; + @Override + public Set getPlanQueues() { + Set ret = new HashSet(); + for (Map.Entry l : queueManager.getQueues().entrySet()) { + if (l.getValue() instanceof PlanQueue) { + ret.add(l.getKey()); + } } + return ret; + } - while (canAllocateMore(assignment, offswitchCount, - assignedContainers)) { - // Try to see if it is possible to allocate multiple container for - // the same node heartbeat - assignment = allocateContainersToNode(candidates, true); + @Override + public Priority checkAndGetApplicationPriority( + Priority priorityRequestedByApp, UserGroupInformation user, + String queueName, ApplicationId applicationId) throws YarnException { + Priority appPriority = priorityRequestedByApp; - if (null != assignment - && assignment.getType() == NodeType.OFF_SWITCH) { - offswitchCount++; - } + // Verify the scenario where priority is null from submissionContext. + if (null == appPriority) { + // Verify whether submitted user has any default priority set. If so, + // user's default priority will get precedence over queue default. + // for updateApplicationPriority call flow, this check is done in + // CientRMService itself. + appPriority = this.appPriorityACLManager.getDefaultPriority(queueName, + user); + + // Get the default priority for the Queue. If Queue is non-existent, + // then + // use default priority. Do it only if user doesn't have any default. + if (null == appPriority) { + appPriority = this.queueManager.getDefaultPriorityForQueue(queueName); + } - if (null != assignment - && Resources.greaterThan(calculator, getClusterResource(), - assignment.getResource(), Resources.none())) { - assignedContainers++; - } + LOG.info( + "Application '" + applicationId + "' is submitted without priority " + + "hence considering default queue/cluster priority: " + + appPriority.getPriority()); } - if (offswitchCount >= offswitchPerHeartbeatLimit) { - if (LOG.isDebugEnabled()) { - LOG.debug("Assigned maximum number of off-switch containers: " - + offswitchCount + ", assignments so far: " + assignment); - } - } - } - } - } - - /* - * Logics of allocate container on a single node (Old behavior) - */ - private CSAssignment allocateContainerOnSingleNode( - CandidateNodeSet candidates, FiCaSchedulerNode node, - boolean withNodeHeartbeat) { - if (LOG.isDebugEnabled()) { - LOG.debug( - "Trying to schedule on node: " + node.getNodeName() + ", available: " - + node.getUnallocatedResource()); - } - - // Backward compatible way to make sure previous behavior which allocation - // driven by node heartbeat works. - if (getNode(node.getNodeID()) != node) { - LOG.error("Trying to schedule on a removed node, please double check, " - + "nodeId=" + node.getNodeID()); - return null; - } - - CSAssignment assignment; - - // Assign new containers... - // 1. Check for reserved applications - // 2. Schedule if there are no reservations - RMContainer reservedContainer = node.getReservedContainer(); - if (reservedContainer != null) { - FiCaSchedulerApp reservedApplication = getCurrentAttemptForContainer( - reservedContainer.getContainerId()); - if (reservedApplication == null) { - LOG.error( - "Trying to schedule for a finished app, please double check. nodeId=" - + node.getNodeID() + " container=" + reservedContainer - .getContainerId()); - return null; - } - - // Try to fulfill the reservation - if (LOG.isDebugEnabled()) { - LOG.debug("Trying to fulfill reservation for application " - + reservedApplication.getApplicationId() + " on node: " + node - .getNodeID()); - } - - LeafQueue queue = ((LeafQueue) reservedApplication.getQueue()); - assignment = queue.assignContainers(getClusterResource(), candidates, - // TODO, now we only consider limits for parent for non-labeled - // resources, should consider labeled resources as well. - new ResourceLimits(labelManager - .getResourceByLabel(RMNodeLabelsManager.NO_LABEL, - getClusterResource())), - SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY); - - if (assignment.isFulfilledReservation()) { - if (withNodeHeartbeat) { - // Only update SchedulerHealth in sync scheduling, existing - // Data structure of SchedulerHealth need to be updated for - // Async mode - updateSchedulerHealth(lastNodeUpdateTime, node.getNodeID(), - assignment); - } - - schedulerHealth.updateSchedulerFulfilledReservationCounts(1); - - ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, - queue.getParent().getQueueName(), queue.getQueueName(), - ActivityState.ACCEPTED, ActivityDiagnosticConstant.EMPTY); - ActivitiesLogger.NODE.finishAllocatedNodeAllocation(activitiesManager, - node, reservedContainer.getContainerId(), - AllocationState.ALLOCATED_FROM_RESERVED); - } else{ - ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node, - queue.getParent().getQueueName(), queue.getQueueName(), - ActivityState.ACCEPTED, ActivityDiagnosticConstant.EMPTY); - ActivitiesLogger.NODE.finishAllocatedNodeAllocation(activitiesManager, - node, reservedContainer.getContainerId(), AllocationState.SKIPPED); - } - - assignment.setSchedulingMode( - SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY); - submitResourceCommitRequest(getClusterResource(), assignment); - } - - // Do not schedule if there are any reservations to fulfill on the node - if (node.getReservedContainer() != null) { - if (LOG.isDebugEnabled()) { - LOG.debug("Skipping scheduling since node " + node.getNodeID() - + " is reserved by application " + node.getReservedContainer() - .getContainerId().getApplicationAttemptId()); - } - return null; - } - - // First check if we can schedule - // When this time look at one node only, try schedule if the node - // has any available or killable resource - if (calculator.computeAvailableContainers(Resources - .add(node.getUnallocatedResource(), node.getTotalKillableResources()), - minimumAllocation) <= 0) { - if (LOG.isDebugEnabled()) { - LOG.debug("This node or this node partition doesn't have available or" - + "killable resource"); - } - return null; - } - - return allocateOrReserveNewContainers(candidates, withNodeHeartbeat); - } - - private CSAssignment allocateOrReserveNewContainers( - CandidateNodeSet candidates, - boolean withNodeHeartbeat) { - CSAssignment assignment = getRootQueue().assignContainers( - getClusterResource(), candidates, new ResourceLimits(labelManager - .getResourceByLabel(candidates.getPartition(), - getClusterResource())), - SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY); - - assignment.setSchedulingMode(SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY); - submitResourceCommitRequest(getClusterResource(), assignment); - - if (Resources.greaterThan(calculator, getClusterResource(), - assignment.getResource(), Resources.none())) { - if (withNodeHeartbeat) { - updateSchedulerHealth(lastNodeUpdateTime, - CandidateNodeSetUtils.getSingleNode(candidates).getNodeID(), - assignment); - } - return assignment; - } - - // Only do non-exclusive allocation when node has node-labels. - if (StringUtils.equals(candidates.getPartition(), - RMNodeLabelsManager.NO_LABEL)) { - return null; - } - - // Only do non-exclusive allocation when the node-label supports that - try { - if (rmContext.getNodeLabelManager().isExclusiveNodeLabel( - candidates.getPartition())) { - return null; - } - } catch (IOException e) { - LOG.warn( - "Exception when trying to get exclusivity of node label=" + candidates - .getPartition(), e); - return null; - } - - // Try to use NON_EXCLUSIVE - assignment = getRootQueue().assignContainers(getClusterResource(), - candidates, - // TODO, now we only consider limits for parent for non-labeled - // resources, should consider labeled resources as well. - new ResourceLimits(labelManager - .getResourceByLabel(RMNodeLabelsManager.NO_LABEL, - getClusterResource())), - SchedulingMode.IGNORE_PARTITION_EXCLUSIVITY); - assignment.setSchedulingMode(SchedulingMode.IGNORE_PARTITION_EXCLUSIVITY); - submitResourceCommitRequest(getClusterResource(), assignment); - - return assignment; - } - - /* - * New behavior, allocate containers considering multiple nodes - */ - private CSAssignment allocateContainersOnMultiNodes( - CandidateNodeSet candidates) { - // When this time look at multiple nodes, try schedule if the - // partition has any available resource or killable resource - if (getRootQueue().getQueueCapacities().getUsedCapacity( - candidates.getPartition()) >= 1.0f - && preemptionManager.getKillableResource( - CapacitySchedulerConfiguration.ROOT, candidates.getPartition()) - == Resources.none()) { - if (LOG.isDebugEnabled()) { - LOG.debug("This node or this node partition doesn't have available or" - + "killable resource"); - } - return null; - } - - return allocateOrReserveNewContainers(candidates, false); - } - - @VisibleForTesting - CSAssignment allocateContainersToNode( - CandidateNodeSet candidates, - boolean withNodeHeartbeat) { - if (rmContext.isWorkPreservingRecoveryEnabled() && !rmContext - .isSchedulerReadyForAllocatingContainers()) { - return null; - } - - long startTime = System.nanoTime(); - - // Backward compatible way to make sure previous behavior which allocation - // driven by node heartbeat works. - FiCaSchedulerNode node = CandidateNodeSetUtils.getSingleNode(candidates); - - // We have two different logics to handle allocation on single node / multi - // nodes. - CSAssignment assignment; - if (null != node) { - assignment = allocateContainerOnSingleNode(candidates, - node, withNodeHeartbeat); - } else{ - assignment = allocateContainersOnMultiNodes(candidates); - } - - if (assignment != null && assignment.getAssignmentInformation() != null - && assignment.getAssignmentInformation().getNumAllocations() > 0) { - long allocateTime = System.nanoTime() - startTime; - CapacitySchedulerMetrics.getMetrics().addAllocate(allocateTime); - } - return assignment; - } - - @Override - public void handle(SchedulerEvent event) { - switch(event.getType()) { - case NODE_ADDED: - { - NodeAddedSchedulerEvent nodeAddedEvent = (NodeAddedSchedulerEvent)event; - addNode(nodeAddedEvent.getAddedRMNode()); - recoverContainersOnNode(nodeAddedEvent.getContainerReports(), - nodeAddedEvent.getAddedRMNode()); - } - break; - case NODE_REMOVED: - { - NodeRemovedSchedulerEvent nodeRemovedEvent = (NodeRemovedSchedulerEvent)event; - removeNode(nodeRemovedEvent.getRemovedRMNode()); - } - break; - case NODE_RESOURCE_UPDATE: - { - NodeResourceUpdateSchedulerEvent nodeResourceUpdatedEvent = - (NodeResourceUpdateSchedulerEvent)event; - updateNodeAndQueueResource(nodeResourceUpdatedEvent.getRMNode(), - nodeResourceUpdatedEvent.getResourceOption()); - } - break; - case NODE_LABELS_UPDATE: - { - NodeLabelsUpdateSchedulerEvent labelUpdateEvent = - (NodeLabelsUpdateSchedulerEvent) event; - - updateNodeLabelsAndQueueResource(labelUpdateEvent); - } - break; - case NODE_UPDATE: - { - NodeUpdateSchedulerEvent nodeUpdatedEvent = (NodeUpdateSchedulerEvent)event; - nodeUpdate(nodeUpdatedEvent.getRMNode()); - } - break; - case APP_ADDED: - { - AppAddedSchedulerEvent appAddedEvent = (AppAddedSchedulerEvent) event; - String queueName = resolveReservationQueueName(appAddedEvent.getQueue(), - appAddedEvent.getApplicationId(), appAddedEvent.getReservationID(), - appAddedEvent.getIsAppRecovering()); - if (queueName != null) { - if (!appAddedEvent.getIsAppRecovering()) { - addApplication(appAddedEvent.getApplicationId(), queueName, - appAddedEvent.getUser(), appAddedEvent.getApplicatonPriority(), - appAddedEvent.getPlacementContext()); - } else { - addApplicationOnRecovery(appAddedEvent.getApplicationId(), queueName, - appAddedEvent.getUser(), appAddedEvent.getApplicatonPriority(), - appAddedEvent.getPlacementContext()); - } - } - } - break; - case APP_REMOVED: - { - AppRemovedSchedulerEvent appRemovedEvent = (AppRemovedSchedulerEvent)event; - doneApplication(appRemovedEvent.getApplicationID(), - appRemovedEvent.getFinalState()); - } - break; - case APP_ATTEMPT_ADDED: - { - AppAttemptAddedSchedulerEvent appAttemptAddedEvent = - (AppAttemptAddedSchedulerEvent) event; - addApplicationAttempt(appAttemptAddedEvent.getApplicationAttemptId(), - appAttemptAddedEvent.getTransferStateFromPreviousAttempt(), - appAttemptAddedEvent.getIsAttemptRecovering()); - } - break; - case APP_ATTEMPT_REMOVED: - { - AppAttemptRemovedSchedulerEvent appAttemptRemovedEvent = - (AppAttemptRemovedSchedulerEvent) event; - doneApplicationAttempt(appAttemptRemovedEvent.getApplicationAttemptID(), - appAttemptRemovedEvent.getFinalAttemptState(), - appAttemptRemovedEvent.getKeepContainersAcrossAppAttempts()); - } - break; - case CONTAINER_EXPIRED: - { - ContainerExpiredSchedulerEvent containerExpiredEvent = - (ContainerExpiredSchedulerEvent) event; - ContainerId containerId = containerExpiredEvent.getContainerId(); - if (containerExpiredEvent.isIncrease()) { - rollbackContainerUpdate(containerId); - } else { - completedContainer(getRMContainer(containerId), - SchedulerUtils.createAbnormalContainerStatus( - containerId, - SchedulerUtils.EXPIRED_CONTAINER), - RMContainerEventType.EXPIRE); - } - } - break; - case RELEASE_CONTAINER: - { - RMContainer container = ((ReleaseContainerEvent) event).getContainer(); - completedContainer(container, - SchedulerUtils.createAbnormalContainerStatus( - container.getContainerId(), - SchedulerUtils.RELEASED_CONTAINER), - RMContainerEventType.RELEASED); - } - break; - case KILL_RESERVED_CONTAINER: - { - ContainerPreemptEvent killReservedContainerEvent = - (ContainerPreemptEvent) event; - RMContainer container = killReservedContainerEvent.getContainer(); - killReservedContainer(container); - } - break; - case MARK_CONTAINER_FOR_PREEMPTION: - { - ContainerPreemptEvent preemptContainerEvent = - (ContainerPreemptEvent)event; - ApplicationAttemptId aid = preemptContainerEvent.getAppId(); - RMContainer containerToBePreempted = preemptContainerEvent.getContainer(); - markContainerForPreemption(aid, containerToBePreempted); - } - break; - case MARK_CONTAINER_FOR_KILLABLE: - { - ContainerPreemptEvent containerKillableEvent = (ContainerPreemptEvent)event; - RMContainer killableContainer = containerKillableEvent.getContainer(); - markContainerForKillable(killableContainer); - } - break; - case MARK_CONTAINER_FOR_NONKILLABLE: - { - if (isLazyPreemptionEnabled) { - ContainerPreemptEvent cancelKillContainerEvent = - (ContainerPreemptEvent) event; - markContainerForNonKillable(cancelKillContainerEvent.getContainer()); - } - } - break; - case MANAGE_QUEUE: - { - QueueManagementChangeEvent queueManagementChangeEvent = - (QueueManagementChangeEvent) event; - ParentQueue parentQueue = queueManagementChangeEvent.getParentQueue(); - try { - final List queueManagementChanges = - queueManagementChangeEvent.getQueueManagementChanges(); - ((ManagedParentQueue) parentQueue) - .validateAndApplyQueueManagementChanges(queueManagementChanges); - } catch (SchedulerDynamicEditException sde) { - LOG.error("Queue Management Change event cannot be applied for " - + "parent queue : " + parentQueue.getQueueName(), sde); - } catch (IOException ioe) { - LOG.error("Queue Management Change event cannot be applied for " - + "parent queue : " + parentQueue.getQueueName(), ioe); - } - } - break; - default: - LOG.error("Invalid eventtype " + event.getType() + ". Ignoring!"); - } - } - - /** - * Process node labels update. - */ - private void updateNodeLabelsAndQueueResource( - NodeLabelsUpdateSchedulerEvent labelUpdateEvent) { - try { - writeLock.lock(); - for (Entry> entry : labelUpdateEvent - .getUpdatedNodeToLabels().entrySet()) { - NodeId id = entry.getKey(); - Set labels = entry.getValue(); - updateLabelsOnNode(id, labels); - } - Resource clusterResource = getClusterResource(); - getRootQueue().updateClusterResource(clusterResource, - new ResourceLimits(clusterResource)); - } finally { - writeLock.unlock(); - } - } - - private void addNode(RMNode nodeManager) { - try { - writeLock.lock(); - FiCaSchedulerNode schedulerNode = new FiCaSchedulerNode(nodeManager, - usePortForNodeName, nodeManager.getNodeLabels()); - nodeTracker.addNode(schedulerNode); - - // update this node to node label manager - if (labelManager != null) { - labelManager.activateNode(nodeManager.getNodeID(), - schedulerNode.getTotalResource()); - } - - Resource clusterResource = getClusterResource(); - getRootQueue().updateClusterResource(clusterResource, - new ResourceLimits(clusterResource)); - - LOG.info( - "Added node " + nodeManager.getNodeAddress() + " clusterResource: " - + clusterResource); - - if (scheduleAsynchronously && getNumClusterNodes() == 1) { - for (AsyncScheduleThread t : asyncSchedulerThreads) { - t.beginSchedule(); - } - } - } finally { - writeLock.unlock(); - } - } - - private void removeNode(RMNode nodeInfo) { - try { - writeLock.lock(); - // update this node to node label manager - if (labelManager != null) { - labelManager.deactivateNode(nodeInfo.getNodeID()); - } - - NodeId nodeId = nodeInfo.getNodeID(); - FiCaSchedulerNode node = nodeTracker.getNode(nodeId); - if (node == null) { - LOG.error("Attempting to remove non-existent node " + nodeId); - return; - } - - // Remove running containers - List runningContainers = - node.getCopiedListOfRunningContainers(); - for (RMContainer container : runningContainers) { - super.completedContainer(container, SchedulerUtils - .createAbnormalContainerStatus(container.getContainerId(), - SchedulerUtils.LOST_CONTAINER), RMContainerEventType.KILL); - } - - // Remove reservations, if any - RMContainer reservedContainer = node.getReservedContainer(); - if (reservedContainer != null) { - super.completedContainer(reservedContainer, SchedulerUtils - .createAbnormalContainerStatus(reservedContainer.getContainerId(), - SchedulerUtils.LOST_CONTAINER), RMContainerEventType.KILL); - } - - nodeTracker.removeNode(nodeId); - Resource clusterResource = getClusterResource(); - getRootQueue().updateClusterResource(clusterResource, - new ResourceLimits(clusterResource)); - int numNodes = nodeTracker.nodeCount(); - - if (scheduleAsynchronously && numNodes == 0) { - for (AsyncScheduleThread t : asyncSchedulerThreads) { - t.suspendSchedule(); - } - } - - LOG.info( - "Removed node " + nodeInfo.getNodeAddress() + " clusterResource: " - + getClusterResource()); - } finally { - writeLock.unlock(); - } - } - - @Override - protected void completedContainerInternal( - RMContainer rmContainer, ContainerStatus containerStatus, - RMContainerEventType event) { - Container container = rmContainer.getContainer(); - ContainerId containerId = container.getId(); - - // Get the application for the finished container - FiCaSchedulerApp application = getCurrentAttemptForContainer( - container.getId()); - ApplicationId appId = - containerId.getApplicationAttemptId().getApplicationId(); - if (application == null) { - LOG.info( - "Container " + container + " of" + " finished application " + appId - + " completed with event " + event); - return; - } - - // Get the node on which the container was allocated - FiCaSchedulerNode node = getNode(container.getNodeId()); - if (null == node) { - LOG.info("Container " + container + " of" + " removed node " + container - .getNodeId() + " completed with event " + event); - return; - } - - // Inform the queue - LeafQueue queue = (LeafQueue) application.getQueue(); - queue.completedContainer(getClusterResource(), application, node, - rmContainer, containerStatus, event, null, true); - if (ContainerExitStatus.PREEMPTED == containerStatus.getExitStatus()) { - updateQueuePreemptionMetrics(queue, rmContainer); - } - } - - private void updateQueuePreemptionMetrics( - CSQueue queue, RMContainer rmc) { - QueueMetrics qMetrics = queue.getMetrics(); - long usedMillis = rmc.getFinishTime() - rmc.getCreationTime(); - Resource containerResource = rmc.getAllocatedResource(); - qMetrics.preemptContainer(); - long mbSeconds = (containerResource.getMemorySize() * usedMillis) - / DateUtils.MILLIS_PER_SECOND; - long vcSeconds = (containerResource.getVirtualCores() * usedMillis) - / DateUtils.MILLIS_PER_SECOND; - qMetrics.updatePreemptedMemoryMBSeconds(mbSeconds); - qMetrics.updatePreemptedVcoreSeconds(vcSeconds); - } - - @Lock(Lock.NoLock.class) - @VisibleForTesting - @Override - public FiCaSchedulerApp getApplicationAttempt( - ApplicationAttemptId applicationAttemptId) { - return super.getApplicationAttempt(applicationAttemptId); - } - - @Lock(Lock.NoLock.class) - public FiCaSchedulerNode getNode(NodeId nodeId) { - return nodeTracker.getNode(nodeId); - } - - @Lock(Lock.NoLock.class) - public List getAllNodes() { - return nodeTracker.getAllNodes(); - } - - @Override - @Lock(Lock.NoLock.class) - public void recover(RMState state) throws Exception { - // NOT IMPLEMENTED - } - - @Override - public void killReservedContainer(RMContainer container) { - if(LOG.isDebugEnabled()){ - LOG.debug(SchedulerEventType.KILL_RESERVED_CONTAINER + ":" - + container.toString()); - } - // To think: What happens if this is no longer a reserved container, for - // e.g if the reservation became an allocation. - super.completedContainer(container, - SchedulerUtils.createAbnormalContainerStatus( - container.getContainerId(), - SchedulerUtils.UNRESERVED_CONTAINER), - RMContainerEventType.KILL); - } - - @Override - public void markContainerForPreemption(ApplicationAttemptId aid, - RMContainer cont) { - if(LOG.isDebugEnabled()){ - LOG.debug(SchedulerEventType.MARK_CONTAINER_FOR_PREEMPTION - + ": appAttempt:" + aid.toString() + " container: " - + cont.toString()); - } - FiCaSchedulerApp app = getApplicationAttempt(aid); - if (app != null) { - app.markContainerForPreemption(cont.getContainerId()); - } - } - - @VisibleForTesting - @Override - public void killContainer(RMContainer container) { - markContainerForKillable(container); - } - - public void markContainerForKillable( - RMContainer killableContainer) { - try { - writeLock.lock(); - if (LOG.isDebugEnabled()) { - LOG.debug(SchedulerEventType.MARK_CONTAINER_FOR_KILLABLE + ": container" - + killableContainer.toString()); - } - - if (!isLazyPreemptionEnabled) { - super.completedContainer(killableContainer, SchedulerUtils - .createPreemptedContainerStatus(killableContainer.getContainerId(), - SchedulerUtils.PREEMPTED_CONTAINER), RMContainerEventType.KILL); - } else{ - FiCaSchedulerNode node = (FiCaSchedulerNode) getSchedulerNode( - killableContainer.getAllocatedNode()); + // Verify whether submitted priority is lesser than max priority + // in the cluster. If it is out of found, defining a max cap. + if (appPriority.getPriority() > getMaxClusterLevelAppPriority() + .getPriority()) { + appPriority = Priority + .newInstance(getMaxClusterLevelAppPriority().getPriority()); + } - FiCaSchedulerApp application = getCurrentAttemptForContainer( - killableContainer.getContainerId()); + // Lets check for ACLs here. + if (!appPriorityACLManager.checkAccess(user, queueName, appPriority)) { + throw new YarnException(new AccessControlException( + "User " + user + " does not have permission to submit/update " + + applicationId + " for " + appPriority)); + } - node.markContainerToKillable(killableContainer.getContainerId()); + LOG.info("Priority '" + appPriority.getPriority() + + "' is acceptable in queue : " + queueName + " for application: " + + applicationId); - // notify PreemptionManager - // Get the application for the finished container - if (null != application) { - String leafQueueName = application.getCSLeafQueue().getQueueName(); - getPreemptionManager().addKillableContainer( - new KillableContainer(killableContainer, node.getPartition(), - leafQueueName)); - } - } - } finally { - writeLock.unlock(); - } - } - - private void markContainerForNonKillable( - RMContainer nonKillableContainer) { - try { - writeLock.lock(); - if (LOG.isDebugEnabled()) { - LOG.debug( - SchedulerEventType.MARK_CONTAINER_FOR_NONKILLABLE + ": container" - + nonKillableContainer.toString()); - } - - FiCaSchedulerNode node = (FiCaSchedulerNode) getSchedulerNode( - nonKillableContainer.getAllocatedNode()); - - FiCaSchedulerApp application = getCurrentAttemptForContainer( - nonKillableContainer.getContainerId()); - - node.markContainerToNonKillable(nonKillableContainer.getContainerId()); - - // notify PreemptionManager - // Get the application for the finished container - if (null != application) { - String leafQueueName = application.getCSLeafQueue().getQueueName(); - getPreemptionManager().removeKillableContainer( - new KillableContainer(nonKillableContainer, node.getPartition(), - leafQueueName)); - } - } finally { - writeLock.unlock(); - } - } - - @Override - public boolean checkAccess(UserGroupInformation callerUGI, - QueueACL acl, String queueName) { - CSQueue queue = getQueue(queueName); - if (queue == null) { - if (LOG.isDebugEnabled()) { - LOG.debug("ACL not found for queue access-type " + acl + " for queue " - + queueName); - } - return false; - } - return queue.hasAccess(acl, callerUGI); - } - - @Override - public List getAppsInQueue(String queueName) { - CSQueue queue = getQueue(queueName); - if (queue == null) { - return null; - } - List apps = new ArrayList(); - queue.collectSchedulerApplications(apps); - return apps; - } - - public boolean isSystemAppsLimitReached() { - if (getRootQueue().getNumApplications() < conf - .getMaximumSystemApplications()) { - return false; - } - return true; - } - - private String getDefaultReservationQueueName(String planQueueName) { - return planQueueName + ReservationConstants.DEFAULT_QUEUE_SUFFIX; - } - - private String resolveReservationQueueName(String queueName, - ApplicationId applicationId, ReservationId reservationID, - boolean isRecovering) { - try { - readLock.lock(); - CSQueue queue = getQueue(queueName); - // Check if the queue is a plan queue - if ((queue == null) || !(queue instanceof PlanQueue)) { - return queueName; - } - if (reservationID != null) { - String resQName = reservationID.toString(); - queue = getQueue(resQName); - if (queue == null) { - // reservation has terminated during failover - if (isRecovering && conf.getMoveOnExpiry( - getQueue(queueName).getQueuePath())) { - // move to the default child queue of the plan - return getDefaultReservationQueueName(queueName); - } - String message = "Application " + applicationId - + " submitted to a reservation which is not currently active: " - + resQName; - this.rmContext.getDispatcher().getEventHandler().handle( - new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, - message)); - return null; - } - if (!queue.getParent().getQueueName().equals(queueName)) { - String message = - "Application: " + applicationId + " submitted to a reservation " - + resQName + " which does not belong to the specified queue: " - + queueName; - this.rmContext.getDispatcher().getEventHandler().handle( - new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, - message)); - return null; - } - // use the reservation queue to run the app - queueName = resQName; - } else{ - // use the default child queue of the plan for unreserved apps - queueName = getDefaultReservationQueueName(queueName); - } - return queueName; - } finally { - readLock.unlock(); - } - - } - - @Override - public void removeQueue(String queueName) - throws SchedulerDynamicEditException { - try { - writeLock.lock(); - LOG.info("Removing queue: " + queueName); - CSQueue q = this.getQueue(queueName); - if (!(AbstractAutoCreatedLeafQueue.class.isAssignableFrom( - q.getClass()))) { - throw new SchedulerDynamicEditException( - "The queue that we are asked " + "to remove (" + queueName - + ") is not a AutoCreatedLeafQueue or ReservationQueue"); - } - AbstractAutoCreatedLeafQueue disposableLeafQueue = - (AbstractAutoCreatedLeafQueue) q; - // at this point we should have no more apps - if (disposableLeafQueue.getNumApplications() > 0) { - throw new SchedulerDynamicEditException( - "The queue " + queueName + " is not empty " + disposableLeafQueue - .getApplications().size() + " active apps " - + disposableLeafQueue.getPendingApplications().size() - + " pending apps"); - } - - ((AbstractManagedParentQueue) disposableLeafQueue.getParent()) - .removeChildQueue(q); - this.queueManager.removeQueue(queueName); - LOG.info( - "Removal of AutoCreatedLeafQueue " + queueName + " has succeeded"); - } finally { - writeLock.unlock(); - } - } - - @Override - public void addQueue(Queue queue) - throws SchedulerDynamicEditException, IOException { - try { - writeLock.lock(); - if (queue == null) { - throw new SchedulerDynamicEditException( - "Queue specified is null. Should be an implementation of " - + "AbstractAutoCreatedLeafQueue"); - } else if (!(AbstractAutoCreatedLeafQueue.class - .isAssignableFrom(queue.getClass()))) { - throw new SchedulerDynamicEditException( - "Queue is not an implementation of " - + "AbstractAutoCreatedLeafQueue : " + queue.getClass()); - } - - AbstractAutoCreatedLeafQueue newQueue = - (AbstractAutoCreatedLeafQueue) queue; - - if (newQueue.getParent() == null || !(AbstractManagedParentQueue.class. - isAssignableFrom(newQueue.getParent().getClass()))) { - throw new SchedulerDynamicEditException( - "ParentQueue for " + newQueue + " is not properly set" - + " (should be set and be a PlanQueue or ManagedParentQueue)"); - } - - AbstractManagedParentQueue parent = - (AbstractManagedParentQueue) newQueue.getParent(); - String queuename = newQueue.getQueueName(); - parent.addChildQueue(newQueue); - this.queueManager.addQueue(queuename, newQueue); - - LOG.info("Creation of AutoCreatedLeafQueue " + newQueue + " succeeded"); - } finally { - writeLock.unlock(); - } - } - - @Override - public void setEntitlement(String inQueue, QueueEntitlement entitlement) - throws YarnException { - try { - writeLock.lock(); - LeafQueue queue = this.queueManager.getAndCheckLeafQueue(inQueue); - AbstractManagedParentQueue parent = - (AbstractManagedParentQueue) queue.getParent(); - - if (!(AbstractAutoCreatedLeafQueue.class.isAssignableFrom( - queue.getClass()))) { - throw new SchedulerDynamicEditException( - "Entitlement can not be" + " modified dynamically since queue " - + inQueue + " is not a AutoCreatedLeafQueue"); - } - - if (parent == null || !(AbstractManagedParentQueue.class.isAssignableFrom( - parent.getClass()))) { - throw new SchedulerDynamicEditException( - "The parent of AutoCreatedLeafQueue " + inQueue - + " must be a PlanQueue/ManagedParentQueue"); - } - - AbstractAutoCreatedLeafQueue newQueue = - (AbstractAutoCreatedLeafQueue) queue; - parent.validateQueueEntitlementChange(newQueue, entitlement); - - newQueue.setEntitlement(entitlement); - - LOG.info("Set entitlement for AutoCreatedLeafQueue " + inQueue + " to " - + queue.getCapacity() + " request was (" + entitlement.getCapacity() - + ")"); - } finally { - writeLock.unlock(); - } - } - - @Override - public String moveApplication(ApplicationId appId, - String targetQueueName) throws YarnException { - try { - writeLock.lock(); - SchedulerApplication application = - applications.get(appId); - if (application == null) { - throw new YarnException("App to be moved " + appId + " not found."); - } - String sourceQueueName = application.getQueue().getQueueName(); - LeafQueue source = - this.queueManager.getAndCheckLeafQueue(sourceQueueName); - String destQueueName = handleMoveToPlanQueue(targetQueueName); - LeafQueue dest = this.queueManager.getAndCheckLeafQueue(destQueueName); - - String user = application.getUser(); - try { - dest.submitApplication(appId, user, destQueueName); - } catch (AccessControlException e) { - throw new YarnException(e); - } - - FiCaSchedulerApp app = application.getCurrentAppAttempt(); - if (app != null) { - // Move all live containers even when stopped. - // For transferStateFromPreviousAttempt required - for (RMContainer rmContainer : app.getLiveContainers()) { - source.detachContainer(getClusterResource(), app, rmContainer); - // attach the Container to another queue - dest.attachContainer(getClusterResource(), app, rmContainer); - } - if (!app.isStopped()) { - source.finishApplicationAttempt(app, sourceQueueName); - // Submit to a new queue - dest.submitApplicationAttempt(app, user); - } - // Finish app & update metrics - app.move(dest); - } - source.appFinished(); - // Detach the application.. - source.getParent().finishApplication(appId, user); - application.setQueue(dest); - LOG.info("App: " + appId + " successfully moved from " + sourceQueueName - + " to: " + destQueueName); - return targetQueueName; - } finally { - writeLock.unlock(); - } - } - - @Override - public void preValidateMoveApplication(ApplicationId appId, - String newQueue) throws YarnException { - try { - writeLock.lock(); - SchedulerApplication application = - applications.get(appId); - if (application == null) { - throw new YarnException("App to be moved " + appId + " not found."); - } - String sourceQueueName = application.getQueue().getQueueName(); - this.queueManager.getAndCheckLeafQueue(sourceQueueName); - String destQueueName = handleMoveToPlanQueue(newQueue); - LeafQueue dest = this.queueManager.getAndCheckLeafQueue(destQueueName); - // Validation check - ACLs, submission limits for user & queue - String user = application.getUser(); - // Check active partition only when attempt is available - FiCaSchedulerApp appAttempt = - getApplicationAttempt(ApplicationAttemptId.newInstance(appId, 0)); - if (null != appAttempt) { - checkQueuePartition(appAttempt, dest); - } - try { - dest.validateSubmitApplication(appId, user, destQueueName); - } catch (AccessControlException e) { - throw new YarnException(e); - } - } finally { - writeLock.unlock(); - } - } - - /** - * Check application can be moved to queue with labels enabled. All labels in - * application life time will be checked - * - * @param app - * @param dest - * @throws YarnException - */ - private void checkQueuePartition(FiCaSchedulerApp app, LeafQueue dest) - throws YarnException { - if (!YarnConfiguration.areNodeLabelsEnabled(conf)) { - return; - } - Set targetqueuelabels = dest.getAccessibleNodeLabels(); - AppSchedulingInfo schedulingInfo = app.getAppSchedulingInfo(); - Set appLabelexpressions = schedulingInfo.getRequestedPartitions(); - // default partition access always available remove empty label - appLabelexpressions.remove(RMNodeLabelsManager.NO_LABEL); - Set nonAccessiblelabels = new HashSet(); - for (String label : appLabelexpressions) { - if (!SchedulerUtils.checkQueueLabelExpression(targetqueuelabels, label, - null)) { - nonAccessiblelabels.add(label); - } - } - if (nonAccessiblelabels.size() > 0) { - throw new YarnException( - "Specified queue=" + dest.getQueueName() + " can't satisfy following " - + "apps label expressions =" + nonAccessiblelabels - + " accessible node labels =" + targetqueuelabels); - } - } - - /** {@inheritDoc} */ - @Override - public EnumSet getSchedulingResourceTypes() { - if (calculator.getClass().getName() - .equals(DefaultResourceCalculator.class.getName())) { - return EnumSet.of(SchedulerResourceTypes.MEMORY); - } - return EnumSet.of(SchedulerResourceTypes.MEMORY, SchedulerResourceTypes.CPU); - } - - @Override - public Resource getMaximumResourceCapability(String queueName) { - CSQueue queue = getQueue(queueName); - if (queue == null) { - LOG.error("Unknown queue: " + queueName); - return getMaximumResourceCapability(); - } - if (!(queue instanceof LeafQueue)) { - LOG.error("queue " + queueName + " is not an leaf queue"); - return getMaximumResourceCapability(); - } - - // queue.getMaxAllocation returns *configured* maximum allocation. - // getMaximumResourceCapability() returns maximum allocation considers - // per-node maximum resources. So return (component-wise) min of the two. - - Resource queueMaxAllocation = ((LeafQueue)queue).getMaximumAllocation(); - Resource clusterMaxAllocationConsiderNodeMax = - getMaximumResourceCapability(); - - return Resources.componentwiseMin(queueMaxAllocation, - clusterMaxAllocationConsiderNodeMax); - } - - private String handleMoveToPlanQueue(String targetQueueName) { - CSQueue dest = getQueue(targetQueueName); - if (dest != null && dest instanceof PlanQueue) { - // use the default child reservation queue of the plan - targetQueueName = targetQueueName + ReservationConstants.DEFAULT_QUEUE_SUFFIX; - } - return targetQueueName; - } - - @Override - public Set getPlanQueues() { - Set ret = new HashSet(); - for (Map.Entry l : queueManager.getQueues().entrySet()) { - if (l.getValue() instanceof PlanQueue) { - ret.add(l.getKey()); - } - } - return ret; - } - - @Override - public Priority checkAndGetApplicationPriority( - Priority priorityRequestedByApp, UserGroupInformation user, - String queueName, ApplicationId applicationId) throws YarnException { - try { - readLock.lock(); - Priority appPriority = priorityRequestedByApp; - - // Verify the scenario where priority is null from submissionContext. - if (null == appPriority) { - // Verify whether submitted user has any default priority set. If so, - // user's default priority will get precedence over queue default. - // for updateApplicationPriority call flow, this check is done in - // CientRMService itself. - appPriority = this.appPriorityACLManager.getDefaultPriority(queueName, - user); - - // Get the default priority for the Queue. If Queue is non-existent, - // then - // use default priority. Do it only if user doesn't have any default. - if (null == appPriority) { - appPriority = this.queueManager.getDefaultPriorityForQueue(queueName); - } - - LOG.info( - "Application '" + applicationId + "' is submitted without priority " - + "hence considering default queue/cluster priority: " - + appPriority.getPriority()); - } - - // Verify whether submitted priority is lesser than max priority - // in the cluster. If it is out of found, defining a max cap. - if (appPriority.getPriority() > getMaxClusterLevelAppPriority() - .getPriority()) { - appPriority = Priority - .newInstance(getMaxClusterLevelAppPriority().getPriority()); - } - - // Lets check for ACLs here. - if (!appPriorityACLManager.checkAccess(user, queueName, appPriority)) { - throw new YarnException(new AccessControlException( - "User " + user + " does not have permission to submit/update " - + applicationId + " for " + appPriority)); - } - - LOG.info("Priority '" + appPriority.getPriority() - + "' is acceptable in queue : " + queueName + " for application: " - + applicationId); - - return appPriority; - } finally { - readLock.unlock(); - } - } - - @Override - public Priority updateApplicationPriority(Priority newPriority, - ApplicationId applicationId, SettableFuture future, - UserGroupInformation user) - throws YarnException { - try { - writeLock.lock(); - Priority appPriority = null; - SchedulerApplication application = applications - .get(applicationId); - - if (application == null) { - throw new YarnException("Application '" + applicationId - + "' is not present, hence could not change priority."); - } - - RMApp rmApp = rmContext.getRMApps().get(applicationId); - - appPriority = checkAndGetApplicationPriority(newPriority, user, - rmApp.getQueue(), applicationId); - - if (application.getPriority().equals(appPriority)) { - future.set(null); return appPriority; - } - - // Update new priority in Submission Context to update to StateStore. - rmApp.getApplicationSubmissionContext().setPriority(appPriority); - - // Update to state store - ApplicationStateData appState = ApplicationStateData.newInstance( - rmApp.getSubmitTime(), rmApp.getStartTime(), - rmApp.getApplicationSubmissionContext(), rmApp.getUser(), - rmApp.getCallerContext()); - appState.setApplicationTimeouts(rmApp.getApplicationTimeouts()); - rmContext.getStateStore().updateApplicationStateSynchronously(appState, - false, future); - - // As we use iterator over a TreeSet for OrderingPolicy, once we change - // priority then reinsert back to make order correct. - LeafQueue queue = (LeafQueue) getQueue(rmApp.getQueue()); - queue.updateApplicationPriority(application, appPriority); - - LOG.info("Priority '" + appPriority + "' is updated in queue :" - + rmApp.getQueue() + " for application: " + applicationId - + " for the user: " + rmApp.getUser()); - return appPriority; - } finally { - writeLock.unlock(); - } - } - - @Override - public PreemptionManager getPreemptionManager() { - return preemptionManager; - } - - @Override - public ResourceUsage getClusterResourceUsage() { - return getRootQueue().getQueueResourceUsage(); - } - - private SchedulerContainer getSchedulerContainer( - RMContainer rmContainer, boolean allocated) { - if (null == rmContainer) { - return null; - } - - FiCaSchedulerApp app = getApplicationAttempt( - rmContainer.getApplicationAttemptId()); - if (null == app) { return null; } - - NodeId nodeId; - // Get nodeId - if (rmContainer.getState() == RMContainerState.RESERVED) { - nodeId = rmContainer.getReservedNode(); - } else { - nodeId = rmContainer.getNodeId(); - } - - FiCaSchedulerNode node = getNode(nodeId); - if (null == node) { - return null; - } - return new SchedulerContainer<>(app, node, rmContainer, - // TODO, node partition should come from CSAssignment to avoid partition - // get updated before submitting the commit - node.getPartition(), allocated); - } - - private List> - getSchedulerContainersToRelease( - CSAssignment csAssignment) { - List> list = null; - - if (csAssignment.getContainersToKill() != null && !csAssignment - .getContainersToKill().isEmpty()) { - list = new ArrayList<>(); - for (RMContainer rmContainer : csAssignment.getContainersToKill()) { - list.add(getSchedulerContainer(rmContainer, false)); - } - } - - if (csAssignment.getExcessReservation() != null) { - if (null == list) { - list = new ArrayList<>(); - } - list.add( - getSchedulerContainer(csAssignment.getExcessReservation(), false)); - } - - return list; - } - - @VisibleForTesting - public void submitResourceCommitRequest(Resource cluster, - CSAssignment csAssignment) { - ResourceCommitRequest request = - createResourceCommitRequest(csAssignment); - - if (null == request) { - return; - } - - if (scheduleAsynchronously) { - // Submit to a commit thread and commit it async-ly - resourceCommitterService.addNewCommitRequest(request); - } else{ - // Otherwise do it sync-ly. - tryCommit(cluster, request, true); - } - } - - @Override - public boolean attemptAllocationOnNode(SchedulerApplicationAttempt appAttempt, - SchedulingRequest schedulingRequest, SchedulerNode schedulerNode) { - if (schedulingRequest.getResourceSizing() != null) { - if (schedulingRequest.getResourceSizing().getNumAllocations() > 1) { - LOG.warn("The SchedulingRequest has requested more than 1 allocation," + - " but only 1 will be attempted !!"); - } - if (!appAttempt.isStopped()) { - ResourceCommitRequest - resourceCommitRequest = createResourceCommitRequest( - appAttempt, schedulingRequest, schedulerNode); - - // Validate placement constraint is satisfied before - // committing the request. - try { - if (!PlacementConstraintsUtil.canSatisfyConstraints( - appAttempt.getApplicationId(), - schedulingRequest, schedulerNode, - rmContext.getPlacementConstraintManager(), - rmContext.getAllocationTagsManager())) { - LOG.debug("Failed to allocate container for application " - + appAttempt.getApplicationId() + " on node " - + schedulerNode.getNodeName() - + " because this allocation violates the" - + " placement constraint."); - return false; - } - } catch (InvalidAllocationTagsQueryException e) { - LOG.warn("Unable to allocate container", e); - return false; - } - return tryCommit(getClusterResource(), resourceCommitRequest, false); - } - } - return false; - } - - // This assumes numContainers = 1 for the request. - private ResourceCommitRequest - createResourceCommitRequest(SchedulerApplicationAttempt appAttempt, - SchedulingRequest schedulingRequest, SchedulerNode schedulerNode) { - ContainerAllocationProposal allocated = - null; - Resource resource = schedulingRequest.getResourceSizing().getResources(); - if (Resources.greaterThan(calculator, getClusterResource(), - resource, Resources.none())) { - ContainerId cId = - ContainerId.newContainerId(appAttempt.getApplicationAttemptId(), - appAttempt.getAppSchedulingInfo().getNewContainerId()); - Container container = BuilderUtils.newContainer( - cId, schedulerNode.getNodeID(), schedulerNode.getHttpAddress(), - resource, schedulingRequest.getPriority(), null, - ExecutionType.GUARANTEED, - schedulingRequest.getAllocationRequestId()); - RMContainer rmContainer = new RMContainerImpl(container, - SchedulerRequestKey.extractFrom(container), - appAttempt.getApplicationAttemptId(), container.getNodeId(), - appAttempt.getUser(), rmContext, false); - ((RMContainerImpl)rmContainer).setAllocationTags( - new HashSet<>(schedulingRequest.getAllocationTags())); - - allocated = new ContainerAllocationProposal<>( - getSchedulerContainer(rmContainer, true), - null, null, NodeType.NODE_LOCAL, NodeType.NODE_LOCAL, - SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY, - resource); - } - - if (null != allocated) { - List> - allocationsList = new ArrayList<>(); - allocationsList.add(allocated); - - return new ResourceCommitRequest<>(allocationsList, null, null); - } - return null; - } - - @VisibleForTesting - public ResourceCommitRequest - createResourceCommitRequest(CSAssignment csAssignment) { - ContainerAllocationProposal allocated = - null; - ContainerAllocationProposal reserved = - null; - List> released = - null; - - if (Resources.greaterThan(calculator, getClusterResource(), - csAssignment.getResource(), Resources.none())) { - // Allocated something - List allocations = - csAssignment.getAssignmentInformation().getAllocationDetails(); - if (!allocations.isEmpty()) { - RMContainer rmContainer = allocations.get(0).rmContainer; - allocated = new ContainerAllocationProposal<>( - getSchedulerContainer(rmContainer, true), - getSchedulerContainersToRelease(csAssignment), - getSchedulerContainer(csAssignment.getFulfilledReservedContainer(), - false), csAssignment.getType(), - csAssignment.getRequestLocalityType(), - csAssignment.getSchedulingMode() != null ? - csAssignment.getSchedulingMode() : - SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY, - csAssignment.getResource()); - } - - // Reserved something - List reservation = - csAssignment.getAssignmentInformation().getReservationDetails(); - if (!reservation.isEmpty()) { - RMContainer rmContainer = reservation.get(0).rmContainer; - reserved = new ContainerAllocationProposal<>( - getSchedulerContainer(rmContainer, false), - getSchedulerContainersToRelease(csAssignment), - getSchedulerContainer(csAssignment.getFulfilledReservedContainer(), - false), csAssignment.getType(), - csAssignment.getRequestLocalityType(), - csAssignment.getSchedulingMode() != null ? - csAssignment.getSchedulingMode() : - SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY, - csAssignment.getResource()); - } - } - - // When we don't need to allocate/reserve anything, we can feel free to - // kill all to-release containers in the request. - if (null == allocated && null == reserved) { - released = getSchedulerContainersToRelease(csAssignment); - } - - if (null != allocated || null != reserved || (null != released && !released - .isEmpty())) { - List> - allocationsList = null; - if (allocated != null) { - allocationsList = new ArrayList<>(); - allocationsList.add(allocated); - } - - List> - reservationsList = null; - if (reserved != null) { - reservationsList = new ArrayList<>(); - reservationsList.add(reserved); - } - - return new ResourceCommitRequest<>(allocationsList, reservationsList, - released); - } - - return null; - } - - @Override - public boolean tryCommit(Resource cluster, ResourceCommitRequest r, - boolean updatePending) { - long commitStart = System.nanoTime(); - ResourceCommitRequest request = - (ResourceCommitRequest) r; - - ApplicationAttemptId attemptId = null; - - // We need to update unconfirmed allocated resource of application when - // any container allocated. - boolean updateUnconfirmedAllocatedResource = - request.getContainersToAllocate() != null && !request - .getContainersToAllocate().isEmpty(); - - // find the application to accept and apply the ResourceCommitRequest - if (request.anythingAllocatedOrReserved()) { - ContainerAllocationProposal c = - request.getFirstAllocatedOrReservedContainer(); - attemptId = - c.getAllocatedOrReservedContainer().getSchedulerApplicationAttempt() - .getApplicationAttemptId(); - } else { - if (!request.getContainersToRelease().isEmpty()) { - attemptId = request.getContainersToRelease().get(0) - .getSchedulerApplicationAttempt().getApplicationAttemptId(); - } - } - - if (LOG.isDebugEnabled()) { - LOG.debug("Try to commit allocation proposal=" + request); - } - - boolean isSuccess = false; - if (attemptId != null) { - FiCaSchedulerApp app = getApplicationAttempt(attemptId); - // Required sanity check for attemptId - when async-scheduling enabled, - // proposal might be outdated if AM failover just finished - // and proposal queue was not be consumed in time - if (app != null && attemptId.equals(app.getApplicationAttemptId())) { - if (app.accept(cluster, request, updatePending) - && app.apply(cluster, request, updatePending)) { - long commitSuccess = System.nanoTime() - commitStart; - CapacitySchedulerMetrics.getMetrics() - .addCommitSuccess(commitSuccess); - LOG.info("Allocation proposal accepted"); - isSuccess = true; - } else{ - long commitFailed = System.nanoTime() - commitStart; - CapacitySchedulerMetrics.getMetrics() - .addCommitFailure(commitFailed); - LOG.info("Failed to accept allocation proposal"); + + } + + @Override + public synchronized Priority updateApplicationPriority(Priority newPriority, + ApplicationId applicationId, SettableFuture future, + UserGroupInformation user) + throws YarnException { + Priority appPriority = null; + SchedulerApplication application = applications + .get(applicationId); + + if (application == null) { + throw new YarnException("Application '" + applicationId + + "' is not present, hence could not change priority."); + } + + RMApp rmApp = rmContext.getRMApps().get(applicationId); + + appPriority = checkAndGetApplicationPriority(newPriority, user, + rmApp.getQueue(), applicationId); + + if (application.getPriority().equals(appPriority)) { + future.set(null); + return appPriority; + } + + // Update new priority in Submission Context to update to StateStore. + rmApp.getApplicationSubmissionContext().setPriority(appPriority); + + // Update to state store + ApplicationStateData appState = ApplicationStateData.newInstance( + rmApp.getSubmitTime(), rmApp.getStartTime(), + rmApp.getApplicationSubmissionContext(), rmApp.getUser(), + rmApp.getCallerContext()); + appState.setApplicationTimeouts(rmApp.getApplicationTimeouts()); + rmContext.getStateStore().updateApplicationStateSynchronously(appState, + false, future); + + // As we use iterator over a TreeSet for OrderingPolicy, once we change + // priority then reinsert back to make order correct. + LeafQueue queue = (LeafQueue) getQueue(rmApp.getQueue()); + queue.updateApplicationPriority(application, appPriority); + + LOG.info("Priority '" + appPriority + "' is updated in queue :" + + rmApp.getQueue() + " for application: " + applicationId + + " for the user: " + rmApp.getUser()); + return appPriority; + } + + @Override + public PreemptionManager getPreemptionManager() { + return preemptionManager; + } + + @Override + public ResourceUsage getClusterResourceUsage() { + return getRootQueue().getQueueResourceUsage(); + } + + private SchedulerContainer getSchedulerContainer( + RMContainer rmContainer, boolean allocated) { + if (null == rmContainer) { + return null; } - if (LOG.isDebugEnabled()) { - LOG.debug("Allocation proposal accepted=" + isSuccess + ", proposal=" - + request); - } - - // Update unconfirmed allocated resource. - if (updateUnconfirmedAllocatedResource) { - app.decUnconfirmedRes(request.getTotalAllocatedResource()); - } - } - } - return isSuccess; - } - - public int getAsyncSchedulingPendingBacklogs() { - if (scheduleAsynchronously) { - return resourceCommitterService.getPendingBacklogs(); - } - return 0; - } - - @Override - public CapacitySchedulerQueueManager getCapacitySchedulerQueueManager() { - return this.queueManager; - } - - /** - * Try to move a reserved container to a targetNode. - * If the targetNode is reserved by another application (other than this one). - * The previous reservation will be cancelled. - * - * @param toBeMovedContainer reserved container will be moved - * @param targetNode targetNode - * @return true if move succeeded. Return false if the targetNode is reserved by - * a different container or move failed because of any other reasons. - */ - public boolean moveReservedContainer(RMContainer toBeMovedContainer, - FiCaSchedulerNode targetNode) { - try { - writeLock.lock(); - - if (LOG.isDebugEnabled()) { - LOG.debug("Trying to move container=" + toBeMovedContainer + " to node=" - + targetNode.getNodeID()); - } - - FiCaSchedulerNode sourceNode = getNode(toBeMovedContainer.getNodeId()); - if (null == sourceNode) { - if (LOG.isDebugEnabled()) { - LOG.debug("Failed to move reservation, cannot find source node=" - + toBeMovedContainer.getNodeId()); + FiCaSchedulerApp app = getApplicationAttempt( + rmContainer.getApplicationAttemptId()); + if (null == app) { + return null; } - return false; - } - // Target node updated? - if (getNode(targetNode.getNodeID()) != targetNode) { - if (LOG.isDebugEnabled()) { - LOG.debug( - "Failed to move reservation, node updated or removed, moving " - + "cancelled."); + NodeId nodeId; + // Get nodeId + if (rmContainer.getState() == RMContainerState.RESERVED) { + nodeId = rmContainer.getReservedNode(); + } else { + nodeId = rmContainer.getNodeId(); } - return false; - } - // Target node's reservation status changed? - if (targetNode.getReservedContainer() != null) { - if (LOG.isDebugEnabled()) { - LOG.debug( - "Target node's reservation status changed, moving cancelled."); + FiCaSchedulerNode node = getNode(nodeId); + if (null == node) { + return null; + } + return new SchedulerContainer<>(app, node, rmContainer, + // TODO, node partition should come from CSAssignment to avoid partition + // get updated before submitting the commit + node.getPartition(), allocated); + } + + private List> + getSchedulerContainersToRelease( + CSAssignment csAssignment) { + List> list = null; + + if (csAssignment.getContainersToKill() != null && !csAssignment + .getContainersToKill().isEmpty()) { + list = new ArrayList<>(); + for (RMContainer rmContainer : csAssignment.getContainersToKill()) { + list.add(getSchedulerContainer(rmContainer, false)); + } + } + + if (csAssignment.getExcessReservation() != null) { + if (null == list) { + list = new ArrayList<>(); + } + list.add( + getSchedulerContainer(csAssignment.getExcessReservation(), false)); + } + + return list; + } + + @VisibleForTesting + public void submitResourceCommitRequest(Resource cluster, + CSAssignment csAssignment) { + ResourceCommitRequest request = + createResourceCommitRequest(csAssignment); + + if (null == request) { + return; + } + + if (scheduleAsynchronously) { + // Submit to a commit thread and commit it async-ly + resourceCommitterService.addNewCommitRequest(request); + } else { + // Otherwise do it sync-ly. + tryCommit(cluster, request, true); + } + } + + @Override + public boolean attemptAllocationOnNode(SchedulerApplicationAttempt appAttempt, + SchedulingRequest schedulingRequest, SchedulerNode schedulerNode) { + if (schedulingRequest.getResourceSizing() != null) { + if (schedulingRequest.getResourceSizing().getNumAllocations() > 1) { + LOG.warn("The SchedulingRequest has requested more than 1 allocation," + + " but only 1 will be attempted !!"); + } + if (!appAttempt.isStopped()) { + ResourceCommitRequest + resourceCommitRequest = createResourceCommitRequest( + appAttempt, schedulingRequest, schedulerNode); + + // Validate placement constraint is satisfied before + // committing the request. + try { + if (!PlacementConstraintsUtil.canSatisfyConstraints( + appAttempt.getApplicationId(), + schedulingRequest, schedulerNode, + rmContext.getPlacementConstraintManager(), + rmContext.getAllocationTagsManager())) { + LOG.debug("Failed to allocate container for application " + + appAttempt.getApplicationId() + " on node " + + schedulerNode.getNodeName() + + " because this allocation violates the" + + " placement constraint."); + return false; + } + } catch (InvalidAllocationTagsQueryException e) { + LOG.warn("Unable to allocate container", e); + return false; + } + return tryCommit(getClusterResource(), resourceCommitRequest, false); + } } return false; - } + } + + // This assumes numContainers = 1 for the request. + private ResourceCommitRequest + createResourceCommitRequest(SchedulerApplicationAttempt appAttempt, + SchedulingRequest schedulingRequest, SchedulerNode schedulerNode) { + ContainerAllocationProposal allocated = + null; + Resource resource = schedulingRequest.getResourceSizing().getResources(); + if (Resources.greaterThan(calculator, getClusterResource(), + resource, Resources.none())) { + ContainerId cId = + ContainerId.newContainerId(appAttempt.getApplicationAttemptId(), + appAttempt.getAppSchedulingInfo().getNewContainerId()); + Container container = BuilderUtils.newContainer( + cId, schedulerNode.getNodeID(), schedulerNode.getHttpAddress(), + resource, schedulingRequest.getPriority(), null, + ExecutionType.GUARANTEED, + schedulingRequest.getAllocationRequestId()); + RMContainer rmContainer = new RMContainerImpl(container, + SchedulerRequestKey.extractFrom(container), + appAttempt.getApplicationAttemptId(), container.getNodeId(), + appAttempt.getUser(), rmContext, false); + ((RMContainerImpl) rmContainer).setAllocationTags( + new HashSet<>(schedulingRequest.getAllocationTags())); + + allocated = new ContainerAllocationProposal<>( + getSchedulerContainer(rmContainer, true), + null, null, NodeType.NODE_LOCAL, NodeType.NODE_LOCAL, + SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY, + resource); + } + + if (null != allocated) { + List> + allocationsList = new ArrayList<>(); + allocationsList.add(allocated); + + return new ResourceCommitRequest<>(allocationsList, null, null); + } + return null; + } + + @VisibleForTesting + public ResourceCommitRequest + createResourceCommitRequest(CSAssignment csAssignment) { + ContainerAllocationProposal allocated = + null; + ContainerAllocationProposal reserved = + null; + List> released = + null; + + if (Resources.greaterThan(calculator, getClusterResource(), + csAssignment.getResource(), Resources.none())) { + // Allocated something + List allocations = + csAssignment.getAssignmentInformation().getAllocationDetails(); + if (!allocations.isEmpty()) { + RMContainer rmContainer = allocations.get(0).rmContainer; + allocated = new ContainerAllocationProposal<>( + getSchedulerContainer(rmContainer, true), + getSchedulerContainersToRelease(csAssignment), + getSchedulerContainer(csAssignment.getFulfilledReservedContainer(), + false), csAssignment.getType(), + csAssignment.getRequestLocalityType(), + csAssignment.getSchedulingMode() != null ? + csAssignment.getSchedulingMode() : + SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY, + csAssignment.getResource()); + } + + // Reserved something + List reservation = + csAssignment.getAssignmentInformation().getReservationDetails(); + if (!reservation.isEmpty()) { + RMContainer rmContainer = reservation.get(0).rmContainer; + reserved = new ContainerAllocationProposal<>( + getSchedulerContainer(rmContainer, false), + getSchedulerContainersToRelease(csAssignment), + getSchedulerContainer(csAssignment.getFulfilledReservedContainer(), + false), csAssignment.getType(), + csAssignment.getRequestLocalityType(), + csAssignment.getSchedulingMode() != null ? + csAssignment.getSchedulingMode() : + SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY, + csAssignment.getResource()); + } + } + + // When we don't need to allocate/reserve anything, we can feel free to + // kill all to-release containers in the request. + if (null == allocated && null == reserved) { + released = getSchedulerContainersToRelease(csAssignment); + } + + if (null != allocated || null != reserved || (null != released && !released + .isEmpty())) { + List> + allocationsList = null; + if (allocated != null) { + allocationsList = new ArrayList<>(); + allocationsList.add(allocated); + } + + List> + reservationsList = null; + if (reserved != null) { + reservationsList = new ArrayList<>(); + reservationsList.add(reserved); + } + + return new ResourceCommitRequest<>(allocationsList, reservationsList, + released); + } + + return null; + } + + @Override + public boolean tryCommit(Resource cluster, ResourceCommitRequest r, + boolean updatePending) { + long commitStart = System.nanoTime(); + ResourceCommitRequest request = + (ResourceCommitRequest) r; + + ApplicationAttemptId attemptId = null; + + // We need to update unconfirmed allocated resource of application when + // any container allocated. + boolean updateUnconfirmedAllocatedResource = + request.getContainersToAllocate() != null && !request + .getContainersToAllocate().isEmpty(); + + // find the application to accept and apply the ResourceCommitRequest + if (request.anythingAllocatedOrReserved()) { + ContainerAllocationProposal c = + request.getFirstAllocatedOrReservedContainer(); + attemptId = + c.getAllocatedOrReservedContainer().getSchedulerApplicationAttempt() + .getApplicationAttemptId(); + } else { + if (!request.getContainersToRelease().isEmpty()) { + attemptId = request.getContainersToRelease().get(0) + .getSchedulerApplicationAttempt().getApplicationAttemptId(); + } + } - FiCaSchedulerApp app = getApplicationAttempt( - toBeMovedContainer.getApplicationAttemptId()); - if (null == app) { if (LOG.isDebugEnabled()) { - LOG.debug("Cannot find to-be-moved container's application=" - + toBeMovedContainer.getApplicationAttemptId()); + LOG.debug("Try to commit allocation proposal=" + request); } - return false; - } - - // finally, move the reserved container - return app.moveReservation(toBeMovedContainer, sourceNode, targetNode); - } finally { - writeLock.unlock(); - } - } - - @Override - public long checkAndGetApplicationLifetime(String queueName, - long lifetimeRequestedByApp) { - try { - readLock.lock(); - CSQueue queue = getQueue(queueName); - if (queue == null || !(queue instanceof LeafQueue)) { - return lifetimeRequestedByApp; - } - long defaultApplicationLifetime = - ((LeafQueue) queue).getDefaultApplicationLifetime(); - long maximumApplicationLifetime = - ((LeafQueue) queue).getMaximumApplicationLifetime(); + boolean isSuccess = false; + if (attemptId != null) { + FiCaSchedulerApp app = getApplicationAttempt(attemptId); + // Required sanity check for attemptId - when async-scheduling enabled, + // proposal might be outdated if AM failover just finished + // and proposal queue was not be consumed in time + if (app != null && attemptId.equals(app.getApplicationAttemptId())) { + if (app.accept(cluster, request, updatePending) + && app.apply(cluster, request, updatePending)) { + long commitSuccess = System.nanoTime() - commitStart; + CapacitySchedulerMetrics.getMetrics() + .addCommitSuccess(commitSuccess); + LOG.info("Allocation proposal accepted"); + isSuccess = true; + } else { + long commitFailed = System.nanoTime() - commitStart; + CapacitySchedulerMetrics.getMetrics() + .addCommitFailure(commitFailed); + LOG.info("Failed to accept allocation proposal"); + } + + if (LOG.isDebugEnabled()) { + LOG.debug("Allocation proposal accepted=" + isSuccess + ", proposal=" + + request); + } + + // Update unconfirmed allocated resource. + if (updateUnconfirmedAllocatedResource) { + app.decUnconfirmedRes(request.getTotalAllocatedResource()); + } + } + } + return isSuccess; + } + + public int getAsyncSchedulingPendingBacklogs() { + if (scheduleAsynchronously) { + return resourceCommitterService.getPendingBacklogs(); + } + return 0; + } + + @Override + public CapacitySchedulerQueueManager getCapacitySchedulerQueueManager() { + return this.queueManager; + } + + /** + * Try to move a reserved container to a targetNode. + * If the targetNode is reserved by another application (other than this one). + * The previous reservation will be cancelled. + * + * @param toBeMovedContainer reserved container will be moved + * @param targetNode targetNode + * @return true if move succeeded. Return false if the targetNode is reserved by + * a different container or move failed because of any other reasons. + */ + public synchronized boolean moveReservedContainer(RMContainer toBeMovedContainer, + FiCaSchedulerNode targetNode) { + if (LOG.isDebugEnabled()) { + LOG.debug("Trying to move container=" + toBeMovedContainer + " to node=" + + targetNode.getNodeID()); + } + + FiCaSchedulerNode sourceNode = getNode(toBeMovedContainer.getNodeId()); + if (null == sourceNode) { + if (LOG.isDebugEnabled()) { + LOG.debug("Failed to move reservation, cannot find source node=" + + toBeMovedContainer.getNodeId()); + } + return false; + } + + // Target node updated? + if (getNode(targetNode.getNodeID()) != targetNode) { + if (LOG.isDebugEnabled()) { + LOG.debug( + "Failed to move reservation, node updated or removed, moving " + + "cancelled."); + } + return false; + } + + // Target node's reservation status changed? + if (targetNode.getReservedContainer() != null) { + if (LOG.isDebugEnabled()) { + LOG.debug( + "Target node's reservation status changed, moving cancelled."); + } + return false; + } + + FiCaSchedulerApp app = getApplicationAttempt( + toBeMovedContainer.getApplicationAttemptId()); + if (null == app) { + if (LOG.isDebugEnabled()) { + LOG.debug("Cannot find to-be-moved container's application=" + + toBeMovedContainer.getApplicationAttemptId()); + } + return false; + } + + // finally, move the reserved container + return app.moveReservation(toBeMovedContainer, sourceNode, targetNode); + } + + @Override + public long checkAndGetApplicationLifetime(String queueName, + long lifetimeRequestedByApp) { + CSQueue queue = getQueue(queueName); + if (queue == null || !(queue instanceof LeafQueue)) { + return lifetimeRequestedByApp; + } + + long defaultApplicationLifetime = + ((LeafQueue) queue).getDefaultApplicationLifetime(); + long maximumApplicationLifetime = + ((LeafQueue) queue).getMaximumApplicationLifetime(); + + // check only for maximum, that's enough because default can't + // exceed maximum + if (maximumApplicationLifetime <= 0) { + return lifetimeRequestedByApp; + } - // check only for maximum, that's enough because default can't - // exceed maximum - if (maximumApplicationLifetime <= 0) { + if (lifetimeRequestedByApp <= 0) { + return defaultApplicationLifetime; + } else if (lifetimeRequestedByApp > maximumApplicationLifetime) { + return maximumApplicationLifetime; + } return lifetimeRequestedByApp; - } - - if (lifetimeRequestedByApp <= 0) { - return defaultApplicationLifetime; - } else if (lifetimeRequestedByApp > maximumApplicationLifetime) { - return maximumApplicationLifetime; - } - return lifetimeRequestedByApp; - } finally { - readLock.unlock(); - } - } - - @Override - public long getMaximumApplicationLifetime(String queueName) { - CSQueue queue = getQueue(queueName); - if (queue == null || !(queue instanceof LeafQueue)) { - LOG.error("Unknown queue: " + queueName); - return -1; - } - // In seconds - return ((LeafQueue) queue).getMaximumApplicationLifetime(); - } - - @Override - public boolean isConfigurationMutable() { - return csConfProvider instanceof MutableConfigurationProvider; - } - - @Override - public MutableConfigurationProvider getMutableConfProvider() { - if (isConfigurationMutable()) { - return (MutableConfigurationProvider) csConfProvider; - } - return null; - } - - private LeafQueue autoCreateLeafQueue( - ApplicationPlacementContext placementContext) - throws IOException, YarnException { - - AutoCreatedLeafQueue autoCreatedLeafQueue = null; - - String leafQueueName = placementContext.getQueue(); - String parentQueueName = placementContext.getParentQueue(); - - if (!StringUtils.isEmpty(parentQueueName)) { - CSQueue parentQueue = getQueue(parentQueueName); - - if (parentQueue != null && conf.isAutoCreateChildQueueEnabled( - parentQueue.getQueuePath())) { - - ManagedParentQueue autoCreateEnabledParentQueue = - (ManagedParentQueue) parentQueue; - autoCreatedLeafQueue = new AutoCreatedLeafQueue(this, leafQueueName, - autoCreateEnabledParentQueue); - - addQueue(autoCreatedLeafQueue); - - } else{ - throw new SchedulerDynamicEditException( - "Could not auto-create leaf queue for " + leafQueueName - + ". Queue mapping specifies an invalid parent queue " - + "which does not exist " - + parentQueueName); - } - } else{ - throw new SchedulerDynamicEditException( - "Could not auto-create leaf queue for " + leafQueueName - + ". Queue mapping does not specify" - + " which parent queue it needs to be created under."); - } - return autoCreatedLeafQueue; - } - - @Override - public void resetSchedulerMetrics() { - CapacitySchedulerMetrics.destroy(); - } + } + + @Override + public long getMaximumApplicationLifetime(String queueName) { + CSQueue queue = getQueue(queueName); + if (queue == null || !(queue instanceof LeafQueue)) { + LOG.error("Unknown queue: " + queueName); + return -1; + } + // In seconds + return ((LeafQueue) queue).getMaximumApplicationLifetime(); + } + + @Override + public boolean isConfigurationMutable() { + return csConfProvider instanceof MutableConfigurationProvider; + } + + @Override + public MutableConfigurationProvider getMutableConfProvider() { + if (isConfigurationMutable()) { + return (MutableConfigurationProvider) csConfProvider; + } + return null; + } + + private LeafQueue autoCreateLeafQueue( + ApplicationPlacementContext placementContext) + throws IOException, YarnException { + + AutoCreatedLeafQueue autoCreatedLeafQueue = null; + + String leafQueueName = placementContext.getQueue(); + String parentQueueName = placementContext.getParentQueue(); + + if (!StringUtils.isEmpty(parentQueueName)) { + CSQueue parentQueue = getQueue(parentQueueName); + + if (parentQueue != null && conf.isAutoCreateChildQueueEnabled( + parentQueue.getQueuePath())) { + + ManagedParentQueue autoCreateEnabledParentQueue = + (ManagedParentQueue) parentQueue; + autoCreatedLeafQueue = new AutoCreatedLeafQueue(this, leafQueueName, + autoCreateEnabledParentQueue); + + addQueue(autoCreatedLeafQueue); + + } else { + throw new SchedulerDynamicEditException( + "Could not auto-create leaf queue for " + leafQueueName + + ". Queue mapping specifies an invalid parent queue " + + "which does not exist " + + parentQueueName); + } + } else { + throw new SchedulerDynamicEditException( + "Could not auto-create leaf queue for " + leafQueueName + + ". Queue mapping does not specify" + + " which parent queue it needs to be created under."); + } + return autoCreatedLeafQueue; + } + + @Override + public void resetSchedulerMetrics() { + CapacitySchedulerMetrics.destroy(); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java index 5c08251..6c96e20 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java @@ -168,11 +168,9 @@ protected void setupQueueConfigs(Resource clusterResource) setupQueueConfigs(clusterResource, csContext.getConfiguration()); } - protected void setupQueueConfigs(Resource clusterResource, + protected synchronized void setupQueueConfigs(Resource clusterResource, CapacitySchedulerConfiguration conf) throws IOException { - try { - writeLock.lock(); CapacitySchedulerConfiguration schedConf = csContext.getConfiguration(); super.setupQueueConfigs(clusterResource, conf); @@ -335,9 +333,6 @@ protected void setupQueueConfigs(Resource clusterResource, + "\nmaxLifetime = " + maxApplicationLifetime + " seconds" + "\ndefaultLifetime = " + defaultApplicationLifetime + " seconds"); - } finally { - writeLock.unlock(); - } } /** @@ -405,61 +400,42 @@ void setUserLimitFactor(float userLimitFactor) { @Override public int getNumApplications() { try { - readLock.lock(); return getNumPendingApplications() + getNumActiveApplications(); } finally { - readLock.unlock(); } } public int getNumPendingApplications() { try { - readLock.lock(); return pendingOrderingPolicy.getNumSchedulableEntities(); } finally { - readLock.unlock(); } } - public int getNumActiveApplications() { - try { - readLock.lock(); + public synchronized int getNumActiveApplications() { return orderingPolicy.getNumSchedulableEntities(); - } finally { - readLock.unlock(); - } } @Private - public int getNumPendingApplications(String user) { - try { - readLock.lock(); + public synchronized int getNumPendingApplications(String user) { User u = getUser(user); if (null == u) { return 0; } return u.getPendingApplications(); - } finally { - readLock.unlock(); - } } @Private - public int getNumActiveApplications(String user) { - try { - readLock.lock(); + public synchronized int getNumActiveApplications(String user) { User u = getUser(user); if (null == u) { return 0; } return u.getActiveApplications(); - } finally { - readLock.unlock(); - } } @Private - public int getUserLimit() { + public synchronized int getUserLimit() { return usersManager.getUserLimit(); } @@ -476,10 +452,8 @@ public QueueInfo getQueueInfo( } @Override - public List + public synchronized List getQueueUserAclInfo(UserGroupInformation user) { - try { - readLock.lock(); QueueUserACLInfo userAclInfo = recordFactory.newRecordInstance( QueueUserACLInfo.class); List operations = new ArrayList<>(); @@ -492,15 +466,9 @@ public QueueInfo getQueueInfo( userAclInfo.setQueueName(getQueueName()); userAclInfo.setUserAcls(operations); return Collections.singletonList(userAclInfo); - } finally { - readLock.unlock(); - } - } public String toString() { - try { - readLock.lock(); return queueName + ": " + "capacity=" + queueCapacities.getCapacity() + ", " + "absoluteCapacity=" + queueCapacities.getAbsoluteCapacity() + ", " + "usedResources=" + queueUsage.getUsed() + ", " @@ -511,10 +479,6 @@ public String toString() { getEffectiveCapacity(CommonNodeLabelsManager.NO_LABEL) + " , effectiveMaxResource=" + getEffectiveMaxCapacity(CommonNodeLabelsManager.NO_LABEL); - } finally { - readLock.unlock(); - } - } @VisibleForTesting @@ -523,22 +487,14 @@ public User getUser(String userName) { } @Private - public List getPriorityACLs() { - try { - readLock.lock(); + public synchronized List getPriorityACLs() { return new ArrayList<>(priorityAcls); - } finally { - readLock.unlock(); - } } - protected void reinitialize( + protected synchronized void reinitialize( CSQueue newlyParsedQueue, Resource clusterResource, CapacitySchedulerConfiguration configuration) throws IOException { - - try { - writeLock.lock(); // Sanity check if (!(newlyParsedQueue instanceof LeafQueue) || !newlyParsedQueue .getQueuePath().equals(getQueuePath())) { @@ -566,10 +522,6 @@ protected void reinitialize( // queue metrics are updated, more resource may be available // activate the pending applications if possible activateApplications(); - - } finally { - writeLock.unlock(); - } } @Override @@ -584,17 +536,13 @@ public void reinitialize( public void submitApplicationAttempt(FiCaSchedulerApp application, String userName) { // Careful! Locking order is important! - try { - writeLock.lock(); - + synchronized (this){ // TODO, should use getUser, use this method just to avoid UT failure // which is caused by wrong invoking order, will fix UT separately User user = usersManager.getUserAndAddIfAbsent(userName); // Add the attempt to our data-structures addApplicationAttempt(application, user); - } finally { - writeLock.unlock(); } // We don't want to update metrics for move app @@ -624,8 +572,7 @@ public void submitApplication(ApplicationId applicationId, String userName, public void validateSubmitApplication(ApplicationId applicationId, String userName, String queue) throws AccessControlException { - try { - writeLock.lock(); + synchronized(this) { // Check if the queue is accepting jobs if (getState() != QueueState.RUNNING) { String msg = "Queue " + getQueuePath() @@ -654,10 +601,7 @@ public void validateSubmitApplication(ApplicationId applicationId, LOG.info(msg); throw new AccessControlException(msg); } - } finally { - writeLock.unlock(); } - try { getParent().validateSubmitApplication(applicationId, userName, queue); } catch (AccessControlException ace) { @@ -687,14 +631,12 @@ public Resource getUserAMResourceLimit() { null); } - public Resource getUserAMResourceLimitPerPartition( + public synchronized Resource getUserAMResourceLimitPerPartition( String nodePartition, String userName) { float userWeight = 1.0f; if (userName != null && getUser(userName) != null) { userWeight = getUser(userName).getWeight(); } - try { - readLock.lock(); /* * The user am resource limit is based on the same approach as the user * limit (as it should represent a subset of that). This means that it uses @@ -735,16 +677,10 @@ public Resource getUserAMResourceLimitPerPartition( + userAMLimit + ". User weight: " + userWeight); } return userAMLimit; - } finally { - readLock.unlock(); - } - } - public Resource calculateAndGetAMResourceLimitPerPartition( + public synchronized Resource calculateAndGetAMResourceLimitPerPartition( String nodePartition) { - try { - writeLock.lock(); /* * For non-labeled partition, get the max value from resources currently * available to the queue and the absolute resources guaranteed for the @@ -790,14 +726,9 @@ public Resource calculateAndGetAMResourceLimitPerPartition( + " amResourceLimit : " + amResouceLimit); } return amResouceLimit; - } finally { - writeLock.unlock(); - } } - protected void activateApplications() { - try { - writeLock.lock(); + protected synchronized void activateApplications() { // limit of allowed resource usage for application masters Map userAmPartitionLimit = new HashMap(); @@ -911,15 +842,10 @@ protected void activateApplications() { LOG.info("Application " + applicationId + " from user: " + application .getUser() + " activated in queue: " + getQueueName()); } - } finally { - writeLock.unlock(); - } } - private void addApplicationAttempt(FiCaSchedulerApp application, + private synchronized void addApplicationAttempt(FiCaSchedulerApp application, User user) { - try { - writeLock.lock(); // Accept user.submitApplication(); getPendingAppsOrderingPolicy().addSchedulableEntity(application); @@ -946,9 +872,6 @@ private void addApplicationAttempt(FiCaSchedulerApp application, .getActiveApplications() + " #queue-pending-applications: " + getNumPendingApplications() + " #queue-active-applications: " + getNumActiveApplications()); - } finally { - writeLock.unlock(); - } } @Override @@ -965,15 +888,14 @@ public void finishApplication(ApplicationId application, String user) { @Override public void finishApplicationAttempt(FiCaSchedulerApp application, String queue) { // Careful! Locking order is important! - removeApplicationAttempt(application, application.getUser()); + synchronized (this) { + removeApplicationAttempt(application, application.getUser()); + } getParent().finishApplicationAttempt(application, queue); } - private void removeApplicationAttempt( + private synchronized void removeApplicationAttempt( FiCaSchedulerApp application, String userName) { - try { - writeLock.lock(); - // TODO, should use getUser, use this method just to avoid UT failure // which is caused by wrong invoking order, will fix UT separately User user = usersManager.getUserAndAddIfAbsent(userName); @@ -1007,12 +929,9 @@ private void removeApplicationAttempt( + " #user-active-applications: " + user.getActiveApplications() + " #queue-pending-applications: " + getNumPendingApplications() + " #queue-active-applications: " + getNumActiveApplications()); - } finally { - writeLock.unlock(); - } } - private FiCaSchedulerApp getApplication( + private synchronized FiCaSchedulerApp getApplication( ApplicationAttemptId applicationAttemptId) { return applicationAttemptMap.get(applicationAttemptId); } @@ -1227,8 +1146,7 @@ public boolean accept(Resource cluster, // Do not check limits when allocation from a reserved container if (allocation.getAllocateFromReservedContainer() == null) { - try { - readLock.lock(); + synchronized (this){ FiCaSchedulerApp app = schedulerContainer.getSchedulerApplicationAttempt(); String username = app.getUser(); @@ -1258,8 +1176,6 @@ public boolean accept(Resource cluster, } return false; } - } finally { - readLock.unlock(); } } @@ -1328,9 +1244,7 @@ public void apply(Resource cluster, releaseContainers(cluster, request); - try { - writeLock.lock(); - + synchronized (this){ if (request.anythingAllocatedOrReserved()) { ContainerAllocationProposal allocation = request.getFirstAllocatedOrReservedContainer(); @@ -1361,10 +1275,7 @@ public void apply(Resource cluster, request.getTotalReservedResource()); } } - } finally { - writeLock.unlock(); } - if (parent != null && applyToParentQueue) { parent.apply(cluster, request); } @@ -1545,11 +1456,9 @@ public Resource getResourceLimitForAllUsers(String userName, } @Private - protected boolean canAssignToUser(Resource clusterResource, + protected synchronized boolean canAssignToUser(Resource clusterResource, String userName, Resource limit, FiCaSchedulerApp application, String nodePartition, ResourceLimits currentResourceLimits) { - try { - readLock.lock(); User user = getUser(userName); if (user == null) { if (LOG.isDebugEnabled()) { @@ -1595,9 +1504,6 @@ protected boolean canAssignToUser(Resource clusterResource, return false; } return true; - } finally { - readLock.unlock(); - } } private void updateSchedulerHealthForCompletedContainer( @@ -1628,12 +1534,9 @@ private void updateSchedulerHealthForCompletedContainer( * @param nodePartition * Partition */ - public void recalculateQueueUsageRatio(Resource clusterResource, + public synchronized void recalculateQueueUsageRatio(Resource clusterResource, String nodePartition) { - try { - writeLock.lock(); ResourceUsage queueResourceUsage = getQueueResourceUsage(); - if (nodePartition == null) { for (String partition : Sets.union( getQueueCapacities().getNodePartitionsSet(), @@ -1643,9 +1546,6 @@ public void recalculateQueueUsageRatio(Resource clusterResource, } else { usersManager.updateUsageRatio(nodePartition, clusterResource); } - } finally { - writeLock.unlock(); - } } @Override @@ -1659,8 +1559,7 @@ public void completedContainer(Resource clusterResource, if (application != null) { boolean removed = false; // Careful! Locking order is important! - writeLock.lock(); - try { + synchronized (this){ Container container = rmContainer.getContainer(); // Inform the application & the node // Note: It's safe to assume that all state changes to RMContainer @@ -1685,11 +1584,8 @@ public void completedContainer(Resource clusterResource, releaseResource(clusterResource, application, container.getResource(), node.getPartition(), rmContainer); } - } finally { - writeLock.unlock(); } - if (removed) { // Inform the parent queue _outside_ of the leaf-queue lock getParent().completedContainer(clusterResource, application, node, @@ -1702,11 +1598,9 @@ public void completedContainer(Resource clusterResource, new KillableContainer(rmContainer, node.getPartition(), queueName)); } - void allocateResource(Resource clusterResource, + synchronized void allocateResource(Resource clusterResource, SchedulerApplicationAttempt application, Resource resource, String nodePartition, RMContainer rmContainer) { - try { - writeLock.lock(); super.allocateResource(clusterResource, resource, nodePartition); // handle ignore exclusivity container @@ -1742,16 +1636,11 @@ void allocateResource(Resource clusterResource, + numContainers + " headroom = " + application.getHeadroom() + " user-resources=" + user.getUsed()); } - } finally { - writeLock.unlock(); - } } - void releaseResource(Resource clusterResource, + synchronized void releaseResource(Resource clusterResource, FiCaSchedulerApp application, Resource resource, String nodePartition, RMContainer rmContainer) { - try { - writeLock.lock(); super.releaseResource(clusterResource, resource, nodePartition); // handle ignore exclusivity container @@ -1782,9 +1671,6 @@ void releaseResource(Resource clusterResource, + numContainers + " user=" + userName + " user-resources=" + user.getUsed()); } - } finally { - writeLock.unlock(); - } } private void updateCurrentResourceLimits( @@ -1804,10 +1690,8 @@ private void updateCurrentResourceLimits( } @Override - public void updateClusterResource(Resource clusterResource, + public synchronized void updateClusterResource(Resource clusterResource, ResourceLimits currentResourceLimits) { - try { - writeLock.lock(); updateCurrentResourceLimits(currentResourceLimits, clusterResource); lastClusterResource = clusterResource; @@ -1838,9 +1722,6 @@ public void updateClusterResource(Resource clusterResource, RMNodeLabelsManager.NO_LABEL, SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY, null); } - } finally { - writeLock.unlock(); - } } @Override @@ -1885,15 +1766,12 @@ public void recoverContainer(Resource clusterResource, return; } // Careful! Locking order is important! - try { - writeLock.lock(); + synchronized (this) { FiCaSchedulerNode node = scheduler.getNode( rmContainer.getContainer().getNodeId()); allocateResource(clusterResource, attempt, rmContainer.getContainer().getResource(), node.getPartition(), rmContainer); - } finally { - writeLock.unlock(); } getParent().recoverContainer(clusterResource, attempt, rmContainer); @@ -1946,11 +1824,9 @@ public void recoverContainer(Resource clusterResource, * excessive preemption. * @return Total pending resource considering user limit */ - public Resource getTotalPendingResourcesConsideringUserLimit( + public synchronized Resource getTotalPendingResourcesConsideringUserLimit( Resource clusterResources, String partition, boolean deductReservedFromPending) { - try { - readLock.lock(); Map userNameToHeadroom = new HashMap<>(); Resource totalPendingConsideringUserLimit = Resource.newInstance(0, 0); @@ -1984,17 +1860,11 @@ public Resource getTotalPendingResourcesConsideringUserLimit( minpendingConsideringUserLimit); } return totalPendingConsideringUserLimit; - } finally { - readLock.unlock(); - } - } @Override - public void collectSchedulerApplications( + public synchronized void collectSchedulerApplications( Collection apps) { - try { - readLock.lock(); for (FiCaSchedulerApp pendingApp : pendingOrderingPolicy .getSchedulableEntities()) { apps.add(pendingApp.getApplicationAttemptId()); @@ -2002,10 +1872,6 @@ public void collectSchedulerApplications( for (FiCaSchedulerApp app : orderingPolicy.getSchedulableEntities()) { apps.add(app.getApplicationAttemptId()); } - } finally { - readLock.unlock(); - } - } @Override @@ -2050,12 +1916,9 @@ public void detachContainer(Resource clusterResource, * @return all ignored partition exclusivity RMContainers in the LeafQueue, * this will be used by preemption policy. */ - public Map> + public synchronized Map> getIgnoreExclusivityRMContainers() { Map> clonedMap = new HashMap<>(); - try { - readLock.lock(); - for (Map.Entry> entry : ignorePartitionExclusivityRMContainers .entrySet()) { clonedMap.put(entry.getKey(), new TreeSet<>(entry.getValue())); @@ -2063,9 +1926,6 @@ public void detachContainer(Resource clusterResource, return clonedMap; - } finally { - readLock.unlock(); - } } public void setCapacity(float capacity) { @@ -2102,18 +1962,13 @@ public void setMaxAMResourcePerQueuePercent( return orderingPolicy; } - void setOrderingPolicy( + synchronized void setOrderingPolicy( OrderingPolicy orderingPolicy) { - try { - writeLock.lock(); if (null != this.orderingPolicy) { orderingPolicy.addAllSchedulableEntities( this.orderingPolicy.getSchedulableEntities()); } this.orderingPolicy = orderingPolicy; - } finally { - writeLock.unlock(); - } } @Override @@ -2121,10 +1976,9 @@ public Priority getDefaultApplicationPriority() { return defaultAppPriorityPerQueue; } - public void updateApplicationPriority(SchedulerApplication app, + public synchronized void updateApplicationPriority(SchedulerApplication app, Priority newAppPriority) { - try { - writeLock.lock(); + FiCaSchedulerApp attempt = app.getCurrentAppAttempt(); boolean isActive = orderingPolicy.removeSchedulableEntity(attempt); if (!isActive) { @@ -2138,9 +1992,6 @@ public void updateApplicationPriority(SchedulerApplication app } else { pendingOrderingPolicy.addSchedulableEntity(attempt); } - } finally { - writeLock.unlock(); - } } public OrderingPolicy @@ -2174,17 +2025,12 @@ public Resource getClusterResource() { } @Override - public void stopQueue() { - try { - writeLock.lock(); + public synchronized void stopQueue() { if (getNumApplications() > 0) { updateQueueState(QueueState.DRAINING); } else { updateQueueState(QueueState.STOPPED); } - } finally { - writeLock.unlock(); - } } /** -- 2.23.0.windows.1