diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java index fe61ddd..e1256bd 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java @@ -83,7 +83,7 @@ // Preemption related variables private Resource fairshareStarvation = Resources.none(); private Resource minshareStarvation = Resources.none(); - private Resource preemptedResources = Resources.createResource(0); + private Resource preemptedResources = Resources.clone(Resources.none()); private final Set containersToPreempt = new HashSet<>(); private long lastTimeAtFairShare; @@ -149,7 +149,7 @@ void containerCompleted(RMContainer rmContainer, // Remove from the list of containers liveContainers.remove(rmContainer.getContainerId()); - containersToPreempt.remove(rmContainer); + removePreemption(rmContainer); Resource containerResource = rmContainer.getContainer().getResource(); RMAuditLogger.logSuccess(getUser(), AuditConstants.RELEASE_CONTAINER, @@ -497,6 +497,12 @@ void addPreemption(RMContainer container) { Resources.addTo(preemptedResources, container.getAllocatedResource()); } + void removePreemption(RMContainer container) { + Resources.subtractFrom(preemptedResources, + container.getAllocatedResource()); + containersToPreempt.remove(container); + } + Set getPreemptionContainers() { return containersToPreempt; } @@ -506,18 +512,6 @@ private Resource getPreemptedResources() { return preemptedResources; } - void resetPreemptedResources() { - preemptedResources = Resources.createResource(0); - for (RMContainer container : getPreemptionContainers()) { - Resources.addTo(preemptedResources, container.getAllocatedResource()); - } - } - - void clearPreemptedResources() { - preemptedResources.setMemorySize(0); - preemptedResources.setVirtualCores(0); - } - boolean canContainerBePreempted(RMContainer container) { // Sanity check that the app owns this container if (!getLiveContainersMap().containsKey(container.getContainerId()) && @@ -527,11 +521,17 @@ boolean canContainerBePreempted(RMContainer container) { return false; } + if (containersToPreempt.contains(container)) { + // The container is already under consideration for preemption + return false; + } + // Check if any of the parent queues are not preemptable // TODO (KK): Propagate the "preemptable" flag all the way down to the app // to avoid recursing up every time. - FSQueue queue = getQueue(); - while (!queue.getQueueName().equals("root")) { + for (FSQueue queue = getQueue(); + !queue.getQueueName().equals("root"); + queue = queue.getParent()) { if (!queue.isPreemptable()) { return false; } @@ -539,8 +539,12 @@ boolean canContainerBePreempted(RMContainer container) { // Check if the app's allocation will be over its fairshare even // after preempting this container + Resource currentUsage = getResourceUsage(); + Resource fairshare = getFairShare(); + Resource overFairShareBy = Resources.subtract(currentUsage, fairshare); + return (Resources.fitsIn(container.getAllocatedResource(), - Resources.subtract(getResourceUsage(), getFairShare()))); + overFairShareBy)); } /** diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSPreemptionThread.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSPreemptionThread.java index 905b6f2..853fad4 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSPreemptionThread.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSPreemptionThread.java @@ -83,8 +83,8 @@ public void run() { * @param starvedApp * @return */ - private List identifyContainersToPreempt(FSAppAttempt - starvedApp) { + private List identifyContainersToPreempt( + FSAppAttempt starvedApp) { List containers = new ArrayList<>(); // return value // Find the nodes that match the next resource request @@ -113,13 +113,23 @@ public void run() { // is okay to unreserve it if we find enough resources. } + // Figure out list of containers to consider + List containersToCheck = + node.getCopiedListOfRunningContainers(); + containersToCheck.removeAll(node.getContainersForPreemption()); + // Initialize potential with unallocated resources Resource potential = Resources.clone(node.getUnallocatedResource()); - for (RMContainer container : node.getCopiedListOfRunningContainers()) { + for (RMContainer container : containersToCheck) { FSAppAttempt app = scheduler.getSchedulerApp(container.getApplicationAttemptId()); +// boolean preemptable = app.canContainerBePreempted(container); + if (app.canContainerBePreempted(container)) { + // Flag container for preemption + containers.add(container); + node.addContainerForPreemption(container); Resources.addTo(potential, container.getAllocatedResource()); } @@ -166,6 +176,10 @@ public void run() { LOG.info("Killing container " + container); scheduler.completedContainer( container, status, RMContainerEventType.KILL); + + FSSchedulerNode containerNode = (FSSchedulerNode) + scheduler.getNodeTracker().getNode(container.getAllocatedNode()); + containerNode.removeContainerForPreemption(container); } } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSSchedulerNode.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSSchedulerNode.java index 024ec67..70cf036 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSSchedulerNode.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSSchedulerNode.java @@ -29,6 +29,9 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerRequestKey; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode; +import java.util.HashSet; +import java.util.Set; + @Private @Unstable public class FSSchedulerNode extends SchedulerNode { @@ -36,6 +39,7 @@ private static final Log LOG = LogFactory.getLog(FSSchedulerNode.class); private FSAppAttempt reservedAppSchedulable; + private Set containersForPreemption = new HashSet<>(); public FSSchedulerNode(RMNode node, boolean usePortForNodeName) { super(node, usePortForNodeName); @@ -103,4 +107,15 @@ public synchronized FSAppAttempt getReservedAppSchedulable() { return reservedAppSchedulable; } + public void addContainerForPreemption(RMContainer container) { + containersForPreemption.add(container); + } + + public Set getContainersForPreemption() { + return containersForPreemption; + } + + public void removeContainerForPreemption(RMContainer container) { + containersForPreemption.remove(container); + } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairSchedulerPreemption.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairSchedulerPreemption.java new file mode 100644 index 0000000..3e30169 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairSchedulerPreemption.java @@ -0,0 +1,322 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair; + +import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.server.resourcemanager.MockNodes; +import org.apache.hadoop.yarn.server.resourcemanager.MockRM; +import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent; +import org.apache.hadoop.yarn.util.resource.Resources; +import org.junit.After; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +/** + * Tests to verify fairshare and minshare preemption, using parameterization. + */ +@RunWith(Parameterized.class) +public class TestFairSchedulerPreemption extends FairSchedulerTestBase { + private static final File ALLOC_FILE = new File(TEST_DIR, "test-queues"); + + // Node Capacity = NODE_CAPACITY_MULTIPLE * (1 GB or 1 vcore) + private static final int NODE_CAPACITY_MULTIPLE = 4; + + private final boolean fairsharePreemption; + + // App that takes up the entire cluster + private FSAppAttempt app1; + + // Starving app that is expected to instigate preemption + private FSAppAttempt app2; + + private final List rmNodes = new ArrayList<>(); + + @Parameterized.Parameters + public static Collection getParameters() { + return Arrays.asList(new Boolean[][] { + {true}, {false}}); + } + + public TestFairSchedulerPreemption(Boolean fairshare) throws IOException { + fairsharePreemption = fairshare; + if (fairshare) { + writeFairshareAllocFile(); + } else { + writeMinshareAllocFile(); + } + } + + @Before + public void setup() { + createConfiguration(); + conf.set(FairSchedulerConfiguration.ALLOCATION_FILE, + ALLOC_FILE.getAbsolutePath()); + conf.setBoolean(FairSchedulerConfiguration.PREEMPTION, true); + conf.setFloat(FairSchedulerConfiguration.PREEMPTION_THRESHOLD, 0f); + conf.setInt(FairSchedulerConfiguration.WAIT_TIME_BEFORE_KILL, 0); + } + + @After + public void teardown() { + ALLOC_FILE.delete(); + conf = null; + if (resourceManager != null) { + resourceManager.stop(); + resourceManager = null; + } + } + + private void writeFairshareAllocFile() throws IOException { + /* + * Queue hierarchy: + * root + * |--- allowed + * |--- child-1 + * |--- child-2 + * |--- disallowed + * |--- child-1 + * |--- child-2 + */ + PrintWriter out = new PrintWriter(new FileWriter(ALLOC_FILE)); + out.println(""); + out.println(""); + + out.println(""); + out.println("1" + + ""); + out.println("0" + + ""); + + out.println(""); + out.println(""); + + out.println(""); + + // Queue with fairshare preemption enabled + out.println(""); + out.println("1" + + ""); + out.println("0" + + ""); + out.println("false" + + ""); + + out.println(""); + out.println(""); + + out.println(""); + + out.println(""); + out.close(); + + assertTrue("Allocation file does not exist, not running the test", + ALLOC_FILE.exists()); + } + + private void writeMinshareAllocFile() throws IOException { + /* + * Queue hierarchy: + * root + * |--- allowed + * |--- child-1 + * |--- child-2 + * |--- disallowed + * |--- child-1 + * |--- child-2 + */ + PrintWriter out = new PrintWriter(new FileWriter(ALLOC_FILE)); + out.println(""); + out.println(""); + + out.println(""); + out.println("0" + + ""); + + out.println(""); + out.println("4096mb,4vcores"); + out.println(""); + + out.println(""); + out.println("4096mb,4vcores"); + out.println(""); + + out.println(""); + + out.println(""); + out.println("0" + + ""); + out.println("false" + + ""); + + out.println(""); + out.println("4096mb,4vcores"); + out.println(""); + + out.println(""); + out.println("4096mb,4vcores"); + out.println(""); + + out.println(""); + + out.println(""); + out.close(); + + assertTrue("Allocation file does not exist, not running the test", + ALLOC_FILE.exists()); + } + + private void setupCluster() throws IOException { + resourceManager = new MockRM(conf); + resourceManager.start(); + scheduler = (FairScheduler) resourceManager.getResourceScheduler(); + + // Create and add two nodes to the cluster + addNode(NODE_CAPACITY_MULTIPLE * 1024, NODE_CAPACITY_MULTIPLE); + addNode(NODE_CAPACITY_MULTIPLE * 1024, NODE_CAPACITY_MULTIPLE); + } + + private void addNode(int memory, int cores) { + int id = rmNodes.size() + 1; + RMNode node = + MockNodes.newNodeInfo(1, Resources.createResource(memory, cores), id, + "127.0.0." + id); + scheduler.handle(new NodeAddedSchedulerEvent(node)); + rmNodes.add(node); + } + + private void sendNodeUpdateEvents() { + for (RMNode node : rmNodes) { + NodeUpdateSchedulerEvent nodeUpdateSchedulerEvent = + new NodeUpdateSchedulerEvent(node); + for (int i = 0; i < NODE_CAPACITY_MULTIPLE; i++) { + scheduler.handle(nodeUpdateSchedulerEvent); + } + } + } + + /** + * Submit application to queue1 and take over the entire cluster. Submit + * application with larger containers to queue2 that requires preemption + * from the first application. + * + * @param queue1 first queue + * @param queue2 second queue + * @throws InterruptedException - if interrupted while waiting + */ + private void submitApps(String queue1, String queue2) + throws InterruptedException { + // Create an app that takes up all the resources on the cluster + ApplicationAttemptId appAttemptId1 + = createSchedulingRequest(1024, 1, queue1, "default", + NODE_CAPACITY_MULTIPLE * rmNodes.size()); + app1 = scheduler.getSchedulerApp(appAttemptId1); + scheduler.update(); + sendNodeUpdateEvents(); + assertEquals(8, app1.getLiveContainers().size()); + + // Create an app that takes up all the resources on the cluster + ApplicationAttemptId appAttemptId2 + = createSchedulingRequest(2048, 2, queue2, "default", + NODE_CAPACITY_MULTIPLE * rmNodes.size() / 2); + app2 = scheduler.getSchedulerApp(appAttemptId2); + + // Sleep long enough to pass + Thread.sleep(10); + + scheduler.update(); + } + + private void verifyPreemption() throws InterruptedException { + // Sleep long enough for four containers to be preempted. Note that the + // starved app must be queued four times for containers to be preempted. + for (int i = 0; i < 10000; i++) { + if (app1.getLiveContainers().size() == 4) { + break; + } + Thread.sleep(10); + } + + // Verify the right amount of containers are preempted from app1 + assertEquals(4, app1.getLiveContainers().size()); + + sendNodeUpdateEvents(); + + // Verify the preempted containers are assigned to app2 + assertEquals(2, app2.getLiveContainers().size()); + } + + private void verifyNoPreemption() throws InterruptedException { + // Sleep long enough to ensure not even one container is preempted. + for (int i = 0; i < 600; i++) { + if (app1.getLiveContainers().size() != 8) { + break; + } + Thread.sleep(10); + } + assertEquals(8, app1.getLiveContainers().size()); + } + + @Test + public void testPreemptionWithinSameLeafQueue() throws Exception { + setupCluster(); + String queue = "root.allowed.child-1"; + submitApps(queue, queue); + if (fairsharePreemption) { + verifyPreemption(); + } else { + verifyNoPreemption(); + } + } + + @Test + public void testPreemptionBetweenTwoSiblingLeafQueues() throws Exception { + setupCluster(); + submitApps("root.allowed.child-1", "root.allowed.child-2"); + verifyPreemption(); + } + + @Test + public void testPreemptionBetweenNonSiblingQueues() throws Exception { + setupCluster(); + submitApps("root.allowed.child-1", "root.disallowed.child-1"); + verifyPreemption(); + } + + @Test + public void testNoPreemptionFromDisallowedQueue() throws Exception { + setupCluster(); + submitApps("root.disallowed.child-1", "root.allowed.child-1"); + verifyNoPreemption(); + } +}