From 1ec61de8dee2c00e0c69cf06e0d75837cacb1dc2 Mon Sep 17 00:00:00 2001 From: Sunil G Date: Fri, 15 Jun 2018 00:32:51 +0530 Subject: [PATCH] YARN-8423 --- .../linux/resources/gpu/GpuResourceAllocator.java | 58 ++++++++++++++++++++-- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java index 5bdffc369b2..f32ccad161c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java @@ -29,6 +29,7 @@ import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.AssignedGpuDevice; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice; @@ -36,10 +37,8 @@ import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; -import java.util.Collection; import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -54,6 +53,7 @@ */ public class GpuResourceAllocator { final static Log LOG = LogFactory.getLog(GpuResourceAllocator.class); + private static final int WAIT_MS_PER_LOOP = 1000; private Set allowedGpuDevices = new TreeSet<>(); private Map usedDevices = new TreeMap<>(); @@ -168,13 +168,49 @@ public static int getRequestedGpus(Resource requestedResource) { * @return allocation results. * @throws ResourceHandlerException When failed to assign GPUs. */ - public synchronized GpuAllocation assignGpus(Container container) + public GpuAllocation assignGpus(Container container) + throws ResourceHandlerException { + GpuAllocation allocation = internalAssignGpus(container); + + // Wait for a maximum of 10 seconds if no available GPU are there which + // are yet to be released. + final int timeoutMsecs = 120 * WAIT_MS_PER_LOOP; + int timeWaiting = 0; + while (allocation == null) { + if (timeWaiting >= timeoutMsecs) { + break; + } + + // Sleep for 100 ms to ensure there are some free GPU devices which are + // getting released. + try { + LOG.info("Container : " + container.getContainerId() + + " is waiting for free GPU devices."); + Thread.sleep(WAIT_MS_PER_LOOP); + timeWaiting += WAIT_MS_PER_LOOP; + } catch (InterruptedException e) { + // On any interrupt, break the loop and continue execution. + break; + } + } + return allocation; + } + + private synchronized GpuAllocation internalAssignGpus(Container container) throws ResourceHandlerException { Resource requestedResource = container.getResource(); ContainerId containerId = container.getContainerId(); int numRequestedGpuDevices = getRequestedGpus(requestedResource); // Assign Gpus to container if requested some. if (numRequestedGpuDevices > 0) { + if (numRequestedGpuDevices > getAvailableGpus()) { + // If there are some devices which are getting released, wait for few + // seconds to get it. + if (numRequestedGpuDevices <= getReleasingGpus() + getAvailableGpus()) { + return null; + } + } + if (numRequestedGpuDevices > getAvailableGpus()) { throw new ResourceHandlerException( getResourceHandlerExceptionMessage(numRequestedGpuDevices, @@ -211,6 +247,22 @@ public synchronized GpuAllocation assignGpus(Container container) return new GpuAllocation(null, allowedGpuDevices); } + private synchronized int getReleasingGpus() { + int releasingGpus = 0; + Iterator> iter = usedDevices.entrySet() + .iterator(); + while (iter.hasNext()) { + ContainerId containerId = iter.next().getValue(); + Container container; + if ((container = nmContext.getContainers().get(containerId)) != null) { + if (container.getContainerState() == ContainerState.KILLING) { + releasingGpus++; + } + } + } + return releasingGpus; + } + /** * Clean up all Gpus assigned to containerId * @param containerId containerId -- 2.14.3 (Apple Git-98)