diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java index 6d158be71a8..869917ca5d1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java @@ -51,13 +51,16 @@ private GpuResourceAllocator gpuAllocator; private CGroupsHandler cGroupsHandler; private PrivilegedOperationExecutor privilegedOperationExecutor; + private final GpuDiscoverer gpuDiscoverer; public GpuResourceHandlerImpl(Context nmContext, CGroupsHandler cGroupsHandler, - PrivilegedOperationExecutor privilegedOperationExecutor) { + PrivilegedOperationExecutor privilegedOperationExecutor, + GpuDiscoverer gpuDiscoverer) { this.cGroupsHandler = cGroupsHandler; this.privilegedOperationExecutor = privilegedOperationExecutor; - gpuAllocator = new GpuResourceAllocator(nmContext); + this.gpuAllocator = new GpuResourceAllocator(nmContext); + this.gpuDiscoverer = gpuDiscoverer; } @Override @@ -65,7 +68,7 @@ public GpuResourceHandlerImpl(Context nmContext, throws ResourceHandlerException { List usableGpus; try { - usableGpus = GpuDiscoverer.getInstance().getGpusUsableByYarn(); + usableGpus = gpuDiscoverer.getGpusUsableByYarn(); if (usableGpus == null || usableGpus.isEmpty()) { String message = "GPU is enabled on the NodeManager, but couldn't find " + "any usable GPU devices, please double check configuration!"; @@ -175,4 +178,11 @@ public GpuResourceAllocator getGpuAllocator() { public List teardown() throws ResourceHandlerException { return null; } + + @Override + public String toString() { + return GpuResourceHandlerImpl.class.getName() + "{" + + "gpuAllocator=" + gpuAllocator + + '}'; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java index f28aad206a6..224dcc3157f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java @@ -25,6 +25,8 @@ import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.FpgaResourcePlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuNodeResourceUpdateHandler; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -77,10 +79,11 @@ public synchronized void initialize(Context context) ResourcePlugin plugin = null; if (resourceName.equals(GPU_URI)) { - plugin = new GpuResourcePlugin(); - } - - if (resourceName.equals(FPGA_URI)) { + final GpuDiscoverer gpuDiscoverer = new GpuDiscoverer(); + final GpuNodeResourceUpdateHandler updateHandler = + new GpuNodeResourceUpdateHandler(gpuDiscoverer); + plugin = new GpuResourcePlugin(updateHandler, gpuDiscoverer); + } else if (resourceName.equals(FPGA_URI)) { plugin = new FpgaResourcePlugin(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java index 92792b7ba64..334a86c2c82 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -58,11 +58,6 @@ // command should not run more than 10 sec. private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; private static final int MAX_REPEATED_ERROR_ALLOWED = 10; - private static GpuDiscoverer instance; - - static { - instance = new GpuDiscoverer(); - } private Configuration conf = null; private String pathOfGpuBinary = null; @@ -293,8 +288,4 @@ public synchronized void initialize(Configuration conf) { String getPathOfGpuBinary() { return pathOfGpuBinary; } - - public static GpuDiscoverer getInstance() { - return instance; - } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java index 8b19048c8be..4b2258d557f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java @@ -35,16 +35,20 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin { private static final Logger LOG = LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class); + private final GpuDiscoverer gpuDiscoverer; + + public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer) { + this.gpuDiscoverer = gpuDiscoverer; + } @Override public void updateConfiguredResource(Resource res) throws YarnException { LOG.info("Initializing configured GPU resources for the NodeManager."); - List usableGpus = GpuDiscoverer.getInstance() - .getGpusUsableByYarn(); + List usableGpus = gpuDiscoverer.getGpusUsableByYarn(); if (usableGpus == null || usableGpus.isEmpty()) { String message = "GPU is enabled, " + - "but couldn't find any usable GPUs on the NodeManager!"; + "but could not find any usable GPUs on the NodeManager!"; LOG.error(message); // No gpu can be used by YARN. throw new YarnException(message); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java index f28218de224..8b7970587ce 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java @@ -18,7 +18,6 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; -import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; @@ -34,18 +33,23 @@ import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo; import java.util.List; -import java.util.Map; public class GpuResourcePlugin implements ResourcePlugin { + private final GpuNodeResourceUpdateHandler resourceDiscoverHandler; + private final GpuDiscoverer gpuDiscoverer; private GpuResourceHandlerImpl gpuResourceHandler = null; - private GpuNodeResourceUpdateHandler resourceDiscoverHandler = null; private DockerCommandPlugin dockerCommandPlugin = null; + public GpuResourcePlugin(GpuNodeResourceUpdateHandler resourceDiscoverHandler, + GpuDiscoverer gpuDiscoverer) { + this.resourceDiscoverHandler = resourceDiscoverHandler; + this.gpuDiscoverer = gpuDiscoverer; + } + @Override public synchronized void initialize(Context context) throws YarnException { - resourceDiscoverHandler = new GpuNodeResourceUpdateHandler(); - GpuDiscoverer.getInstance().initialize(context.getConf()); - dockerCommandPlugin = + this.gpuDiscoverer.initialize(context.getConf()); + this.dockerCommandPlugin = GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin( context.getConf()); } @@ -56,7 +60,7 @@ public synchronized ResourceHandler createResourceHandler( PrivilegedOperationExecutor privilegedOperationExecutor) { if (gpuResourceHandler == null) { gpuResourceHandler = new GpuResourceHandlerImpl(context, cGroupsHandler, - privilegedOperationExecutor); + privilegedOperationExecutor, gpuDiscoverer); } return gpuResourceHandler; @@ -77,9 +81,9 @@ public DockerCommandPlugin getDockerCommandPluginInstance() { } @Override - public NMResourceInfo getNMResourceInfo() throws YarnException { + public synchronized NMResourceInfo getNMResourceInfo() throws YarnException { GpuDeviceInformation gpuDeviceInformation = - GpuDiscoverer.getInstance().getGpuDeviceInformation(); + gpuDiscoverer.getGpuDeviceInformation(); GpuResourceAllocator gpuResourceAllocator = gpuResourceHandler.getGpuAllocator(); List totalGpus = gpuResourceAllocator.getAllowedGpusCopy(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java index 9a8a4c9f284..0141c72af90 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java @@ -71,6 +71,7 @@ private GpuResourceHandlerImpl gpuResourceHandler; private NMStateStoreService mockNMStateStore; private ConcurrentHashMap runningContainersMap; + private GpuDiscoverer gpuDiscoverer; @Before public void setup() { @@ -85,8 +86,9 @@ public void setup() { runningContainersMap = new ConcurrentHashMap<>(); when(nmctx.getContainers()).thenReturn(runningContainersMap); + gpuDiscoverer = new GpuDiscoverer(); gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler, - mockPrivilegedExecutor); + mockPrivilegedExecutor, gpuDiscoverer); } @Test @@ -94,7 +96,7 @@ public void testBootStrap() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0"); - GpuDiscoverer.getInstance().initialize(conf); + gpuDiscoverer.initialize(conf); gpuResourceHandler.bootstrap(conf); verify(mockCGroupsHandler, times(1)).initializeCGroupController( @@ -157,7 +159,7 @@ private void commonTestAllocation(boolean dockerContainerEnabled) throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); - GpuDiscoverer.getInstance().initialize(conf); + gpuDiscoverer.initialize(conf); gpuResourceHandler.bootstrap(conf); Assert.assertEquals(4, @@ -246,7 +248,7 @@ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); - GpuDiscoverer.getInstance().initialize(conf); + gpuDiscoverer.initialize(conf); gpuResourceHandler.bootstrap(conf); Assert.assertEquals(4, @@ -275,7 +277,7 @@ public void testAssignedGpuWillBeCleanedupWhenStoreOpFails() public void testAllocationWithoutAllowedGpus() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " "); - GpuDiscoverer.getInstance().initialize(conf); + gpuDiscoverer.initialize(conf); try { gpuResourceHandler.bootstrap(conf); @@ -310,7 +312,7 @@ public void testAllocationWithoutAllowedGpus() throws Exception { public void testAllocationStored() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); - GpuDiscoverer.getInstance().initialize(conf); + gpuDiscoverer.initialize(conf); gpuResourceHandler.bootstrap(conf); Assert.assertEquals(4, @@ -352,11 +354,11 @@ public void testAllocationStoredWithNULLStateStore() throws Exception { GpuResourceHandlerImpl gpuNULLStateResourceHandler = new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler, - mockPrivilegedExecutor); + mockPrivilegedExecutor, gpuDiscoverer); Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); - GpuDiscoverer.getInstance().initialize(conf); + gpuDiscoverer.initialize(conf); gpuNULLStateResourceHandler.bootstrap(conf); Assert.assertEquals(4, @@ -376,7 +378,7 @@ public void testAllocationStoredWithNULLStateStore() throws Exception { public void testRecoverResourceAllocation() throws Exception { Configuration conf = new YarnConfiguration(); conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); - GpuDiscoverer.getInstance().initialize(conf); + gpuDiscoverer.initialize(conf); gpuResourceHandler.bootstrap(conf); Assert.assertEquals(4,