From cc4f2c805aeb7823b44fb6f6d2e6288b14cf6922 Mon Sep 17 00:00:00 2001 From: Adam Antal Date: Thu, 11 Apr 2019 16:57:29 +0200 Subject: [PATCH] YARN-9337. GPU auto-discovery script runs even when the resource is given by hand --- .../resourceplugin/gpu/GpuDiscoverer.java | 60 +++++++++++-------- .../resourceplugin/gpu/TestGpuDiscoverer.java | 4 +- 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java index d4734183e8e9c9ff054680e905c535eb35017564..4d08c2a5f419c2aebb6b7284dc432d171f770d9d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -70,6 +70,8 @@ private int numOfErrorExecutionSinceLastSucceed = 0; private GpuDeviceInformation lastDiscoveredGpuInformation = null; + private List gpuDevicesFromUser; + private void validateConfOrThrowException() throws YarnException { if (conf == null) { throw new YarnException("Please initialize (call initialize) before use " @@ -142,6 +144,14 @@ synchronized GpuDeviceInformation getGpuDeviceInformation() } } + private boolean IsAutoDiscoveryEnabled() { + String allowedDevicesStr = conf.get( + YarnConfiguration.NM_GPU_ALLOWED_DEVICES, + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES); + return allowedDevicesStr.equals( + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES); + } + /** * Get list of GPU devices usable by YARN. * @@ -152,15 +162,13 @@ synchronized GpuDeviceInformation getGpuDeviceInformation() throws YarnException { validateConfOrThrowException(); - String allowedDevicesStr = conf.get( - YarnConfiguration.NM_GPU_ALLOWED_DEVICES, - YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES); - - if (allowedDevicesStr.equals( - YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) { + if (IsAutoDiscoveryEnabled()) { return parseGpuDevicesFromAutoDiscoveredGpuInfo(); } else { - return parseGpuDevicesFromUserDefinedValues(allowedDevicesStr); + if (gpuDevicesFromUser == null) { + gpuDevicesFromUser = parseGpuDevicesFromUserDefinedValues(); + } + return gpuDevicesFromUser; } } @@ -192,16 +200,16 @@ synchronized GpuDeviceInformation getGpuDeviceInformation() } /** - * @param devices allowed devices coming from the config. - * Individual devices should be separated by commas. - *
The format of individual devices should be: - * <index:><minorNumber> * @return List of GpuDevices * @throws YarnException when a GPU device is defined as a duplicate. * The first duplicate GPU device will be added to the exception message. */ - private List parseGpuDevicesFromUserDefinedValues(String devices) + private List parseGpuDevicesFromUserDefinedValues() throws YarnException { + String devices = conf.get( + YarnConfiguration.NM_GPU_ALLOWED_DEVICES, + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES); + if (devices.trim().isEmpty()) { throw GpuDeviceSpecificationException.createWithEmptyValueSpecified(); } @@ -243,19 +251,21 @@ private GpuDevice parseGpuDevice(String device, String[] splitByColon, public synchronized void initialize(Configuration config) throws YarnException { this.conf = config; - numOfErrorExecutionSinceLastSucceed = 0; - lookUpAutoDiscoveryBinary(config); - - // Try to discover GPU information once and print - try { - LOG.info("Trying to discover GPU information ..."); - GpuDeviceInformation info = getGpuDeviceInformation(); - LOG.info("Discovered GPU information: " + info.toString()); - } catch (YarnException e) { - String msg = - "Failed to discover GPU information from system, exception message:" - + e.getMessage() + " continue..."; - LOG.warn(msg); + if (IsAutoDiscoveryEnabled()) { + numOfErrorExecutionSinceLastSucceed = 0; + lookUpAutoDiscoveryBinary(config); + + // Try to discover GPU information once and print + try { + LOG.info("Trying to discover GPU information ..."); + GpuDeviceInformation info = getGpuDeviceInformation(); + LOG.info("Discovered GPU information: " + info.toString()); + } catch (YarnException e) { + String msg = + "Failed to discover GPU information from system, exception message:" + + e.getMessage() + " continue..."; + LOG.warn(msg); + } } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java index 0f67214348235d298e2ad011a92896f6869191a8..a70e668146e98f54360c544bc734c2668087e93c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -50,7 +50,9 @@ import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; -import static org.mockito.Mockito.*; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.verify; public class TestGpuDiscoverer { private static final Logger LOG = LoggerFactory.getLogger( -- 2.21.0