diff --git hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml index 45aa868830c..e6dcefb2099 100644 --- hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml +++ hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml @@ -633,4 +633,12 @@ + + + + + + + + diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/Resource.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/Resource.java index 530d8c91ae5..724fd7f14de 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/Resource.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/Resource.java @@ -18,10 +18,15 @@ package org.apache.hadoop.yarn.api.records; +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; import java.util.Map; +import com.google.common.collect.Lists; +import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.NotImplementedException; +import org.apache.curator.shaded.com.google.common.reflect.ClassPath; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience.Private; import org.apache.hadoop.classification.InterfaceAudience.Public; @@ -232,6 +237,22 @@ public void setMemorySize(long memory) { return resources; } + /** + * Get list of resource information, this will be used by JAXB. + * @return list of resources copy. + */ + @InterfaceAudience.Private + @InterfaceStability.Unstable + public List getAllResourcesListCopy() { + List list = new ArrayList<>(); + for (ResourceInformation i : resources) { + ResourceInformation ri = new ResourceInformation(); + ResourceInformation.copy(i, ri); + list.add(ri); + } + return list; + } + /** * Get ResourceInformation for a specified resource. * diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceInformation.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceInformation.java index e8280ba9cac..a4c1f6c6de3 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceInformation.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceInformation.java @@ -18,10 +18,13 @@ package org.apache.hadoop.yarn.api.records; +import com.google.common.collect.ImmutableMap; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.yarn.api.protocolrecords.ResourceTypes; import org.apache.hadoop.yarn.util.UnitsConversionUtil; +import java.util.Map; + /** * Class to encapsulate information about a Resource - the name of the resource, * the units(milli, micro, etc), the type(countable), and the value. @@ -35,13 +38,20 @@ private long minimumAllocation; private long maximumAllocation; + // Known resource types public static final String MEMORY_URI = "memory-mb"; public static final String VCORES_URI = "vcores"; + public static final String GPU_URI = "yarn.io/gpu"; public static final ResourceInformation MEMORY_MB = ResourceInformation.newInstance(MEMORY_URI, "Mi"); public static final ResourceInformation VCORES = ResourceInformation.newInstance(VCORES_URI); + public static final ResourceInformation GPUS = + ResourceInformation.newInstance(GPU_URI); + + public static final Map MANDATORY_RESOURCES = + ImmutableMap.of(MEMORY_URI, MEMORY_MB, VCORES_URI, VCORES, GPU_URI, GPUS); /** * Get the name for the resource. @@ -215,6 +225,12 @@ public static ResourceInformation newInstance(String name, String units, Long.MAX_VALUE); } + public static ResourceInformation newInstance(String name, String units, + long minRes, long maxRes) { + return ResourceInformation.newInstance(name, units, 0L, + ResourceTypes.COUNTABLE, minRes, maxRes); + } + public static ResourceInformation newInstance(String name, long value) { return ResourceInformation .newInstance(name, "", value, ResourceTypes.COUNTABLE, 0L, diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 6c65b197981..4bde72009ec 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1430,6 +1430,39 @@ public static boolean isAclEnabled(Configuration conf) { public static final String NM_NETWORK_RESOURCE_OUTBOUND_BANDWIDTH_YARN_MBIT = NM_NETWORK_RESOURCE_PREFIX + "outbound-bandwidth-yarn-mbit"; + /** + * Prefix for computation resources, example of computation resources like + * GPU / FPGA / TPU, etc. + */ + @Private + public static final String NM_RESOURCE_PLUGINS = + NM_PREFIX + "resource-plugins"; + + /** + * Prefix for gpu configurations. Work in progress: This configuration + * parameter may be changed/removed in the future. + */ + @Private + public static final String NM_GPU_RESOURCE_PREFIX = + NM_RESOURCE_PLUGINS + ".gpu."; + + @Private + public static final String NM_GPU_ALLOWED_DEVICES = + NM_GPU_RESOURCE_PREFIX + "allowed-gpu-devices"; + @Private + public static final String AUTOMATICALLY_DISCOVER_GPU_DEVICES = "auto"; + + /** + * This setting controls where to how to invoke GPU binaries + */ + @Private + public static final String NM_GPU_PATH_TO_EXEC = + NM_GPU_RESOURCE_PREFIX + "path-to-discovery-executables"; + + @Private + public static final String DEFAULT_NM_GPU_PATH_TO_EXEC = ""; + + /** NM Webapp address.**/ public static final String NM_WEBAPP_ADDRESS = NM_PREFIX + "webapp.address"; public static final int DEFAULT_NM_WEBAPP_PORT = 8042; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/ResourceUtils.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/ResourceUtils.java index b1d0b754c4f..6e8eb8115c1 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/ResourceUtils.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/util/resource/ResourceUtils.java @@ -47,6 +47,8 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI; + /** * Helper class to read the resource-types to be supported by the system. */ @@ -89,33 +91,32 @@ private static void checkMandatoryResources( */ String key = "memory"; if (resourceInformationMap.containsKey(key)) { - LOG.warn("Attempt to define resource '" + key + - "', but it is not allowed."); - throw new YarnRuntimeException("Attempt to re-define mandatory resource '" - + key + "'."); - } - - if (resourceInformationMap.containsKey(MEMORY)) { - ResourceInformation memInfo = resourceInformationMap.get(MEMORY); - String memUnits = ResourceInformation.MEMORY_MB.getUnits(); - ResourceTypes memType = ResourceInformation.MEMORY_MB.getResourceType(); - if (!memInfo.getUnits().equals(memUnits) || !memInfo.getResourceType() - .equals(memType)) { - throw new YarnRuntimeException( - "Attempt to re-define mandatory resource 'memory-mb'. It can only" - + " be of type 'COUNTABLE' and have units 'Mi'."); - } + LOG.warn( + "Attempt to define resource '" + key + "', but it is not allowed."); + throw new YarnRuntimeException( + "Attempt to re-define mandatory resource '" + key + "'."); } - if (resourceInformationMap.containsKey(VCORES)) { - ResourceInformation vcoreInfo = resourceInformationMap.get(VCORES); - String vcoreUnits = ResourceInformation.VCORES.getUnits(); - ResourceTypes vcoreType = ResourceInformation.VCORES.getResourceType(); - if (!vcoreInfo.getUnits().equals(vcoreUnits) || !vcoreInfo - .getResourceType().equals(vcoreType)) { - throw new YarnRuntimeException( - "Attempt to re-define mandatory resource 'vcores'. It can only be" - + " of type 'COUNTABLE' and have units ''(no units)."); + for (Map.Entry mandatoryResourceEntry : + ResourceInformation.MANDATORY_RESOURCES.entrySet()) { + key = mandatoryResourceEntry.getKey(); + ResourceInformation mandatoryRI = mandatoryResourceEntry.getValue(); + + ResourceInformation newDefinedRI = resourceInformationMap.get(key); + if (newDefinedRI != null) { + String expectedUnit = mandatoryRI.getUnits(); + ResourceTypes expectedType = mandatoryRI.getResourceType(); + String actualUnit = newDefinedRI.getUnits(); + ResourceTypes actualType = newDefinedRI.getResourceType(); + + if (!expectedUnit.equals(actualUnit) || !expectedType.equals( + actualType)) { + throw new YarnRuntimeException("Defined mandatory resource type=" + + key + " inside resource-types.xml, however its type or " + + "unit is conflict to mandatory resource types, expected type=" + + expectedType + ", unit=" + expectedUnit + "; actual type=" + + actualType + " actual unit=" + actualUnit); + } } } } @@ -200,9 +201,23 @@ static void validateNameOfResourceNameAndThrowException(String resourceName) } } - @VisibleForTesting - static void initializeResourcesMap(Configuration conf) { + /** + * Get maximum allocation from config, *THIS WILL NOT UPDATE INTERNAL DATA* + * @param conf config + * @return maximum allocation + */ + public static Resource fetchMaximumAllocationFromConfig(Configuration conf) { + Map resourceInformationMap = + getResourceInformationMapFromConfig(conf); + Resource ret = Resource.newInstance(0, 0); + for (ResourceInformation entry : resourceInformationMap.values()) { + ret.setResourceValue(entry.getName(), entry.getMaximumAllocation()); + } + return ret; + } + private static Map getResourceInformationMapFromConfig( + Configuration conf) { Map resourceInformationMap = new HashMap<>(); String[] resourceNames = conf.getStrings(YarnConfiguration.RESOURCE_TYPES); @@ -248,6 +263,13 @@ static void initializeResourcesMap(Configuration conf) { setAllocationForMandatoryResources(resourceInformationMap, conf); + return resourceInformationMap; + } + + @VisibleForTesting + static void initializeResourcesMap(Configuration conf) { + Map resourceInformationMap = + getResourceInformationMapFromConfig(conf); initializeResourcesFromResourceInformationMap(resourceInformationMap); } @@ -545,19 +567,8 @@ public static Resource getResourceTypesMinimumAllocation() { public static Resource getResourceTypesMaximumAllocation() { Resource ret = Resource.newInstance(0, 0); for (ResourceInformation entry : resourceTypesArray) { - String name = entry.getName(); - if (name.equals(ResourceInformation.MEMORY_MB.getName())) { - ret.setMemorySize(entry.getMaximumAllocation()); - } else if (name.equals(ResourceInformation.VCORES.getName())) { - Long tmp = entry.getMaximumAllocation(); - if (tmp > Integer.MAX_VALUE) { - tmp = (long) Integer.MAX_VALUE; - } - ret.setVirtualCores(tmp.intValue()); - continue; - } else { - ret.setResourceValue(name, entry.getMaximumAllocation()); - } + ret.setResourceValue(entry.getName(), + entry.getMaximumAllocation()); } return ret; } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 6d69a10032d..91935adce85 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -3442,6 +3442,45 @@ + + + When yarn.nodemanager.resource.gpu.allowed-gpu-devices=auto specified, + YARN NodeManager needs to run GPU discovery binary (now only support + nvidia-smi) to get GPU-related information. + When value is empty (default), YARN NodeManager will try to locate + discovery executable itself. + An example of the config value is: /usr/local/bin/nvidia-smi + + yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables + + + + + + Enable additional discovery/isolation of resources on the NodeManager, + split by comma. By default, this is empty. Acceptable values: { "yarn-io/gpu" }. + + yarn.nodemanager.resource-plugins + + + + + + Specify GPU devices which can be managed by YARN NodeManager, split by comma + Number of GPU devices will be reported to RM to make scheduling decisions. + Set to auto (default) let YARN automatically discover GPU resource from + system. + Manually specify GPU devices if auto detect GPU device failed or admin + only want subset of GPU devices managed by YARN. GPU device is identified + by their minor device number. A common approach to get minor device number + of GPUs is using "nvidia-smi -q" and search "Minor Number" output. An + example of manual specification is "0,1,2,4" to allow YARN NodeManager + to manage GPU devices with minor number 0/1/2/4. + + yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices + auto + + Provides an option for client to load supported resource types from RM diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/resource/TestResourceUtils.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/resource/TestResourceUtils.java index 0e5e8a80d41..0ad029c67d3 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/resource/TestResourceUtils.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/resource/TestResourceUtils.java @@ -53,6 +53,23 @@ public ResourceFileInformation(String name, int count) { } } + public static void addNewTypesToResources(String... resourceTypes) { + // Initialize resource map + Map riMap = new HashMap<>(); + + // Initialize mandatory resources + riMap.put(ResourceInformation.MEMORY_URI, ResourceInformation.MEMORY_MB); + riMap.put(ResourceInformation.VCORES_URI, ResourceInformation.VCORES); + + for (String newResource : resourceTypes) { + riMap.put(newResource, ResourceInformation + .newInstance(newResource, "", 0, ResourceTypes.COUNTABLE, 0, + Integer.MAX_VALUE)); + } + + ResourceUtils.initializeResourcesFromResourceInformationMap(riMap); + } + @Before public void setup() { ResourceUtils.resetResourceTypes(); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java index 3b532c9b430..f43b1ee0105 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java @@ -113,9 +113,10 @@ public Configuration getConf() { * Run the executor initialization steps. * Verify that the necessary configs and permissions are in place. * + * @param nmContext Context of NM * @throws IOException if initialization fails */ - public abstract void init() throws IOException; + public abstract void init(Context nmContext) throws IOException; /** * This function localizes the JAR file on-demand. diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java index a2d00a4cc24..a1c474f958d 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java @@ -34,6 +34,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator; import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager; @@ -122,4 +123,6 @@ ContainerExecutor getContainerExecutor(); ContainerStateTransitionListener getContainerStateTransitionListener(); + + ResourcePluginManager getResourcePluginManager(); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java index ac88e8cd336..5772403567e 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java @@ -133,7 +133,7 @@ protected void setScriptExecutable(Path script, String owner) } @Override - public void init() throws IOException { + public void init(Context nmContext) throws IOException { // nothing to do or verify here } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java index 64f3d58327e..da1989eaf5c 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java @@ -20,6 +20,7 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Optional; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -282,7 +283,7 @@ protected PrivilegedOperationExecutor getPrivilegedOperationExecutor() { } @Override - public void init() throws IOException { + public void init(Context nmContext) throws IOException { Configuration conf = super.getConf(); // Send command to executor which will just start up, @@ -306,7 +307,7 @@ public void init() throws IOException { try { resourceHandlerChain = ResourceHandlerModule - .getConfiguredResourceHandlerChain(conf); + .getConfiguredResourceHandlerChain(conf, nmContext); if (LOG.isDebugEnabled()) { LOG.debug("Resource handler chain enabled = " + (resourceHandlerChain != null)); @@ -871,4 +872,9 @@ public void mountCgroups(List cgroupKVs, String hierarchy) e); } } + + @VisibleForTesting + public ResourceHandler getResourceHandler() { + return resourceHandlerChain; + } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java index 44133dfd59d..54a235c8762 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java @@ -18,23 +18,7 @@ package org.apache.hadoop.yarn.server.nodemanager; -import java.io.IOException; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.ConcurrentMap; -import java.util.concurrent.ConcurrentSkipListMap; -import java.util.concurrent.atomic.AtomicBoolean; - -import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent; -import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; -import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; -import org.apache.hadoop.yarn.state.MultiStateTransitionListener; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - +import com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.classification.InterfaceAudience.Private; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -65,12 +49,16 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport; import org.apache.hadoop.yarn.server.api.records.AppCollectorData; import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus; -import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager; import org.apache.hadoop.yarn.server.nodemanager.collectormanager.NMCollectorService; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.ConfigurationNodeLabelsProvider; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider; @@ -78,14 +66,25 @@ import org.apache.hadoop.yarn.server.nodemanager.recovery.NMLeveldbStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; -import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator; import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager; import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM; import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher; import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer; +import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; +import org.apache.hadoop.yarn.state.MultiStateTransitionListener; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import com.google.common.annotations.VisibleForTesting; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ConcurrentSkipListMap; +import java.util.concurrent.atomic.AtomicBoolean; public class NodeManager extends CompositeService implements EventHandler { @@ -332,6 +331,18 @@ public static NodeHealthScriptRunner getNodeHealthScriptRunner(Configuration con nmCheckintervalTime, scriptTimeout, scriptArgs); } + @VisibleForTesting + protected ResourcePluginManager createResourcePluginManager() { + return new ResourcePluginManager(); + } + + @VisibleForTesting + protected ContainerExecutor createContainerExecutor(Configuration conf) { + return ReflectionUtils.newInstance( + conf.getClass(YarnConfiguration.NM_CONTAINER_EXECUTOR, + DefaultContainerExecutor.class, ContainerExecutor.class), conf); + } + @Override protected void serviceInit(Configuration conf) throws Exception { rmWorkPreservingRestartEnabled = conf.getBoolean(YarnConfiguration @@ -357,11 +368,22 @@ protected void serviceInit(Configuration conf) throws Exception { this.aclsManager = new ApplicationACLsManager(conf); - ContainerExecutor exec = ReflectionUtils.newInstance( - conf.getClass(YarnConfiguration.NM_CONTAINER_EXECUTOR, - DefaultContainerExecutor.class, ContainerExecutor.class), conf); + this.dirsHandler = new LocalDirsHandlerService(metrics); + + boolean isDistSchedulingEnabled = + conf.getBoolean(YarnConfiguration.DIST_SCHEDULING_ENABLED, + YarnConfiguration.DEFAULT_DIST_SCHEDULING_ENABLED); + + this.context = createNMContext(containerTokenSecretManager, + nmTokenSecretManager, nmStore, isDistSchedulingEnabled, conf); + + ResourcePluginManager pluginManager = createResourcePluginManager(); + pluginManager.initialize(context); + ((NMContext)context).setResourcePluginManager(pluginManager); + + ContainerExecutor exec = createContainerExecutor(conf); try { - exec.init(); + exec.init(context); } catch (IOException e) { throw new YarnRuntimeException("Failed to initialize container executor", e); } @@ -371,19 +393,11 @@ protected void serviceInit(Configuration conf) throws Exception { // NodeManager level dispatcher this.dispatcher = createNMDispatcher(); - dirsHandler = new LocalDirsHandlerService(metrics); nodeHealthChecker = new NodeHealthCheckerService( getNodeHealthScriptRunner(conf), dirsHandler); addService(nodeHealthChecker); - boolean isDistSchedulingEnabled = - conf.getBoolean(YarnConfiguration.DIST_SCHEDULING_ENABLED, - YarnConfiguration.DEFAULT_DIST_SCHEDULING_ENABLED); - - this.context = createNMContext(containerTokenSecretManager, - nmTokenSecretManager, nmStore, isDistSchedulingEnabled, conf); - ((NMContext)context).setContainerExecutor(exec); @@ -457,6 +471,12 @@ protected void serviceStop() throws Exception { try { super.serviceStop(); DefaultMetricsSystem.shutdown(); + + // Cleanup ResourcePluginManager + ResourcePluginManager rpm = context.getResourcePluginManager(); + if (rpm != null) { + rpm.cleanup(); + } } finally { // YARN-3641: NM's services stop get failed shouldn't block the // release of NMLevelDBStore. @@ -604,6 +624,8 @@ protected void reregisterCollectors() { private ContainerStateTransitionListener containerStateTransitionListener; + private ResourcePluginManager resourcePluginManager; + public NMContext(NMContainerTokenSecretManager containerTokenSecretManager, NMTokenSecretManagerInNM nmTokenSecretManager, LocalDirsHandlerService dirsHandler, ApplicationACLsManager aclsManager, @@ -804,6 +826,15 @@ public void setContainerStateTransitionListener( ContainerStateTransitionListener transitionListener) { this.containerStateTransitionListener = transitionListener; } + + public ResourcePluginManager getResourcePluginManager() { + return resourcePluginManager; + } + + public void setResourcePluginManager( + ResourcePluginManager resourcePluginManager) { + this.resourcePluginManager = resourcePluginManager; + } } /** diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java index ee85042979e..91217ddb463 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java @@ -33,6 +33,9 @@ import java.util.Random; import java.util.Set; import java.util.concurrent.ConcurrentLinkedQueue; + +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -178,14 +181,15 @@ protected void serviceInit(Configuration conf) throws Exception { long memoryMb = totalResource.getMemorySize(); float vMemToPMem = conf.getFloat( - YarnConfiguration.NM_VMEM_PMEM_RATIO, - YarnConfiguration.DEFAULT_NM_VMEM_PMEM_RATIO); + YarnConfiguration.NM_VMEM_PMEM_RATIO, + YarnConfiguration.DEFAULT_NM_VMEM_PMEM_RATIO); long virtualMemoryMb = (long)Math.ceil(memoryMb * vMemToPMem); - int virtualCores = totalResource.getVirtualCores(); - LOG.info("Nodemanager resources: memory set to " + memoryMb + "MB."); - LOG.info("Nodemanager resources: vcores set to " + virtualCores + "."); - LOG.info("Nodemanager resources: " + totalResource); + + // Update configured resources via plugins. + updateConfiguredResourcesViaPlugins(totalResource); + + LOG.info("Nodemanager resources is set to: " + totalResource); metrics.addResource(totalResource); @@ -342,12 +346,27 @@ protected ResourceTracker getRMClient() throws IOException { return ServerRMProxy.createRMProxy(conf, ResourceTracker.class); } + private void updateConfiguredResourcesViaPlugins( + Resource configuredResource) throws YarnException { + ResourcePluginManager pluginManager = context.getResourcePluginManager(); + if (pluginManager != null && pluginManager.getNameToPlugins() != null) { + // Update configured resource + for (ResourcePlugin resourcePlugin : pluginManager.getNameToPlugins() + .values()) { + if (resourcePlugin.getNodeResourceHandlerInstance() != null) { + resourcePlugin.getNodeResourceHandlerInstance() + .updateConfiguredResource(configuredResource); + } + } + } + } + @VisibleForTesting protected void registerWithRM() throws YarnException, IOException { RegisterNodeManagerResponse regNMResponse; Set nodeLabels = nodeLabelsHandler.getNodeLabelsForRegistration(); - + // Synchronize NM-RM registration with // ContainerManagerImpl#increaseContainersResource and // ContainerManagerImpl#startContainers to avoid race condition @@ -358,6 +377,7 @@ protected void registerWithRM() RegisterNodeManagerRequest.newInstance(nodeId, httpPort, totalResource, nodeManagerVersionId, containerReports, getRunningApplications(), nodeLabels, physicalResource); + if (containerReports != null) { LOG.info("Registering with RM using containers :" + containerReports); } @@ -406,7 +426,7 @@ protected void registerWithRM() if (masterKey != null) { this.context.getContainerTokenSecretManager().setMasterKey(masterKey); } - + masterKey = regNMResponse.getNMTokenMasterKey(); if (masterKey != null) { this.context.getNMTokenSecretManager().setMasterKey(masterKey); @@ -738,7 +758,7 @@ public void removeVeryOldStoppedContainersFromCache() { } } } - + @Override public long getRMIdentifier() { return this.rmIdentifier; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java index ae83b8832a6..ae3552fbebb 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java @@ -98,4 +98,11 @@ boolean isRecovering(); void sendPauseEvent(String description); + + /** + * Get assigned resource mappings to the container. + * + * @return Resource Mappings of the container + */ + ResourceMappings getResourceMappings(); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index 91c51c5ea11..a91973f99a1 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -188,6 +188,7 @@ private ReInitializationContext createContextForRollback() { private boolean recoveredAsKilled = false; private Context context; private ResourceSet resourceSet; + private ResourceMappings resourceMappings; public ContainerImpl(Configuration conf, Dispatcher dispatcher, ContainerLaunchContext launchContext, Credentials creds, @@ -246,6 +247,7 @@ public ContainerImpl(Configuration conf, Dispatcher dispatcher, context.getContainerStateTransitionListener()); this.context = context; this.resourceSet = new ResourceSet(); + this.resourceMappings = new ResourceMappings(); } private static ContainerRetryContext configureRetryContext( @@ -286,6 +288,7 @@ public ContainerImpl(Configuration conf, Dispatcher dispatcher, this.remainingRetryAttempts = rcs.getRemainingRetryAttempts(); this.workDir = rcs.getWorkDir(); this.logDir = rcs.getLogDir(); + this.resourceMappings = rcs.getResourceMappings(); } private static final ContainerDiagnosticsUpdateTransition UPDATE_DIAGNOSTICS_TRANSITION = @@ -2174,4 +2177,14 @@ public boolean isRecovering() { getContainerState() == ContainerState.NEW); return isRecovering; } + + /** + * Get assigned resource mappings to the container. + * + * @return Resource Mappings of the container + */ + @Override + public ResourceMappings getResourceMappings() { + return resourceMappings; + } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ResourceMappings.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ResourceMappings.java new file mode 100644 index 00000000000..d673341b01c --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ResourceMappings.java @@ -0,0 +1,124 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.container; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.io.IOUtils; + +/** + * This class is used to store assigned resource to a single container by + * resource types. + * + * Assigned resource could be list of String + * + * For example, we can assign container to: + * "numa": ["numa0"] + * "gpu": ["0", "1", "2", "3"] + * "fpga": ["1", "3"] + * + * This will be used for NM restart container recovery. + */ +public class ResourceMappings { + + private Map assignedResourcesMap = new HashMap<>(); + + /** + * Get all resource mappings. + * @param resourceType resourceType + * @return map of resource mapping + */ + public List getAssignedResources(String resourceType) { + AssignedResources ar = assignedResourcesMap.get(resourceType); + if (null == ar) { + return Collections.emptyList(); + } + return ar.getAssignedResources(); + } + + /** + * Adds the resources for a given resource type. + * + * @param resourceType Resource Type + * @param assigned Assigned resources to add + */ + public void addAssignedResources(String resourceType, + AssignedResources assigned) { + assignedResourcesMap.put(resourceType, assigned); + } + + /** + * Stores resources assigned to a container for a given resource type. + */ + public static class AssignedResources implements Serializable { + private static final long serialVersionUID = -1059491941955757926L; + private List resources = Collections.emptyList(); + + public List getAssignedResources() { + return Collections.unmodifiableList(resources); + } + + public void updateAssignedResources(List list) { + this.resources = new ArrayList<>(list); + } + + @SuppressWarnings("unchecked") + public static AssignedResources fromBytes(byte[] bytes) + throws IOException { + ObjectInputStream ois = null; + List resources; + try { + ByteArrayInputStream bis = new ByteArrayInputStream(bytes); + ois = new ObjectInputStream(bis); + resources = (List) ois.readObject(); + } catch (ClassNotFoundException e) { + throw new IOException(e); + } finally { + IOUtils.closeQuietly(ois); + } + AssignedResources ar = new AssignedResources(); + ar.updateAssignedResources(resources); + return ar; + } + + public byte[] toBytes() throws IOException { + ObjectOutputStream oos = null; + byte[] bytes; + try { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + oos = new ObjectOutputStream(bos); + oos.writeObject(resources); + bytes = bos.toByteArray(); + } finally { + IOUtils.closeQuietly(oos); + } + return bytes; + } + } +} \ No newline at end of file diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java index 8402a16339d..db0b2251578 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/privileged/PrivilegedOperation.java @@ -51,6 +51,7 @@ TC_READ_STATS("--tc-read-stats"), ADD_PID_TO_CGROUP(""), //no CLI switch supported yet. RUN_DOCKER_CMD("--run-docker"), + GPU("--module-gpu"), LIST_AS_USER(""); //no CLI switch supported yet. private final String option; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerChain.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerChain.java index 955d2169fec..72bf30ce871 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerChain.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerChain.java @@ -20,6 +20,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources; +import com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; @@ -135,7 +136,8 @@ public ResourceHandlerChain(List resourceHandlers) { return allOperations; } - List getResourceHandlerList() { + @VisibleForTesting + public List getResourceHandlerList() { return Collections.unmodifiableList(resourceHandlers); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java index 3c61cd4b5be..ce850ab3b7c 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java @@ -21,25 +21,28 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources; import com.google.common.annotations.VisibleForTesting; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; import org.apache.hadoop.yarn.server.nodemanager.util.CgroupsLCEResourcesHandler; import org.apache.hadoop.yarn.server.nodemanager.util.DefaultLCEResourcesHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; -import java.util.Set; -import java.util.HashSet; -import java.util.Map; -import java.util.HashMap; -import java.util.Arrays; import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; import java.util.List; +import java.util.Map; +import java.util.Set; /** * Provides mechanisms to get various resource handlers - cpu, memory, network, @@ -206,22 +209,41 @@ private static void addHandlerIfNotNull(List handlerList, } private static void initializeConfiguredResourceHandlerChain( - Configuration conf) throws ResourceHandlerException { + Configuration conf, Context nmContext) + throws ResourceHandlerException { ArrayList handlerList = new ArrayList<>(); addHandlerIfNotNull(handlerList, getOutboundBandwidthResourceHandler(conf)); addHandlerIfNotNull(handlerList, getDiskResourceHandler(conf)); addHandlerIfNotNull(handlerList, getMemoryResourceHandler(conf)); addHandlerIfNotNull(handlerList, getCGroupsCpuResourceHandler(conf)); + addHandlersFromConfiguredResourcePlugins(handlerList, conf, nmContext); resourceHandlerChain = new ResourceHandlerChain(handlerList); } + private static void addHandlersFromConfiguredResourcePlugins( + List handlerList, Configuration conf, + Context nmContext) throws ResourceHandlerException { + ResourcePluginManager pluginManager = nmContext.getResourcePluginManager(); + if (pluginManager != null) { + Map pluginMap = pluginManager.getNameToPlugins(); + if (pluginMap != null) { + for (ResourcePlugin plugin : pluginMap.values()) { + addHandlerIfNotNull(handlerList, plugin + .createResourceHandler(nmContext, + getInitializedCGroupsHandler(conf), + PrivilegedOperationExecutor.getInstance(conf))); + } + } + } + } + public static ResourceHandlerChain getConfiguredResourceHandlerChain( - Configuration conf) throws ResourceHandlerException { + Configuration conf, Context nmContext) throws ResourceHandlerException { if (resourceHandlerChain == null) { synchronized (ResourceHandlerModule.class) { if (resourceHandlerChain == null) { - initializeConfiguredResourceHandlerChain(conf); + initializeConfiguredResourceHandlerChain(conf, nmContext); } } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java new file mode 100644 index 00000000000..5bdffc369b2 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceAllocator.java @@ -0,0 +1,245 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Sets; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.exceptions.ResourceNotFoundException; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.AssignedGpuDevice; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; + +import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI; + +/** + * Allocate GPU resources according to requirements + */ +public class GpuResourceAllocator { + final static Log LOG = LogFactory.getLog(GpuResourceAllocator.class); + + private Set allowedGpuDevices = new TreeSet<>(); + private Map usedDevices = new TreeMap<>(); + private Context nmContext; + + public GpuResourceAllocator(Context ctx) { + this.nmContext = ctx; + } + + /** + * Contains allowed and denied devices + * Denied devices will be useful for cgroups devices module to do blacklisting + */ + static class GpuAllocation { + private Set allowed = Collections.emptySet(); + private Set denied = Collections.emptySet(); + + GpuAllocation(Set allowed, Set denied) { + if (allowed != null) { + this.allowed = ImmutableSet.copyOf(allowed); + } + if (denied != null) { + this.denied = ImmutableSet.copyOf(denied); + } + } + + public Set getAllowedGPUs() { + return allowed; + } + + public Set getDeniedGPUs() { + return denied; + } + } + + /** + * Add GPU to allowed list + * @param gpuDevice gpu device + */ + public synchronized void addGpu(GpuDevice gpuDevice) { + allowedGpuDevices.add(gpuDevice); + } + + private String getResourceHandlerExceptionMessage(int numRequestedGpuDevices, + ContainerId containerId) { + return "Failed to find enough GPUs, requestor=" + containerId + + ", #RequestedGPUs=" + numRequestedGpuDevices + ", #availableGpus=" + + getAvailableGpus(); + } + + @VisibleForTesting + public synchronized int getAvailableGpus() { + return allowedGpuDevices.size() - usedDevices.size(); + } + + public synchronized void recoverAssignedGpus(ContainerId containerId) + throws ResourceHandlerException { + Container c = nmContext.getContainers().get(containerId); + if (null == c) { + throw new ResourceHandlerException( + "This shouldn't happen, cannot find container with id=" + + containerId); + } + + for (Serializable gpuDeviceSerializable : c.getResourceMappings() + .getAssignedResources(GPU_URI)) { + if (!(gpuDeviceSerializable instanceof GpuDevice)) { + throw new ResourceHandlerException( + "Trying to recover device id, however it" + + " is not GpuDevice, this shouldn't happen"); + } + + GpuDevice gpuDevice = (GpuDevice) gpuDeviceSerializable; + + // Make sure it is in allowed GPU device. + if (!allowedGpuDevices.contains(gpuDevice)) { + throw new ResourceHandlerException( + "Try to recover device = " + gpuDevice + + " however it is not in allowed device list:" + StringUtils + .join(",", allowedGpuDevices)); + } + + // Make sure it is not occupied by anybody else + if (usedDevices.containsKey(gpuDevice)) { + throw new ResourceHandlerException( + "Try to recover device id = " + gpuDevice + + " however it is already assigned to container=" + usedDevices + .get(gpuDevice) + ", please double check what happened."); + } + + usedDevices.put(gpuDevice, containerId); + } + } + + /** + * Get number of requested GPUs from resource. + * @param requestedResource requested resource + * @return #gpus. + */ + public static int getRequestedGpus(Resource requestedResource) { + try { + return Long.valueOf(requestedResource.getResourceValue( + GPU_URI)).intValue(); + } catch (ResourceNotFoundException e) { + return 0; + } + } + + /** + * Assign GPU to requestor + * @param container container to allocate + * @return allocation results. + * @throws ResourceHandlerException When failed to assign GPUs. + */ + public synchronized GpuAllocation assignGpus(Container container) + throws ResourceHandlerException { + Resource requestedResource = container.getResource(); + ContainerId containerId = container.getContainerId(); + int numRequestedGpuDevices = getRequestedGpus(requestedResource); + // Assign Gpus to container if requested some. + if (numRequestedGpuDevices > 0) { + if (numRequestedGpuDevices > getAvailableGpus()) { + throw new ResourceHandlerException( + getResourceHandlerExceptionMessage(numRequestedGpuDevices, + containerId)); + } + + Set assignedGpus = new TreeSet<>(); + + for (GpuDevice gpu : allowedGpuDevices) { + if (!usedDevices.containsKey(gpu)) { + usedDevices.put(gpu, containerId); + assignedGpus.add(gpu); + if (assignedGpus.size() == numRequestedGpuDevices) { + break; + } + } + } + + // Record in state store if we allocated anything + if (!assignedGpus.isEmpty()) { + try { + // Update state store. + nmContext.getNMStateStore().storeAssignedResources(container, GPU_URI, + new ArrayList<>(assignedGpus)); + } catch (IOException e) { + cleanupAssignGpus(containerId); + throw new ResourceHandlerException(e); + } + } + + return new GpuAllocation(assignedGpus, + Sets.difference(allowedGpuDevices, assignedGpus)); + } + return new GpuAllocation(null, allowedGpuDevices); + } + + /** + * Clean up all Gpus assigned to containerId + * @param containerId containerId + */ + public synchronized void cleanupAssignGpus(ContainerId containerId) { + Iterator> iter = + usedDevices.entrySet().iterator(); + while (iter.hasNext()) { + if (iter.next().getValue().equals(containerId)) { + iter.remove(); + } + } + } + + @VisibleForTesting + public synchronized Map getDeviceAllocationMappingCopy() { + return new HashMap<>(usedDevices); + } + + public synchronized List getAllowedGpusCopy() { + return new ArrayList<>(allowedGpuDevices); + } + + public synchronized List getAssignedGpusCopy() { + List assigns = new ArrayList<>(); + for (Map.Entry entry : usedDevices.entrySet()) { + assigns.add(new AssignedGpuDevice(entry.getKey().getIndex(), + entry.getKey().getMinorNumber(), entry.getValue())); + } + return assigns; + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java new file mode 100644 index 00000000000..500382162fb --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java @@ -0,0 +1,160 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class GpuResourceHandlerImpl implements ResourceHandler { + final static Log LOG = LogFactory + .getLog(GpuResourceHandlerImpl.class); + + // This will be used by container-executor to add necessary clis + public static final String EXCLUDED_GPUS_CLI_OPTION = "--excluded_gpus"; + public static final String CONTAINER_ID_CLI_OPTION = "--container_id"; + + private GpuResourceAllocator gpuAllocator; + private CGroupsHandler cGroupsHandler; + private PrivilegedOperationExecutor privilegedOperationExecutor; + + public GpuResourceHandlerImpl(Context nmContext, + CGroupsHandler cGroupsHandler, + PrivilegedOperationExecutor privilegedOperationExecutor) { + this.cGroupsHandler = cGroupsHandler; + this.privilegedOperationExecutor = privilegedOperationExecutor; + gpuAllocator = new GpuResourceAllocator(nmContext); + } + + @Override + public List bootstrap(Configuration configuration) + throws ResourceHandlerException { + List usableGpus; + try { + usableGpus = GpuDiscoverer.getInstance() + .getGpusUsableByYarn(); + if (usableGpus == null || usableGpus.isEmpty()) { + String message = "GPU is enabled on the NodeManager, but couldn't find " + + "any usable GPU devices, please double check configuration."; + LOG.error(message); + throw new ResourceHandlerException(message); + } + } catch (YarnException e) { + LOG.error("Exception when trying to get usable GPU device", e); + throw new ResourceHandlerException(e); + } + + for (GpuDevice gpu : usableGpus) { + gpuAllocator.addGpu(gpu); + } + + // And initialize cgroups + this.cGroupsHandler.initializeCGroupController( + CGroupsHandler.CGroupController.DEVICES); + + return null; + } + + @Override + public synchronized List preStart(Container container) + throws ResourceHandlerException { + String containerIdStr = container.getContainerId().toString(); + + // Assign Gpus to container if requested some. + GpuResourceAllocator.GpuAllocation allocation = gpuAllocator.assignGpus( + container); + + // Create device cgroups for the container + cGroupsHandler.createCGroup(CGroupsHandler.CGroupController.DEVICES, + containerIdStr); + try { + // Execute c-e to setup GPU isolation before launch the container + PrivilegedOperation privilegedOperation = new PrivilegedOperation( + PrivilegedOperation.OperationType.GPU, Arrays + .asList(CONTAINER_ID_CLI_OPTION, containerIdStr)); + if (!allocation.getDeniedGPUs().isEmpty()) { + List minorNumbers = new ArrayList<>(); + for (GpuDevice deniedGpu : allocation.getDeniedGPUs()) { + minorNumbers.add(deniedGpu.getMinorNumber()); + } + privilegedOperation.appendArgs(Arrays.asList(EXCLUDED_GPUS_CLI_OPTION, + StringUtils.join(",", minorNumbers))); + } + privilegedOperationExecutor.executePrivilegedOperation( + privilegedOperation, true); + } catch (PrivilegedOperationException e) { + cGroupsHandler.deleteCGroup(CGroupsHandler.CGroupController.DEVICES, + containerIdStr); + LOG.warn("Could not update cgroup for container", e); + throw new ResourceHandlerException(e); + } + + List ret = new ArrayList<>(); + ret.add(new PrivilegedOperation( + PrivilegedOperation.OperationType.ADD_PID_TO_CGROUP, + PrivilegedOperation.CGROUP_ARG_PREFIX + + cGroupsHandler.getPathForCGroupTasks( + CGroupsHandler.CGroupController.DEVICES, containerIdStr))); + + return ret; + } + + public GpuResourceAllocator getGpuAllocator() { + return gpuAllocator; + } + + @Override + public List reacquireContainer(ContainerId containerId) + throws ResourceHandlerException { + gpuAllocator.recoverAssignedGpus(containerId); + return null; + } + + @Override + public synchronized List postComplete( + ContainerId containerId) throws ResourceHandlerException { + gpuAllocator.cleanupAssignGpus(containerId); + cGroupsHandler.deleteCGroup(CGroupsHandler.CGroupController.DEVICES, + containerId.toString()); + return null; + } + + @Override + public List teardown() throws ResourceHandlerException { + return null; + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/NodeResourceUpdaterPlugin.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/NodeResourceUpdaterPlugin.java new file mode 100644 index 00000000000..88f77ed12ed --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/NodeResourceUpdaterPlugin.java @@ -0,0 +1,52 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin; + +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.exceptions.YarnException; + +/** + * Plugins to handle resources on a node. This will be used by + * {@link org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater} + */ +public abstract class NodeResourceUpdaterPlugin { + /** + * Update configured resource for the given component. + * @param res resource passed in by external mododule (such as + * {@link org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater} + * @throws YarnException when any issue happens. + */ + public abstract void updateConfiguredResource(Resource res) + throws YarnException; + + /** + * This method will be called when the node's resource is loaded from + * dynamic-resources.xml in ResourceManager. + * + * @param newResource newResource reported by RM + * @throws YarnException when any mismatch between NM/RM + */ + public void handleUpdatedResourceFromRM(Resource newResource) throws + YarnException { + // by default do nothing, subclass should implement this method when any + // special activities required upon new resource reported by RM. + } + + // TODO: add implementation to update node attribute once YARN-3409 merged. +} \ No newline at end of file diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePlugin.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePlugin.java new file mode 100644 index 00000000000..78167c4ef33 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePlugin.java @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin; + +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo; + +/** + * {@link ResourcePlugin} is an interface for node manager to easier support + * discovery/manage/isolation for new resource types. + * + *

+ * It has two major part: {@link ResourcePlugin#createResourceHandler(Context, + * CGroupsHandler, PrivilegedOperationExecutor)} and + * {@link ResourcePlugin#getNodeResourceHandlerInstance()}, see javadocs below + * for more details. + *

+ */ +public interface ResourcePlugin { + /** + * Initialize the plugin, this will be invoked during NM startup. + * @param context NM Context + * @throws YarnException when any issue occurs + */ + void initialize(Context context) throws YarnException; + + /** + * Plugin needs to return {@link ResourceHandler} when any special isolation + * required for the resource type. This will be added to + * {@link ResourceHandlerChain} during NodeManager startup. When no special + * isolation need, return null. + * + * @param nmContext NodeManager context. + * @param cGroupsHandler CGroupsHandler + * @param privilegedOperationExecutor Privileged Operation Executor. + * @return ResourceHandler + */ + ResourceHandler createResourceHandler(Context nmContext, + CGroupsHandler cGroupsHandler, + PrivilegedOperationExecutor privilegedOperationExecutor); + + /** + * Plugin needs to return {@link NodeResourceUpdaterPlugin} when any discovery + * mechanism required for the resource type. For example, if we want to set + * resource-value during NM registration or send update during NM-RM heartbeat + * We can implement a {@link NodeResourceUpdaterPlugin} and update fields of + * {@link org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest} + * or {@link org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest} + * + * This will be invoked during every node status update or node registration, + * please avoid creating new instance every time. + * + * @return NodeResourceUpdaterPlugin, could be null when no discovery needed. + */ + NodeResourceUpdaterPlugin getNodeResourceHandlerInstance(); + + /** + * Do cleanup of the plugin, this will be invoked when + * {@link org.apache.hadoop.yarn.server.nodemanager.NodeManager} stops + * @throws YarnException if any issue occurs + */ + void cleanup() throws YarnException; + + /** + * Get resource information from this plugin. + * + * @return NMResourceInfo, an example is + * {@link org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation} + * + * @throws YarnException when any issue occurs + */ + NMResourceInfo getNMResourceInfo() throws YarnException; +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java new file mode 100644 index 00000000000..73d6038afb1 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java @@ -0,0 +1,106 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin; + +import com.google.common.collect.ImmutableSet; +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuResourcePlugin; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI; + +/** + * Manages {@link ResourcePlugin} configured on this NodeManager. + */ +public class ResourcePluginManager { + private static final Logger LOG = + LoggerFactory.getLogger(ResourcePluginManager.class); + private static final Set SUPPORTED_RESOURCE_PLUGINS = ImmutableSet.of( + GPU_URI); + + private Map configuredPlugins = Collections.EMPTY_MAP; + + public synchronized void initialize(Context context) + throws YarnException { + Configuration conf = context.getConf(); + String[] plugins = conf.getStrings(YarnConfiguration.NM_RESOURCE_PLUGINS); + + if (plugins != null) { + Map pluginMap = new HashMap<>(); + + // Initialize each plugins + for (String resourceName : plugins) { + resourceName = resourceName.trim(); + if (!SUPPORTED_RESOURCE_PLUGINS.contains(resourceName)) { + String msg = + "Trying to initialize resource plugin with name=" + resourceName + + ", it is not supported, list of supported plugins:" + + StringUtils.join(",", + SUPPORTED_RESOURCE_PLUGINS); + LOG.error(msg); + throw new YarnException(msg); + } + + if (pluginMap.containsKey(resourceName)) { + // Duplicated items, ignore ... + continue; + } + + ResourcePlugin plugin = null; + if (resourceName.equals(GPU_URI)) { + plugin = new GpuResourcePlugin(); + } + + if (plugin == null) { + throw new YarnException( + "This shouldn't happen, plugin=" + resourceName + + " should be loaded and initialized"); + } + plugin.initialize(context); + pluginMap.put(resourceName, plugin); + } + + configuredPlugins = Collections.unmodifiableMap(pluginMap); + } + } + + public synchronized void cleanup() throws YarnException { + for (ResourcePlugin plugin : configuredPlugins.values()) { + plugin.cleanup(); + } + } + + /** + * Get resource name (such as gpu/fpga) to plugin references. + * @return read-only map of resource name to plugins. + */ + public synchronized Map getNameToPlugins() { + return configuredPlugins; + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/AssignedGpuDevice.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/AssignedGpuDevice.java new file mode 100644 index 00000000000..26fd9050742 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/AssignedGpuDevice.java @@ -0,0 +1,79 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import org.apache.hadoop.yarn.api.records.ContainerId; + +/** + * In addition to {@link GpuDevice}, this include container id and more runtime + * information related to who is using the GPU device if possible + */ +public class AssignedGpuDevice extends GpuDevice { + private static final long serialVersionUID = -12983712986315L; + + String containerId; + + public AssignedGpuDevice(int index, int minorNumber, + ContainerId containerId) { + super(index, minorNumber); + this.containerId = containerId.toString(); + } + + public String getContainerId() { + return containerId; + } + + public void setContainerId(String containerId) { + this.containerId = containerId; + } + + @Override + public boolean equals(Object obj) { + if (obj == null || !(obj instanceof AssignedGpuDevice)) { + return false; + } + AssignedGpuDevice other = (AssignedGpuDevice) obj; + return index == other.index && minorNumber == other.minorNumber + && containerId.equals(other.containerId); + } + + @Override + public int compareTo(Object obj) { + if (obj == null || (!(obj instanceof AssignedGpuDevice))) { + return -1; + } + + AssignedGpuDevice other = (AssignedGpuDevice) obj; + + int result = Integer.compare(index, other.index); + if (0 != result) { + return result; + } + result = Integer.compare(minorNumber, other.minorNumber); + if (0 != result) { + return result; + } + return containerId.compareTo(other.containerId); + } + + @Override + public int hashCode() { + final int prime = 47; + return prime * (prime * index + minorNumber) + containerId.hashCode(); + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDevice.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDevice.java new file mode 100644 index 00000000000..bce1d9fa480 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDevice.java @@ -0,0 +1,78 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import java.io.Serializable; + +/** + * This class is used to represent GPU device while allocation. + */ +public class GpuDevice implements Serializable, Comparable { + protected int index; + protected int minorNumber; + private static final long serialVersionUID = -6812314470754667710L; + + public GpuDevice(int index, int minorNumber) { + this.index = index; + this.minorNumber = minorNumber; + } + + public int getIndex() { + return index; + } + + public int getMinorNumber() { + return minorNumber; + } + + @Override + public boolean equals(Object obj) { + if (obj == null || !(obj instanceof GpuDevice)) { + return false; + } + GpuDevice other = (GpuDevice) obj; + return index == other.index && minorNumber == other.minorNumber; + } + + @Override + public int compareTo(Object obj) { + if (obj == null || (!(obj instanceof GpuDevice))) { + return -1; + } + + GpuDevice other = (GpuDevice) obj; + + int result = Integer.compare(index, other.index); + if (0 != result) { + return result; + } + return Integer.compare(minorNumber, other.minorNumber); + } + + @Override + public int hashCode() { + final int prime = 47; + return prime * index + minorNumber; + } + + @Override + public String toString() { + return "(index=" + index + ",minor_number=" + minorNumber + ")"; + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java new file mode 100644 index 00000000000..6e3cf1315ce --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -0,0 +1,264 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.Shell; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +@InterfaceAudience.Private +@InterfaceStability.Unstable +public class GpuDiscoverer { + public static final Logger LOG = LoggerFactory.getLogger( + GpuDiscoverer.class); + @VisibleForTesting + protected static final String DEFAULT_BINARY_NAME = "nvidia-smi"; + + // When executable path not set, try to search default dirs + // By default search /usr/bin, /bin, and /usr/local/nvidia/bin (when + // launched by nvidia-docker. + private static final Set DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of( + "/usr/bin", "/bin", "/usr/local/nvidia/bin"); + + // command should not run more than 10 sec. + private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; + private static final int MAX_REPEATED_ERROR_ALLOWED = 10; + private static GpuDiscoverer instance; + + static { + instance = new GpuDiscoverer(); + } + + private Configuration conf = null; + private String pathOfGpuBinary = null; + private Map environment = new HashMap<>(); + private GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); + + private int numOfErrorExecutionSinceLastSucceed = 0; + GpuDeviceInformation lastDiscoveredGpuInformation = null; + + private void validateConfOrThrowException() throws YarnException { + if (conf == null) { + throw new YarnException("Please initialize (call initialize) before use " + + GpuDiscoverer.class.getSimpleName()); + } + } + + /** + * Get GPU device information from system. + * This need to be called after initialize. + * + * Please note that this only works on *NIX platform, so external caller + * need to make sure this. + * + * @return GpuDeviceInformation + * @throws YarnException when any error happens + */ + public synchronized GpuDeviceInformation getGpuDeviceInformation() + throws YarnException { + validateConfOrThrowException(); + + if (null == pathOfGpuBinary) { + throw new YarnException( + "Failed to find GPU discovery executable, please double check " + + YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting."); + } + + if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { + String msg = + "Failed to execute GPU device information detection script for " + + MAX_REPEATED_ERROR_ALLOWED + + " times, skip following executions."; + LOG.error(msg); + throw new YarnException(msg); + } + + String output; + try { + output = Shell.execCommand(environment, + new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS); + GpuDeviceInformation info = parser.parseXml(output); + numOfErrorExecutionSinceLastSucceed = 0; + lastDiscoveredGpuInformation = info; + return info; + } catch (IOException e) { + numOfErrorExecutionSinceLastSucceed++; + String msg = + "Failed to execute " + pathOfGpuBinary + " exception message:" + e + .getMessage() + ", continue ..."; + if (LOG.isDebugEnabled()) { + LOG.debug(msg); + } + throw new YarnException(e); + } catch (YarnException e) { + numOfErrorExecutionSinceLastSucceed++; + String msg = "Failed to parse xml output" + e.getMessage(); + if (LOG.isDebugEnabled()) { + LOG.warn(msg, e); + } + throw e; + } + } + + /** + * Get list of GPU devices usable by YARN. + * + * @return List of GPU devices + * @throws YarnException when any issue happens + */ + public synchronized List getGpusUsableByYarn() + throws YarnException { + validateConfOrThrowException(); + + String allowedDevicesStr = conf.get( + YarnConfiguration.NM_GPU_ALLOWED_DEVICES, + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES); + + List gpuDevices = new ArrayList<>(); + + if (allowedDevicesStr.equals( + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) { + // Get gpu device information from system. + if (null == lastDiscoveredGpuInformation) { + String msg = YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " is set to " + + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES + + ", however automatically discovering " + + "GPU information failed, please check NodeManager log for more" + + " details, as an alternative, admin can specify " + + YarnConfiguration.NM_GPU_ALLOWED_DEVICES + + " manually to enable GPU isolation."; + LOG.error(msg); + throw new YarnException(msg); + } + + if (lastDiscoveredGpuInformation.getGpus() != null) { + for (int i = 0; i < lastDiscoveredGpuInformation.getGpus().size(); + i++) { + List gpuInfos = + lastDiscoveredGpuInformation.getGpus(); + gpuDevices.add(new GpuDevice(i, gpuInfos.get(i).getMinorNumber())); + } + } + } else{ + for (String s : allowedDevicesStr.split(",")) { + if (s.trim().length() > 0) { + String[] kv = s.trim().split(":"); + if (kv.length != 2) { + throw new YarnException( + "Illegal format, it should be index:minor_number format, now it=" + + s); + } + + gpuDevices.add( + new GpuDevice(Integer.parseInt(kv[0]), Integer.parseInt(kv[1]))); + } + } + LOG.info("Allowed GPU devices:" + gpuDevices); + } + + return gpuDevices; + } + + public synchronized void initialize(Configuration conf) throws YarnException { + this.conf = conf; + numOfErrorExecutionSinceLastSucceed = 0; + String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC, + YarnConfiguration.DEFAULT_NM_GPU_PATH_TO_EXEC); + if (pathToExecutable.isEmpty()) { + pathToExecutable = DEFAULT_BINARY_NAME; + } + + // Validate file existence + File binaryPath = new File(pathToExecutable); + + if (!binaryPath.exists()) { + // When binary not exist, use default setting. + boolean found = false; + for (String dir : DEFAULT_BINARY_SEARCH_DIRS) { + binaryPath = new File(dir, DEFAULT_BINARY_NAME); + if (binaryPath.exists()) { + found = true; + pathOfGpuBinary = binaryPath.getAbsolutePath(); + break; + } + } + + if (!found) { + LOG.warn("Failed to locate binary at:" + binaryPath.getAbsolutePath() + + ", please double check [" + YarnConfiguration.NM_GPU_PATH_TO_EXEC + + "] setting. Now use " + "default binary:" + DEFAULT_BINARY_NAME); + } + } else{ + // If path specified by user is a directory, use + if (binaryPath.isDirectory()) { + binaryPath = new File(binaryPath, DEFAULT_BINARY_NAME); + LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME + + " under the directory, updated path-to-executable:" + binaryPath + .getAbsolutePath()); + } + // Validated + pathOfGpuBinary = binaryPath.getAbsolutePath(); + } + + // Try to discover GPU information once and print + try { + LOG.info("Trying to discover GPU information ..."); + GpuDeviceInformation info = getGpuDeviceInformation(); + LOG.info(info.toString()); + } catch (YarnException e) { + String msg = + "Failed to discover GPU information from system, exception message:" + + e.getMessage() + " continue..."; + LOG.warn(msg); + } + } + + @VisibleForTesting + protected Map getEnvironmentToRunCommand() { + return environment; + } + + @VisibleForTesting + protected String getPathOfGpuBinary() { + return pathOfGpuBinary; + } + + public static GpuDiscoverer getInstance() { + return instance; + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java new file mode 100644 index 00000000000..796eb25b431 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java @@ -0,0 +1,68 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.ResourceInformation; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin; +import org.apache.hadoop.yarn.util.resource.ResourceUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.Map; + +import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI; + +public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin { + private static final Logger LOG = + LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class); + + @Override + public void updateConfiguredResource(Resource res) throws YarnException { + LOG.info("Initializing configured GPU resources for the NodeManager."); + + List usableGpus = + GpuDiscoverer.getInstance().getGpusUsableByYarn(); + if (null == usableGpus || usableGpus.isEmpty()) { + String message = "GPU is enabled, but couldn't find any usable GPUs on the " + + "NodeManager."; + LOG.error(message); + // No gpu can be used by YARN. + throw new YarnException(message); + } + + long nUsableGpus = usableGpus.size(); + + Map configuredResourceTypes = + ResourceUtils.getResourceTypes(); + if (!configuredResourceTypes.containsKey(GPU_URI)) { + throw new YarnException("Found " + nUsableGpus + " usable GPUs, however " + + GPU_URI + + " resource-type is not configured inside" + + " resource-types.xml, please configure it to enable GPU feature or" + + " remove " + GPU_URI + " from " + + YarnConfiguration.NM_RESOURCE_PLUGINS); + } + + res.setResourceValue(GPU_URI, nUsableGpus); + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java new file mode 100644 index 00000000000..d294503704e --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java @@ -0,0 +1,83 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceAllocator; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceHandlerImpl; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo; + +import java.util.List; +import java.util.Map; + +public class GpuResourcePlugin implements ResourcePlugin { + private GpuResourceHandlerImpl gpuResourceHandler = null; + private GpuNodeResourceUpdateHandler resourceDiscoverHandler = null; + + @Override + public synchronized void initialize(Context context) throws YarnException { + resourceDiscoverHandler = new GpuNodeResourceUpdateHandler(); + GpuDiscoverer.getInstance().initialize(context.getConf()); + } + + @Override + public synchronized ResourceHandler createResourceHandler( + Context context, CGroupsHandler cGroupsHandler, + PrivilegedOperationExecutor privilegedOperationExecutor) { + if (gpuResourceHandler == null) { + gpuResourceHandler = new GpuResourceHandlerImpl(context, cGroupsHandler, + privilegedOperationExecutor); + } + + return gpuResourceHandler; + } + + @Override + public synchronized NodeResourceUpdaterPlugin getNodeResourceHandlerInstance() { + return resourceDiscoverHandler; + } + + @Override + public void cleanup() throws YarnException { + // Do nothing. + } + + @Override + public NMResourceInfo getNMResourceInfo() throws YarnException { + GpuDeviceInformation gpuDeviceInformation = + GpuDiscoverer.getInstance().getGpuDeviceInformation(); + GpuResourceAllocator gpuResourceAllocator = + gpuResourceHandler.getGpuAllocator(); + List totalGpus = gpuResourceAllocator.getAllowedGpusCopy(); + List assignedGpuDevices = + gpuResourceAllocator.getAssignedGpusCopy(); + + return new NMGpuResourceInfo(gpuDeviceInformation, totalGpus, + assignedGpuDevices); + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java index 08a486e7865..374cc290579 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java @@ -18,39 +18,25 @@ package org.apache.hadoop.yarn.server.nodemanager.recovery; -import static org.fusesource.leveldbjni.JniDBFactory.asString; -import static org.fusesource.leveldbjni.JniDBFactory.bytes; - -import org.apache.hadoop.yarn.api.records.Token; -import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Timer; -import java.util.TimerTask; -import java.util.Set; - +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.ListMultimap; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Time; import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest; import org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainerRequestPBImpl; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.Token; import org.apache.hadoop.yarn.api.records.impl.pb.ResourcePBImpl; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.proto.YarnProtos.LocalResourceProto; +import org.apache.hadoop.yarn.proto.YarnSecurityTokenProtos.ContainerTokenIdentifierProto; import org.apache.hadoop.yarn.proto.YarnServerCommonProtos.MasterKeyProto; import org.apache.hadoop.yarn.proto.YarnServerCommonProtos.VersionProto; import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.ContainerManagerApplicationProto; @@ -58,9 +44,11 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LocalizedResourceProto; import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDeleterProto; import org.apache.hadoop.yarn.proto.YarnServiceProtos.StartContainerRequestProto; -import org.apache.hadoop.yarn.proto.YarnSecurityTokenProtos.ContainerTokenIdentifierProto; +import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; import org.apache.hadoop.yarn.server.api.records.MasterKey; import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; import org.apache.hadoop.yarn.server.records.Version; import org.apache.hadoop.yarn.server.records.impl.pb.VersionPBImpl; import org.apache.hadoop.yarn.server.utils.BuilderUtils; @@ -72,10 +60,26 @@ import org.iq80.leveldb.DBException; import org.iq80.leveldb.Options; import org.iq80.leveldb.WriteBatch; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.Timer; +import java.util.TimerTask; + +import static org.fusesource.leveldbjni.JniDBFactory.asString; +import static org.fusesource.leveldbjni.JniDBFactory.bytes; -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.ArrayListMultimap; -import com.google.common.collect.ListMultimap; public class NMLeveldbStateStoreService extends NMStateStoreService { @@ -147,6 +151,9 @@ private static final String AMRMPROXY_KEY_PREFIX = "AMRMProxy/"; + private static final String CONTAINER_ASSIGNED_RESOURCES_KEY_SUFFIX = + "/assignedResources_"; + private static final byte[] EMPTY_VALUE = new byte[0]; private DB db; @@ -308,6 +315,13 @@ private RecoveredContainerState loadContainerState(ContainerId containerId, rcs.setWorkDir(asString(entry.getValue())); } else if (suffix.equals(CONTAINER_LOG_DIR_KEY_SUFFIX)) { rcs.setLogDir(asString(entry.getValue())); + } else if (suffix.startsWith(CONTAINER_ASSIGNED_RESOURCES_KEY_SUFFIX)) { + String resourceType = suffix.substring( + CONTAINER_ASSIGNED_RESOURCES_KEY_SUFFIX.length()); + ResourceMappings.AssignedResources assignedResources = + ResourceMappings.AssignedResources.fromBytes(entry.getValue()); + rcs.getResourceMappings().addAssignedResources(resourceType, + assignedResources); } else { LOG.warn("the container " + containerId + " will be killed because of the unknown key " + key @@ -1170,6 +1184,41 @@ public void removeLogDeleter(ApplicationId appId) throws IOException { } } + @Override + public void storeAssignedResources(Container container, + String resourceType, List assignedResources) + throws IOException { + if (LOG.isDebugEnabled()) { + LOG.debug( + "storeAssignedResources: containerId=" + container.getContainerId() + + ", assignedResources=" + StringUtils + .join(",", assignedResources)); + + } + + String keyResChng = CONTAINERS_KEY_PREFIX + container.getContainerId().toString() + + CONTAINER_ASSIGNED_RESOURCES_KEY_SUFFIX + resourceType; + try { + WriteBatch batch = db.createWriteBatch(); + try { + ResourceMappings.AssignedResources res = + new ResourceMappings.AssignedResources(); + res.updateAssignedResources(assignedResources); + + // New value will overwrite old values for the same key + batch.put(bytes(keyResChng), res.toBytes()); + db.write(batch); + } finally { + batch.close(); + } + } catch (DBException e) { + throw new IOException(e); + } + + // update container resource mapping. + updateContainerResourceMapping(container, resourceType, assignedResources); + } + @SuppressWarnings("deprecation") private void cleanupDeprecatedFinishedApps() { try { diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java index aaf6fb2cdbc..95ec61ae1a3 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java @@ -19,6 +19,7 @@ package org.apache.hadoop.yarn.server.nodemanager.recovery; import java.io.IOException; +import java.io.Serializable; import java.util.List; import org.apache.hadoop.conf.Configuration; @@ -34,6 +35,7 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDeleterProto; import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; import org.apache.hadoop.yarn.server.api.records.MasterKey; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; // The state store to use when state isn't being stored public class NMNullStateStoreService extends NMStateStoreService { @@ -266,6 +268,13 @@ public void removeAMRMProxyAppContext(ApplicationAttemptId attempt) throws IOException { } + @Override + public void storeAssignedResources(Container container, + String resourceType, List assignedResources) + throws IOException { + updateContainerResourceMapping(container, resourceType, assignedResources); + } + @Override protected void initStorage(Configuration conf) throws IOException { } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java index 1cdbd277ff3..350f2423834 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java @@ -19,6 +19,7 @@ package org.apache.hadoop.yarn.server.nodemanager.recovery; import java.io.IOException; +import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -43,6 +44,8 @@ import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.LogDeleterProto; import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; import org.apache.hadoop.yarn.server.api.records.MasterKey; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; @Private @Unstable @@ -90,6 +93,7 @@ public NMStateStoreService(String name) { private RecoveredContainerType recoveryType = RecoveredContainerType.RECOVER; private long startTime; + private ResourceMappings resMappings = new ResourceMappings(); public RecoveredContainerStatus getStatus() { return status; @@ -174,6 +178,14 @@ public RecoveredContainerType getRecoveryType() { public void setRecoveryType(RecoveredContainerType recoveryType) { this.recoveryType = recoveryType; } + + public ResourceMappings getResourceMappings() { + return resMappings; + } + + public void setResourceMappings(ResourceMappings mappings) { + this.resMappings = mappings; + } } public static class LocalResourceTrackerState { @@ -718,9 +730,31 @@ public abstract void removeAMRMProxyAppContextEntry( public abstract void removeAMRMProxyAppContext(ApplicationAttemptId attempt) throws IOException; + /** + * Store the assigned resources to a container. + * + * @param container NMContainer + * @param resourceType Resource Type + * @param assignedResources Assigned resources + * @throws IOException if fails + */ + public abstract void storeAssignedResources(Container container, + String resourceType, List assignedResources) + throws IOException; + protected abstract void initStorage(Configuration conf) throws IOException; protected abstract void startStorage() throws IOException; protected abstract void closeStorage() throws IOException; + + protected void updateContainerResourceMapping(Container container, + String resourceType, List assignedResources) { + // Update Container#getResourceMapping. + ResourceMappings.AssignedResources newAssigned = + new ResourceMappings.AssignedResources(); + newAssigned.updateAssignedResources(assignedResources); + container.getResourceMappings().addAssignedResources(resourceType, + newAssigned); + } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/NMWebServices.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/NMWebServices.java index c5379ccf258..9157374928d 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/NMWebServices.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/NMWebServices.java @@ -27,6 +27,10 @@ import java.util.List; import java.util.Map.Entry; import java.util.Set; + +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -496,6 +500,28 @@ public void write(OutputStream os) throws IOException, } } + @GET + @Path("/resources/{resourcename}") + @Produces({ MediaType.APPLICATION_JSON + "; " + JettyUtils.UTF_8, + MediaType.APPLICATION_XML + "; " + JettyUtils.UTF_8 }) + public Object getNMResourceInfo( + @PathParam("resourcename") + String resourceName) throws YarnException { + init(); + ResourcePluginManager rpm = this.nmContext.getResourcePluginManager(); + if (rpm != null && rpm.getNameToPlugins() != null) { + ResourcePlugin plugin = rpm.getNameToPlugins().get(resourceName); + if (plugin != null) { + NMResourceInfo nmResourceInfo = plugin.getNMResourceInfo(); + if (nmResourceInfo != null) { + return nmResourceInfo; + } + } + } + + return new NMResourceInfo(); + } + private long parseLongParam(String bytes) { if (bytes == null || bytes.isEmpty()) { return Long.MAX_VALUE; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/NMResourceInfo.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/NMResourceInfo.java new file mode 100644 index 00000000000..18ce8ea7a68 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/NMResourceInfo.java @@ -0,0 +1,28 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.webapp.dao; + +import javax.xml.bind.annotation.XmlAccessType; +import javax.xml.bind.annotation.XmlAccessorType; +import javax.xml.bind.annotation.XmlRootElement; + +@XmlRootElement +@XmlAccessorType(XmlAccessType.FIELD) +public class NMResourceInfo { +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java new file mode 100644 index 00000000000..837d5cc99cd --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java @@ -0,0 +1,72 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +import javax.xml.bind.annotation.XmlRootElement; +import java.util.List; + +/** + * All GPU Device Information in the system, fetched from nvidia-smi. + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +@XmlRootElement(name = "nvidia_smi_log") +public class GpuDeviceInformation { + List gpus; + + String driverVersion = "N/A"; + + // More fields like topology information could be added when needed. + // ... + + @javax.xml.bind.annotation.XmlElement(name = "gpu") + public List getGpus() { + return gpus; + } + + public void setGpus(List gpus) { + this.gpus = gpus; + } + + @javax.xml.bind.annotation.XmlElement(name = "driver_version") + public String getDriverVersion() { + return driverVersion; + } + + public void setDriverVersion(String driverVersion) { + this.driverVersion = driverVersion; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("=== Gpus in the system ===\n").append("\tDriver Version:").append( + getDriverVersion()).append("\n"); + + if (gpus != null) { + for (PerGpuDeviceInformation gpu : gpus) { + sb.append("\t").append(gpu.toString()).append("\n"); + } + } + return sb.toString(); + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java new file mode 100644 index 00000000000..1bd92f63a88 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java @@ -0,0 +1,87 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import javax.xml.bind.JAXBContext; +import javax.xml.bind.JAXBException; +import javax.xml.bind.Unmarshaller; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParserFactory; +import javax.xml.transform.sax.SAXSource; +import java.io.StringReader; + +/** + * Parse XML and get GPU device information + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +public class GpuDeviceInformationParser { + private static final Logger LOG = LoggerFactory.getLogger( + GpuDeviceInformationParser.class); + + private Unmarshaller unmarshaller = null; + private XMLReader xmlReader = null; + + private void init() + throws SAXException, ParserConfigurationException, JAXBException { + SAXParserFactory spf = SAXParserFactory.newInstance(); + // Disable external-dtd since by default nvidia-smi output contains + // in header + spf.setFeature( + "http://apache.org/xml/features/nonvalidating/load-external-dtd", + false); + spf.setFeature("http://xml.org/sax/features/validation", false); + + JAXBContext jaxbContext = JAXBContext.newInstance( + GpuDeviceInformation.class); + + this.xmlReader = spf.newSAXParser().getXMLReader(); + this.unmarshaller = jaxbContext.createUnmarshaller(); + } + + public synchronized GpuDeviceInformation parseXml(String xmlContent) + throws YarnException { + if (unmarshaller == null) { + try { + init(); + } catch (SAXException | ParserConfigurationException | JAXBException e) { + LOG.error("Exception while initialize parser", e); + throw new YarnException(e); + } + } + + InputSource inputSource = new InputSource(new StringReader(xmlContent)); + SAXSource source = new SAXSource(xmlReader, inputSource); + try { + return (GpuDeviceInformation) unmarshaller.unmarshal(source); + } catch (JAXBException e) { + LOG.error("Exception while parsing xml", e); + throw new YarnException(e); + } + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/NMGpuResourceInfo.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/NMGpuResourceInfo.java new file mode 100644 index 00000000000..bf1d463df7c --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/NMGpuResourceInfo.java @@ -0,0 +1,71 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu; + +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.AssignedGpuDevice; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo; + +import java.util.List; + +/** + * Gpu device information return to client when + * {@link org.apache.hadoop.yarn.server.nodemanager.webapp.NMWebServices#getNMResourceInfo(String)} + * is invoked. + */ +public class NMGpuResourceInfo extends NMResourceInfo { + GpuDeviceInformation gpuDeviceInformation; + + List totalGpuDevices; + List assignedGpuDevices; + + public NMGpuResourceInfo(GpuDeviceInformation gpuDeviceInformation, + List totalGpuDevices, + List assignedGpuDevices) { + this.gpuDeviceInformation = gpuDeviceInformation; + this.totalGpuDevices = totalGpuDevices; + this.assignedGpuDevices = assignedGpuDevices; + } + + public GpuDeviceInformation getGpuDeviceInformation() { + return gpuDeviceInformation; + } + + public void setGpuDeviceInformation( + GpuDeviceInformation gpuDeviceInformation) { + this.gpuDeviceInformation = gpuDeviceInformation; + } + + public List getTotalGpuDevices() { + return totalGpuDevices; + } + + public void setTotalGpuDevices(List totalGpuDevices) { + this.totalGpuDevices = totalGpuDevices; + } + + public List getAssignedGpuDevices() { + return assignedGpuDevices; + } + + public void setAssignedGpuDevices( + List assignedGpuDevices) { + this.assignedGpuDevices = assignedGpuDevices; + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java new file mode 100644 index 00000000000..25c2e3a1f1d --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java @@ -0,0 +1,165 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +import javax.xml.bind.annotation.XmlElement; +import javax.xml.bind.annotation.XmlRootElement; +import javax.xml.bind.annotation.adapters.XmlAdapter; + +/** + * Capture single GPU device information such as memory size, temperature, + * utilization. + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +@XmlRootElement(name = "gpu") +public class PerGpuDeviceInformation { + + private String productName = "N/A"; + private String uuid = "N/A"; + private int minorNumber = -1; + + private PerGpuUtilizations gpuUtilizations; + private PerGpuMemoryUsage gpuMemoryUsage; + private PerGpuTemperature temperature; + + /** + * Convert formats like "34 C", "75.6 %" to float. + */ + @InterfaceAudience.Private + @InterfaceStability.Unstable + static class StrToFloatBeforeSpaceAdapter extends + XmlAdapter { + @Override + public String marshal(Float v) throws Exception { + if (v == null) { + return ""; + } + return String.valueOf(v); + } + + @Override + public Float unmarshal(String v) throws Exception { + if (v == null) { + return -1f; + } + + return Float.valueOf(v.split(" ")[0]); + } + } + + /** + * Convert formats like "725 MiB" to long. + */ + @InterfaceAudience.Private + @InterfaceStability.Unstable + static class StrToMemAdapter extends XmlAdapter { + @Override + public String marshal(Long v) throws Exception { + if (v == null) { + return ""; + } + return String.valueOf(v) + " MiB"; + } + + @Override + public Long unmarshal(String v) throws Exception { + if (v == null) { + return -1L; + } + return Long.valueOf(v.split(" ")[0]); + } + } + + @XmlElement(name = "temperature") + public PerGpuTemperature getTemperature() { + return temperature; + } + + public void setTemperature(PerGpuTemperature temperature) { + this.temperature = temperature; + } + + @XmlElement(name = "uuid") + public String getUuid() { + return uuid; + } + + public void setUuid(String uuid) { + this.uuid = uuid; + } + + @XmlElement(name = "product_name") + public String getProductName() { + return productName; + } + + public void setProductName(String productName) { + this.productName = productName; + } + + @XmlElement(name = "minor_number") + public int getMinorNumber() { + return minorNumber; + } + + public void setMinorNumber(int minorNumber) { + this.minorNumber = minorNumber; + } + + @XmlElement(name = "utilization") + public PerGpuUtilizations getGpuUtilizations() { + return gpuUtilizations; + } + + public void setGpuUtilizations(PerGpuUtilizations utilizations) { + this.gpuUtilizations = utilizations; + } + + @XmlElement(name = "fb_memory_usage") + public PerGpuMemoryUsage getGpuMemoryUsage() { + return gpuMemoryUsage; + } + + public void setGpuMemoryUsage(PerGpuMemoryUsage gpuMemoryUsage) { + this.gpuMemoryUsage = gpuMemoryUsage; + } + + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("ProductName=").append(productName).append(", MinorNumber=") + .append(minorNumber); + + if (getGpuMemoryUsage() != null) { + sb.append(", TotalMemory=").append( + getGpuMemoryUsage().getTotalMemoryMiB()).append("MiB"); + } + + if (getGpuUtilizations() != null) { + sb.append(", Utilization=").append( + getGpuUtilizations().getOverallGpuUtilization()).append("%"); + } + return sb.toString(); + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java new file mode 100644 index 00000000000..afc1a9679b7 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +import javax.xml.bind.annotation.XmlElement; +import javax.xml.bind.annotation.XmlRootElement; +import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter; + +@InterfaceAudience.Private +@InterfaceStability.Unstable +@XmlRootElement(name = "fb_memory_usage") +public class PerGpuMemoryUsage { + long usedMemoryMiB = -1L; + long availMemoryMiB = -1L; + + @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToMemAdapter.class) + @XmlElement(name = "used") + public Long getUsedMemoryMiB() { + return usedMemoryMiB; + } + + public void setUsedMemoryMiB(Long usedMemoryMiB) { + this.usedMemoryMiB = usedMemoryMiB; + } + + @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToMemAdapter.class) + @XmlElement(name = "free") + public Long getAvailMemoryMiB() { + return availMemoryMiB; + } + + public void setAvailMemoryMiB(Long availMemoryMiB) { + this.availMemoryMiB = availMemoryMiB; + } + + public long getTotalMemoryMiB() { + return usedMemoryMiB + availMemoryMiB; + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java new file mode 100644 index 00000000000..ccd60cbf5e5 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java @@ -0,0 +1,80 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +import javax.xml.bind.annotation.XmlElement; +import javax.xml.bind.annotation.XmlRootElement; +import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter; + +/** + * Temperature of GPU + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +@XmlRootElement(name = "temperature") +public class PerGpuTemperature { + private float currentGpuTemp = Float.MIN_VALUE; + private float maxGpuTemp = Float.MIN_VALUE; + private float slowThresholdGpuTemp = Float.MIN_VALUE; + + /** + * Get current celsius GPU temperature + * @return temperature + */ + @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class) + @XmlElement(name = "gpu_temp") + public Float getCurrentGpuTemp() { + return currentGpuTemp; + } + + public void setCurrentGpuTemp(Float currentGpuTemp) { + this.currentGpuTemp = currentGpuTemp; + } + + /** + * Get max possible celsius GPU temperature + * @return temperature + */ + @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class) + @XmlElement(name = "gpu_temp_max_threshold") + public Float getMaxGpuTemp() { + return maxGpuTemp; + } + + public void setMaxGpuTemp(Float maxGpuTemp) { + this.maxGpuTemp = maxGpuTemp; + } + + /** + * Get celsius GPU temperature which could make GPU runs slower + * @return temperature + */ + @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class) + @XmlElement(name = "gpu_temp_slow_threshold") + public Float getSlowThresholdGpuTemp() { + return slowThresholdGpuTemp; + } + + public void setSlowThresholdGpuTemp(Float slowThresholdGpuTemp) { + this.slowThresholdGpuTemp = slowThresholdGpuTemp; + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java new file mode 100644 index 00000000000..4ef218ba7ea --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java @@ -0,0 +1,50 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +import javax.xml.bind.annotation.XmlElement; +import javax.xml.bind.annotation.XmlRootElement; +import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter; + +/** + * GPU utilizations + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +@XmlRootElement(name = "utilization") +public class PerGpuUtilizations { + private float overallGpuUtilization; + + /** + * Overall percent GPU utilization + * @return utilization + */ + @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class) + @XmlElement(name = "gpu_util") + public Float getOverallGpuUtilization() { + return overallGpuUtilization; + } + + public void setOverallGpuUtilization(Float overallGpuUtilization) { + this.overallGpuUtilization = overallGpuUtilization; + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java new file mode 100644 index 00000000000..13b3ee91bdc --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java @@ -0,0 +1,164 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.net.ServerSocketUtil; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.event.Dispatcher; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.factories.RecordFactory; +import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; +import org.apache.hadoop.yarn.server.api.ResourceTracker; +import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest; +import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse; +import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest; +import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse; +import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerRequest; +import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerResponse; +import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.NodeHeartbeatResponsePBImpl; +import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.RegisterNodeManagerResponsePBImpl; +import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.UnRegisterNodeManagerResponsePBImpl; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; +import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; +import org.junit.Assert; +import org.junit.Before; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.net.InetAddress; +import java.net.UnknownHostException; + +public class NodeManagerTestBase { + // temp fix until metrics system can auto-detect itself running in unit test: + static { + DefaultMetricsSystem.setMiniClusterMode(true); + } + + protected static final Logger LOG = + LoggerFactory.getLogger(TestNodeStatusUpdater.class); + protected static final File basedir = + new File("target", TestNodeStatusUpdater.class.getName()); + protected static final File nmLocalDir = new File(basedir, "nm0"); + protected static final File tmpDir = new File(basedir, "tmpDir"); + protected static final File remoteLogsDir = new File(basedir, "remotelogs"); + protected static final File logsDir = new File(basedir, "logs"); + protected static final RecordFactory recordFactory = RecordFactoryProvider + .getRecordFactory(null); + protected Configuration conf; + + protected YarnConfiguration createNMConfig() throws IOException { + return createNMConfig(ServerSocketUtil.getPort(49170, 10)); + } + + protected YarnConfiguration createNMConfig(int port) throws IOException { + YarnConfiguration conf = new YarnConfiguration(); + String localhostAddress = null; + try { + localhostAddress = InetAddress.getByName("localhost") + .getCanonicalHostName(); + } catch (UnknownHostException e) { + Assert.fail("Unable to get localhost address: " + e.getMessage()); + } + conf.setInt(YarnConfiguration.NM_PMEM_MB, 5 * 1024); // 5GB + conf.set(YarnConfiguration.NM_ADDRESS, localhostAddress + ":" + port); + conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, localhostAddress + ":" + + ServerSocketUtil.getPort(49160, 10)); + conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath()); + conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR, + remoteLogsDir.getAbsolutePath()); + conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath()); + conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1); + return conf; + } + + public static class BaseResourceTrackerForTest implements ResourceTracker { + @Override + public RegisterNodeManagerResponse registerNodeManager( + RegisterNodeManagerRequest request) throws YarnException, IOException { + return new RegisterNodeManagerResponsePBImpl(); + } + + @Override + public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) + throws YarnException, IOException { + return new NodeHeartbeatResponsePBImpl(); + } + + @Override + public UnRegisterNodeManagerResponse unRegisterNodeManager( + UnRegisterNodeManagerRequest request) + throws YarnException, IOException { + return new UnRegisterNodeManagerResponsePBImpl(); + } + } + + protected static class BaseNodeStatusUpdaterForTest extends NodeStatusUpdaterImpl { + public ResourceTracker resourceTracker; + protected Context context; + + public BaseNodeStatusUpdaterForTest(Context context, Dispatcher dispatcher, + NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics, + ResourceTracker resourceTracker) { + super(context, dispatcher, healthChecker, metrics); + this.context = context; + this.resourceTracker = resourceTracker; + } + @Override + protected ResourceTracker getRMClient() { + return resourceTracker; + } + + @Override + protected void stopRMProxy() { + return; + } + } + + public class MyContainerManager extends ContainerManagerImpl { + public boolean signaled = false; + + public MyContainerManager(Context context, ContainerExecutor exec, + DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater, + NodeManagerMetrics metrics, + LocalDirsHandlerService dirsHandler) { + super(context, exec, deletionContext, nodeStatusUpdater, + metrics, dirsHandler); + } + + @Override + public void handle(ContainerManagerEvent event) { + if (event.getType() == ContainerManagerEventType.SIGNAL_CONTAINERS) { + signaled = true; + } + } + } + + @Before + public void setUp() throws IOException { + nmLocalDir.mkdirs(); + tmpDir.mkdirs(); + logsDir.mkdirs(); + remoteLogsDir.mkdirs(); + conf = createNMConfig(); + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java index 2e9eff529cd..9b180c7eff6 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java @@ -178,7 +178,7 @@ public void testDirPermissions() throws Exception { FileContext lfs = FileContext.getLocalFSFileContext(conf); DefaultContainerExecutor executor = new DefaultContainerExecutor(lfs); executor.setConf(conf); - executor.init(); + executor.init(null); try { executor.createUserLocalDirs(localDirs, user); @@ -317,7 +317,7 @@ public Object answer(InvocationOnMock invocationOnMock) Path workDir = localDir; Path pidFile = new Path(workDir, "pid.txt"); - mockExec.init(); + mockExec.init(null); mockExec.activateContainer(cId, pidFile); int ret = mockExec.launchContainer(new ContainerStartContext.Builder() .setContainer(container) diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java index d4db6b0e20e..dcec4c3b0b4 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java @@ -628,7 +628,7 @@ public void testPostExecuteAfterReacquisition() throws Exception { LinuxContainerExecutor lce = new LinuxContainerExecutor(); lce.setConf(conf); try { - lce.init(); + lce.init(null); } catch (IOException e) { // expected if LCE isn't setup right, but not necessary for this test } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java index 7fbc108a693..8bf9d2e5810 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java @@ -427,7 +427,7 @@ public Object answer(InvocationOnMock invocationOnMock) @Test public void testInit() throws Exception { - mockExec.init(); + mockExec.init(mock(Context.class)); assertEquals(Arrays.asList("--checksetup"), readMockParams()); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java index 92797116075..b31215b0f3d 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java @@ -37,7 +37,7 @@ public static final class InvalidContainerExecutor extends DefaultContainerExecutor { @Override - public void init() throws IOException { + public void init(Context nmContext) throws IOException { throw new IOException("dummy executor init called"); } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java index 11c3c356f32..8435340164a 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java @@ -20,16 +20,14 @@ import static org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils.newNodeHeartbeatResponse; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.EOFException; import java.io.File; import java.io.IOException; import java.net.InetAddress; import java.net.InetSocketAddress; -import java.net.UnknownHostException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Collections; @@ -81,8 +79,6 @@ import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; -import org.apache.hadoop.yarn.factories.RecordFactory; -import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.NodeHeartbeatResponseProto; import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; import org.apache.hadoop.yarn.server.api.ResourceTracker; @@ -118,41 +114,14 @@ import org.junit.Test; @SuppressWarnings("rawtypes") -public class TestNodeStatusUpdater { - - // temp fix until metrics system can auto-detect itself running in unit test: - static { - DefaultMetricsSystem.setMiniClusterMode(true); - } - - static final Logger LOG = - LoggerFactory.getLogger(TestNodeStatusUpdater.class); - static final File basedir = - new File("target", TestNodeStatusUpdater.class.getName()); - static final File nmLocalDir = new File(basedir, "nm0"); - static final File tmpDir = new File(basedir, "tmpDir"); - static final File remoteLogsDir = new File(basedir, "remotelogs"); - static final File logsDir = new File(basedir, "logs"); - private static final RecordFactory recordFactory = RecordFactoryProvider - .getRecordFactory(null); - +public class TestNodeStatusUpdater extends NodeManagerTestBase { volatile int heartBeatID = 0; volatile Throwable nmStartError = null; private final List registeredNodes = new ArrayList(); private boolean triggered = false; - private Configuration conf; private NodeManager nm; private AtomicBoolean assertionFailedInThread = new AtomicBoolean(false); - @Before - public void setUp() throws IOException { - nmLocalDir.mkdirs(); - tmpDir.mkdirs(); - logsDir.mkdirs(); - remoteLogsDir.mkdirs(); - conf = createNMConfig(); - } - @After public void tearDown() { this.registeredNodes.clear(); @@ -334,29 +303,7 @@ public UnRegisterNodeManagerResponse unRegisterNodeManager( } } - private class MyContainerManager extends ContainerManagerImpl { - public boolean signaled = false; - - public MyContainerManager(Context context, ContainerExecutor exec, - DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater, - NodeManagerMetrics metrics, - LocalDirsHandlerService dirsHandler) { - super(context, exec, deletionContext, nodeStatusUpdater, - metrics, dirsHandler); - } - - @Override - public void handle(ContainerManagerEvent event) { - if (event.getType() == ContainerManagerEventType.SIGNAL_CONTAINERS) { - signaled = true; - } - } - } - - private class MyNodeStatusUpdater extends NodeStatusUpdaterImpl { - public ResourceTracker resourceTracker; - private Context context; - + private class MyNodeStatusUpdater extends BaseNodeStatusUpdaterForTest { public MyNodeStatusUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) { this(context, dispatcher, healthChecker, metrics, false); @@ -365,19 +312,8 @@ public MyNodeStatusUpdater(Context context, Dispatcher dispatcher, public MyNodeStatusUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics, boolean signalContainer) { - super(context, dispatcher, healthChecker, metrics); - this.context = context; - resourceTracker = new MyResourceTracker(this.context, signalContainer); - } - - @Override - protected ResourceTracker getRMClient() { - return resourceTracker; - } - - @Override - protected void stopRMProxy() { - return; + super(context, dispatcher, healthChecker, metrics, + new MyResourceTracker(context, signalContainer)); } } @@ -1820,7 +1756,6 @@ public void run() { Assert.assertTrue("Test failed with exception(s)" + exceptions, exceptions.isEmpty()); } - // Add new containers info into NM context each time node heart beats. private class MyNMContext extends NMContext { @@ -1924,31 +1859,6 @@ private void verifyNodeStartFailure(String errMessage) throws Exception { this.registeredNodes.size()); } - private YarnConfiguration createNMConfig(int port) throws IOException { - YarnConfiguration conf = new YarnConfiguration(); - String localhostAddress = null; - try { - localhostAddress = InetAddress.getByName("localhost") - .getCanonicalHostName(); - } catch (UnknownHostException e) { - Assert.fail("Unable to get localhost address: " + e.getMessage()); - } - conf.setInt(YarnConfiguration.NM_PMEM_MB, 5 * 1024); // 5GB - conf.set(YarnConfiguration.NM_ADDRESS, localhostAddress + ":" + port); - conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, localhostAddress + ":" - + ServerSocketUtil.getPort(49160, 10)); - conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath()); - conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR, - remoteLogsDir.getAbsolutePath()); - conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath()); - conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1); - return conf; - } - - private YarnConfiguration createNMConfig() throws IOException { - return createNMConfig(ServerSocketUtil.getPort(49170, 10)); - } - private NodeManager getNodeManager(final NodeAction nodeHeartBeatAction) { return new NodeManager() { @Override diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java index 0838f1e523a..3c574966be5 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java @@ -18,26 +18,6 @@ package org.apache.hadoop.yarn.server.nodemanager.amrmproxy; -import java.io.IOException; -import java.security.PrivilegedExceptionAction; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; -import java.util.concurrent.Callable; -import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.ConcurrentMap; -import java.util.concurrent.ExecutorCompletionService; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.TimeUnit; - -import org.apache.hadoop.yarn.server.nodemanager.ContainerStateTransitionListener; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.security.Credentials; import org.apache.hadoop.security.UserGroupInformation; @@ -64,6 +44,7 @@ import org.apache.hadoop.yarn.server.api.records.AppCollectorData; import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus; import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor; +import org.apache.hadoop.yarn.server.nodemanager.ContainerStateTransitionListener; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; @@ -72,17 +53,36 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; -import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator; import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager; import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM; import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher; +import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.util.Records; import org.junit.After; import org.junit.Assert; import org.junit.Before; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.security.PrivilegedExceptionAction; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import java.util.concurrent.Callable; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; /** * Base class for all the AMRMProxyService test cases. It provides utility @@ -773,5 +773,9 @@ public ContainerExecutor getContainerExecutor() { getContainerStateTransitionListener() { return null; } + + public ResourcePluginManager getResourcePluginManager() { + return null; + } } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java index ae550417484..1e919af3c19 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java @@ -31,6 +31,7 @@ import java.io.File; import java.io.IOException; import java.io.PrintWriter; +import java.io.Serializable; import java.nio.ByteBuffer; import java.security.PrivilegedExceptionAction; import java.util.ArrayList; @@ -91,6 +92,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncher; import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncherEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService; @@ -110,6 +112,7 @@ import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.util.timeline.TimelineUtils; +import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -457,7 +460,7 @@ public void testContainerResizeRecovery() throws Exception { NMStateStoreService stateStore = new NMMemoryStateStoreService(); stateStore.init(conf); stateStore.start(); - Context context = createContext(conf, stateStore); + context = createContext(conf, stateStore); ContainerManagerImpl cm = createContainerManager(context, delSrvc); ((NMContext) context).setContainerManager(cm); cm.init(conf); @@ -467,55 +470,12 @@ public void testContainerResizeRecovery() throws Exception { ApplicationAttemptId attemptId = ApplicationAttemptId.newInstance(appId, 1); ContainerId cid = ContainerId.newContainerId(attemptId, 1); - Map containerEnv = new HashMap<>(); - setFlowContext(containerEnv, "app_name1", appId); - Map serviceData = Collections.emptyMap(); - Credentials containerCreds = new Credentials(); - DataOutputBuffer dob = new DataOutputBuffer(); - containerCreds.writeTokenStorageToStream(dob); - ByteBuffer containerTokens = ByteBuffer.wrap(dob.getData(), 0, - dob.getLength()); - Map acls = Collections.emptyMap(); - File tmpDir = new File("target", - this.getClass().getSimpleName() + "-tmpDir"); - File scriptFile = Shell.appendScriptExtension(tmpDir, "scriptFile"); - PrintWriter fileWriter = new PrintWriter(scriptFile); - if (Shell.WINDOWS) { - fileWriter.println("@ping -n 100 127.0.0.1 >nul"); - } else { - fileWriter.write("\numask 0"); - fileWriter.write("\nexec sleep 100"); - } - fileWriter.close(); - FileContext localFS = FileContext.getLocalFSFileContext(); - URL resource_alpha = - URL.fromPath(localFS - .makeQualified(new Path(scriptFile.getAbsolutePath()))); - LocalResource rsrc_alpha = RecordFactoryProvider - .getRecordFactory(null).newRecordInstance(LocalResource.class); - rsrc_alpha.setResource(resource_alpha); - rsrc_alpha.setSize(-1); - rsrc_alpha.setVisibility(LocalResourceVisibility.APPLICATION); - rsrc_alpha.setType(LocalResourceType.FILE); - rsrc_alpha.setTimestamp(scriptFile.lastModified()); - String destinationFile = "dest_file"; - Map localResources = new HashMap<>(); - localResources.put(destinationFile, rsrc_alpha); - List commands = - Arrays.asList(Shell.getRunScriptCommand(scriptFile)); - ContainerLaunchContext clc = ContainerLaunchContext.newInstance( - localResources, containerEnv, commands, serviceData, - containerTokens, acls); - StartContainersResponse startResponse = startContainer( - context, cm, cid, clc, null); - assertTrue(startResponse.getFailedRequests().isEmpty()); - assertEquals(1, context.getApplications().size()); + + commonLaunchContainer(appId, cid, cm); + Application app = context.getApplications().get(appId); assertNotNull(app); - // make sure the container reaches RUNNING state - waitForNMContainerState(cm, cid, - org.apache.hadoop.yarn.server.nodemanager - .containermanager.container.ContainerState.RUNNING); + Resource targetResource = Resource.newInstance(2048, 2); ContainerUpdateResponse updateResponse = updateContainers(context, cm, cid, targetResource); @@ -538,6 +498,61 @@ public void testContainerResizeRecovery() throws Exception { assertEquals(targetResource, containerStatus.getCapability()); } + @Test + public void testResourceMappingRecoveryForContainer() throws Exception { + conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true); + conf.setBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED, true); + NMStateStoreService stateStore = new NMMemoryStateStoreService(); + stateStore.init(conf); + stateStore.start(); + context = createContext(conf, stateStore); + ContainerManagerImpl cm = createContainerManager(context, delSrvc); + ((NMContext) context).setContainerManager(cm); + cm.init(conf); + cm.start(); + + // add an application by starting a container + ApplicationId appId = ApplicationId.newInstance(0, 1); + ApplicationAttemptId attemptId = + ApplicationAttemptId.newInstance(appId, 1); + ContainerId cid = ContainerId.newContainerId(attemptId, 1); + + commonLaunchContainer(appId, cid, cm); + + Container nmContainer = context.getContainers().get(cid); + + Application app = context.getApplications().get(appId); + assertNotNull(app); + + // store resource mapping of the container + List gpuResources = Arrays.asList("1", "2", "3"); + stateStore.storeAssignedResources(nmContainer, "gpu", gpuResources); + List numaResources = Arrays.asList("numa1"); + stateStore.storeAssignedResources(nmContainer, "numa", numaResources); + List fpgaResources = Arrays.asList("fpga1", "fpga2"); + stateStore.storeAssignedResources(nmContainer, "fpga", fpgaResources); + + cm.stop(); + context = createContext(conf, stateStore); + cm = createContainerManager(context); + ((NMContext) context).setContainerManager(cm); + cm.init(conf); + cm.start(); + assertEquals(1, context.getApplications().size()); + app = context.getApplications().get(appId); + assertNotNull(app); + + Assert.assertNotNull(nmContainer); + ResourceMappings resourceMappings = nmContainer.getResourceMappings(); + List assignedResource = resourceMappings + .getAssignedResources("gpu"); + Assert.assertTrue(assignedResource.equals(gpuResources)); + Assert.assertTrue( + resourceMappings.getAssignedResources("numa").equals(numaResources)); + Assert.assertTrue( + resourceMappings.getAssignedResources("fpga").equals(fpgaResources)); + } + @Test public void testContainerCleanupOnShutdown() throws Exception { ApplicationId appId = ApplicationId.newInstance(0, 1); @@ -610,6 +625,57 @@ public void testContainerCleanupOnShutdown() throws Exception { verify(cm, never()).handle(isA(CMgrCompletedAppsEvent.class)); } + private void commonLaunchContainer(ApplicationId appId, ContainerId cid, + ContainerManagerImpl cm) throws Exception { + Map containerEnv = new HashMap<>(); + setFlowContext(containerEnv, "app_name1", appId); + Map serviceData = Collections.emptyMap(); + Credentials containerCreds = new Credentials(); + DataOutputBuffer dob = new DataOutputBuffer(); + containerCreds.writeTokenStorageToStream(dob); + ByteBuffer containerTokens = ByteBuffer.wrap(dob.getData(), 0, + dob.getLength()); + Map acls = Collections.emptyMap(); + File tmpDir = new File("target", + this.getClass().getSimpleName() + "-tmpDir"); + File scriptFile = Shell.appendScriptExtension(tmpDir, "scriptFile"); + PrintWriter fileWriter = new PrintWriter(scriptFile); + if (Shell.WINDOWS) { + fileWriter.println("@ping -n 100 127.0.0.1 >nul"); + } else { + fileWriter.write("\numask 0"); + fileWriter.write("\nexec sleep 100"); + } + fileWriter.close(); + FileContext localFS = FileContext.getLocalFSFileContext(); + URL resource_alpha = + URL.fromPath(localFS + .makeQualified(new Path(scriptFile.getAbsolutePath()))); + LocalResource rsrc_alpha = RecordFactoryProvider + .getRecordFactory(null).newRecordInstance(LocalResource.class); + rsrc_alpha.setResource(resource_alpha); + rsrc_alpha.setSize(-1); + rsrc_alpha.setVisibility(LocalResourceVisibility.APPLICATION); + rsrc_alpha.setType(LocalResourceType.FILE); + rsrc_alpha.setTimestamp(scriptFile.lastModified()); + String destinationFile = "dest_file"; + Map localResources = new HashMap<>(); + localResources.put(destinationFile, rsrc_alpha); + List commands = + Arrays.asList(Shell.getRunScriptCommand(scriptFile)); + ContainerLaunchContext clc = ContainerLaunchContext.newInstance( + localResources, containerEnv, commands, serviceData, + containerTokens, acls); + StartContainersResponse startResponse = startContainer( + context, cm, cid, clc, null); + assertTrue(startResponse.getFailedRequests().isEmpty()); + assertEquals(1, context.getApplications().size()); + // make sure the container reaches RUNNING state + waitForNMContainerState(cm, cid, + org.apache.hadoop.yarn.server.nodemanager + .containermanager.container.ContainerState.RUNNING); + } + private ContainerManagerImpl createContainerManager(Context context, DeletionService delSrvc) { return new ContainerManagerImpl(context, exec, delSrvc, diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java index e5414a587f1..0563694f004 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java @@ -22,6 +22,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.nodemanager.Context; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -30,6 +31,8 @@ import java.util.List; +import static org.mockito.Mockito.mock; + public class TestResourceHandlerModule { private static final Logger LOG = LoggerFactory.getLogger(TestResourceHandlerModule.class); @@ -62,7 +65,7 @@ public void testOutboundBandwidthHandler() { //Ensure that outbound bandwidth resource handler is present in the chain ResourceHandlerChain resourceHandlerChain = ResourceHandlerModule - .getConfiguredResourceHandlerChain(networkEnabledConf); + .getConfiguredResourceHandlerChain(networkEnabledConf, mock(Context.class)); List resourceHandlers = resourceHandlerChain .getResourceHandlerList(); //Exactly one resource handler in chain @@ -88,7 +91,8 @@ public void testDiskResourceHandler() throws Exception { Assert.assertNotNull(handler); ResourceHandlerChain resourceHandlerChain = - ResourceHandlerModule.getConfiguredResourceHandlerChain(diskConf); + ResourceHandlerModule.getConfiguredResourceHandlerChain(diskConf, + mock(Context.class)); List resourceHandlers = resourceHandlerChain.getResourceHandlerList(); // Exactly one resource handler in chain diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java new file mode 100644 index 00000000000..dd78ebfb5b9 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java @@ -0,0 +1,474 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.ResourceInformation; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants; +import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; +import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; +import org.apache.hadoop.yarn.util.resource.TestResourceUtils; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import static org.mockito.Matchers.any; +import static org.mockito.Matchers.anyList; +import static org.mockito.Matchers.anyString; +import static org.mockito.Matchers.eq; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +public class TestGpuResourceHandler { + private CGroupsHandler mockCGroupsHandler; + private PrivilegedOperationExecutor mockPrivilegedExecutor; + private GpuResourceHandlerImpl gpuResourceHandler; + private NMStateStoreService mockNMStateStore; + private ConcurrentHashMap runningContainersMap; + + @Before + public void setup() { + TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI); + + mockCGroupsHandler = mock(CGroupsHandler.class); + mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class); + mockNMStateStore = mock(NMStateStoreService.class); + + Context nmctx = mock(Context.class); + when(nmctx.getNMStateStore()).thenReturn(mockNMStateStore); + runningContainersMap = new ConcurrentHashMap<>(); + when(nmctx.getContainers()).thenReturn(runningContainersMap); + + gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler, + mockPrivilegedExecutor); + } + + @Test + public void testBootStrap() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0"); + + GpuDiscoverer.getInstance().initialize(conf); + + gpuResourceHandler.bootstrap(conf); + verify(mockCGroupsHandler, times(1)).initializeCGroupController( + CGroupsHandler.CGroupController.DEVICES); + } + + private static ContainerId getContainerId(int id) { + return ContainerId.newContainerId(ApplicationAttemptId + .newInstance(ApplicationId.newInstance(1234L, 1), 1), id); + } + + private static Container mockContainerWithGpuRequest(int id, int numGpuRequest, + boolean dockerContainerEnabled) { + Container c = mock(Container.class); + when(c.getContainerId()).thenReturn(getContainerId(id)); + + Resource res = Resource.newInstance(1024, 1); + ResourceMappings resMapping = new ResourceMappings(); + + res.setResourceValue(ResourceInformation.GPU_URI, numGpuRequest); + when(c.getResource()).thenReturn(res); + when(c.getResourceMappings()).thenReturn(resMapping); + + ContainerLaunchContext clc = mock(ContainerLaunchContext.class); + Map env = new HashMap<>(); + if (dockerContainerEnabled) { + env.put(ContainerRuntimeConstants.ENV_CONTAINER_TYPE, "docker"); + } + when(clc.getEnvironment()).thenReturn(env); + when(c.getLaunchContext()).thenReturn(clc); + return c; + } + + private static Container mockContainerWithGpuRequest(int id, + int numGpuRequest) { + return mockContainerWithGpuRequest(id, numGpuRequest, false); + } + + private void verifyDeniedDevices(ContainerId containerId, + List deniedDevices) + throws ResourceHandlerException, PrivilegedOperationException { + verify(mockCGroupsHandler, times(1)).createCGroup( + CGroupsHandler.CGroupController.DEVICES, containerId.toString()); + + if (null != deniedDevices && !deniedDevices.isEmpty()) { + List deniedDevicesMinorNumber = new ArrayList<>(); + for (GpuDevice deniedDevice : deniedDevices) { + deniedDevicesMinorNumber.add(deniedDevice.getMinorNumber()); + } + verify(mockPrivilegedExecutor, times(1)).executePrivilegedOperation( + new PrivilegedOperation(PrivilegedOperation.OperationType.GPU, Arrays + .asList(GpuResourceHandlerImpl.CONTAINER_ID_CLI_OPTION, + containerId.toString(), + GpuResourceHandlerImpl.EXCLUDED_GPUS_CLI_OPTION, + StringUtils.join(",", deniedDevicesMinorNumber))), true); + } + } + + private void commonTestAllocation(boolean dockerContainerEnabled) + throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); + GpuDiscoverer.getInstance().initialize(conf); + + gpuResourceHandler.bootstrap(conf); + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + /* Start container 1, asks 3 containers */ + gpuResourceHandler.preStart( + mockContainerWithGpuRequest(1, 3, dockerContainerEnabled)); + + // Only device=4 will be blocked. + if (dockerContainerEnabled) { + verifyDeniedDevices(getContainerId(1), Collections.emptyList()); + } else{ + verifyDeniedDevices(getContainerId(1), Arrays.asList(new GpuDevice(3,4))); + } + + /* Start container 2, asks 2 containers. Excepted to fail */ + boolean failedToAllocate = false; + try { + gpuResourceHandler.preStart( + mockContainerWithGpuRequest(2, 2, dockerContainerEnabled)); + } catch (ResourceHandlerException e) { + failedToAllocate = true; + } + Assert.assertTrue(failedToAllocate); + + /* Start container 3, ask 1 container, succeeded */ + gpuResourceHandler.preStart( + mockContainerWithGpuRequest(3, 1, dockerContainerEnabled)); + + // devices = 0/1/3 will be blocked + if (dockerContainerEnabled) { + verifyDeniedDevices(getContainerId(3), Collections.emptyList()); + } else { + verifyDeniedDevices(getContainerId(3), Arrays + .asList(new GpuDevice(0, 0), new GpuDevice(1, 1), + new GpuDevice(2, 3))); + } + + + /* Start container 4, ask 0 container, succeeded */ + gpuResourceHandler.preStart( + mockContainerWithGpuRequest(4, 0, dockerContainerEnabled)); + + if (dockerContainerEnabled) { + verifyDeniedDevices(getContainerId(4), Collections.emptyList()); + } else{ + // All devices will be blocked + verifyDeniedDevices(getContainerId(4), Arrays + .asList(new GpuDevice(0, 0), new GpuDevice(1, 1), new GpuDevice(2, 3), + new GpuDevice(3, 4))); + } + + /* Release container-1, expect cgroups deleted */ + gpuResourceHandler.postComplete(getContainerId(1)); + + verify(mockCGroupsHandler, times(1)).createCGroup( + CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString()); + Assert.assertEquals(3, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + /* Release container-3, expect cgroups deleted */ + gpuResourceHandler.postComplete(getContainerId(3)); + + verify(mockCGroupsHandler, times(1)).createCGroup( + CGroupsHandler.CGroupController.DEVICES, getContainerId(3).toString()); + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + } + + @Test + public void testAllocationWhenDockerContainerEnabled() throws Exception { + // When docker container is enabled, no devices should be written to + // devices.deny. + commonTestAllocation(true); + } + + @Test + public void testAllocation() throws Exception { + commonTestAllocation(false); + } + + @SuppressWarnings("unchecked") + @Test + public void testAssignedGpuWillBeCleanedupWhenStoreOpFails() + throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); + GpuDiscoverer.getInstance().initialize(conf); + + gpuResourceHandler.bootstrap(conf); + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + doThrow(new IOException("Exception ...")).when(mockNMStateStore) + .storeAssignedResources( + any(Container.class), anyString(), anyList()); + + boolean exception = false; + /* Start container 1, asks 3 containers */ + try { + gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3)); + } catch (ResourceHandlerException e) { + exception = true; + } + + Assert.assertTrue("preStart should throw exception", exception); + + // After preStart, we still have 4 available GPU since the store op fails. + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + } + + @Test + public void testAllocationWithoutAllowedGpus() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " "); + GpuDiscoverer.getInstance().initialize(conf); + + try { + gpuResourceHandler.bootstrap(conf); + Assert.fail("Should fail because no GPU available"); + } catch (ResourceHandlerException e) { + // Expected because of no resource available + } + + /* Start container 1, asks 0 containers */ + gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 0)); + verifyDeniedDevices(getContainerId(1), Collections.emptyList()); + + /* Start container 2, asks 1 containers. Excepted to fail */ + boolean failedToAllocate = false; + try { + gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 1)); + } catch (ResourceHandlerException e) { + failedToAllocate = true; + } + Assert.assertTrue(failedToAllocate); + + /* Release container 1, expect cgroups deleted */ + gpuResourceHandler.postComplete(getContainerId(1)); + + verify(mockCGroupsHandler, times(1)).createCGroup( + CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString()); + Assert.assertEquals(0, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + } + + @Test + public void testAllocationStored() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); + GpuDiscoverer.getInstance().initialize(conf); + + gpuResourceHandler.bootstrap(conf); + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + /* Start container 1, asks 3 containers */ + Container container = mockContainerWithGpuRequest(1, 3); + gpuResourceHandler.preStart(container); + + verify(mockNMStateStore).storeAssignedResources(container, + ResourceInformation.GPU_URI, Arrays + .asList(new GpuDevice(0, 0), new GpuDevice(1, 1), + new GpuDevice(2, 3))); + + // Only device=4 will be blocked. + verifyDeniedDevices(getContainerId(1), Arrays.asList(new GpuDevice(3, 4))); + + /* Start container 2, ask 0 container, succeeded */ + container = mockContainerWithGpuRequest(2, 0); + gpuResourceHandler.preStart(container); + + verifyDeniedDevices(getContainerId(2), Arrays + .asList(new GpuDevice(0, 0), new GpuDevice(1, 1), new GpuDevice(2, 3), + new GpuDevice(3, 4))); + Assert.assertEquals(0, container.getResourceMappings() + .getAssignedResources(ResourceInformation.GPU_URI).size()); + + // Store assigned resource will not be invoked. + verify(mockNMStateStore, never()).storeAssignedResources( + eq(container), eq(ResourceInformation.GPU_URI), anyList()); + } + + @Test + public void testAllocationStoredWithNULLStateStore() throws Exception { + NMNullStateStoreService mockNMNULLStateStore = mock(NMNullStateStoreService.class); + + Context nmnctx = mock(Context.class); + when(nmnctx.getNMStateStore()).thenReturn(mockNMNULLStateStore); + + GpuResourceHandlerImpl gpuNULLStateResourceHandler = + new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler, + mockPrivilegedExecutor); + + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); + GpuDiscoverer.getInstance().initialize(conf); + + gpuNULLStateResourceHandler.bootstrap(conf); + Assert.assertEquals(4, + gpuNULLStateResourceHandler.getGpuAllocator().getAvailableGpus()); + + /* Start container 1, asks 3 containers */ + Container container = mockContainerWithGpuRequest(1, 3); + gpuNULLStateResourceHandler.preStart(container); + + verify(nmnctx.getNMStateStore()).storeAssignedResources(container, + ResourceInformation.GPU_URI, Arrays + .asList(new GpuDevice(0, 0), new GpuDevice(1, 1), + new GpuDevice(2, 3))); + } + + @Test + public void testRecoverResourceAllocation() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4"); + GpuDiscoverer.getInstance().initialize(conf); + + gpuResourceHandler.bootstrap(conf); + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + Container nmContainer = mock(Container.class); + ResourceMappings rmap = new ResourceMappings(); + ResourceMappings.AssignedResources ar = + new ResourceMappings.AssignedResources(); + ar.updateAssignedResources( + Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3))); + rmap.addAssignedResources(ResourceInformation.GPU_URI, ar); + when(nmContainer.getResourceMappings()).thenReturn(rmap); + + runningContainersMap.put(getContainerId(1), nmContainer); + + // TEST CASE + // Reacquire container restore state of GPU Resource Allocator. + gpuResourceHandler.reacquireContainer(getContainerId(1)); + + Map deviceAllocationMapping = + gpuResourceHandler.getGpuAllocator().getDeviceAllocationMappingCopy(); + Assert.assertEquals(2, deviceAllocationMapping.size()); + Assert.assertTrue( + deviceAllocationMapping.keySet().contains(new GpuDevice(1, 1))); + Assert.assertTrue( + deviceAllocationMapping.keySet().contains(new GpuDevice(2, 3))); + Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)), + getContainerId(1)); + + // TEST CASE + // Try to reacquire a container but requested device is not in allowed list. + nmContainer = mock(Container.class); + rmap = new ResourceMappings(); + ar = new ResourceMappings.AssignedResources(); + // id=5 is not in allowed list. + ar.updateAssignedResources( + Arrays.asList(new GpuDevice(3, 4), new GpuDevice(4, 5))); + rmap.addAssignedResources(ResourceInformation.GPU_URI, ar); + when(nmContainer.getResourceMappings()).thenReturn(rmap); + + runningContainersMap.put(getContainerId(2), nmContainer); + + boolean caughtException = false; + try { + gpuResourceHandler.reacquireContainer(getContainerId(1)); + } catch (ResourceHandlerException e) { + caughtException = true; + } + Assert.assertTrue( + "Should fail since requested device Id is not in allowed list", + caughtException); + + // Make sure internal state not changed. + deviceAllocationMapping = + gpuResourceHandler.getGpuAllocator().getDeviceAllocationMappingCopy(); + Assert.assertEquals(2, deviceAllocationMapping.size()); + Assert.assertTrue(deviceAllocationMapping.keySet() + .containsAll(Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3)))); + Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)), + getContainerId(1)); + + // TEST CASE + // Try to reacquire a container but requested device is already assigned. + nmContainer = mock(Container.class); + rmap = new ResourceMappings(); + ar = new ResourceMappings.AssignedResources(); + // id=3 is already assigned + ar.updateAssignedResources( + Arrays.asList(new GpuDevice(3, 4), new GpuDevice(2, 3))); + rmap.addAssignedResources("gpu", ar); + when(nmContainer.getResourceMappings()).thenReturn(rmap); + + runningContainersMap.put(getContainerId(2), nmContainer); + + caughtException = false; + try { + gpuResourceHandler.reacquireContainer(getContainerId(1)); + } catch (ResourceHandlerException e) { + caughtException = true; + } + Assert.assertTrue( + "Should fail since requested device Id is not in allowed list", + caughtException); + + // Make sure internal state not changed. + deviceAllocationMapping = + gpuResourceHandler.getGpuAllocator().getDeviceAllocationMappingCopy(); + Assert.assertEquals(2, deviceAllocationMapping.size()); + Assert.assertTrue(deviceAllocationMapping.keySet() + .containsAll(Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 3)))); + Assert.assertEquals(deviceAllocationMapping.get(new GpuDevice(1, 1)), + getContainerId(1)); + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java index e21eea0cdc7..2cca2774207 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java @@ -73,7 +73,7 @@ private static class MockExecutor extends ContainerExecutor { @Override - public void init() throws IOException { + public void init(Context nmContext) throws IOException { } @Override public void startLocalizer(LocalizerStartContext ctx) diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java new file mode 100644 index 00000000000..bcadf76e4bd --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/TestResourcePluginManager.java @@ -0,0 +1,261 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.service.ServiceOperations; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.event.Dispatcher; +import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.DeletionService; +import org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor; +import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; +import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; +import org.apache.hadoop.yarn.server.nodemanager.NodeManager; +import org.apache.hadoop.yarn.server.nodemanager.NodeManagerTestBase; +import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; +import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; +import org.junit.After; +import org.junit.Assert; +import org.junit.Test; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.mockito.Matchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +public class TestResourcePluginManager extends NodeManagerTestBase { + private NodeManager nm; + + ResourcePluginManager stubResourcePluginmanager() { + // Stub ResourcePluginManager + final ResourcePluginManager rpm = mock(ResourcePluginManager.class); + Map plugins = new HashMap<>(); + + // First resource plugin + ResourcePlugin resourcePlugin = mock(ResourcePlugin.class); + NodeResourceUpdaterPlugin nodeResourceUpdaterPlugin = mock( + NodeResourceUpdaterPlugin.class); + when(resourcePlugin.getNodeResourceHandlerInstance()).thenReturn( + nodeResourceUpdaterPlugin); + plugins.put("resource1", resourcePlugin); + + // Second resource plugin + resourcePlugin = mock(ResourcePlugin.class); + when(resourcePlugin.createResourceHandler(any(Context.class), any( + CGroupsHandler.class), any(PrivilegedOperationExecutor.class))) + .thenReturn(new CustomizedResourceHandler()); + plugins.put("resource2", resourcePlugin); + when(rpm.getNameToPlugins()).thenReturn(plugins); + return rpm; + } + + @After + public void tearDown() { + if (nm != null) { + try { + ServiceOperations.stop(nm); + } catch (Throwable t) { + // ignore + } + } + } + + private class CustomizedResourceHandler implements ResourceHandler { + + @Override + public List bootstrap(Configuration configuration) + throws ResourceHandlerException { + return null; + } + + @Override + public List preStart(Container container) + throws ResourceHandlerException { + return null; + } + + @Override + public List reacquireContainer(ContainerId containerId) + throws ResourceHandlerException { + return null; + } + + @Override + public List postComplete(ContainerId containerId) + throws ResourceHandlerException { + return null; + } + + @Override + public List teardown() + throws ResourceHandlerException { + return null; + } + } + + private class MyMockNM extends NodeManager { + private final ResourcePluginManager rpm; + + public MyMockNM(ResourcePluginManager rpm) { + this.rpm = rpm; + } + + @Override + protected NodeStatusUpdater createNodeStatusUpdater(Context context, + Dispatcher dispatcher, NodeHealthCheckerService healthChecker) { + ((NodeManager.NMContext)context).setResourcePluginManager(rpm); + return new BaseNodeStatusUpdaterForTest(context, dispatcher, healthChecker, + metrics, new BaseResourceTrackerForTest()); + } + + @Override + protected ContainerManagerImpl createContainerManager(Context context, + ContainerExecutor exec, DeletionService del, + NodeStatusUpdater nodeStatusUpdater, + ApplicationACLsManager aclsManager, + LocalDirsHandlerService diskhandler) { + return new MyContainerManager(context, exec, del, nodeStatusUpdater, + metrics, diskhandler); + } + + @Override + protected ResourcePluginManager createResourcePluginManager() { + return rpm; + } + } + + public class MyLCE extends LinuxContainerExecutor { + private PrivilegedOperationExecutor poe = mock(PrivilegedOperationExecutor.class); + + @Override + protected PrivilegedOperationExecutor getPrivilegedOperationExecutor() { + return poe; + } + } + + /* + * Make sure ResourcePluginManager is initialized during NM start up. + */ + @Test(timeout = 30000) + public void testResourcePluginManagerInitialization() throws Exception { + final ResourcePluginManager rpm = stubResourcePluginmanager(); + nm = new MyMockNM(rpm); + + YarnConfiguration conf = createNMConfig(); + nm.init(conf); + verify(rpm, times(1)).initialize( + any(Context.class)); + } + + /* + * Make sure ResourcePluginManager is invoked during NM update. + */ + @Test(timeout = 30000) + public void testNodeStatusUpdaterWithResourcePluginsEnabled() throws Exception { + final ResourcePluginManager rpm = stubResourcePluginmanager(); + + nm = new MyMockNM(rpm); + + YarnConfiguration conf = createNMConfig(); + nm.init(conf); + nm.start(); + + NodeResourceUpdaterPlugin nodeResourceUpdaterPlugin = + rpm.getNameToPlugins().get("resource1") + .getNodeResourceHandlerInstance(); + + verify(nodeResourceUpdaterPlugin, times(1)).updateConfiguredResource( + any(Resource.class)); + } + + /* + * Make sure ResourcePluginManager is used to initialize ResourceHandlerChain + */ + @Test(timeout = 30000) + public void testLinuxContainerExecutorWithResourcePluginsEnabled() throws Exception { + final ResourcePluginManager rpm = stubResourcePluginmanager(); + final LinuxContainerExecutor lce = new MyLCE(); + + nm = new NodeManager() { + @Override + protected NodeStatusUpdater createNodeStatusUpdater(Context context, + Dispatcher dispatcher, NodeHealthCheckerService healthChecker) { + ((NMContext)context).setResourcePluginManager(rpm); + return new BaseNodeStatusUpdaterForTest(context, dispatcher, healthChecker, + metrics, new BaseResourceTrackerForTest()); + } + + @Override + protected ContainerManagerImpl createContainerManager(Context context, + ContainerExecutor exec, DeletionService del, + NodeStatusUpdater nodeStatusUpdater, + ApplicationACLsManager aclsManager, + LocalDirsHandlerService diskhandler) { + return new MyContainerManager(context, exec, del, nodeStatusUpdater, + metrics, diskhandler); + } + + @Override + protected ContainerExecutor createContainerExecutor(Configuration conf) { + ((NMContext)this.getNMContext()).setResourcePluginManager(rpm); + lce.setConf(conf); + return lce; + } + }; + + YarnConfiguration conf = createNMConfig(); + + nm.init(conf); + nm.start(); + + ResourceHandler handler = lce.getResourceHandler(); + Assert.assertNotNull(handler); + Assert.assertTrue(handler instanceof ResourceHandlerChain); + + boolean newHandlerAdded = false; + for (ResourceHandler h : ((ResourceHandlerChain) handler) + .getResourceHandlerList()) { + if (h instanceof CustomizedResourceHandler) { + newHandlerAdded = true; + break; + } + } + Assert.assertTrue("New ResourceHandler should be added", newHandlerAdded); + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java new file mode 100644 index 00000000000..4abb633a69a --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java @@ -0,0 +1,141 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; +import org.junit.Assert; +import org.junit.Assume; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.List; + +public class TestGpuDiscoverer { + private String getTestParentFolder() { + File f = new File("target/temp/" + TestGpuDiscoverer.class.getName()); + return f.getAbsolutePath(); + } + + private void touchFile(File f) throws IOException { + new FileOutputStream(f).close(); + } + + @Before + public void before() throws IOException { + String folder = getTestParentFolder(); + File f = new File(folder); + FileUtils.deleteDirectory(f); + f.mkdirs(); + } + + @Test + public void testLinuxGpuResourceDiscoverPluginConfig() throws Exception { + // Only run this on demand. + Assume.assumeTrue(Boolean.valueOf( + System.getProperty("RunLinuxGpuResourceDiscoverPluginConfigTest"))); + + // test case 1, check default setting. + Configuration conf = new Configuration(false); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME, + plugin.getPathOfGpuBinary()); + Assert.assertNotNull(plugin.getEnvironmentToRunCommand().get("PATH")); + Assert.assertTrue( + plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia")); + + // test case 2, check mandatory set path. + File fakeBinary = new File(getTestParentFolder(), + GpuDiscoverer.DEFAULT_BINARY_NAME); + touchFile(fakeBinary); + conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, getTestParentFolder()); + plugin = new GpuDiscoverer(); + plugin.initialize(conf); + Assert.assertEquals(fakeBinary.getAbsolutePath(), + plugin.getPathOfGpuBinary()); + Assert.assertNull(plugin.getEnvironmentToRunCommand().get("PATH")); + + // test case 3, check mandatory set path, but binary doesn't exist so default + // path will be used. + fakeBinary.delete(); + plugin = new GpuDiscoverer(); + plugin.initialize(conf); + Assert.assertEquals(GpuDiscoverer.DEFAULT_BINARY_NAME, + plugin.getPathOfGpuBinary()); + Assert.assertTrue( + plugin.getEnvironmentToRunCommand().get("PATH").contains("nvidia")); + } + + @Test + public void testGpuDiscover() throws YarnException { + // Since this is more of a performance unit test, only run if + // RunUserLimitThroughput is set (-DRunUserLimitThroughput=true) + Assume.assumeTrue( + Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest"))); + Configuration conf = new Configuration(false); + GpuDiscoverer plugin = new GpuDiscoverer(); + plugin.initialize(conf); + GpuDeviceInformation info = plugin.getGpuDeviceInformation(); + + Assert.assertTrue(info.getGpus().size() > 0); + Assert.assertEquals(plugin.getGpusUsableByYarn().size(), + info.getGpus().size()); + } + + @Test + public void getNumberOfUsableGpusFromConfig() throws YarnException { + Configuration conf = new Configuration(false); + + // Illegal format + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3"); + GpuDiscoverer plugin = new GpuDiscoverer(); + try { + plugin.initialize(conf); + plugin.getGpusUsableByYarn(); + Assert.fail("Illegal format, should fail."); + } catch (YarnException e) { + // Expected + } + + // Valid format + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:2,3:4"); + plugin = new GpuDiscoverer(); + plugin.initialize(conf); + + List usableGpuDevices = plugin.getGpusUsableByYarn(); + Assert.assertEquals(4, usableGpuDevices.size()); + + Assert.assertTrue(0 == usableGpuDevices.get(0).getIndex()); + Assert.assertTrue(1 == usableGpuDevices.get(1).getIndex()); + Assert.assertTrue(2 == usableGpuDevices.get(2).getIndex()); + Assert.assertTrue(3 == usableGpuDevices.get(3).getIndex()); + + Assert.assertTrue(0 == usableGpuDevices.get(0).getMinorNumber()); + Assert.assertTrue(1 == usableGpuDevices.get(1).getMinorNumber()); + Assert.assertTrue(2 == usableGpuDevices.get(2).getMinorNumber()); + Assert.assertTrue(4 == usableGpuDevices.get(3).getMinorNumber()); + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java index 0e46234a91f..4364709b56f 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java @@ -19,6 +19,7 @@ package org.apache.hadoop.yarn.server.nodemanager.recovery; import java.io.IOException; +import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -42,6 +43,8 @@ import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; import org.apache.hadoop.yarn.server.api.records.MasterKey; import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; import org.apache.hadoop.yarn.server.utils.BuilderUtils; @@ -124,6 +127,7 @@ public synchronized void removeApplication(ApplicationId appId) rcsCopy.setRemainingRetryAttempts(rcs.getRemainingRetryAttempts()); rcsCopy.setWorkDir(rcs.getWorkDir()); rcsCopy.setLogDir(rcs.getLogDir()); + rcsCopy.setResourceMappings(rcs.getResourceMappings()); result.add(rcsCopy); } return result; @@ -511,6 +515,20 @@ public synchronized void removeAMRMProxyAppContext( amrmProxyState.getAppContexts().remove(attempt); } + @Override + public void storeAssignedResources(Container container, + String resourceType, List assignedResources) + throws IOException { + ResourceMappings.AssignedResources ar = + new ResourceMappings.AssignedResources(); + ar.updateAssignedResources(assignedResources); + containerStates.get(container.getContainerId()).getResourceMappings() + .addAssignedResources(resourceType, ar); + + // update container resource mapping. + updateContainerResourceMapping(container, resourceType, assignedResources); + } + private static class TrackerState { Map inProgressMap = new HashMap(); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java index e2c28aabc5c..f93f3e6cb73 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java @@ -32,9 +32,11 @@ import static org.mockito.Mockito.timeout; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; import java.io.File; import java.io.IOException; +import java.io.Serializable; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; @@ -71,6 +73,8 @@ import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; import org.apache.hadoop.yarn.server.api.records.MasterKey; import org.apache.hadoop.yarn.server.nodemanager.amrmproxy.AMRMProxyTokenSecretManager; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.LocalResourceTrackerState; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredAMRMProxyState; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredApplicationsState; @@ -1011,46 +1015,12 @@ public void testUnexpectedKeyDoesntThrowException() throws IOException { .loadContainersState(); assertTrue(recoveredContainers.isEmpty()); - // create a container request ApplicationId appId = ApplicationId.newInstance(1234, 3); ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, 4); ContainerId containerId = ContainerId.newContainerId(appAttemptId, 5); - LocalResource lrsrc = LocalResource.newInstance( - URL.newInstance("hdfs", "somehost", 12345, "/some/path/to/rsrc"), - LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, 123L, - 1234567890L); - Map localResources = - new HashMap(); - localResources.put("rsrc", lrsrc); - Map env = new HashMap(); - env.put("somevar", "someval"); - List containerCmds = new ArrayList(); - containerCmds.add("somecmd"); - containerCmds.add("somearg"); - Map serviceData = new HashMap(); - serviceData.put("someservice", - ByteBuffer.wrap(new byte[] { 0x1, 0x2, 0x3 })); - ByteBuffer containerTokens = ByteBuffer - .wrap(new byte[] { 0x7, 0x8, 0x9, 0xa }); - Map acls = - new HashMap(); - acls.put(ApplicationAccessType.VIEW_APP, "viewuser"); - acls.put(ApplicationAccessType.MODIFY_APP, "moduser"); - ContainerLaunchContext clc = ContainerLaunchContext.newInstance( - localResources, env, containerCmds, - serviceData, containerTokens, acls); - Resource containerRsrc = Resource.newInstance(1357, 3); - ContainerTokenIdentifier containerTokenId = new ContainerTokenIdentifier( - containerId, "host", "user", containerRsrc, 9876543210L, 42, 2468, - Priority.newInstance(7), 13579); - Token containerToken = Token.newInstance(containerTokenId.getBytes(), - ContainerTokenIdentifier.KIND.toString(), "password".getBytes(), - "tokenservice"); - StartContainerRequest containerReq = StartContainerRequest.newInstance(clc, - containerToken); - - stateStore.storeContainer(containerId, 0, 0, containerReq); + StartContainerRequest startContainerRequest = storeMockContainer( + containerId); // add a invalid key byte[] invalidKey = ("ContainerManager/containers/" @@ -1063,7 +1033,7 @@ public void testUnexpectedKeyDoesntThrowException() throws IOException { assertEquals(RecoveredContainerStatus.REQUESTED, rcs.getStatus()); assertEquals(ContainerExitStatus.INVALID, rcs.getExitCode()); assertEquals(false, rcs.getKilled()); - assertEquals(containerReq, rcs.getStartRequest()); + assertEquals(startContainerRequest, rcs.getStartRequest()); assertTrue(rcs.getDiagnostics().isEmpty()); assertEquals(RecoveredContainerType.KILL, rcs.getRecoveryType()); // assert unknown keys are cleaned up finally @@ -1171,6 +1141,97 @@ public void testAMRMProxyStorage() throws IOException { } } + @Test + public void testStateStoreForResourceMapping() throws IOException { + // test empty when no state + List recoveredContainers = stateStore + .loadContainersState(); + assertTrue(recoveredContainers.isEmpty()); + + ApplicationId appId = ApplicationId.newInstance(1234, 3); + ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, + 4); + ContainerId containerId = ContainerId.newContainerId(appAttemptId, 5); + storeMockContainer(containerId); + + Container container = mock(Container.class); + when(container.getContainerId()).thenReturn(containerId); + ResourceMappings resourceMappings = new ResourceMappings(); + when(container.getResourceMappings()).thenReturn(resourceMappings); + + // Store ResourceMapping + stateStore.storeAssignedResources(container, "gpu", + Arrays.asList("1", "2", "3")); + // This will overwrite above + List gpuRes1 = Arrays.asList("1", "2", "4"); + stateStore.storeAssignedResources(container, "gpu", gpuRes1); + List fpgaRes = Arrays.asList("3", "4", "5", "6"); + stateStore.storeAssignedResources(container, "fpga", fpgaRes); + List numaRes = Arrays.asList("numa1"); + stateStore.storeAssignedResources(container, "numa", numaRes); + + // add a invalid key + restartStateStore(); + recoveredContainers = stateStore.loadContainersState(); + assertEquals(1, recoveredContainers.size()); + RecoveredContainerState rcs = recoveredContainers.get(0); + List res = rcs.getResourceMappings() + .getAssignedResources("gpu"); + Assert.assertTrue(res.equals(gpuRes1)); + Assert.assertTrue( + resourceMappings.getAssignedResources("gpu").equals(gpuRes1)); + + res = rcs.getResourceMappings().getAssignedResources("fpga"); + Assert.assertTrue(res.equals(fpgaRes)); + Assert.assertTrue( + resourceMappings.getAssignedResources("fpga").equals(fpgaRes)); + + res = rcs.getResourceMappings().getAssignedResources("numa"); + Assert.assertTrue(res.equals(numaRes)); + Assert.assertTrue( + resourceMappings.getAssignedResources("numa").equals(numaRes)); + } + + private StartContainerRequest storeMockContainer(ContainerId containerId) + throws IOException { + // create a container request + LocalResource lrsrc = LocalResource.newInstance( + URL.newInstance("hdfs", "somehost", 12345, "/some/path/to/rsrc"), + LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, 123L, + 1234567890L); + Map localResources = + new HashMap(); + localResources.put("rsrc", lrsrc); + Map env = new HashMap(); + env.put("somevar", "someval"); + List containerCmds = new ArrayList(); + containerCmds.add("somecmd"); + containerCmds.add("somearg"); + Map serviceData = new HashMap(); + serviceData.put("someservice", + ByteBuffer.wrap(new byte[] { 0x1, 0x2, 0x3 })); + ByteBuffer containerTokens = ByteBuffer + .wrap(new byte[] { 0x7, 0x8, 0x9, 0xa }); + Map acls = + new HashMap(); + acls.put(ApplicationAccessType.VIEW_APP, "viewuser"); + acls.put(ApplicationAccessType.MODIFY_APP, "moduser"); + ContainerLaunchContext clc = ContainerLaunchContext.newInstance( + localResources, env, containerCmds, + serviceData, containerTokens, acls); + Resource containerRsrc = Resource.newInstance(1357, 3); + ContainerTokenIdentifier containerTokenId = new ContainerTokenIdentifier( + containerId, "host", "user", containerRsrc, 9876543210L, 42, 2468, + Priority.newInstance(7), 13579); + Token containerToken = Token.newInstance(containerTokenId.getBytes(), + ContainerTokenIdentifier.KIND.toString(), "password".getBytes(), + "tokenservice"); + StartContainerRequest containerReq = StartContainerRequest.newInstance(clc, + containerToken); + stateStore.storeContainer(containerId, 0, 0, containerReq); + return containerReq; + } + private static class NMTokenSecretManagerForTest extends BaseNMTokenSecretManager { public MasterKey generateKey() { diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java index 8ad6d7c12a3..3d669e22404 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java @@ -37,6 +37,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceSet; import org.apache.hadoop.yarn.server.utils.BuilderUtils; @@ -244,4 +245,9 @@ public long getContainerStartTime() { public void sendPauseEvent(String description) { } + + @Override + public ResourceMappings getResourceMappings() { + return null; + } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java index ed7b9ea0aa4..39e403da5bf 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java @@ -18,31 +18,20 @@ package org.apache.hadoop.yarn.server.nodemanager.webapp; -import static org.apache.hadoop.yarn.webapp.WebServicesTestUtils.assertResponseStatusCode; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; - -import java.io.File; -import java.io.IOException; -import java.io.PrintWriter; -import java.io.StringReader; -import java.net.HttpURLConnection; -import java.net.URI; -import java.net.URL; -import java.util.List; -import javax.servlet.http.HttpServletResponse; -import javax.ws.rs.core.MediaType; -import javax.xml.parsers.DocumentBuilder; -import javax.xml.parsers.DocumentBuilderFactory; - -import org.apache.hadoop.http.JettyUtils; -import org.apache.hadoop.yarn.webapp.GuiceServletConfig; +import com.google.inject.Guice; +import com.google.inject.servlet.ServletModule; +import com.sun.jersey.api.client.ClientResponse; +import com.sun.jersey.api.client.ClientResponse.Status; +import com.sun.jersey.api.client.GenericType; +import com.sun.jersey.api.client.UniformInterfaceException; +import com.sun.jersey.api.client.WebResource; +import com.sun.jersey.guice.spi.container.servlet.GuiceContainer; +import com.sun.jersey.test.framework.WebAppDescriptor; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.http.JettyUtils; import org.apache.hadoop.util.VersionInfo; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; @@ -50,6 +39,7 @@ import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.AsyncDispatcher; +import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.logaggregation.ContainerLogAggregationType; import org.apache.hadoop.yarn.logaggregation.ContainerLogFileInfo; import org.apache.hadoop.yarn.logaggregation.TestContainerLogsUtils; @@ -61,13 +51,22 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.AssignedGpuDevice; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice; import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.server.webapp.YarnWebServiceParams; import org.apache.hadoop.yarn.server.webapp.dao.ContainerLogsInfo; import org.apache.hadoop.yarn.util.YarnVersionInfo; import org.apache.hadoop.yarn.webapp.GenericExceptionHandler; +import org.apache.hadoop.yarn.webapp.GuiceServletConfig; import org.apache.hadoop.yarn.webapp.JerseyTestBase; import org.apache.hadoop.yarn.webapp.WebApp; import org.apache.hadoop.yarn.webapp.WebServicesTestUtils; @@ -83,22 +82,36 @@ import org.w3c.dom.NodeList; import org.xml.sax.InputSource; -import com.google.inject.Guice; -import com.google.inject.servlet.ServletModule; -import com.sun.jersey.api.client.ClientResponse; -import com.sun.jersey.api.client.ClientResponse.Status; -import com.sun.jersey.api.client.GenericType; -import com.sun.jersey.api.client.UniformInterfaceException; -import com.sun.jersey.api.client.WebResource; -import com.sun.jersey.guice.spi.container.servlet.GuiceContainer; -import com.sun.jersey.test.framework.WebAppDescriptor; +import javax.servlet.http.HttpServletResponse; +import javax.ws.rs.core.MediaType; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.io.StringReader; +import java.net.HttpURLConnection; +import java.net.URI; +import java.net.URL; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.apache.hadoop.yarn.webapp.WebServicesTestUtils.assertResponseStatusCode; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; /** * Test the nodemanager node info web services api's */ public class TestNMWebServices extends JerseyTestBase { - private static Context nmContext; + private static NodeManager.NMContext nmContext; private static ResourceView resourceView; private static ApplicationACLsManager aclsManager; private static LocalDirsHandlerService dirsHandler; @@ -418,6 +431,112 @@ public void testNMRedirect() { assertFalse(redirectURL.contains(YarnWebServiceParams.NM_ID)); } + @Test + public void testGetNMResourceInfo() + throws YarnException, InterruptedException, JSONException { + ResourcePluginManager rpm = mock(ResourcePluginManager.class); + Map namesToPlugins = new HashMap<>(); + ResourcePlugin mockPlugin1 = mock(ResourcePlugin.class); + NMResourceInfo nmResourceInfo1 = new NMResourceInfo() { + public long a = 1000L; + }; + when(mockPlugin1.getNMResourceInfo()).thenReturn(nmResourceInfo1); + namesToPlugins.put("resource-1", mockPlugin1); + namesToPlugins.put("yarn.io/resource-1", mockPlugin1); + ResourcePlugin mockPlugin2 = mock(ResourcePlugin.class); + namesToPlugins.put("resource-2", mockPlugin2); + when(rpm.getNameToPlugins()).thenReturn(namesToPlugins); + + nmContext.setResourcePluginManager(rpm); + + WebResource r = resource(); + ClientResponse response = r.path("ws").path("v1").path("node").path( + "resources").path("resource-2").accept(MediaType.APPLICATION_JSON).get( + ClientResponse.class); + assertEquals(MediaType.APPLICATION_JSON + "; " + JettyUtils.UTF_8, + response.getType().toString()); + + // Access resource-2 should fail (empty NMResourceInfo returned). + JSONObject json = response.getEntity(JSONObject.class); + Assert.assertEquals(0, json.length()); + + // Access resource-3 should fail (unknown plugin) + response = r.path("ws").path("v1").path("node").path( + "resources").path("resource-3").accept(MediaType.APPLICATION_JSON).get( + ClientResponse.class); + assertEquals(MediaType.APPLICATION_JSON + "; " + JettyUtils.UTF_8, + response.getType().toString()); + json = response.getEntity(JSONObject.class); + Assert.assertEquals(0, json.length()); + + // Access resource-1 should success + response = r.path("ws").path("v1").path("node").path( + "resources").path("resource-1").accept(MediaType.APPLICATION_JSON).get( + ClientResponse.class); + assertEquals(MediaType.APPLICATION_JSON + "; " + JettyUtils.UTF_8, + response.getType().toString()); + json = response.getEntity(JSONObject.class); + Assert.assertEquals(1000, json.get("a")); + + // Access resource-1 should success (encoded yarn.io/Fresource-1). + response = r.path("ws").path("v1").path("node").path("resources").path( + "yarn.io%2Fresource-1").accept(MediaType.APPLICATION_JSON).get( + ClientResponse.class); + assertEquals(MediaType.APPLICATION_JSON + "; " + JettyUtils.UTF_8, + response.getType().toString()); + json = response.getEntity(JSONObject.class); + Assert.assertEquals(1000, json.get("a")); + } + + private ContainerId createContainerId(int id) { + ApplicationId appId = ApplicationId.newInstance(0, 0); + ApplicationAttemptId appAttemptId = + ApplicationAttemptId.newInstance(appId, 1); + ContainerId containerId = ContainerId.newContainerId(appAttemptId, id); + return containerId; + } + + @Test + public void testGetYarnGpuResourceInfo() + throws YarnException, InterruptedException, JSONException { + ResourcePluginManager rpm = mock(ResourcePluginManager.class); + Map namesToPlugins = new HashMap<>(); + ResourcePlugin mockPlugin1 = mock(ResourcePlugin.class); + GpuDeviceInformation gpuDeviceInformation = new GpuDeviceInformation(); + gpuDeviceInformation.setDriverVersion("1.2.3"); + gpuDeviceInformation.setGpus(Arrays.asList(new PerGpuDeviceInformation())); + NMResourceInfo nmResourceInfo1 = new NMGpuResourceInfo(gpuDeviceInformation, + Arrays.asList(new GpuDevice(1, 1), new GpuDevice(2, 2), + new GpuDevice(3, 3)), Arrays + .asList(new AssignedGpuDevice(2, 2, createContainerId(1)), + new AssignedGpuDevice(3, 3, createContainerId(2)))); + when(mockPlugin1.getNMResourceInfo()).thenReturn(nmResourceInfo1); + namesToPlugins.put("resource-1", mockPlugin1); + namesToPlugins.put("yarn.io/resource-1", mockPlugin1); + ResourcePlugin mockPlugin2 = mock(ResourcePlugin.class); + namesToPlugins.put("resource-2", mockPlugin2); + when(rpm.getNameToPlugins()).thenReturn(namesToPlugins); + + nmContext.setResourcePluginManager(rpm); + + WebResource r = resource(); + ClientResponse response; + JSONObject json; + + // Access resource-1 should success + response = r.path("ws").path("v1").path("node").path( + "resources").path("resource-1").accept(MediaType.APPLICATION_JSON).get( + ClientResponse.class); + assertEquals(MediaType.APPLICATION_JSON + "; " + JettyUtils.UTF_8, + response.getType().toString()); + json = response.getEntity(JSONObject.class); + Assert.assertEquals("1.2.3", + json.getJSONObject("gpuDeviceInformation").get("driverVersion")); + Assert.assertEquals(3, json.getJSONArray("totalGpuDevices").length()); + Assert.assertEquals(2, json.getJSONArray("assignedGpuDevices").length()); + Assert.assertEquals(2, json.getJSONArray("assignedGpuDevices").length()); + } + private void testContainerLogs(WebResource r, ContainerId containerId) throws IOException { final String containerIdStr = containerId.toString(); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/TestGpuDeviceInformationParser.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/TestGpuDeviceInformationParser.java new file mode 100644 index 00000000000..dc96746cf5d --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/TestGpuDeviceInformationParser.java @@ -0,0 +1,50 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu; + +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.junit.Assert; +import org.junit.Test; + +import java.io.File; +import java.io.IOException; + +public class TestGpuDeviceInformationParser { + @Test + public void testParse() throws IOException, YarnException { + File f = new File("src/test/resources/nvidia-smi-sample-xml-output"); + String s = FileUtils.readFileToString(f, "UTF-8"); + + GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); + + GpuDeviceInformation info = parser.parseXml(s); + Assert.assertEquals("375.66", info.getDriverVersion()); + Assert.assertEquals(2, info.getGpus().size()); + PerGpuDeviceInformation gpu1 = info.getGpus().get(1); + Assert.assertEquals("Tesla P100-PCIE-12GB", gpu1.getProductName()); + Assert.assertEquals(12193, gpu1.getGpuMemoryUsage().getTotalMemoryMiB()); + Assert.assertEquals(10.3f, + gpu1.getGpuUtilizations().getOverallGpuUtilization(), 1e-6); + Assert.assertEquals(34f, gpu1.getTemperature().getCurrentGpuTemp(), 1e-6); + Assert.assertEquals(85f, gpu1.getTemperature().getMaxGpuTemp(), 1e-6); + Assert.assertEquals(82f, gpu1.getTemperature().getSlowThresholdGpuTemp(), + 1e-6); + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-xml-output hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-xml-output new file mode 100644 index 00000000000..5ccb72265b5 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/resources/nvidia-smi-sample-xml-output @@ -0,0 +1,547 @@ + + + + + + + Wed Sep 6 21:52:51 2017 + 375.66 + 2 + + Tesla P100-PCIE-12GB + Tesla + Disabled + Disabled + Disabled + Disabled + 1920 + + N/A + N/A + + 0320717030197 + GPU-28604e81-21ec-cc48-6759-bf2648b22e16 + 0 + 86.00.3A.00.02 + No + 0x400 + 900-2H400-0110-030 + + H400.0202.00.01 + 1.1 + 4.1 + N/A + + + N/A + N/A + + + None + + + 04 + 00 + 0000 + 15F710DE + 0000:04:00.0 + 11DA10DE + + + 3 + 3 + + + 16x + 16x + + + + N/A + N/A + + 0 + 0 KB/s + 0 KB/s + + N/A + P0 + + Active + Not Active + Not Active + Not Active + Not Active + Not Active + + + 12193 MiB + 0 MiB + 12193 MiB + + + 16384 MiB + 2 MiB + 16382 MiB + + Default + + 0 % + 0 % + 0 % + 0 % + + + 0 + 0 + 0 ms + + + Enabled + Enabled + + + + + 0 + 0 + N/A + 0 + 0 + 0 + 0 + + + 0 + 0 + N/A + 0 + 0 + 0 + 0 + + + + + 0 + 0 + N/A + 0 + 0 + 0 + 0 + + + 0 + 0 + N/A + 0 + 0 + 0 + 0 + + + + + + 0 + + + + + 0 + + + + No + + + 31 C + 85 C + 82 C + + + P0 + Supported + 24.84 W + 250.00 W + 250.00 W + 250.00 W + 125.00 W + 250.00 W + + + 405 MHz + 405 MHz + 715 MHz + 835 MHz + + + 1189 MHz + 715 MHz + + + 1189 MHz + 715 MHz + + + 1328 MHz + 1328 MHz + 715 MHz + 1328 MHz + + + N/A + N/A + + + + 715 MHz + 1328 MHz + 1316 MHz + 1303 MHz + 1290 MHz + 1278 MHz + 1265 MHz + 1252 MHz + 1240 MHz + 1227 MHz + 1215 MHz + 1202 MHz + 1189 MHz + 1177 MHz + 1164 MHz + 1151 MHz + 1139 MHz + 1126 MHz + 1113 MHz + 1101 MHz + 1088 MHz + 1075 MHz + 1063 MHz + 1050 MHz + 1037 MHz + 1025 MHz + 1012 MHz + 999 MHz + 987 MHz + 974 MHz + 961 MHz + 949 MHz + 936 MHz + 923 MHz + 911 MHz + 898 MHz + 885 MHz + 873 MHz + 860 MHz + 847 MHz + 835 MHz + 822 MHz + 810 MHz + 797 MHz + 784 MHz + 772 MHz + 759 MHz + 746 MHz + 734 MHz + 721 MHz + 708 MHz + 696 MHz + 683 MHz + 670 MHz + 658 MHz + 645 MHz + 632 MHz + 620 MHz + 607 MHz + 594 MHz + 582 MHz + 569 MHz + 556 MHz + 544 MHz + + + + + + + + + + Tesla P100-PCIE-12GB + Tesla + Disabled + Disabled + Disabled + Disabled + 1920 + + N/A + N/A + + 0320717031755 + GPU-46915a82-3fd2-8e11-ae26-a80b607c04f3 + 1 + 86.00.3A.00.02 + No + 0x8200 + 900-2H400-0110-030 + + H400.0202.00.01 + 1.1 + 4.1 + N/A + + + N/A + N/A + + + None + + + 82 + 00 + 0000 + 15F710DE + 0000:82:00.0 + 11DA10DE + + + 3 + 3 + + + 16x + 16x + + + + N/A + N/A + + 0 + 0 KB/s + 0 KB/s + + N/A + P0 + + Active + Not Active + Not Active + Not Active + Not Active + Not Active + + + 12193 MiB + 0 MiB + 12193 MiB + + + 16384 MiB + 2 MiB + 16382 MiB + + Default + + 10.3 % + 0 % + 0 % + 0 % + + + 0 + 0 + 0 ms + + + Enabled + Enabled + + + + + 0 + 0 + N/A + 0 + 0 + 0 + 0 + + + 0 + 0 + N/A + 0 + 0 + 0 + 0 + + + + + 0 + 0 + N/A + 0 + 0 + 0 + 0 + + + 0 + 0 + N/A + 0 + 0 + 0 + 0 + + + + + + 0 + + + + + 0 + + + + No + + + 34 C + 85 C + 82 C + + + P0 + Supported + 25.54 W + 250.00 W + 250.00 W + 250.00 W + 125.00 W + 250.00 W + + + 405 MHz + 405 MHz + 715 MHz + 835 MHz + + + 1189 MHz + 715 MHz + + + 1189 MHz + 715 MHz + + + 1328 MHz + 1328 MHz + 715 MHz + 1328 MHz + + + N/A + N/A + + + + 715 MHz + 1328 MHz + 1316 MHz + 1303 MHz + 1290 MHz + 1278 MHz + 1265 MHz + 1252 MHz + 1240 MHz + 1227 MHz + 1215 MHz + 1202 MHz + 1189 MHz + 1177 MHz + 1164 MHz + 1151 MHz + 1139 MHz + 1126 MHz + 1113 MHz + 1101 MHz + 1088 MHz + 1075 MHz + 1063 MHz + 1050 MHz + 1037 MHz + 1025 MHz + 1012 MHz + 999 MHz + 987 MHz + 974 MHz + 961 MHz + 949 MHz + 936 MHz + 923 MHz + 911 MHz + 898 MHz + 885 MHz + 873 MHz + 860 MHz + 847 MHz + 835 MHz + 822 MHz + 810 MHz + 797 MHz + 784 MHz + 772 MHz + 759 MHz + 746 MHz + 734 MHz + 721 MHz + 708 MHz + 696 MHz + 683 MHz + 670 MHz + 658 MHz + 645 MHz + 632 MHz + 620 MHz + 607 MHz + 594 MHz + 582 MHz + 569 MHz + 556 MHz + 544 MHz + + + + + + + + + \ No newline at end of file diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java index 6c0a8541223..3c117bc4b07 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java @@ -400,14 +400,32 @@ public RefreshQueuesResponse refreshQueues(RefreshQueuesRequest request) } } + protected Configuration loadNewConfiguration() + throws IOException, YarnException { + // Retrieve yarn-site.xml in order to refresh scheduling monitor properties. + Configuration conf = getConfiguration(new Configuration(false), + YarnConfiguration.YARN_SITE_CONFIGURATION_FILE, + YarnConfiguration.RESOURCE_TYPES_CONFIGURATION_FILE); + // The reason we call Configuration#size() is because when getConfiguration + // been called, it invokes Configuration#addResouce, which invokes + // Configuration#reloadConfiguration which triggers the reload process in a + // lazy way, the properties will only be reload when it's needed rather than + // reload it right after getConfiguration been called. So here we call + // Configuration#size() to force the Configuration#getProps been called to + // reload all the properties. + conf.size(); + return conf; + } + @Private public void refreshQueues() throws IOException, YarnException { - rm.getRMContext().getScheduler().reinitialize(getConfig(), + Configuration conf = loadNewConfiguration(); + rm.getRMContext().getScheduler().reinitialize(conf, this.rm.getRMContext()); // refresh the reservation system ReservationSystem rSystem = rm.getRMContext().getReservationSystem(); if (rSystem != null) { - rSystem.reinitialize(getConfig(), rm.getRMContext()); + rSystem.reinitialize(conf, rm.getRMContext()); } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java index e3e157e451b..3a48bcb2f4a 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java @@ -58,7 +58,6 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException; import org.apache.hadoop.yarn.exceptions.YarnException; -import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; import org.apache.hadoop.yarn.proto.YarnServiceProtos.SchedulerResourceTypes; import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus; import org.apache.hadoop.yarn.server.resourcemanager.RMAppManagerEvent; @@ -799,7 +798,7 @@ public void killAllAppsInQueue(String queueName) writeLock.unlock(); } } - + /** * Process resource update on a node. */ @@ -902,12 +901,12 @@ public void setClusterMaxPriority(Configuration conf) LOG.info("Updated the cluste max priority to maxClusterLevelAppPriority = " + maxClusterLevelAppPriority); } - + /** * Sanity check increase/decrease request, and return * SchedulerContainerResourceChangeRequest according to given * UpdateContainerRequest. - * + * *

    * - Returns non-null value means validation succeeded
    * - Throw exception when any other error happens
@@ -1344,9 +1343,7 @@ public void asyncContainerRelease(RMContainer container) {
   }
 
   /*
-   * Get a Resource object with for the minimum allocation possible. If resource
-   * profiles are enabled then the 'minimum' resource profile will be used. If
-   * they are not enabled, use the minimums specified in the config files.
+   * Get a Resource object with for the minimum allocation possible.
    *
    * @return a Resource object with the minimum allocation for the scheduler
    */
@@ -1357,9 +1354,7 @@ public Resource getMinimumAllocation() {
   }
 
   /**
-   * Get a Resource object with for the maximum allocation possible. If resource
-   * profiles are enabled then the 'maximum' resource profile will be used. If
-   * they are not enabled, use the maximums specified in the config files.
+   * Get a Resource object with for the maximum allocation possible.
    *
    * @return a Resource object with the maximum allocation for the scheduler
    */
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
index 28b8c6b7eb1..27859b778fc 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
@@ -143,6 +143,7 @@
 import org.apache.hadoop.yarn.server.utils.Lock;
 import org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator;
 import org.apache.hadoop.yarn.util.resource.ResourceCalculator;
+import org.apache.hadoop.yarn.util.resource.ResourceUtils;
 import org.apache.hadoop.yarn.util.resource.Resources;
 
 import com.google.common.annotations.VisibleForTesting;
@@ -442,12 +443,15 @@ public void reinitialize(Configuration newConf, RMContext rmContext)
       validateConf(this.conf);
       try {
         LOG.info("Re-initializing queues...");
-        refreshMaximumAllocation(this.conf.getMaximumAllocation());
+        refreshMaximumAllocation(
+            ResourceUtils.fetchMaximumAllocationFromConfig(this.conf));
         reinitializeQueues(this.conf);
       } catch (Throwable t) {
         this.conf = oldConf;
-        refreshMaximumAllocation(this.conf.getMaximumAllocation());
-        throw new IOException("Failed to re-init queues : "+ t.getMessage(), t);
+        refreshMaximumAllocation(
+            ResourceUtils.fetchMaximumAllocationFromConfig(this.conf));
+        throw new IOException("Failed to re-init queues : " + t.getMessage(),
+            t);
       }
 
       // update lazy preemption
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java
index 631f1f886e1..22e0d89690c 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java
@@ -49,6 +49,7 @@
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.policy.SchedulableEntity;
 import org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator;
 import org.apache.hadoop.yarn.util.resource.ResourceCalculator;
+import org.apache.hadoop.yarn.util.resource.ResourceUtils;
 import org.apache.hadoop.yarn.util.resource.Resources;
 
 import java.util.ArrayList;
@@ -787,16 +788,6 @@ public Resource getMinimumAllocation() {
     return Resources.createResource(minimumMemory, minimumCores);
   }
 
-  public Resource getMaximumAllocation() {
-    int maximumMemory = getInt(
-        YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_MB,
-        YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_MB);
-    int maximumCores = getInt(
-        YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES,
-        YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES);
-    return Resources.createResource(maximumMemory, maximumCores);
-  }
-
   @Private
   public Priority getQueuePriority(String queue) {
     String queuePolicyPrefix = getQueuePrefix(queue);
@@ -820,6 +811,8 @@ public void setQueuePriority(String queue, int priority) {
    * @return setting specified per queue else falls back to the cluster setting
    */
   public Resource getMaximumAllocationPerQueue(String queue) {
+    // Only support to specify memory and vcores maximum allocation per queue
+    // for now.
     String queuePrefix = getQueuePrefix(queue);
     long maxAllocationMbPerQueue = getInt(queuePrefix + MAXIMUM_ALLOCATION_MB,
         (int)UNDEFINED);
@@ -831,7 +824,7 @@ public Resource getMaximumAllocationPerQueue(String queue) {
       LOG.debug("max alloc vcores per queue for " + queue + " is "
           + maxAllocationVcoresPerQueue);
     }
-    Resource clusterMax = getMaximumAllocation();
+    Resource clusterMax = ResourceUtils.fetchMaximumAllocationFromConfig(this);
     if (maxAllocationMbPerQueue == (int)UNDEFINED) {
       LOG.info("max alloc mb per queue for " + queue + " is undefined");
       maxAllocationMbPerQueue = clusterMax.getMemorySize();
@@ -840,8 +833,11 @@ public Resource getMaximumAllocationPerQueue(String queue) {
        LOG.info("max alloc vcore per queue for " + queue + " is undefined");
       maxAllocationVcoresPerQueue = clusterMax.getVirtualCores();
     }
-    Resource result = Resources.createResource(maxAllocationMbPerQueue,
-        maxAllocationVcoresPerQueue);
+    // Copy from clusterMax and overwrite per-queue's maximum memory/vcore
+    // allocation.
+    Resource result = Resources.clone(clusterMax);
+    result.setMemorySize(maxAllocationMbPerQueue);
+    result.setVirtualCores(maxAllocationVcoresPerQueue);
     if (maxAllocationMbPerQueue > clusterMax.getMemorySize()
         || maxAllocationVcoresPerQueue > clusterMax.getVirtualCores()) {
       throw new IllegalArgumentException(
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java
index 4ef26e639ac..d4fe1c91a19 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java
@@ -527,8 +527,8 @@ public void reinitialize(
       // since we have already told running AM's the size
       Resource oldMax = getMaximumAllocation();
       Resource newMax = newlyParsedLeafQueue.getMaximumAllocation();
-      if (newMax.getMemorySize() < oldMax.getMemorySize()
-          || newMax.getVirtualCores() < oldMax.getVirtualCores()) {
+
+      if (!Resources.fitsIn(oldMax, newMax)) {
         throw new IOException("Trying to reinitialize " + getQueuePath()
             + " the maximum allocation size can not be decreased!"
             + " Current setting: " + oldMax + ", trying to set it to: "
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java
index 45d94919938..b1897547e01 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java
@@ -47,14 +47,14 @@
 import org.apache.hadoop.yarn.server.utils.BuilderUtils;
 import org.apache.hadoop.yarn.util.Records;
 import org.apache.hadoop.yarn.util.YarnVersionInfo;
+import org.apache.hadoop.yarn.util.resource.Resources;
 import org.eclipse.jetty.util.log.Log;
 
 public class MockNM {
 
   private int responseId;
   private NodeId nodeId;
-  private long memory;
-  private int vCores;
+  private Resource capatibility;
   private ResourceTrackerService resourceTracker;
   private int httpPort = 2;
   private MasterKey currentContainerTokenMasterKey;
@@ -75,13 +75,25 @@ public MockNM(String nodeIdStr, int memory, ResourceTrackerService resourceTrack
 
   public MockNM(String nodeIdStr, int memory, int vcores,
       ResourceTrackerService resourceTracker) {
-    this(nodeIdStr, memory, vcores, resourceTracker, YarnVersionInfo.getVersion());
+    this(nodeIdStr, memory, vcores, resourceTracker,
+        YarnVersionInfo.getVersion());
   }
 
   public MockNM(String nodeIdStr, int memory, int vcores,
       ResourceTrackerService resourceTracker, String version) {
-    this.memory = memory;
-    this.vCores = vcores;
+    this(nodeIdStr, Resource.newInstance(memory, vcores), resourceTracker,
+        version);
+  }
+
+  public MockNM(String nodeIdStr, Resource capatibility,
+      ResourceTrackerService resourceTracker) {
+    this(nodeIdStr, capatibility, resourceTracker,
+        YarnVersionInfo.getVersion());
+  }
+
+  public MockNM(String nodeIdStr, Resource capatibility,
+      ResourceTrackerService resourceTracker, String version) {
+    this.capatibility = capatibility;
     this.resourceTracker = resourceTracker;
     this.version = version;
     String[] splits = nodeIdStr.split(":");
@@ -146,8 +158,7 @@ public RegisterNodeManagerResponse registerNode(
         RegisterNodeManagerRequest.class);
     req.setNodeId(nodeId);
     req.setHttpPort(httpPort);
-    Resource resource = BuilderUtils.newResource(memory, vCores);
-    req.setResource(resource);
+    req.setResource(capatibility);
     req.setContainerStatuses(containerReports);
     req.setNMVersion(version);
     req.setRunningApplications(runningApplications);
@@ -158,8 +169,7 @@ public RegisterNodeManagerResponse registerNode(
     this.currentNMTokenMasterKey = registrationResponse.getNMTokenMasterKey();
     Resource newResource = registrationResponse.getResource();
     if (newResource != null) {
-      memory = (int) newResource.getMemorySize();
-      vCores = newResource.getVirtualCores();
+      capatibility = Resources.clone(newResource);
     }
     containerStats.clear();
     if (containerReports != null) {
@@ -185,7 +195,7 @@ public NodeHeartbeatResponse nodeHeartbeat(ApplicationAttemptId attemptId,
       long containerId, ContainerState containerState) throws Exception {
     ContainerStatus containerStatus = BuilderUtils.newContainerStatus(
         BuilderUtils.newContainerId(attemptId, containerId), containerState,
-        "Success", 0, BuilderUtils.newResource(memory, vCores));
+        "Success", 0, capatibility);
     ArrayList containerStatusList =
         new ArrayList(1);
     containerStatusList.add(containerStatus);
@@ -266,19 +276,22 @@ public NodeHeartbeatResponse nodeHeartbeat(List updatedStats,
 
     Resource newResource = heartbeatResponse.getResource();
     if (newResource != null) {
-      memory = newResource.getMemorySize();
-      vCores = newResource.getVirtualCores();
+      capatibility = Resources.clone(newResource);
     }
 
     return heartbeatResponse;
   }
 
   public long getMemory() {
-    return memory;
+    return capatibility.getMemorySize();
   }
 
   public int getvCores() {
-    return vCores;
+    return capatibility.getVirtualCores();
+  }
+
+  public Resource getCapatibility() {
+    return capatibility;
   }
 
   public String getVersion() {
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java
index 1da93d15352..40ab17875e7 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java
@@ -849,6 +849,15 @@ public MockNM registerNode(String nodeIdStr, int memory, int vCores,
     return nm;
   }
 
+  public MockNM registerNode(String nodeIdStr, Resource nodeCapatibility)
+      throws Exception {
+    MockNM nm = new MockNM(nodeIdStr, nodeCapatibility,
+        getResourceTrackerService());
+    nm.registerNode();
+    drainEventsImplicitly();
+    return nm;
+  }
+
   public void sendNodeStarted(MockNM nm) throws Exception {
     RMNodeImpl node = (RMNodeImpl) getRMContext().getRMNodes().get(
         nm.getNodeId());
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/RMHATestBase.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/RMHATestBase.java
index 4ac4fc306b5..2b36ed08995 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/RMHATestBase.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/RMHATestBase.java
@@ -105,9 +105,34 @@ protected MockAM launchAM(RMApp app, MockRM rm, MockNM nm)
     return am;
   }
 
+  private MockRM initMockRMWithOldConf(Configuration confForRM1) {
+    return new MockRM(confForRM1, null, false, false) {
+      @Override
+      protected AdminService createAdminService() {
+        return new AdminService(this) {
+          @Override
+          protected void startServer() {
+            // override to not start rpc handler
+          }
+
+          @Override
+          protected void stopServer() {
+            // don't do anything
+          }
+
+          @Override
+          protected Configuration loadNewConfiguration() throws IOException, YarnException {
+            return confForRM1;
+          }
+        };
+      }
+    };
+  }
+
   protected void startRMs() throws IOException {
-    rm1 = new MockRM(confForRM1, null, false, false);
-    rm2 = new MockRM(confForRM2, null, false, false);
+    rm1 = initMockRMWithOldConf(confForRM1);
+    rm2 = initMockRMWithOldConf(confForRM2);
+
     startRMs(rm1, confForRM1, rm2, confForRM2);
   }
 
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java
index b7f69fc5462..caf16fb2137 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java
@@ -156,6 +156,7 @@
 import org.apache.hadoop.yarn.server.utils.BuilderUtils;
 import org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator;
 import org.apache.hadoop.yarn.util.resource.DominantResourceCalculator;
+import org.apache.hadoop.yarn.util.resource.ResourceUtils;
 import org.apache.hadoop.yarn.util.resource.Resources;
 import org.apache.log4j.Level;
 import org.apache.log4j.LogManager;
@@ -2943,7 +2944,7 @@ public void testRefreshQueuesMaxAllocationRefresh() throws Exception {
         conf.getMaximumAllocationPerQueue(A1).getMemorySize());
     assertEquals("max allocation",
         YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_MB,
-        conf.getMaximumAllocation().getMemorySize());
+        ResourceUtils.fetchMaximumAllocationFromConfig(conf).getMemorySize());
 
     CSQueue rootQueue = cs.getRootQueue();
     CSQueue queueA = findQueue(rootQueue, A);
@@ -3044,10 +3045,10 @@ public void testRefreshQueuesMaxAllocationRefreshLarger() throws Exception {
         conf.getMaximumAllocationPerQueue(A1).getVirtualCores());
     assertEquals("cluster max allocation MB",
         YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_MB,
-        conf.getMaximumAllocation().getMemorySize());
+        ResourceUtils.fetchMaximumAllocationFromConfig(conf).getMemorySize());
     assertEquals("cluster max allocation vcores",
         YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES,
-        conf.getMaximumAllocation().getVirtualCores());
+        ResourceUtils.fetchMaximumAllocationFromConfig(conf).getVirtualCores());
 
     CSQueue rootQueue = cs.getRootQueue();
     CSQueue queueA = findQueue(rootQueue, A);
@@ -3066,10 +3067,10 @@ public void testRefreshQueuesMaxAllocationRefreshLarger() throws Exception {
         conf.getMaximumAllocationPerQueue(A1).getVirtualCores());
     assertEquals("max allocation MB cluster",
         YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_MB,
-        conf.getMaximumAllocation().getMemorySize());
+        ResourceUtils.fetchMaximumAllocationFromConfig(conf).getMemorySize());
     assertEquals("max allocation vcores cluster",
         YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES,
-        conf.getMaximumAllocation().getVirtualCores());
+        ResourceUtils.fetchMaximumAllocationFromConfig(conf).getVirtualCores());
     assertEquals("queue max allocation MB", 6144,
         ((LeafQueue) queueA1).getMaximumAllocation().getMemorySize());
     assertEquals("queue max allocation vcores", 3,
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerWithMultiResourceTypes.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerWithMultiResourceTypes.java
new file mode 100644
index 00000000000..1a30e1da359
--- /dev/null
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerWithMultiResourceTypes.java
@@ -0,0 +1,190 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity; + +import com.google.common.collect.ImmutableMap; +import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse; +import org.apache.hadoop.yarn.api.protocolrecords.ResourceTypes; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.ResourceInformation; +import org.apache.hadoop.yarn.api.records.ResourceRequest; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; +import org.apache.hadoop.yarn.server.resourcemanager.MockAM; +import org.apache.hadoop.yarn.server.resourcemanager.MockNM; +import org.apache.hadoop.yarn.server.resourcemanager.MockRM; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; +import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent; +import org.apache.hadoop.yarn.server.utils.BuilderUtils; +import org.apache.hadoop.yarn.util.resource.DominantResourceCalculator; +import org.apache.hadoop.yarn.util.resource.ResourceUtils; +import org.junit.Assert; +import org.junit.Test; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +/** + * Test Capacity Scheduler with multiple resource types. + */ +public class TestCapacitySchedulerWithMultiResourceTypes { + private static String RESOURCE_1 = "res1"; + private final int GB = 1024; + + @Test + public void testMaximumAllocationRefreshWithMultipleResourceTypes() throws Exception { + + // Initialize resource map + Map riMap = new HashMap<>(); + + // Initialize mandatory resources + ResourceInformation memory = ResourceInformation.newInstance( + ResourceInformation.MEMORY_MB.getName(), + ResourceInformation.MEMORY_MB.getUnits(), + YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_MB, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_MB); + ResourceInformation vcores = ResourceInformation.newInstance( + ResourceInformation.VCORES.getName(), + ResourceInformation.VCORES.getUnits(), + YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES, + YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES); + riMap.put(ResourceInformation.MEMORY_URI, memory); + riMap.put(ResourceInformation.VCORES_URI, vcores); + riMap.put(RESOURCE_1, ResourceInformation.newInstance(RESOURCE_1, "", 0, + ResourceTypes.COUNTABLE, 0, 3333L)); + + ResourceUtils.initializeResourcesFromResourceInformationMap(riMap); + + CapacitySchedulerConfiguration csconf = + new CapacitySchedulerConfiguration(); + csconf.setMaximumApplicationMasterResourcePerQueuePercent("root", 100.0f); + csconf.setMaximumAMResourcePercentPerPartition("root", "", 100.0f); + csconf.setMaximumApplicationMasterResourcePerQueuePercent("root.default", + 100.0f); + csconf.setMaximumAMResourcePercentPerPartition("root.default", "", 100.0f); + csconf.setResourceComparator(DominantResourceCalculator.class); + csconf.set(YarnConfiguration.RESOURCE_TYPES, RESOURCE_1); + csconf.setInt(YarnConfiguration.RESOURCE_TYPES + "." + RESOURCE_1 + + ".maximum-allocation", 3333); + + YarnConfiguration conf = new YarnConfiguration(csconf); + // Don't reset resource types since we have already configured resource + // types + conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, + ResourceScheduler.class); + + MockRM rm = new MockRM(conf); + rm.start(); + + CapacityScheduler cs = (CapacityScheduler) rm.getResourceScheduler(); + Assert.assertEquals(3333L, + cs.getMaximumResourceCapability().getResourceValue(RESOURCE_1)); + Assert.assertEquals(3333L, + cs.getMaximumAllocation().getResourceValue(RESOURCE_1)); + Assert.assertEquals( + YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_MB, + cs.getMaximumResourceCapability() + .getResourceValue(ResourceInformation.MEMORY_URI)); + Assert.assertEquals( + YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_MB, + cs.getMaximumAllocation() + .getResourceValue(ResourceInformation.MEMORY_URI)); + Assert.assertEquals( + YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES, + cs.getMaximumResourceCapability() + .getResourceValue(ResourceInformation.VCORES_URI)); + Assert.assertEquals( + YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES, + cs.getMaximumAllocation() + .getResourceValue(ResourceInformation.VCORES_URI)); + + // Set RES_1 to 3332 (less than 3333) and refresh CS, failures expected. + csconf.set(YarnConfiguration.RESOURCE_TYPES, RESOURCE_1); + csconf.setInt(YarnConfiguration.RESOURCE_TYPES + "." + RESOURCE_1 + + ".maximum-allocation", 3332); + + boolean exception = false; + try { + cs.reinitialize(csconf, rm.getRMContext()); + } catch (IOException e) { + exception = true; + } + + Assert.assertTrue("Should have exception in CS", exception); + + // Maximum allocation won't be updated + Assert.assertEquals(3333L, + cs.getMaximumResourceCapability().getResourceValue(RESOURCE_1)); + Assert.assertEquals(3333L, + cs.getMaximumAllocation().getResourceValue(RESOURCE_1)); + Assert.assertEquals( + YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_MB, + cs.getMaximumResourceCapability() + .getResourceValue(ResourceInformation.MEMORY_URI)); + Assert.assertEquals( + YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_MB, + cs.getMaximumAllocation() + .getResourceValue(ResourceInformation.MEMORY_URI)); + Assert.assertEquals( + YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES, + cs.getMaximumResourceCapability() + .getResourceValue(ResourceInformation.VCORES_URI)); + Assert.assertEquals( + YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES, + cs.getMaximumAllocation() + .getResourceValue(ResourceInformation.VCORES_URI)); + + // Set RES_1 to 3334 and refresh CS, should success + csconf.set(YarnConfiguration.RESOURCE_TYPES, RESOURCE_1); + csconf.setInt(YarnConfiguration.RESOURCE_TYPES + "." + RESOURCE_1 + + ".maximum-allocation", 3334); + cs.reinitialize(csconf, rm.getRMContext()); + + // Maximum allocation will be updated + Assert.assertEquals(3334, + cs.getMaximumResourceCapability().getResourceValue(RESOURCE_1)); + + // Since we haven't updated the real configuration of ResourceUtils, + // cs.getMaximumAllocation won't be updated. + Assert.assertEquals(3333, + cs.getMaximumAllocation().getResourceValue(RESOURCE_1)); + Assert.assertEquals( + YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_MB, + cs.getMaximumResourceCapability() + .getResourceValue(ResourceInformation.MEMORY_URI)); + Assert.assertEquals( + YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_MB, + cs.getMaximumAllocation() + .getResourceValue(ResourceInformation.MEMORY_URI)); + Assert.assertEquals( + YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES, + cs.getMaximumResourceCapability() + .getResourceValue(ResourceInformation.VCORES_URI)); + Assert.assertEquals( + YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES, + cs.getMaximumAllocation() + .getResourceValue(ResourceInformation.VCORES_URI)); + + rm.close(); + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestUtils.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestUtils.java index 4bc5127e9da..8036a409df8 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestUtils.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestUtils.java @@ -18,16 +18,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity; -import static org.mockito.Matchers.any; -import static org.mockito.Mockito.doReturn; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.when; - -import java.io.IOException; -import java.util.Map; -import java.util.Set; - +import com.google.common.collect.Sets; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -38,6 +29,7 @@ import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.ResourceInformation; import org.apache.hadoop.yarn.api.records.ResourceRequest; import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.event.Event; @@ -53,21 +45,28 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.ContainerAllocationExpirer; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; -import org.apache.hadoop.yarn.server.scheduler.SchedulerRequestKey; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode; import org.apache.hadoop.yarn.server.resourcemanager.security.AMRMTokenSecretManager; import org.apache.hadoop.yarn.server.resourcemanager.security.ClientToAMTokenSecretManagerInRM; import org.apache.hadoop.yarn.server.resourcemanager.security.NMTokenSecretManagerInRM; import org.apache.hadoop.yarn.server.resourcemanager.security.RMContainerTokenSecretManager; +import org.apache.hadoop.yarn.server.scheduler.SchedulerRequestKey; import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator; import org.apache.hadoop.yarn.util.resource.Resources; import org.mockito.invocation.InvocationOnMock; import org.mockito.stubbing.Answer; -import com.google.common.collect.Sets; -import org.apache.hadoop.yarn.event.Event; +import java.io.IOException; +import java.util.Map; +import java.util.Set; + +import static org.mockito.Matchers.any; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; public class TestUtils { private static final Log LOG = LogFactory.getLog(TestUtils.class); @@ -457,4 +456,21 @@ public FiCaSchedulerApp getApplicationAttempt( cs.submitResourceCommitRequest(clusterResource, csAssignment); } + + /** + * An easy way to create resources other than memory and vcores for tests. + * @param memory memory + * @param vcores vcores + * @param nameToValues resource types other than memory and vcores. + * @return created resource + */ + public static Resource createResource(long memory, int vcores, + Map nameToValues) { + Resource res = Resource.newInstance(memory, vcores); + for (Map.Entry entry : nameToValues.entrySet()) { + res.setResourceInformation(entry.getKey(), ResourceInformation + .newInstance(entry.getKey(), "", entry.getValue())); + } + return res; + } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/UsingGpus.md hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/UsingGpus.md new file mode 100644 index 00000000000..f6000e7c35e --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/UsingGpus.md @@ -0,0 +1,230 @@ + + + +# Using GPU On YARN +# Prerequisites + +- As of now, only Nvidia GPUs are supported by YARN +- YARN node managers have to be pre-installed with Nvidia drivers. +- When Docker is used as container runtime context, nvidia-docker 1.0 needs to be installed (Current supported version in YARN for nvidia-docker). + +# Configs + +## GPU scheduling + +In `resource-types.xml` + +Add following properties + +``` + + + yarn.resource-types + yarn.io/gpu + + +``` + +In `yarn-site.xml` + +`DominantResourceCalculator` MUST be configured to enable GPU scheduling/isolation. + +For `Capacity Scheduler`, use following property to configure `DominantResourceCalculator` (In `capacity-scheduler.xml`): + +| Property | Default value | +| --- | --- | +| yarn.scheduler.capacity.resource-calculator | org.apache.hadoop.yarn.util.resource.DominantResourceCalculator | + + +## GPU Isolation + +### In `yarn-site.xml` + +``` + + yarn.nodemanager.resource-plugins + yarn.io/gpu + +``` + +This is to enable GPU isolation module on NodeManager side. + +By default, YARN will automatically detect and config GPUs when above config is set. Following configs need to be set in `yarn-site.xml` only if admin has specialized requirements. + +**1) Allowed GPU Devices** + +| Property | Default value | +| --- | --- | +| yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices | auto | + + Specify GPU devices which can be managed by YARN NodeManager (split by comma). + Number of GPU devices will be reported to RM to make scheduling decisions. + Set to auto (default) let YARN automatically discover GPU resource from + system. + + Manually specify GPU devices if auto detect GPU device failed or admin + only want subset of GPU devices managed by YARN. GPU device is identified + by their minor device number and index. A common approach to get minor + device number of GPUs is using `nvidia-smi -q` and search `Minor Number` + output. + + When minor numbers are specified manually, admin needs to include indice of GPUs + as well, format is `index:minor_number[,index:minor_number...]`. An example + of manual specification is `0:0,1:1,2:2,3:4"`to allow YARN NodeManager to + manage GPU devices with indices `0/1/2/3` and minor number `0/1/2/4`. + numbers . + +**2) Executable to discover GPUs** + +| Property | value | +| --- | --- | +| yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables | /absolute/path/to/nvidia-smi | + +When `yarn.nodemanager.resource.gpu.allowed-gpu-devices=auto` specified, +YARN NodeManager needs to run GPU discovery binary (now only support +`nvidia-smi`) to get GPU-related information. +When value is empty (default), YARN NodeManager will try to locate +discovery executable itself. +An example of the config value is: `/usr/local/bin/nvidia-smi` + +**3) Docker Plugin Related Configs** + +Following configs can be customized when user needs to run GPU applications inside Docker container. They're not required if admin follows default installation/configuration of `nvidia-docker`. + +| Property | Default value | +| --- | --- | +| yarn.nodemanager.resource-plugins.gpu.docker-plugin | nvidia-docker-v1 | + +Specify docker command plugin for GPU. By default uses Nvidia docker V1.0. + +| Property | Default value | +| --- | --- | +| yarn.nodemanager.resource-plugins.gpu.docker-plugin.nvidia-docker-v1.endpoint | http://localhost:3476/v1.0/docker/cli | + +Specify end point of `nvidia-docker-plugin`. Please find documentation: https://github.com/NVIDIA/nvidia-docker/wiki For more details. + +**4) CGroups mount** + +GPU isolation uses CGroup [devices controller](https://www.kernel.org/doc/Documentation/cgroup-v1/devices.txt) to do per-GPU device isolation. Following configs should be added to `yarn-site.xml` to automatically mount CGroup sub devices, otherwise admin has to manually create devices subfolder in order to use this feature. + +| Property | Default value | +| --- | --- | +| yarn.nodemanager.linux-container-executor.cgroups.mount | true | + + +### In `container-executor.cfg` + +In general, following config needs to be added to `container-executor.cfg` + +``` +[gpu] +module.enabled=true +``` + +When user needs to run GPU applications under non-Docker environment: + +``` +[cgroups] +# This should be same as yarn.nodemanager.linux-container-executor.cgroups.mount-path inside yarn-site.xml +root=/sys/fs/cgroup +# This should be same as yarn.nodemanager.linux-container-executor.cgroups.hierarchy inside yarn-site.xml +yarn-hierarchy=yarn +``` + +When user needs to run GPU applications under Docker environment: + +**1) Add GPU related devices to docker section:** + +Values separated by comma, you can get this by running `ls /dev/nvidia*` + +``` +[docker] +docker.allowed.devices=/dev/nvidiactl,/dev/nvidia-uvm,/dev/nvidia-uvm-tools,/dev/nvidia1,/dev/nvidia0 +``` + +**2) Add `nvidia-docker` to volume-driver whitelist.** + +``` +[docker] +... +docker.allowed.volume-drivers +``` + +**3) Add `nvidia_driver_` to readonly mounts whitelist.** + +``` +[docker] +... +docker.allowed.ro-mounts=nvidia_driver_375.66 +``` + +# Use it + +## Distributed-shell + GPU + +Distributed shell currently support specify additional resource types other than memory and vcores. + +### Distributed-shell + GPU without Docker + +Run distributed shell without using docker container (Asks 2 tasks, each task has 3GB memory, 1 vcore, 2 GPU device resource): + +``` +yarn jar \ + -jar \ + -shell_command /usr/local/nvidia/bin/nvidia-smi \ + -container_resources memory-mb=3072,vcores=1,yarn.io/gpu=2 \ + -num_containers 2 +``` + +You should be able to see output like + +``` +Tue Dec 5 22:21:47 2017 ++-----------------------------------------------------------------------------+ +| NVIDIA-SMI 375.66 Driver Version: 375.66 | +|-------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | +|===============================+======================+======================| +| 0 Tesla P100-PCIE... Off | 0000:04:00.0 Off | 0 | +| N/A 30C P0 24W / 250W | 0MiB / 12193MiB | 0% Default | ++-------------------------------+----------------------+----------------------+ +| 1 Tesla P100-PCIE... Off | 0000:82:00.0 Off | 0 | +| N/A 34C P0 25W / 250W | 0MiB / 12193MiB | 0% Default | ++-------------------------------+----------------------+----------------------+ + ++-----------------------------------------------------------------------------+ +| Processes: GPU Memory | +| GPU PID Type Process name Usage | +|=============================================================================| +| No running processes found | ++-----------------------------------------------------------------------------+ +``` + +For launched container task. + +### Distributed-shell + GPU with Docker + +You can also run distributed shell with Docker container. `YARN_CONTAINER_RUNTIME_TYPE`/`YARN_CONTAINER_RUNTIME_DOCKER_IMAGE` must be specified to use docker container. + +``` +yarn jar \ + -jar \ + -shell_env YARN_CONTAINER_RUNTIME_TYPE=docker \ + -shell_env YARN_CONTAINER_RUNTIME_DOCKER_IMAGE= \ + -shell_command nvidia-smi \ + -container_resources memory-mb=3072,vcores=1,yarn.io/gpu=2 \ + -num_containers 2 +``` \ No newline at end of file diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/adapters/yarn-nm-gpu.js hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/adapters/yarn-nm-gpu.js new file mode 100644 index 00000000000..bf6307a664c --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/adapters/yarn-nm-gpu.js @@ -0,0 +1,33 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import AbstractAdapter from './abstract'; + +export default AbstractAdapter.extend({ + + address: "localBaseAddress", + restNameSpace: "node", + serverName: "NM", + + urlForFindRecord(id/*, modelName, snapshot*/) { + var url = this._buildURL(); + url = url.replace("{nodeAddress}", id) + "/resources/yarn.io%2Fgpu"; + return url; + } + +}); \ No newline at end of file diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/components/donut-chart.js hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/components/donut-chart.js index b1e6ecf5076..03b633682f7 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/components/donut-chart.js +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/components/donut-chart.js @@ -20,6 +20,7 @@ import Ember from 'ember'; import BaseChartComponent from 'yarn-ui/components/base-chart-component'; import ColorUtils from 'yarn-ui/utils/color-utils'; import Converter from 'yarn-ui/utils/converter'; +import {Entities} from 'yarn-ui/constants'; export default BaseChartComponent.extend({ /* @@ -41,8 +42,10 @@ export default BaseChartComponent.extend({ } if (!middleValue) { - if (this.get("type") === "memory") { + if (this.get(Entities.Type) === Entities.Memory) { middleValue = Converter.memoryToSimpliedUnit(total); + } else if (this.get(Entities.Type) === Entities.Resource) { + middleValue = Converter.resourceToSimplifiedUnit(total, this.get(Entities.Unit)); } else { middleValue = total; } @@ -151,7 +154,10 @@ export default BaseChartComponent.extend({ var value = d.value; if (this.get("type") === "memory") { value = Converter.memoryToSimpliedUnit(value); + } else if (this.get("type") === "resource") { + value = Converter.resourceToSimplifiedUnit(value, this.get(Entities.Unit)); } + return d.label + ' = ' + value + suffix; }.bind(this)); } @@ -185,10 +191,18 @@ export default BaseChartComponent.extend({ } this.renderDonutChart(this.get("data"), this.get("title"), this.get("showLabels"), - this.get("middleLabel"), this.get("middleValue")); + this.get("middleLabel"), this.get("middleValue"), this.get("suffix")); }, didInsertElement: function() { + // When parentIdPrefix is specified, use parentidPrefix + name as new parent + // id + if (this.get("parentIdPrefix")) { + var newParentId = this.get("parentIdPrefix") + this.get("id"); + this.set("parentId", newParentId); + console.log(newParentId); + } + this.initChart(); this.draw(); }, diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/components/gpu-donut-chart.js hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/components/gpu-donut-chart.js new file mode 100644 index 00000000000..fa5ca8ac1dc --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/components/gpu-donut-chart.js @@ -0,0 +1,66 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import DonutChart from 'yarn-ui/components/donut-chart'; +import ColorUtils from 'yarn-ui/utils/color-utils'; + +export default DonutChart.extend({ + draw: function() { + // Construct data + var data = []; + if (this.get("gpu-render-type") === "gpu-memory") { + data.push({ + label: "Used", + value: parseFloat(this.get("gpuInfo").gpuMemoryUsage.usedMemoryMiB), + }); + data.push({ + label: "Available", + value: parseFloat(this.get("gpuInfo").gpuMemoryUsage.availMemoryMiB) + }); + } else if (this.get("gpu-render-type") === "gpu-utilization") { + var utilization = parseFloat(this.get("gpuInfo").gpuUtilizations.overallGpuUtilization); + data.push({ + label: "Utilized", + value: utilization, + }); + data.push({ + label: "Available", + value: 100 - utilization + }); + } + + var colorTargets = this.get("colorTargets"); + if (colorTargets) { + var colorTargetReverse = Boolean(this.get("colorTargetReverse")); + var targets = colorTargets.split(" "); + this.colors = ColorUtils.getColors(data.length, targets, colorTargetReverse); + } + + this.renderDonutChart(data, this.get("title"), this.get("showLabels"), + this.get("middleLabel"), this.get("middleValue"), this.get("suffix")); + }, + + didInsertElement: function() { + // ParentId includes minorNumber + var newParentId = this.get("parentId") + this.get("gpuInfo").minorNumber; + this.set("parentId", newParentId); + + this.initChart(); + this.draw(); + }, +}); \ No newline at end of file diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/constants.js hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/constants.js index d2937a0441f..29ad4bc2d93 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/constants.js +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/constants.js @@ -22,3 +22,16 @@ export default { PARAM_SEPARATOR: '!', }; + +const BASE_UNIT = 1024 + +export const Type = 'type'; +export const Memory = 'memory'; +export const Resource = 'resource'; +export const Unit = 'unit'; +export const Entities = { + Type: 'type', + Memory:'memory', + Resource: 'resource', + Unit: 'unit' +} \ No newline at end of file diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/controllers/yarn-nodes/table.js hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/controllers/yarn-nodes/table.js index 3fae5961f87..f4bd5788433 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/controllers/yarn-nodes/table.js +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/controllers/yarn-nodes/table.js @@ -60,7 +60,7 @@ export default Ember.Controller.extend({ getCellContent: function(row) { var node_id = row.get("id"), node_addr = row.get("nodeHTTPAddress"), - href = `#/yarn-node/${node_id}/${node_addr}`; + href = `#/yarn-node/${node_id}/${node_addr}/info`; switch(row.get("nodeState")) { case "SHUTDOWN": case "LOST": diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/models/cluster-metric.js hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/models/cluster-metric.js index dcc0c2997d8..d9a5eefd769 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/models/cluster-metric.js +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/models/cluster-metric.js @@ -43,6 +43,8 @@ export default DS.Model.extend({ decommissionedNodes: DS.attr('number'), rebootedNodes: DS.attr('number'), activeNodes: DS.attr('number'), + totalUsedResourcesAcrossPartition: DS.attr('object'), + totalClusterResourcesAcrossPartition: DS.attr('object'), getFinishedAppsDataForDonutChart: function() { var arr = []; @@ -135,4 +137,71 @@ export default DS.Model.extend({ return arr; }.property("allocatedVirtualCores", "reservedVirtualCores", "availableVirtualCores"), + + getResourceTypes: function() { + var types = []; + if (this.get("totalClusterResourcesAcrossPartition")) { + + console.log(types); + } + }.property("totalClusterResourcesAcrossPartition"), + + /* + * Returned format + * [ + * { + * name: + * unit: + * [ + * { + * label:

Rack: " + this.get("rack") + '

' + "

Host: " + this.get("nodeHostName") + '

'; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/router.js hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/router.js index 901314289f2..1a01b863756 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/router.js +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/router.js @@ -37,7 +37,10 @@ Router.map(function() { this.route('apps'); }); this.route('yarn-nodes-heatmap'); - this.route('yarn-node', { path: '/yarn-node/:node_id/:node_addr' }); + this.route('yarn-node', { path: '/yarn-node/:node_id/:node_addr' }, function() { + this.route("info"); + this.route("yarn-nm-gpu"); + }); this.route('yarn-node-apps', { path: '/yarn-node-apps/:node_id/:node_addr' }); this.route('yarn-node-app', { path: '/yarn-node-app/:node_id/:node_addr/:app_id' }); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/routes/cluster-overview.js hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/routes/cluster-overview.js index d03ea0daa2d..254ece43479 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/routes/cluster-overview.js +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/routes/cluster-overview.js @@ -31,7 +31,7 @@ export default AbstractRoute.extend({ queues: this.store.query("yarn-queue.yarn-queue", {}).then((model) => { let type = model.get('firstObject').get('type'); return this.store.query("yarn-queue." + type + "-queue", {}); - }), + }) }); }, diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/routes/yarn-node.js hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/routes/yarn-node.js index 3d548460d4f..7ce615c83fe 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/routes/yarn-node.js +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/routes/yarn-node.js @@ -25,6 +25,7 @@ export default AbstractRoute.extend({ // Fetches data from both NM and RM. RM is queried to get node usage info. return Ember.RSVP.hash({ nodeInfo: { id: param.node_id, addr: param.node_addr }, + nmGpuInfo: this.store.findRecord('yarn-nm-gpu', param.node_addr, {reload:true}), node: this.store.findRecord('yarn-node', param.node_addr, {reload: true}), rmNode: this.store.findRecord('yarn-rm-node', param.node_id, {reload: true}) }); @@ -33,5 +34,6 @@ export default AbstractRoute.extend({ unloadAll() { this.store.unloadAll('yarn-node'); this.store.unloadAll('yarn-rm-node'); + this.store.unloadAll('yarn-nm-gpu'); } }); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/routes/yarn-node/yarn-nm-gpu.js hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/routes/yarn-node/yarn-nm-gpu.js new file mode 100644 index 00000000000..38ae5d15f4c --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/routes/yarn-node/yarn-nm-gpu.js @@ -0,0 +1,22 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import Ember from 'ember'; + +export default Ember.Route.extend({ +}); \ No newline at end of file diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/serializers/yarn-nm-gpu.js hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/serializers/yarn-nm-gpu.js new file mode 100644 index 00000000000..3567c683013 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/serializers/yarn-nm-gpu.js @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import DS from 'ember-data'; + +export default DS.JSONAPISerializer.extend({ + internalNormalizeSingleResponse(store, primaryModelClass, payload, id) { + if (payload.nodeInfo) { + payload = payload.nodeInfo; + } + + var fixedPayload = { + id: id, + type: primaryModelClass.modelName, + attributes: { + info: payload + } + }; + return fixedPayload; + }, + + normalizeSingleResponse(store, primaryModelClass, payload, id/*, requestType*/) { + // payload is of the form {"nodeInfo":{}} + var p = this.internalNormalizeSingleResponse(store, + primaryModelClass, payload, id); + return { data: p }; + }, +}); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/serializers/yarn-queue/capacity-queue.js hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/serializers/yarn-queue/capacity-queue.js index c7350ef03bc..7626598e092 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/serializers/yarn-queue/capacity-queue.js +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/serializers/yarn-queue/capacity-queue.js @@ -72,6 +72,7 @@ export default DS.JSONAPISerializer.extend({ preemptionDisabled: payload.preemptionDisabled, numPendingApplications: payload.numPendingApplications, numActiveApplications: payload.numActiveApplications, + resources: payload.resources, type: "capacity", }, // Relationships diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/serializers/yarn-rm-node.js hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/serializers/yarn-rm-node.js index 1c6d1be859a..a3a1d59168f 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/serializers/yarn-rm-node.js +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/serializers/yarn-rm-node.js @@ -41,7 +41,9 @@ export default DS.JSONAPISerializer.extend({ usedVirtualCores: payload.usedVirtualCores, availableVirtualCores: payload.availableVirtualCores, version: payload.version, - nodeLabels: payload.nodeLabels + nodeLabels: payload.nodeLabels, + usedResource: payload.used, + availableResource: payload.avail } }; return fixedPayload; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/cluster-overview.hbs hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/cluster-overview.hbs index e549ce568a2..ff4682a5b88 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/cluster-overview.hbs +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/cluster-overview.hbs @@ -90,41 +90,71 @@
- -
-
-
- Resource - Memory + + {{#if model.clusterMetrics.firstObject.getAllResourceTypesDonutChart}} + {{#each + model.clusterMetrics.firstObject.getAllResourceTypesDonutChart as |perTypeUsage|}} +
+
+
+ {{perTypeUsage.name}} - Usages +
+
+ {{donut-chart + data=perTypeUsage.data + showLabels=true + parentIdPrefix="resource-type-" + id=perTypeUsage.id + ratio=0.6 + unit=perTypeUsage.unit + type="resource" + maxHeight=350 + colorTargets="good" + colorTargetReverse=true}} +
+
-
- {{donut-chart data=model.clusterMetrics.firstObject.getMemoryDataForDonutChart - showLabels=true - parentId="mem-donut-chart" - ratio=0.6 - maxHeight=350 - colorTargets="good" - colorTargetReverse=true - type="memory"}} + {{/each}} + {{else}} +
+
+
+ Resource - Memory +
+
+ {{donut-chart + data=model.clusterMetrics.firstObject.getMemoryDataForDonutChart + showLabels=true + parentId="mem-donut-chart" + ratio=0.6 + maxHeight=350 + colorTargets="good" + colorTargetReverse=true + type="memory"}} +
-
-
-
-
- Resource - VCores -
-
- {{donut-chart data=model.clusterMetrics.firstObject.getVCoreDataForDonutChart - showLabels=true - parentId="vcore-donut-chart" - ratio=0.6 - maxHeight=350 - colorTargets="good" - colorTargetReverse=true}} +
+
+
+ Resource - VCores +
+
+ {{donut-chart + data=model.clusterMetrics.firstObject.getVCoreDataForDonutChart + showLabels=true + parentId="vcore-donut-chart" + ratio=0.6 + maxHeight=350 + colorTargets="good" + colorTargetReverse=true}} +
-
+ {{/if}} +
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/components/node-menu-panel.hbs hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/components/node-menu-panel.hbs index d2486c9ff6f..966e408d2cf 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/components/node-menu-panel.hbs +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/components/node-menu-panel.hbs @@ -24,8 +24,8 @@
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/components/yarn-nm-gpu-info.hbs hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/components/yarn-nm-gpu-info.hbs new file mode 100644 index 00000000000..4118b1e7c81 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/components/yarn-nm-gpu-info.hbs @@ -0,0 +1,69 @@ +{{! + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +}} + +
+
Gpu Information - (Minor + Number {{gpu.minorNumber}}) +
+ + + + + + + + + + + + + + + + + + + +
Product Name{{gpu.productName}}
UUID{{gpu.uuid}}
Current Temperature{{gpu.temperature.currentGpuTemp}}
Max Temperature{{gpu.temperature.maxGpuTemp}}
+ +
+ {{gpu-donut-chart gpuInfo=gpu + showLabels=true + parentId="mem-donut-chart" + middleLabel = "Gpu Memory" + ratio=0.6 + type="memory" + gpu-render-type = "gpu-memory" + colorTargets="good" + colorTargetReverse=true + maxHeight=350}} +
+ +
+ {{gpu-donut-chart gpuInfo=gpu + showLabels=true + parentId="utilization-donut-chart" + middleLabel = "Gpu Utilization" + ratio=0.6 + gpu-render-type = "gpu-utilization" + colorTargets="good" + colorTargetReverse=true + suffix="%" + maxHeight=350}} +
+
\ No newline at end of file diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/yarn-node-apps.hbs hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/yarn-node-apps.hbs index 52f0c86c8e8..919e54df545 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/yarn-node-apps.hbs +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/yarn-node-apps.hbs @@ -20,7 +20,7 @@
- {{node-menu-panel path="yarn-node-apps" nodeAddr=model.nodeInfo.addr nodeId=model.nodeInfo.id}} + {{node-menu-panel path="yarn-node-apps" nodeAddr=model.nodeInfo.addr nodeId=model.nodeInfo.id nmGpuInfo=model.nmGpuInfo}} {{#if model.apps}}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/yarn-node-containers.hbs hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/yarn-node-containers.hbs index f520c46b9b4..1f312722072 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/yarn-node-containers.hbs +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/yarn-node-containers.hbs @@ -20,7 +20,7 @@
- {{node-menu-panel path="yarn-node-containers" nodeAddr=model.nodeInfo.addr nodeId=model.nodeInfo.id}} + {{node-menu-panel path="yarn-node-containers" nodeAddr=model.nodeInfo.addr nodeId=model.nodeInfo.id nmGpuInfo=model.nmGpuInfo}} {{#if model.containers}}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/yarn-node.hbs hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/yarn-node.hbs deleted file mode 100644 index 1e8549bd87f..00000000000 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/yarn-node.hbs +++ /dev/null @@ -1,125 +0,0 @@ -{{!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---}} - -{{breadcrumb-bar breadcrumbs=breadcrumbs}} - -
-
- - {{node-menu-panel path="yarn-node" nodeId=model.rmNode.id nodeAddr=model.node.id}} - -
- -
-
-
-
Node Information: {{model.rmNode.id}}
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - {{#if model.node.nmStartupTime}} - - - - - {{/if}} - - - - - - - - - -
Total Vmem allocated for Containers{{divide num=model.node.totalVmemAllocatedContainersMB den=1024}} GB
Vmem enforcement enabled{{model.node.vmemCheckEnabled}}
Total Pmem allocated for Containers{{divide num=model.node.totalPmemAllocatedContainersMB den=1024}} GB
Pmem enforcement enabled{{model.node.pmemCheckEnabled}}
Total VCores allocated for Containers{{model.node.totalVCoresAllocatedContainers}}
Node Healthy Status{{model.node.nodeHealthy}}
Last Node Health Report Time{{model.node.lastNodeUpdateTime}}
Node Health Report{{model.node.healthReport}}
Node Manager Start Time{{model.node.nmStartupTime}}
Node Manager Version{{model.node.nodeManagerBuildVersion}}
Hadoop Version{{model.node.hadoopBuildVersion}}
-
-
-
- -
-
-
-
- Resource - Memory -
-
- {{donut-chart data=model.rmNode.getMemoryDataForDonutChart - showLabels=true - parentId="mem-donut-chart" - ratio=0.6 - type="memory" - colorTargets="good" - colorTargetReverse=true - maxHeight=350}} -
-
-
- -
-
-
- Resource - VCores -
-
- {{donut-chart data=model.rmNode.getVCoreDataForDonutChart - showLabels=true - parentId="vcore-donut-chart" - ratio=0.6 - colorTargets="good" - colorTargetReverse=true - maxHeight=350}} -
-
-
-
-
-
-
-{{outlet}} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/yarn-node/info.hbs hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/yarn-node/info.hbs new file mode 100644 index 00000000000..ad411c096a1 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/yarn-node/info.hbs @@ -0,0 +1,154 @@ +{{!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--}} + +{{breadcrumb-bar breadcrumbs=breadcrumbs}} + +
+
+ + {{node-menu-panel path="yarn-node" nodeId=model.rmNode.id + nodeAddr=model.node.id nmGpuInfo=model.nmGpuInfo}} + +
+ +
+
+
+
Node + Information: {{model.rmNode.id}}
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + {{#if model.node.nmStartupTime}} + + + + + {{/if}} + + + + + + + + + +
Total Vmem allocated for Containers{{divide num=model.node.totalVmemAllocatedContainersMB + den=1024}} GB +
Vmem enforcement enabled{{model.node.vmemCheckEnabled}}
Total Pmem allocated for Containers{{divide num=model.node.totalPmemAllocatedContainersMB + den=1024}} GB +
Pmem enforcement enabled{{model.node.pmemCheckEnabled}}
Total VCores allocated for Containers{{model.node.totalVCoresAllocatedContainers}}
Node Healthy Status{{model.node.nodeHealthy}}
Last Node Health Report Time{{model.node.lastNodeUpdateTime}}
Node Health Report{{model.node.healthReport}}
Node Manager Start Time{{model.node.nmStartupTime}}
Node Manager Version{{model.node.nodeManagerBuildVersion}}
Hadoop Version{{model.node.hadoopBuildVersion}}
+
+
+
+ +
+
+
+
+ Resource - Memory +
+
+ {{donut-chart data=model.rmNode.getMemoryDataForDonutChart + showLabels=true + parentId="mem-donut-chart" + ratio=0.6 + type="memory" + colorTargets="good" + colorTargetReverse=true + maxHeight=350}} +
+
+
+ +
+
+
+ Resource - VCores +
+
+ {{donut-chart data=model.rmNode.getVCoreDataForDonutChart + showLabels=true + parentId="vcore-donut-chart" + ratio=0.6 + colorTargets="good" + colorTargetReverse=true + maxHeight=350}} +
+
+
+
+ + {{#if model.nmGpuInfo}} +
+
+
+
+
  • + Resources - yarn.io/gpu +
  • +
    +
    + {{donut-chart data=model.rmNode.getGpuDataForDonutChart + showLabels=true + parentId="gpu-donut-chart" + ratio=0.6 + colorTargets="good" + colorTargetReverse=true + maxHeight=350}} +
    +
    +
    +
    + {{/if}} +
    +
    +
    +{{outlet}} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/yarn-node/yarn-nm-gpu.hbs hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/yarn-node/yarn-nm-gpu.hbs new file mode 100644 index 00000000000..0464cc8db50 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/templates/yarn-node/yarn-nm-gpu.hbs @@ -0,0 +1,57 @@ +{{!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--}} + +{{breadcrumb-bar breadcrumbs=breadcrumbs}} + +
    +
    + + {{node-menu-panel path="yarn-node" nodeId=model.rmNode.id + nodeAddr=model.node.id nmGpuInfo=model.nmGpuInfo}} + {{#if model.nmGpuInfo.info.totalGpuDevices}} + +
    +
    +
    Gpu Information
    + + + + + + + + + + + + + + + +
    VendorNVIDIA
    Driver Version{{model.nmGpuInfo.info.gpuDeviceInformation.driverVersion}}
    Total Number Of Gpus{{model.nmGpuInfo.info.totalGpuDevices.length}}
    +
    + + {{#each model.nmGpuInfo.info.gpuDeviceInformation.gpus as |gpu|}} + {{yarn-nm-gpu-info gpu=gpu}} + {{/each}} +
    + {{else}} +

    No GPUs are found on this node.

    + {{/if}} +
    +
    \ No newline at end of file diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/utils/converter.js hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/utils/converter.js index b9da222cceb..74cc9161d31 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/utils/converter.js +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/src/main/webapp/app/utils/converter.js @@ -130,6 +130,57 @@ export default { } return value.toFixed(1) + " " + unit; }, + resourceToSimplifiedUnit: function (value, unit) { + // First convert unit to base unit (""). + var normalizedValue = value; + if (unit === "Ki") { + normalizedValue = normalizedValue * 1024; + } else if (unit === "Mi") { + normalizedValue = normalizedValue * 1024 * 1024; + } else if (unit === "Gi") { + normalizedValue = normalizedValue * 1024 * 1024 * 1024; + } else if (unit === "Ti") { + normalizedValue = normalizedValue * 1024 * 1024 * 1024 * 1024; + } else if (unit === "Pi") { + normalizedValue = normalizedValue * 1024 * 1024 * 1024 * 1024 * 1024; + } else if (unit === "K" || unit === "k") { + normalizedValue = normalizedValue * 1000; + } else if (unit === "M" || unit === "m") { + normalizedValue = normalizedValue * 1000 * 1000; + } else if (unit === "G" || unit === "g") { + normalizedValue = normalizedValue * 1000 * 1000 * 1000; + } else if (unit === "T" || unit === "t") { + normalizedValue = normalizedValue * 1000 * 1000 * 1000 * 1000; + } else if (unit === "P" || unit === "p") { + normalizedValue = normalizedValue * 1000 * 1000 * 1000 * 1000 * 1000; + } + + // From baseunit ("") convert to most human readable unit + // (which value < 1024 * 0.9). + var finalUnit = ""; + if (normalizedValue / 1024 >= 0.9) { + normalizedValue = normalizedValue / 1024; + finalUnit = "Ki"; + } + if (normalizedValue / 1024 >= 0.9) { + normalizedValue = normalizedValue / 1024; + finalUnit = "Mi"; + } + if (normalizedValue / 1024 >= 0.9) { + normalizedValue = normalizedValue / 1024; + finalUnit = "Gi"; + } + if (normalizedValue / 1024 >= 0.9) { + normalizedValue = normalizedValue / 1024; + finalUnit = "Ti"; + } + if (normalizedValue / 1024 >= 0.9) { + normalizedValue = normalizedValue / 1024; + finalUnit = "Pi"; + } + + return normalizedValue.toFixed(1) + " " + finalUnit; + }, msToElapsedTimeUnit: function(millisecs, short) { var seconds = Math.floor(millisecs / 1000); var days = Math.floor(seconds / (3600 * 24));