diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java index 86f2554..1c3e253 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java @@ -39,6 +39,8 @@ long getContainerStartTime(); + long getContainerLaunchTime(); + Resource getResource(); ContainerTokenIdentifier getContainerTokenIdentifier(); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index c09c7f1..2faa4d2 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -883,6 +883,11 @@ public long getContainerStartTime() { } @Override + public long getContainerLaunchTime() { + return this.containerLaunchStartTime; + } + + @Override public Resource getResource() { return Resources.clone( this.containerTokenIdentifier.getResource()); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/DefaultOOMHandler.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/DefaultOOMHandler.java index c690225..578cb96 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/DefaultOOMHandler.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/DefaultOOMHandler.java @@ -46,9 +46,16 @@ public class DefaultOOMHandler implements Runnable { protected static final Log LOG = LogFactory .getLog(DefaultOOMHandler.class); - private Context context; - private boolean virtual; - private CGroupsHandler cgroups; + protected final Context context; + protected final boolean virtual; + protected final CGroupsHandler cgroups; + + // Reverse order by start time + protected static final Comparator CONTAINER_START_TIME_COMPARATOR = + (Container o1, Container o2) -> { + long order = o1.getContainerLaunchTime() - o2.getContainerLaunchTime(); + return order > 0 ? -1 : order < 0 ? 1 : 0; + }; /** * Create an OOM handler. @@ -59,12 +66,12 @@ public DefaultOOMHandler(Context context, boolean testVirtual) { this.context = context; this.virtual = testVirtual; - this.cgroups = ResourceHandlerModule.getCGroupsHandler(); + this.cgroups = getCGroupsHandler(); } @VisibleForTesting - void setCGroupsHandler(CGroupsHandler handler) { - cgroups = handler; + protected CGroupsHandler getCGroupsHandler() { + return ResourceHandlerModule.getCGroupsHandler(); } /** @@ -121,7 +128,7 @@ private boolean killContainerIfOOM(Container container, String fileName) { * * @param container Container to clean up */ - private void sigKill(Container container) { + protected void sigKill(Container container) { boolean finished = false; try { while (!finished) { @@ -168,21 +175,12 @@ private void sigKill(Container container) { /** * It is called when the node is under an OOM condition. All processes in * all sub-cgroups are suspended. We need to act fast, so that we do not - * affect the overall system utilization. - * In general we try to find a newly run container that exceeded its limits. - * The justification is cost, since probably this is the one that has - * accumulated the least amount of uncommitted data so far. - * We continue the process until the OOM is resolved. + * affect the overall system utilization. We continue the process until + * the OOM is resolved. */ @Override public void run() { try { - // Reverse order by start time - Comparator comparator = (Container o1, Container o2) -> { - long order = o1.getContainerStartTime() - o2.getContainerStartTime(); - return order > 0 ? -1 : order < 0 ? 1 : 0; - }; - // We kill containers until the kernel reports the OOM situation resolved // Note: If the kernel has a delay this may kill more than necessary while (true) { @@ -194,61 +192,77 @@ public void run() { break; } - // The first pass kills a recent container - // that uses more than its request - ArrayList containers = new ArrayList<>(); - containers.addAll(context.getContainers().values()); - // Note: Sorting may take a long time with 10K+ containers - // but it is acceptable now with low number of containers per node - containers.sort(comparator); - - // Kill the latest container that exceeded its request - boolean found = false; - for (Container container : containers) { - if (!virtual) { - if (killContainerIfOOM(container, - CGROUP_PARAM_MEMORY_USAGE_BYTES)) { - found = true; - break; - } - } else { - if (killContainerIfOOM(container, - CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES)) { - found = true; - break; - } - } - } - if (found) { - continue; - } + boolean containerKilled = killContainer(); - // We have not found any containers that ran out of their limit, - // so we will kill the latest one. This can happen, if all use - // close to their request and one of them requests a big block - // triggering the OOM freeze. - // Currently there is no other way to identify the outstanding one. - if (containers.size() > 0) { - Container container = containers.get(0); - sigKill(container); - String message = String.format( - "Newest container %s killed by elastic cgroups OOM handler using", - container.getContainerId()); - LOG.warn(message); - continue; + if (!containerKilled) { + // This can happen, if SIGKILL did not clean up + // non-PGID or containers or containers launched by other users + // or if a process was put to the root YARN cgroup. + throw new YarnRuntimeException( + "Could not find any containers but CGroups " + + "reserved for containers ran out of memory. " + + "I am giving up"); } - - // This can happen, if SIGKILL did not clean up - // non-PGID or containers or containers launched by other users - // or if a process was put to the root YARN cgroup. - throw new YarnRuntimeException( - "Could not find any containers but CGroups " + - "reserved for containers ran out of memory. " + - "I am giving up"); } } catch (ResourceHandlerException ex) { LOG.warn("Could not fecth OOM status. " + "This is expected at shutdown. Exiting.", ex); } } + + /** + * Choose and kill a container in case of OOM. We try to find a newly run + * container that has exceeded its limits, since probably this is the one + * that has accumulated the least amount of uncommitted value so far. If + * there is no such container found, we choose to kill the most recently + * launched one. + * @return true if a container is killed, false otherwise + */ + protected boolean killContainer() { + boolean containerKilled = false; + + // The first pass kills a recent container + // that uses more than its request + ArrayList containers = new ArrayList<>(); + containers.addAll(context.getContainers().values()); + // Note: Sorting may take a long time with 10K+ containers + // but it is acceptable now with low number of containers per node + containers.sort(CONTAINER_START_TIME_COMPARATOR); + + // Kill the latest container that exceeded its request + for (Container container : containers) { + if (!virtual) { + if (killContainerIfOOM(container, + CGROUP_PARAM_MEMORY_USAGE_BYTES)) { + containerKilled = true; + break; + } + } else { + if (killContainerIfOOM(container, + CGROUP_PARAM_MEMORY_MEMSW_USAGE_BYTES)) { + containerKilled = true; + break; + } + } + } + + if (!containerKilled) { + // We have not found any containers that ran out of their limit, + // so we will kill the latest one. This can happen, if all use + // close to their request and one of them requests a big block + // triggering the OOM freeze. + // Currently there is no other way to identify the outstanding one. + if (containers.size() > 0) { + Container container = containers.get(0); + sigKill(container); + String message = String.format( + "Newest container %s killed by elastic cgroups OOM handler using", + container.getContainerId()); + LOG.warn(message); + containerKilled = true; + } + } + + return containerKilled; + } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/OverAllocationOOMHandler.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/OverAllocationOOMHandler.java new file mode 100644 index 0000000..3595fe5 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/OverAllocationOOMHandler.java @@ -0,0 +1,115 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources; + +import org.apache.hadoop.yarn.api.records.ExecutionType; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; + +import java.util.List; +import java.util.stream.Collectors; + +/** + * An OOM handler, executed when the root container cgroup runs into an + * OOM situation, that tries to kill most recently started OPPORTUNISTIC + * containers, followed by most recently started GUARANTEED containers + * in cases where no OPPORTUNISTIC containers were found or OOM cannot + * be resolved after killing all OPPORTUNISTIC containers. + */ +public class OverAllocationOOMHandler extends DefaultOOMHandler { + /** + * Create an OOM handler. + * This has to be public to be able to construct through reflection. + * + * @param context node manager context to work with + * @param testVirtual Test virtual memory or physical + */ + public OverAllocationOOMHandler(Context context, boolean testVirtual) { + super(context, testVirtual); + } + + @Override + protected boolean killContainer() { + boolean containerKilled = tryToKillOpportunisticContainer(); + + if (!containerKilled) { + // fall back to kill GUARANTEED containers when we have no more + // OPPORTUNISTIC containers left. + containerKilled = tryToKillGuaranteedContainer(); + } + + return containerKilled; + } + + /** + * Try to choose and kill the most recently started OPPORTUNISTIC container. + * @return true if the most recently started OPPORTUNISTIC container is + * killed, false otherwise + */ + private boolean tryToKillOpportunisticContainer() { + boolean containerKilled = false; + + List candidates = context.getContainers().values().stream() + .filter(container -> ExecutionType.OPPORTUNISTIC.equals( + container.getContainerTokenIdentifier().getExecutionType())) + .collect(Collectors.toList()); + + candidates.sort(CONTAINER_START_TIME_COMPARATOR); + + if (candidates.size() > 0) { + Container container = candidates.get(0); + sigKill(container); + String message = String.format( + "Newest OPPORTUNISTIC container %s killed by elastic cgroups OOM" + + " handler using", container.getContainerId().toString()); + LOG.warn(message); + containerKilled = true; + } + + return containerKilled; + } + + /** + * Try to choose and kill the most recently started GUARANTEED container. + * @return true if the most recently started OPPORTUNISTIC container is + * killed, false otherwise + */ + private boolean tryToKillGuaranteedContainer() { + boolean containerKilled = false; + + List candidates = context.getContainers().values().stream() + .filter(container -> !ExecutionType.OPPORTUNISTIC.equals( + container.getContainerTokenIdentifier().getExecutionType())) + .collect(Collectors.toList()); + + candidates.sort(CONTAINER_START_TIME_COMPARATOR); + + if (candidates.size() > 0) { + Container container = candidates.get(0); + sigKill(container); + String message = String.format( + "Newest GUARANTEED container %s killed by elastic cgroups OOM" + + " handler using", container.getContainerId().toString()); + LOG.warn(message); + containerKilled = true; + } + + return containerKilled; + } + +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestDefaultOOMHandler.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestDefaultOOMHandler.java index 60c38fe..6f406b6 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestDefaultOOMHandler.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestDefaultOOMHandler.java @@ -66,8 +66,12 @@ public void testNoContainers() throws Exception { CGROUP_PARAM_MEMORY_OOM_CONTROL)) .thenReturn("under_oom 1").thenReturn("under_oom 0"); - DefaultOOMHandler handler = new DefaultOOMHandler(context, false); - handler.setCGroupsHandler(cGroupsHandler); + DefaultOOMHandler handler = new DefaultOOMHandler(context, false) { + @Override + protected CGroupsHandler getCGroupsHandler() { + return cGroupsHandler; + } + }; handler.run(); } @@ -86,14 +90,14 @@ public void testBothContainersOOM() throws Exception { ContainerId cid1 = createContainerId(1); when(c1.getContainerId()).thenReturn(cid1); when(c1.getResource()).thenReturn(Resource.newInstance(10, 1)); - when(c1.getContainerStartTime()).thenReturn((long) 1); + when(c1.getContainerLaunchTime()).thenReturn((long) 1); containers.put(createContainerId(1), c1); Container c2 = mock(Container.class); ContainerId cid2 = createContainerId(2); when(c2.getContainerId()).thenReturn(cid2); when(c2.getResource()).thenReturn(Resource.newInstance(10, 1)); - when(c2.getContainerStartTime()).thenReturn((long) 2); + when(c2.getContainerLaunchTime()).thenReturn((long) 2); containers.put(cid2, c2); CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class); @@ -145,14 +149,14 @@ public void testOneContainerOOM() throws Exception { ContainerId cid1 = createContainerId(1); when(c1.getContainerId()).thenReturn(cid1); when(c1.getResource()).thenReturn(Resource.newInstance(10, 1)); - when(c1.getContainerStartTime()).thenReturn((long) 2); + when(c1.getContainerLaunchTime()).thenReturn((long) 2); containers.put(createContainerId(1), c1); Container c2 = mock(Container.class); ContainerId cid2 = createContainerId(2); when(c2.getContainerId()).thenReturn(cid2); when(c2.getResource()).thenReturn(Resource.newInstance(10, 1)); - when(c2.getContainerStartTime()).thenReturn((long) 1); + when(c2.getContainerLaunchTime()).thenReturn((long) 1); containers.put(cid2, c2); CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class); @@ -202,14 +206,14 @@ public void testNoContainerOOM() throws Exception { ContainerId cid1 = createContainerId(1); when(c1.getContainerId()).thenReturn(cid1); when(c1.getResource()).thenReturn(Resource.newInstance(10, 1)); - when(c1.getContainerStartTime()).thenReturn((long) 1); + when(c1.getContainerLaunchTime()).thenReturn((long) 1); containers.put(createContainerId(1), c1); Container c2 = mock(Container.class); ContainerId cid2 = createContainerId(2); when(c2.getContainerId()).thenReturn(cid2); when(c2.getResource()).thenReturn(Resource.newInstance(10, 1)); - when(c2.getContainerStartTime()).thenReturn((long) 2); + when(c2.getContainerLaunchTime()).thenReturn((long) 2); containers.put(cid2, c2); CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class); @@ -267,8 +271,12 @@ private void runOOMHandler( when(context.getContainerExecutor()).thenReturn(ex); - DefaultOOMHandler handler = new DefaultOOMHandler(context, false); - handler.setCGroupsHandler(cGroupsHandler); + DefaultOOMHandler handler = new DefaultOOMHandler(context, false) { + @Override + protected CGroupsHandler getCGroupsHandler() { + return cGroupsHandler; + } + }; handler.run(); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestOverAllocationOOMHandler.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestOverAllocationOOMHandler.java new file mode 100644 index 0000000..1b6e5c0 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestOverAllocationOOMHandler.java @@ -0,0 +1,400 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources; + +import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.ExecutionType; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; +import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; +import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor; +import org.apache.hadoop.yarn.server.nodemanager.Context; + +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerSignalContext; +import org.junit.Test; +import org.mockito.ArgumentCaptor; + +import java.util.List; +import java.util.concurrent.ConcurrentHashMap; + +import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_FILE_TASKS; +import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler.CGROUP_PARAM_MEMORY_OOM_CONTROL; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.mockito.Matchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +/** + * Unit tests for OverAllocationOOMHandler. + */ +public class TestOverAllocationOOMHandler { + /** + * Test an OOM situation where no containers are running. + */ + @Test(expected = YarnRuntimeException.class) + public void testExceptionThrownWithNoContainers() throws Exception { + Context context = mock(Context.class); + + when(context.getContainers()).thenReturn(new ConcurrentHashMap<>()); + + CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class); + when(cGroupsHandler.getCGroupParam( + CGroupsHandler.CGroupController.MEMORY, "", + CGROUP_PARAM_MEMORY_OOM_CONTROL)) + .thenReturn("under_oom 1").thenReturn("under_oom 0"); + + OverAllocationOOMHandler handler = + new OverAllocationOOMHandler(context, false) { + @Override + protected CGroupsHandler getCGroupsHandler() { + return cGroupsHandler; + } + }; + handler.run(); + } + + + /** + * We have two OPPORTUNISTIC containers and one GUARANTEED container running. + * OOM is resolved after killing the most recently started OPPORTUNISTIC + * containers. + */ + @Test + public void testKillOneOpportunisticContainerUponOOM() throws Exception { + int currentContainerId = 0; + + ConcurrentHashMap containers = + new ConcurrentHashMap<>(); + Container c1 = createContainer(currentContainerId++, false, 1); + containers.put(c1.getContainerId(), c1); + Container c2 = createContainer(currentContainerId++, false, 2); + containers.put(c2.getContainerId(), c2); + Container c3 = createContainer(currentContainerId++,true, 2); + containers.put(c3.getContainerId(), c3); + + ContainerExecutor ex = mock(ContainerExecutor.class); + when(ex.signalContainer(any())) + .thenAnswer(invocation -> { + assertEquals("Wrong pid killed", "1234", + ((ContainerSignalContext) invocation.getArguments()[0]).getPid()); + Object[] arguments = invocation.getArguments(); + Container container = ((ContainerSignalContext) arguments[0]) + .getContainer(); + // remove container from NM context immediately + containers.remove(container.getContainerId()); + return true; + }); + + Context context = mock(Context.class); + when(context.getContainers()).thenReturn(containers); + when(context.getContainerExecutor()).thenReturn(ex); + + CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class); + when(cGroupsHandler.getCGroupParam( + CGroupsHandler.CGroupController.MEMORY, + "", + CGROUP_PARAM_MEMORY_OOM_CONTROL)) + .thenReturn("under_oom 1") + .thenReturn("under_oom 0"); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c1.getContainerId().toString(), CGROUP_FILE_TASKS)) + .thenReturn("1234").thenReturn(""); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c2.getContainerId().toString(), CGROUP_FILE_TASKS)) + .thenReturn("1234").thenReturn(""); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c3.getContainerId().toString(), CGROUP_FILE_TASKS)) + .thenReturn("1234").thenReturn(""); + + OverAllocationOOMHandler handler = + new OverAllocationOOMHandler(context, false) { + @Override + protected CGroupsHandler getCGroupsHandler() { + return cGroupsHandler; + } + }; + handler.run(); + + verify(ex, times(1)).signalContainer( + new ContainerSignalContext.Builder() + .setPid("1234") + .setContainer(c2) + .setSignal(ContainerExecutor.Signal.KILL) + .build() + ); + verify(ex, times(1)).signalContainer(any()); + } + + /** + * We have two OPPORTUNISTIC containers and one GUARANTEED container running. + * OOM is resolved after killing both OPPORTUNISTIC containers. + */ + @Test + public void testKillBothOpportunisticContainerUponOOM() throws Exception { + int currentContainerId = 0; + + ConcurrentHashMap containers = + new ConcurrentHashMap<>(); + Container c1 = createContainer(currentContainerId++, false, 1); + containers.put(c1.getContainerId(), c1); + Container c2 = createContainer(currentContainerId++, false, 2); + containers.put(c2.getContainerId(), c2); + Container c3 = createContainer(currentContainerId++,true, 2); + containers.put(c3.getContainerId(), c3); + + ContainerExecutor ex = mock(ContainerExecutor.class); + when(ex.signalContainer(any())).thenAnswer( + invocation -> { + assertEquals("Wrong pid killed", "1234", + ((ContainerSignalContext) invocation.getArguments()[0]).getPid()); + Object[] arguments = invocation.getArguments(); + Container container = ((ContainerSignalContext) arguments[0]) + .getContainer(); + // remove container from NM context immediately + containers.remove(container.getContainerId()); + return true; + }); + + Context context = mock(Context.class); + when(context.getContainers()).thenReturn(containers); + when(context.getContainerExecutor()).thenReturn(ex); + + CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class); + when(cGroupsHandler.getCGroupParam( + CGroupsHandler.CGroupController.MEMORY, + "", + CGROUP_PARAM_MEMORY_OOM_CONTROL)) + .thenReturn("under_oom 1") + .thenReturn("under_oom 1") + .thenReturn("under_oom 0"); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c1.getContainerId().toString(), CGROUP_FILE_TASKS)) + .thenReturn("1234").thenReturn(""); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c2.getContainerId().toString(), CGROUP_FILE_TASKS)) + .thenReturn("1234").thenReturn(""); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c3.getContainerId().toString(), CGROUP_FILE_TASKS)) + .thenReturn("1234").thenReturn(""); + + + OverAllocationOOMHandler handler = + new OverAllocationOOMHandler(context, false) { + @Override + protected CGroupsHandler getCGroupsHandler() { + return cGroupsHandler; + } + }; + handler.run(); + + ArgumentCaptor argument = + ArgumentCaptor.forClass(ContainerSignalContext.class); + verify(ex, times(2)) + .signalContainer(argument.capture()); + + List containersKilled = + argument.getAllValues(); + assertTrue(containersKilled.get(0).getContainer().equals(c2)); + assertTrue(containersKilled.get(1).getContainer().equals(c1)); + } + + /** + * We have two OPPORTUNISTIC containers and one GUARANTEED container running. + * OOM is resolved after killing all running containers. + */ + @Test + public void testKillAllContainersUponOOM() throws Exception { + int currentContainerId = 0; + + ConcurrentHashMap containers = + new ConcurrentHashMap<>(); + Container c1 = createContainer(currentContainerId++, false, 1); + containers.put(c1.getContainerId(), c1); + Container c2 = createContainer(currentContainerId++, false, 2); + containers.put(c2.getContainerId(), c2); + Container c3 = createContainer(currentContainerId++,true, 2); + containers.put(c3.getContainerId(), c3); + + ContainerExecutor ex = mock(ContainerExecutor.class); + when(ex.signalContainer(any())).thenAnswer( + invocation -> { + assertEquals("Wrong pid killed", "1234", + ((ContainerSignalContext) invocation.getArguments()[0]).getPid()); + Object[] arguments = invocation.getArguments(); + Container container = ((ContainerSignalContext) arguments[0]) + .getContainer(); + // remove container from NM context immediately + containers.remove(container.getContainerId()); + return true; + }); + + Context context = mock(Context.class); + when(context.getContainers()).thenReturn(containers); + when(context.getContainerExecutor()).thenReturn(ex); + + CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class); + when(cGroupsHandler.getCGroupParam( + CGroupsHandler.CGroupController.MEMORY, + "", + CGROUP_PARAM_MEMORY_OOM_CONTROL)) + .thenReturn("under_oom 1") + .thenReturn("under_oom 1") + .thenReturn("under_oom 1") + .thenReturn("under_oom 0"); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c1.getContainerId().toString(), CGROUP_FILE_TASKS)) + .thenReturn("1234").thenReturn(""); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c2.getContainerId().toString(), CGROUP_FILE_TASKS)) + .thenReturn("1234").thenReturn(""); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c3.getContainerId().toString(), CGROUP_FILE_TASKS)) + .thenReturn("1234").thenReturn(""); + + OverAllocationOOMHandler handler = + new OverAllocationOOMHandler(context, false) { + @Override + protected CGroupsHandler getCGroupsHandler() { + return cGroupsHandler; + } + }; + handler.run(); + + verify(ex, times(1)).signalContainer( + new ContainerSignalContext.Builder() + .setPid("1234") + .setContainer(c2) + .setSignal(ContainerExecutor.Signal.KILL) + .build() + ); + verify(ex, times(1)).signalContainer( + new ContainerSignalContext.Builder() + .setPid("1234") + .setContainer(c1) + .setSignal(ContainerExecutor.Signal.KILL) + .build() + ); + verify(ex, times(1)).signalContainer( + new ContainerSignalContext.Builder() + .setPid("1234") + .setContainer(c3) + .setSignal(ContainerExecutor.Signal.KILL) + .build() + ); + verify(ex, times(3)).signalContainer(any()); + } + + + @Test(expected = YarnRuntimeException.class) + public void testOOMUnresolvedAfterKillingAllContainers() throws Exception { + int currentContainerId = 0; + + ConcurrentHashMap containers = + new ConcurrentHashMap<>(); + Container c1 = createContainer(currentContainerId++, false, 1); + containers.put(c1.getContainerId(), c1); + Container c2 = createContainer(currentContainerId++, false, 2); + containers.put(c2.getContainerId(), c2); + Container c3 = createContainer(currentContainerId++,true, 2); + containers.put(c3.getContainerId(), c3); + + ContainerExecutor ex = mock(ContainerExecutor.class); + when(ex.signalContainer(any())).thenAnswer( + invocation -> { + assertEquals("Wrong pid killed", "1234", + ((ContainerSignalContext) invocation.getArguments()[0]).getPid()); + Object[] arguments = invocation.getArguments(); + Container container = ((ContainerSignalContext) arguments[0]) + .getContainer(); + // remove container from NM context immediately + containers.remove(container.getContainerId()); + return true; + }); + + Context context = mock(Context.class); + when(context.getContainers()).thenReturn(containers); + when(context.getContainerExecutor()).thenReturn(ex); + + CGroupsHandler cGroupsHandler = mock(CGroupsHandler.class); + when(cGroupsHandler.getCGroupParam( + CGroupsHandler.CGroupController.MEMORY, + "", + CGROUP_PARAM_MEMORY_OOM_CONTROL)) + .thenReturn("under_oom 1") + .thenReturn("under_oom 1") + .thenReturn("under_oom 1") + .thenReturn("under_oom 1"); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c1.getContainerId().toString(), CGROUP_FILE_TASKS)) + .thenReturn("1234").thenReturn(""); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c2.getContainerId().toString(), CGROUP_FILE_TASKS)) + .thenReturn("1234").thenReturn(""); + when(cGroupsHandler.getCGroupParam(CGroupsHandler.CGroupController.MEMORY, + c3.getContainerId().toString(), CGROUP_FILE_TASKS)) + .thenReturn("1234").thenReturn(""); + + OverAllocationOOMHandler handler = + new OverAllocationOOMHandler(context, false) { + @Override + protected CGroupsHandler getCGroupsHandler() { + return cGroupsHandler; + } + }; + handler.run(); + } + + private static Container createContainer(int containerId, + boolean guaranteed, long launchTime) { + Container c1 = mock(Container.class); + ContainerId cid1 = createContainerId(containerId); + when(c1.getContainerId()).thenReturn(cid1); + + ContainerTokenIdentifier token = mock(ContainerTokenIdentifier.class); + ExecutionType type = + guaranteed ? ExecutionType.GUARANTEED : ExecutionType.OPPORTUNISTIC; + when(token.getExecutionType()).thenReturn(type); + when(c1.getContainerTokenIdentifier()).thenReturn(token); + + when(c1.getResource()).thenReturn(Resource.newInstance(1, 1)); + when(c1.getContainerLaunchTime()).thenReturn(launchTime); + + return c1; + } + + private static ContainerId createContainerId(long id) { + ApplicationId applicationId = ApplicationId.newInstance(1, 1); + + ApplicationAttemptId applicationAttemptId + = mock(ApplicationAttemptId.class); + when(applicationAttemptId.getApplicationId()).thenReturn(applicationId); + when(applicationAttemptId.getAttemptId()).thenReturn(1); + + ContainerId containerId = mock(ContainerId.class); + when(containerId.toString()).thenReturn(Long.toString(id)); + when(containerId.getContainerId()).thenReturn(new Long(1)); + + return containerId; + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java index 77ebd34..325709b 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java @@ -242,6 +242,11 @@ public long getContainerStartTime() { } @Override + public long getContainerLaunchTime() { + return 0; + } + + @Override public ResourceMappings getResourceMappings() { return null; }