diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 2003e13..be9e301 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -674,7 +674,19 @@ /** Where the linux container executor should mount cgroups if not found */ public static final String NM_LINUX_CONTAINER_CGROUPS_MOUNT_PATH = NM_PREFIX + "linux-container-executor.cgroups.mount-path"; - + + + /** + * Interval of time the linux container executor should try cleaning up + * cgroups entry when cleaning up a container. This is required due to what + * it seems a race condition because the SIGTERM/SIGKILL is asynch. + */ + public static final String NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT = + NM_PREFIX + "linux-container-executor.cgroups.delete-timeout-ms"; + + public static final long DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT = + 1000; + /** T-file compression types used to compress aggregated logs.*/ public static final String NM_LOG_AGG_COMPRESSION_TYPE = NM_PREFIX + "log-aggregation.compression-type"; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java index 3a44afe..d5bd225 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java @@ -32,6 +32,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import com.google.common.annotations.VisibleForTesting; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -40,6 +41,8 @@ import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor; +import org.apache.hadoop.yarn.util.Clock; +import org.apache.hadoop.yarn.util.SystemClock; public class CgroupsLCEResourcesHandler implements LCEResourcesHandler { @@ -59,8 +62,13 @@ private final int CPU_DEFAULT_WEIGHT = 1024; // set by kernel private final Map controllerPaths; // Controller -> path + private long deleteCgroupTimeout; + // package private for testing purposes + Clock clock; + public CgroupsLCEResourcesHandler() { this.controllerPaths = new HashMap(); + clock = new SystemClock(); } @Override @@ -73,7 +81,8 @@ public Configuration getConf() { return conf; } - public synchronized void init(LinuxContainerExecutor lce) throws IOException { + @VisibleForTesting + void initConfig() throws IOException { this.cgroupPrefix = conf.get(YarnConfiguration. NM_LINUX_CONTAINER_CGROUPS_HIERARCHY, "/hadoop-yarn"); @@ -82,6 +91,9 @@ public synchronized void init(LinuxContainerExecutor lce) throws IOException { this.cgroupMountPath = conf.get(YarnConfiguration. NM_LINUX_CONTAINER_CGROUPS_MOUNT_PATH, null); + this.deleteCgroupTimeout = conf.getLong( + YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT, + YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT); // remove extra /'s at end or start of cgroupPrefix if (cgroupPrefix.charAt(0) == '/') { cgroupPrefix = cgroupPrefix.substring(1); @@ -91,7 +103,11 @@ public synchronized void init(LinuxContainerExecutor lce) throws IOException { if (cgroupPrefix.charAt(len - 1) == '/') { cgroupPrefix = cgroupPrefix.substring(0, len - 1); } + } + public void init(LinuxContainerExecutor lce) throws IOException { + initConfig(); + // mount cgroups if requested if (cgroupMount && cgroupMountPath != null) { ArrayList cgroupKVs = new ArrayList(); @@ -158,14 +174,32 @@ private void updateCgroup(String controller, String groupName, String param, } } - private void deleteCgroup(String controller, String groupName) { - String path = pathForCgroup(controller, groupName); + @VisibleForTesting + boolean deleteCgroup(String cgroupPath) { + boolean deleted; + + if (LOG.isDebugEnabled()) { + LOG.debug("deleteCgroup: " + cgroupPath); + } - LOG.debug("deleteCgroup: " + path); + long start = clock.getTime(); + do { + deleted = new File(cgroupPath).delete(); + if (!deleted) { + try { + Thread.sleep(20); + } catch (InterruptedException ex) { + // NOP + } + } + } while (!deleted && (clock.getTime() - start) < deleteCgroupTimeout); - if (! new File(path).delete()) { - LOG.warn("Unable to delete cgroup at: " + path); + if (!deleted) { + LOG.warn("Unable to delete cgroup at: " + cgroupPath + + ", tried to delete for " + deleteCgroupTimeout + "ms"); } + + return deleted; } /* @@ -185,21 +219,8 @@ private void setupLimits(ContainerId containerId, } private void clearLimits(ContainerId containerId) { - String containerName = containerId.toString(); - - // Based on testing, ApplicationMaster executables don't terminate until - // a little after the container appears to have finished. Therefore, we - // wait a short bit for the cgroup to become empty before deleting it. - if (containerId.getId() == 1) { - try { - Thread.sleep(500); - } catch (InterruptedException e) { - // not a problem, continue anyway - } - } - if (isCpuWeightEnabled()) { - deleteCgroup(CONTROLLER_CPU, containerName); + deleteCgroup(pathForCgroup(CONTROLLER_CPU, containerId.toString())); } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java new file mode 100644 index 0000000..55caffc --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.yarn.server.nodemanager.util; + +import junit.framework.Assert; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.util.Clock; +import org.junit.Test; + +import java.io.File; +import java.io.FileOutputStream; +import java.util.UUID; +import java.util.concurrent.CountDownLatch; + +public class TestCgroupsLCEResourcesHandler { + + static class MockClock implements Clock { + long time; + @Override + public long getTime() { + return time; + } + } + @Test + public void testDeleteCgroup() throws Exception { + final MockClock clock = new MockClock(); + clock.time = System.currentTimeMillis(); + CgroupsLCEResourcesHandler handler = new CgroupsLCEResourcesHandler(); + handler.setConf(new YarnConfiguration()); + handler.initConfig(); + handler.clock = clock; + + //file exists + File file = new File("target", UUID.randomUUID().toString()); + new FileOutputStream(file).close(); + Assert.assertTrue(handler.deleteCgroup(file.getPath())); + + //file does not exists, timing out + final CountDownLatch latch = new CountDownLatch(1); + new Thread() { + @Override + public void run() { + latch.countDown(); + try { + Thread.sleep(200); + } catch (InterruptedException ex) { + //NOP + } + clock.time += YarnConfiguration. + DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT; + } + }.start(); + latch.await(); + file = new File("target", UUID.randomUUID().toString()); + Assert.assertFalse(handler.deleteCgroup(file.getPath())); + } + +}