Index: hadoop-tools/hadoop-sls/src/main/data/2jobs2min-rumen-jh.json IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-tools/hadoop-sls/src/main/data/2jobs2min-rumen-jh.json (revision a4f9c7c9247801dd37beec6fc195622af1b884ad) +++ hadoop-tools/hadoop-sls/src/main/data/2jobs2min-rumen-jh.json (revision 446be64ef209dc99f38a79e1441432a360403960) @@ -4869,7 +4869,7 @@ "hadoop.security.groups.cache.secs" : "300", "ipc.client.connect.max.retries" : "10", "dfs.namenode.delegation.key.update-interval" : "86400000", - "yarn.nodemanager.process-kill-wait.ms" : "2000", + "yarn.nodemanager.process-kill-wait.ms" : "5000", "yarn.application.classpath" : "$HADOOP_CONF_DIR,$HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,$HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,$HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,$YARN_HOME/*,$YARN_HOME/lib/*", "yarn.app.mapreduce.client.max-retries" : "3", "dfs.datanode.available-space-volume-choosing-policy.balanced-space-preference-fraction" : "0.75f", @@ -9950,7 +9950,7 @@ "hadoop.security.groups.cache.secs" : "300", "ipc.client.connect.max.retries" : "10", "dfs.namenode.delegation.key.update-interval" : "86400000", - "yarn.nodemanager.process-kill-wait.ms" : "2000", + "yarn.nodemanager.process-kill-wait.ms" : "5000", "yarn.application.classpath" : "$HADOOP_CONF_DIR,$HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,$HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,$HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,$YARN_HOME/*,$YARN_HOME/lib/*", "yarn.app.mapreduce.client.max-retries" : "3", "dfs.datanode.available-space-volume-choosing-policy.balanced-space-preference-fraction" : "0.75f", Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java (revision a4f9c7c9247801dd37beec6fc195622af1b884ad) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java (revision 446be64ef209dc99f38a79e1441432a360403960) @@ -1886,7 +1886,7 @@ public static final String NM_PROCESS_KILL_WAIT_MS = NM_PREFIX + "process-kill-wait.ms"; public static final long DEFAULT_NM_PROCESS_KILL_WAIT_MS = - 2000; + 5000; /** Max time to wait to establish a connection to RM */ public static final String RESOURCEMANAGER_CONNECT_MAX_WAIT_MS = Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestNMClient.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestNMClient.java (revision a4f9c7c9247801dd37beec6fc195622af1b884ad) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestNMClient.java (revision 446be64ef209dc99f38a79e1441432a360403960) @@ -27,6 +27,7 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Set; import java.util.TreeSet; @@ -67,6 +68,7 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.MiniYARNCluster; +import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; import org.apache.hadoop.yarn.util.Records; @@ -90,6 +92,9 @@ public void setup() throws YarnException, IOException { // start minicluster conf = new YarnConfiguration(); + // Turn on state tracking + conf.set(YarnConfiguration.NM_CONTAINER_STATE_TRANSITION_LISTENERS, + NodeManager.DebugSumContainerStateListener.class.getName()); yarnCluster = new MiniYARNCluster(TestAMRMClient.class.getName(), nodeCount, 1, 1); yarnCluster.init(conf); @@ -290,7 +295,7 @@ return containers; } - private void testContainerManagement(NMClientImpl nmClient, + private void testContainerManagement(NMClientImpl client, Set containers) throws YarnException, IOException { int size = containers.size(); int i = 0; @@ -298,7 +303,7 @@ // getContainerStatus shouldn't be called before startContainer, // otherwise, NodeManager cannot find the container try { - nmClient.getContainerStatus(container.getId(), container.getNodeId()); + client.getContainerStatus(container.getId(), container.getNodeId()); fail("Exception is expected"); } catch (YarnException e) { assertTrue("The thrown exception is not expected", @@ -307,7 +312,7 @@ // upadateContainerResource shouldn't be called before startContainer, // otherwise, NodeManager cannot find the container try { - nmClient.updateContainerResource(container); + client.updateContainerResource(container); fail("Exception is expected"); } catch (YarnException e) { assertTrue("The thrown exception is not expected", @@ -317,7 +322,7 @@ // restart shouldn't be called before startContainer, // otherwise, NodeManager cannot find the container try { - nmClient.restartContainer(container.getId()); + client.restartContainer(container.getId()); fail("Exception is expected"); } catch (YarnException e) { assertTrue("The thrown exception is not expected", @@ -327,7 +332,7 @@ // rollback shouldn't be called before startContainer, // otherwise, NodeManager cannot find the container try { - nmClient.rollbackLastReInitialization(container.getId()); + client.rollbackLastReInitialization(container.getId()); fail("Exception is expected"); } catch (YarnException e) { assertTrue("The thrown exception is not expected", @@ -337,7 +342,7 @@ // commit shouldn't be called before startContainer, // otherwise, NodeManager cannot find the container try { - nmClient.commitLastReInitialization(container.getId()); + client.commitLastReInitialization(container.getId()); fail("Exception is expected"); } catch (YarnException e) { assertTrue("The thrown exception is not expected", @@ -347,14 +352,12 @@ // stopContainer shouldn't be called before startContainer, // otherwise, an exception will be thrown try { - nmClient.stopContainer(container.getId(), container.getNodeId()); + client.stopContainer(container.getId(), container.getNodeId()); fail("Exception is expected"); } catch (YarnException e) { if (!e.getMessage() .contains("is not handled by this NodeManager")) { - throw (AssertionError) - (new AssertionError("Exception is not expected: " + e).initCause( - e)); + throw new AssertionError("Exception is not expected: ", e); } } @@ -367,78 +370,132 @@ Records.newRecord(ContainerLaunchContext.class); if (Shell.WINDOWS) { clc.setCommands( - Arrays.asList("ping", "-n", "100", "127.0.0.1", ">nul")); + Arrays.asList("ping", "-n", "10000000", "127.0.0.1", ">nul")); } else { - clc.setCommands(Arrays.asList("sleep", "10")); + clc.setCommands(Arrays.asList("sleep", "1000000")); } clc.setTokens(securityTokens); try { - nmClient.startContainer(container, clc); + client.startContainer(container, clc); } catch (YarnException e) { - throw (AssertionError) - (new AssertionError("Exception is not expected: " + e).initCause(e)); + throw new AssertionError("Exception is not expected ", e); } + List array = Collections.singletonList(-1000); // leave one container unclosed if (++i < size) { - // NodeManager may still need some time to make the container started - testGetContainerStatus(container, i, ContainerState.RUNNING, "", - Arrays.asList(new Integer[] {-1000})); - // Test increase container API and make sure requests can reach NM - testIncreaseContainerResource(container); + testContainer(client, i, container, clc, array); + + } + } + } + + private void testContainer(NMClientImpl client, int i, Container container, + ContainerLaunchContext clc, List array) + throws YarnException, IOException { + // NodeManager may still need some time to make the container started + testGetContainerStatus(container, i, ContainerState.RUNNING, "", + array); + waitForContainerTransitionCount(container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState.RUNNING, 1); + // Test increase container API and make sure requests can reach NM + testIncreaseContainerResource(container); - testRestartContainer(container.getId()); - testGetContainerStatus(container, i, ContainerState.RUNNING, - "will be Restarted", Arrays.asList(new Integer[] {-1000})); + testRestartContainer(container.getId()); + testGetContainerStatus(container, i, ContainerState.RUNNING, + "will be Restarted", array); + waitForContainerTransitionCount(container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState.RUNNING, 2); - if (i % 2 == 0) { - testReInitializeContainer(container.getId(), clc, false); - testGetContainerStatus(container, i, ContainerState.RUNNING, - "will be Re-initialized", Arrays.asList(new Integer[] {-1000})); - testRollbackContainer(container.getId(), false); - testGetContainerStatus(container, i, ContainerState.RUNNING, - "will be Rolled-back", Arrays.asList(new Integer[] {-1000})); - testCommitContainer(container.getId(), true); - testReInitializeContainer(container.getId(), clc, false); - testGetContainerStatus(container, i, ContainerState.RUNNING, - "will be Re-initialized", Arrays.asList(new Integer[] {-1000})); - testCommitContainer(container.getId(), false); - } else { - testReInitializeContainer(container.getId(), clc, true); - testGetContainerStatus(container, i, ContainerState.RUNNING, - "will be Re-initialized", Arrays.asList(new Integer[] {-1000})); - testRollbackContainer(container.getId(), true); - testCommitContainer(container.getId(), true); - } + if (i % 2 == 0) { + testReInitializeContainer(container.getId(), clc, false); + testGetContainerStatus(container, i, ContainerState.RUNNING, + "will be Re-initialized", array); + waitForContainerTransitionCount(container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState.RUNNING, 3); + + testRollbackContainer(container.getId(), false); + testGetContainerStatus(container, i, ContainerState.RUNNING, + "will be Rolled-back", array); + waitForContainerTransitionCount(container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState.RUNNING, 4); + + testCommitContainer(container.getId(), true); + testReInitializeContainer(container.getId(), clc, false); + testGetContainerStatus(container, i, ContainerState.RUNNING, + "will be Re-initialized", array); + waitForContainerTransitionCount(container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState.RUNNING, 5); + testCommitContainer(container.getId(), false); + } else { + testReInitializeContainer(container.getId(), clc, true); + testGetContainerStatus(container, i, ContainerState.RUNNING, + "will be Re-initialized", array); + waitForContainerTransitionCount(container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState.RUNNING, 3); + testRollbackContainer(container.getId(), true); + testCommitContainer(container.getId(), true); + } - try { - nmClient.stopContainer(container.getId(), container.getNodeId()); - } catch (YarnException e) { - throw (AssertionError) - (new AssertionError("Exception is not expected: " + e) - .initCause(e)); - } + try { + client.stopContainer(container.getId(), container.getNodeId()); + } catch (YarnException e) { + throw (AssertionError) + (new AssertionError("Exception is not expected: " + e, e)); + } - // getContainerStatus can be called after stopContainer - try { - // O is possible if CLEANUP_CONTAINER is executed too late - // -105 is possible if the container is not terminated but killed - testGetContainerStatus(container, i, ContainerState.COMPLETE, - "Container killed by the ApplicationMaster.", Arrays.asList( - new Integer[] {ContainerExitStatus.KILLED_BY_APPMASTER, - ContainerExitStatus.SUCCESS})); - } catch (YarnException e) { - // The exception is possible because, after the container is stopped, - // it may be removed from NM's context. - if (!e.getMessage() - .contains("was recently stopped on node manager")) { - throw (AssertionError) - (new AssertionError("Exception is not expected: " + e).initCause( - e)); - } - } - } - } + // getContainerStatus can be called after stopContainer + try { + // O is possible if CLEANUP_CONTAINER is executed too late + // -105 is possible if the container is not terminated but killed + testGetContainerStatus(container, i, ContainerState.COMPLETE, + "Container killed by the ApplicationMaster.", + Arrays.asList( + ContainerExitStatus.KILLED_BY_APPMASTER, + ContainerExitStatus.SUCCESS)); + } catch (YarnException e) { + // The exception is possible because, after the container is stopped, + // it may be removed from NM's context. + if (!e.getMessage() + .contains("was recently stopped on node manager")) { + throw (AssertionError) + (new AssertionError("Exception is not expected: ", e)); + } + } + } + + /** + * Wait until the container reaches a state N times. + * @param container container to watch + * @param state state to test + * @param transitions the number N above + * @throws YarnException This happens if the test times out while waiting + */ + private void waitForContainerTransitionCount( + Container container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState state, long transitions) + throws YarnException { + long transitionCount = -1; + do { + if (transitionCount != -1) { + try { + Thread.sleep(10); + } catch (InterruptedException e) { + throw new YarnException( + "Timeout at transition count:" + transitionCount, e); + } + } + NodeManager.DebugSumContainerStateListener listener = + NodeManager.DebugSumContainerStateListener.getSingleton(); + transitionCount = listener.getTransitionCounter(container.getId(), state); + } while (transitionCount != transitions); } private void sleep(int sleepTime) { Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml (revision a4f9c7c9247801dd37beec6fc195622af1b884ad) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml (revision 446be64ef209dc99f38a79e1441432a360403960) @@ -1728,7 +1728,7 @@ Max time to wait for a process to come up when trying to cleanup a container yarn.nodemanager.process-kill-wait.ms - 2000 + 5000 Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java (revision a4f9c7c9247801dd37beec6fc195622af1b884ad) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java (revision 446be64ef209dc99f38a79e1441432a360403960) @@ -275,7 +275,7 @@ sb.writeLocalWrapperScript(launchDst, pidFile); } else { LOG.info("Container " + containerIdStr - + " was marked as inactive. Returning terminated error"); + + " pid file not set. Returning terminated error"); return ExitCode.TERMINATED.getExitCode(); } Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java (revision a4f9c7c9247801dd37beec6fc195622af1b884ad) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java (revision 446be64ef209dc99f38a79e1441432a360403960) @@ -28,6 +28,8 @@ import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; @@ -147,6 +149,79 @@ public void init(Context context) {} } + /** + * Container State transition listener to debug the sum of transitions. + */ + public static class DebugSumContainerStateListener + implements ContainerStateTransitionListener { + private static final Log LOG = + LogFactory.getLog(DebugSumContainerStateListener.class); + private Map> + transitionCounter = new HashMap<>(); + private static DebugSumContainerStateListener singleton = null; + + public DebugSumContainerStateListener() { + // Currently this is not needed to be thread safe + if (singleton == null) { + singleton = this; + } + } + + /** + * Return singleton + * @return return the single object created of this class + */ + public static DebugSumContainerStateListener getSingleton() { + return singleton; + } + + @Override + public void init(Context context) {} + + @Override + public void preTransition(ContainerImpl op, ContainerState beforeState, + ContainerEvent eventToBeProcessed) {} + + @Override + public void postTransition( + ContainerImpl op, + ContainerState beforeState, + ContainerState afterState, + ContainerEvent processedEvent) { + if (singleton != this) { + singleton.postTransition(op, beforeState, afterState, processedEvent); + } else { + synchronized (this) { + if (beforeState != afterState) { + transitionCounter + .putIfAbsent(op.getContainerId(), new HashMap<>()); + long sum = transitionCounter.get(op.getContainerId()) + .compute(afterState, + (state, count) -> count == null ? 1 : count + 1); + LOG.info("***** " + op.getContainerId() + + " Transition from " + beforeState + + " to " + afterState + + "sum:" + sum); + } + } + } + } + + /** + * Get the current number of state transitions. + * This is useful to check, if an event has occurred in unit tests. + * @param id Container id to check + * @param state Return the overall number of transitions to this state + * @return Number of transitions to the state specified + */ + public synchronized long getTransitionCounter( + ContainerId id, ContainerState state) { + Long ret = singleton.transitionCounter.getOrDefault(id, new HashMap<>()) + .get(state); + return ret != null ? ret : 0; + } + } + public NodeManager() { super(NodeManager.class.getName()); } Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java (revision a4f9c7c9247801dd37beec6fc195622af1b884ad) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java (revision 446be64ef209dc99f38a79e1441432a360403960) @@ -693,6 +693,22 @@ new DelayedProcessKiller(container, user, processId, sleepDelayBeforeSigKill, Signal.KILL, exec).start(); } + } else { + // Normally this means that the process was notified about + // deactivateContainer above and did not start. + // Since we already set the state to RUNNING or REINITIALIZING + // we have to send a killed event to continue. + LOG.warn("Container clean up before pid file created " + containerIdStr); + dispatcher.getEventHandler().handle( + new ContainerExitEvent(container.getContainerId(), + ContainerEventType.CONTAINER_KILLED_ON_REQUEST, + Shell.WINDOWS ? ExitCode.FORCE_KILLED.getExitCode() : + ExitCode.TERMINATED.getExitCode(), + "Container terminated before pid file created.")); + // There is a possibility that the launch grabbed the file name before + // the deactivateContainer above but it was slow enough to avoid + // getContainerPid. Increasing YarnConfiguration.NM_PROCESS_KILL_WAIT_MS + // reduces the likelihood of this race condition and process leak. } } catch (Exception e) { String message = Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java (revision a4f9c7c9247801dd37beec6fc195622af1b884ad) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java (revision 446be64ef209dc99f38a79e1441432a360403960) @@ -118,8 +118,8 @@ ContainerLaunch launch = new ContainerLaunch(context, getConfig(), dispatcher, exec, app, event.getContainer(), dirsHandler, containerManager); - containerLauncher.submit(launch); running.put(containerId, launch); + containerLauncher.submit(launch); break; case RELAUNCH_CONTAINER: app = context.getApplications().get( @@ -128,16 +128,16 @@ ContainerRelaunch relaunch = new ContainerRelaunch(context, getConfig(), dispatcher, exec, app, event.getContainer(), dirsHandler, containerManager); - containerLauncher.submit(relaunch); running.put(containerId, relaunch); + containerLauncher.submit(relaunch); break; case RECOVER_CONTAINER: app = context.getApplications().get( containerId.getApplicationAttemptId().getApplicationId()); launch = new RecoveredContainerLaunch(context, getConfig(), dispatcher, exec, app, event.getContainer(), dirsHandler, containerManager); - containerLauncher.submit(launch); running.put(containerId, launch); + containerLauncher.submit(launch); break; case RECOVER_PAUSED_CONTAINER: // Recovery for paused containers is not supported, thus here