Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestNMClient.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestNMClient.java (revision 312e57b95477ec95e6735f5721c646ad1df019f8) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestNMClient.java (revision 42eebff540abe4a28e1eb868be4f5cda5919a8f9) @@ -27,6 +27,7 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Set; import java.util.TreeSet; @@ -66,8 +67,12 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.MiniYARNCluster; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEventType; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; +import org.apache.hadoop.yarn.state.DebugStateMachineDecoratorImpl; import org.apache.hadoop.yarn.util.Records; import org.junit.After; import org.junit.Assert; @@ -87,6 +92,8 @@ @Before public void setup() throws YarnException, IOException { + // Turn on state tracking + ContainerImpl.enableDebugging(); // start minicluster conf = new YarnConfiguration(); yarnCluster = @@ -287,7 +294,7 @@ return containers; } - private void testContainerManagement(NMClientImpl nmClient, + private void testContainerManagement(NMClientImpl client, Set containers) throws YarnException, IOException { int size = containers.size(); int i = 0; @@ -295,7 +302,7 @@ // getContainerStatus shouldn't be called before startContainer, // otherwise, NodeManager cannot find the container try { - nmClient.getContainerStatus(container.getId(), container.getNodeId()); + client.getContainerStatus(container.getId(), container.getNodeId()); fail("Exception is expected"); } catch (YarnException e) { assertTrue("The thrown exception is not expected", @@ -305,7 +312,7 @@ // increaseContainerResource shouldn't be called before startContainer, // otherwise, NodeManager cannot find the container try { - nmClient.increaseContainerResource(container); + client.increaseContainerResource(container); fail("Exception is expected"); } catch (YarnException e) { assertTrue("The thrown exception is not expected", @@ -315,7 +322,7 @@ // restart shouldn't be called before startContainer, // otherwise, NodeManager cannot find the container try { - nmClient.restartContainer(container.getId()); + client.restartContainer(container.getId()); fail("Exception is expected"); } catch (YarnException e) { assertTrue("The thrown exception is not expected", @@ -325,7 +332,7 @@ // rollback shouldn't be called before startContainer, // otherwise, NodeManager cannot find the container try { - nmClient.rollbackLastReInitialization(container.getId()); + client.rollbackLastReInitialization(container.getId()); fail("Exception is expected"); } catch (YarnException e) { assertTrue("The thrown exception is not expected", @@ -335,7 +342,7 @@ // commit shouldn't be called before startContainer, // otherwise, NodeManager cannot find the container try { - nmClient.commitLastReInitialization(container.getId()); + client.commitLastReInitialization(container.getId()); fail("Exception is expected"); } catch (YarnException e) { assertTrue("The thrown exception is not expected", @@ -345,14 +352,12 @@ // stopContainer shouldn't be called before startContainer, // otherwise, an exception will be thrown try { - nmClient.stopContainer(container.getId(), container.getNodeId()); + client.stopContainer(container.getId(), container.getNodeId()); fail("Exception is expected"); } catch (YarnException e) { if (!e.getMessage() .contains("is not handled by this NodeManager")) { - throw (AssertionError) - (new AssertionError("Exception is not expected: " + e).initCause( - e)); + throw new AssertionError("Exception is not expected: ", e); } } @@ -365,78 +370,136 @@ Records.newRecord(ContainerLaunchContext.class); if (Shell.WINDOWS) { clc.setCommands( - Arrays.asList("ping", "-n", "100", "127.0.0.1", ">nul")); + Arrays.asList("ping", "-n", "10000000", "127.0.0.1", ">nul")); } else { - clc.setCommands(Arrays.asList("sleep", "10")); + clc.setCommands(Arrays.asList("sleep", "1000000")); } clc.setTokens(securityTokens); try { - nmClient.startContainer(container, clc); + client.startContainer(container, clc); } catch (YarnException e) { - throw (AssertionError) - (new AssertionError("Exception is not expected: " + e).initCause(e)); + throw new AssertionError("Exception is not expected ", e); } + List array = Collections.singletonList(-1000); // leave one container unclosed if (++i < size) { - // NodeManager may still need some time to make the container started - testGetContainerStatus(container, i, ContainerState.RUNNING, "", - Arrays.asList(new Integer[] {-1000})); - // Test increase container API and make sure requests can reach NM - testIncreaseContainerResource(container); + testContainer(client, i, container, clc, array); + + } + } + } + + private void testContainer(NMClientImpl client, int i, Container container, + ContainerLaunchContext clc, List array) + throws YarnException, IOException { + // NodeManager may still need some time to make the container started + testGetContainerStatus(container, i, ContainerState.RUNNING, "", + array); + waitForContainerTransitionCount(container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState.RUNNING, 1); + // Test increase container API and make sure requests can reach NM + testIncreaseContainerResource(container); - testRestartContainer(container.getId()); - testGetContainerStatus(container, i, ContainerState.RUNNING, - "will be Restarted", Arrays.asList(new Integer[] {-1000})); + testRestartContainer(container.getId()); + testGetContainerStatus(container, i, ContainerState.RUNNING, + "will be Restarted", array); + waitForContainerTransitionCount(container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState.RUNNING, 2); - if (i % 2 == 0) { - testReInitializeContainer(container.getId(), clc, false); - testGetContainerStatus(container, i, ContainerState.RUNNING, - "will be Re-initialized", Arrays.asList(new Integer[] {-1000})); - testRollbackContainer(container.getId(), false); - testGetContainerStatus(container, i, ContainerState.RUNNING, - "will be Rolled-back", Arrays.asList(new Integer[] {-1000})); - testCommitContainer(container.getId(), true); - testReInitializeContainer(container.getId(), clc, false); - testGetContainerStatus(container, i, ContainerState.RUNNING, - "will be Re-initialized", Arrays.asList(new Integer[] {-1000})); - testCommitContainer(container.getId(), false); - } else { - testReInitializeContainer(container.getId(), clc, true); - testGetContainerStatus(container, i, ContainerState.RUNNING, - "will be Re-initialized", Arrays.asList(new Integer[] {-1000})); - testRollbackContainer(container.getId(), true); - testCommitContainer(container.getId(), true); - } + if (i % 2 == 0) { + testReInitializeContainer(container.getId(), clc, false); + testGetContainerStatus(container, i, ContainerState.RUNNING, + "will be Re-initialized", array); + waitForContainerTransitionCount(container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState.RUNNING, 3); + + testRollbackContainer(container.getId(), false); + testGetContainerStatus(container, i, ContainerState.RUNNING, + "will be Rolled-back", array); + waitForContainerTransitionCount(container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState.RUNNING, 4); + + testCommitContainer(container.getId(), true); + testReInitializeContainer(container.getId(), clc, false); + testGetContainerStatus(container, i, ContainerState.RUNNING, + "will be Re-initialized", array); + waitForContainerTransitionCount(container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState.RUNNING, 5); + testCommitContainer(container.getId(), false); + } else { + testReInitializeContainer(container.getId(), clc, true); + testGetContainerStatus(container, i, ContainerState.RUNNING, + "will be Re-initialized", array); + waitForContainerTransitionCount(container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState.RUNNING, 3); + testRollbackContainer(container.getId(), true); + testCommitContainer(container.getId(), true); + } - try { - nmClient.stopContainer(container.getId(), container.getNodeId()); - } catch (YarnException e) { - throw (AssertionError) - (new AssertionError("Exception is not expected: " + e) - .initCause(e)); - } + try { + client.stopContainer(container.getId(), container.getNodeId()); + } catch (YarnException e) { + throw (AssertionError) + (new AssertionError("Exception is not expected: " + e, e)); + } - // getContainerStatus can be called after stopContainer - try { - // O is possible if CLEANUP_CONTAINER is executed too late - // -105 is possible if the container is not terminated but killed - testGetContainerStatus(container, i, ContainerState.COMPLETE, - "Container killed by the ApplicationMaster.", Arrays.asList( - new Integer[] {ContainerExitStatus.KILLED_BY_APPMASTER, - ContainerExitStatus.SUCCESS})); - } catch (YarnException e) { - // The exception is possible because, after the container is stopped, - // it may be removed from NM's context. - if (!e.getMessage() - .contains("was recently stopped on node manager")) { - throw (AssertionError) - (new AssertionError("Exception is not expected: " + e).initCause( - e)); - } - } - } - } + // getContainerStatus can be called after stopContainer + try { + // O is possible if CLEANUP_CONTAINER is executed too late + // -105 is possible if the container is not terminated but killed + testGetContainerStatus(container, i, ContainerState.COMPLETE, + "Container killed by the ApplicationMaster.", + Arrays.asList( + ContainerExitStatus.KILLED_BY_APPMASTER, + ContainerExitStatus.SUCCESS)); + } catch (YarnException e) { + // The exception is possible because, after the container is stopped, + // it may be removed from NM's context. + if (!e.getMessage() + .contains("was recently stopped on node manager")) { + throw (AssertionError) + (new AssertionError("Exception is not expected: ", e)); + } + } + } + + /** + * Wait until the container reaches a state N times. + * @param container container to watch + * @param state state to test + * @param transitions the number N above + * @throws YarnException This happens if the test times out while waiting + */ + private void waitForContainerTransitionCount( + Container container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState state, long transitions) + throws YarnException { + long transitionCount = -1; + do { + if (transitionCount != -1) { + try { + Thread.sleep(10); + } catch (InterruptedException e) { + throw new YarnException( + "Timeout at transition count:" + transitionCount, e); + } + } + DebugStateMachineDecoratorImpl< + org.apache.hadoop.yarn.server.nodemanager.containermanager. + container.ContainerState, + ContainerEventType, ContainerEvent> debugInfo = + ContainerImpl.getDebugInfo(container.getId()); + transitionCount = debugInfo. + getTransitionCounter(state); + } while (transitionCount != transitions); } private void sleep(int sleepTime) { Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/state/DebugStateMachineDecoratorImpl.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/state/DebugStateMachineDecoratorImpl.java (revision 42eebff540abe4a28e1eb868be4f5cda5919a8f9) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/state/DebugStateMachineDecoratorImpl.java (revision 42eebff540abe4a28e1eb868be4f5cda5919a8f9) @@ -0,0 +1,75 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.state; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.util.HashMap; +import java.util.Map; + +/** + * Decorator object to keep track of accumulated state transitions. + * @param The state of the entity. + * @param The external eventType to be handled. + * @param The event object. + */ +public class DebugStateMachineDecoratorImpl, + EVENTTYPE extends Enum, EVENT> + implements StateMachine { + private static final Log LOG = + LogFactory.getLog(DebugStateMachineDecoratorImpl.class); + private StateMachine internal; + private Map transitionCounter = new HashMap<>(); + + public DebugStateMachineDecoratorImpl( + StateMachine wrapped) { + internal = wrapped; + } + + @Override + public synchronized STATE getCurrentState() { + return internal.getCurrentState(); + } + + @Override + public STATE doTransition(EVENTTYPE eventType, EVENT event) + throws InvalidStateTransitionException { + STATE previous = getCurrentState(); + STATE current = internal.doTransition(eventType, event); + if (previous != current) { + long sum = transitionCounter.compute(current, + (state, count) -> count == null ? 1 : count + 1); + LOG.info("***** Transition from " + previous + " to " + current + + "sum:" + sum); + } + return current; + } + + /** + * Get the current number of state transitions. + * This is useful to check, if an event has occurred in unit tests. + * @param state Return the total transitions to this state + * @return Number of total transitions to the state specified. + */ + public synchronized long getTransitionCounter(STATE state) { + Long ret = transitionCounter.get(state); + return ret != null ? ret : 0; + } +} Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/state/StateMachine.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/state/StateMachine.java (revision 312e57b95477ec95e6735f5721c646ad1df019f8) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/state/StateMachine.java (revision 42eebff540abe4a28e1eb868be4f5cda5919a8f9) @@ -26,7 +26,7 @@ public interface StateMachine , EVENTTYPE extends Enum, EVENT> { - public STATE getCurrentState(); - public STATE doTransition(EVENTTYPE eventType, EVENT event) + STATE getCurrentState(); + STATE doTransition(EVENTTYPE eventType, EVENT event) throws InvalidStateTransitionException; } Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java (revision 312e57b95477ec95e6735f5721c646ad1df019f8) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java (revision 42eebff540abe4a28e1eb868be4f5cda5919a8f9) @@ -30,6 +30,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; @@ -84,6 +85,7 @@ import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredContainerStatus; import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher; import org.apache.hadoop.yarn.server.utils.BuilderUtils; +import org.apache.hadoop.yarn.state.DebugStateMachineDecoratorImpl; import org.apache.hadoop.yarn.state.InvalidStateTransitionException; import org.apache.hadoop.yarn.state.MultipleArcTransition; import org.apache.hadoop.yarn.state.SingleArcTransition; @@ -230,7 +232,18 @@ this.containerRetryContext = configureRetryContext( conf, launchContext, this.containerId); this.remainingRetryAttempts = this.containerRetryContext.getMaxRetries(); - stateMachine = stateMachineFactory.make(this); + if (debugInfo != null) { + // Unit tests may get notification about state transition counts + // using this object. This applies only, if they run in the same process. + DebugStateMachineDecoratorImpl< + ContainerState, ContainerEventType, ContainerEvent> debug = + new DebugStateMachineDecoratorImpl<>( + stateMachineFactory.make(this)); + debugInfo.put(containerId, debug); + stateMachine = debug; + } else { + stateMachine = stateMachineFactory.make(this); + } this.context = context; this.resourceSet = new ResourceSet(); } @@ -523,6 +536,28 @@ private final StateMachine stateMachine; + @VisibleForTesting + private static Map> debugInfo = null; + + /** + * Collect state transition counters. + */ + public static void enableDebugging() { + debugInfo = new ConcurrentHashMap<>(); + } + + /** + * Return debug information. + * @param id Container id to return information about + * @return Debug information object + */ + public static DebugStateMachineDecoratorImpl< + ContainerState, ContainerEventType, ContainerEvent> + getDebugInfo(ContainerId id) { + return debugInfo.get(id); + } + public org.apache.hadoop.yarn.api.records.ContainerState getCurrentState() { switch (stateMachine.getCurrentState()) { case NEW: