Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestNMClient.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestNMClient.java (revision 312e57b95477ec95e6735f5721c646ad1df019f8) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestNMClient.java (revision 79476aff10258fae21c2fd1e746e55c9bae8b228) @@ -27,6 +27,7 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Set; import java.util.TreeSet; @@ -66,8 +67,12 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.server.MiniYARNCluster; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEventType; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; +import org.apache.hadoop.yarn.state.DebugStateMachineDecoratorImpl; import org.apache.hadoop.yarn.util.Records; import org.junit.After; import org.junit.Assert; @@ -87,6 +92,8 @@ @Before public void setup() throws YarnException, IOException { + // Turn on state tracking + ContainerImpl.enableDebugging(); // start minicluster conf = new YarnConfiguration(); yarnCluster = @@ -365,46 +372,66 @@ Records.newRecord(ContainerLaunchContext.class); if (Shell.WINDOWS) { clc.setCommands( - Arrays.asList("ping", "-n", "100", "127.0.0.1", ">nul")); + Arrays.asList("ping", "-n", "10000000", "127.0.0.1", ">nul")); } else { - clc.setCommands(Arrays.asList("sleep", "10")); + clc.setCommands(Arrays.asList("sleep", "1000000")); } clc.setTokens(securityTokens); try { nmClient.startContainer(container, clc); } catch (YarnException e) { - throw (AssertionError) - (new AssertionError("Exception is not expected: " + e).initCause(e)); + throw new AssertionError("Exception is not expected ", e); } + List array = Collections.singletonList(-1000); // leave one container unclosed if (++i < size) { // NodeManager may still need some time to make the container started testGetContainerStatus(container, i, ContainerState.RUNNING, "", - Arrays.asList(new Integer[] {-1000})); + array); + waitForContainerTransitionCount(container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState.RUNNING, 1); // Test increase container API and make sure requests can reach NM testIncreaseContainerResource(container); testRestartContainer(container.getId()); testGetContainerStatus(container, i, ContainerState.RUNNING, - "will be Restarted", Arrays.asList(new Integer[] {-1000})); + "will be Restarted", array); + waitForContainerTransitionCount(container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState.RUNNING, 2); if (i % 2 == 0) { testReInitializeContainer(container.getId(), clc, false); testGetContainerStatus(container, i, ContainerState.RUNNING, - "will be Re-initialized", Arrays.asList(new Integer[] {-1000})); + "will be Re-initialized", array); + waitForContainerTransitionCount(container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState.RUNNING, 3); + testRollbackContainer(container.getId(), false); testGetContainerStatus(container, i, ContainerState.RUNNING, - "will be Rolled-back", Arrays.asList(new Integer[] {-1000})); + "will be Rolled-back", array); + waitForContainerTransitionCount(container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState.RUNNING, 4); + testCommitContainer(container.getId(), true); testReInitializeContainer(container.getId(), clc, false); testGetContainerStatus(container, i, ContainerState.RUNNING, - "will be Re-initialized", Arrays.asList(new Integer[] {-1000})); + "will be Re-initialized", array); + waitForContainerTransitionCount(container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState.RUNNING, 5); testCommitContainer(container.getId(), false); } else { testReInitializeContainer(container.getId(), clc, true); testGetContainerStatus(container, i, ContainerState.RUNNING, - "will be Re-initialized", Arrays.asList(new Integer[] {-1000})); + "will be Re-initialized", array); + waitForContainerTransitionCount(container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState.RUNNING, 3); testRollbackContainer(container.getId(), true); testCommitContainer(container.getId(), true); } @@ -413,8 +440,7 @@ nmClient.stopContainer(container.getId(), container.getNodeId()); } catch (YarnException e) { throw (AssertionError) - (new AssertionError("Exception is not expected: " + e) - .initCause(e)); + (new AssertionError("Exception is not expected: " + e, e)); } // getContainerStatus can be called after stopContainer @@ -422,22 +448,54 @@ // O is possible if CLEANUP_CONTAINER is executed too late // -105 is possible if the container is not terminated but killed testGetContainerStatus(container, i, ContainerState.COMPLETE, - "Container killed by the ApplicationMaster.", Arrays.asList( - new Integer[] {ContainerExitStatus.KILLED_BY_APPMASTER, - ContainerExitStatus.SUCCESS})); + "Container killed by the ApplicationMaster.", + Arrays.asList( + ContainerExitStatus.KILLED_BY_APPMASTER, + ContainerExitStatus.SUCCESS)); } catch (YarnException e) { // The exception is possible because, after the container is stopped, // it may be removed from NM's context. if (!e.getMessage() .contains("was recently stopped on node manager")) { throw (AssertionError) - (new AssertionError("Exception is not expected: " + e).initCause( - e)); + (new AssertionError("Exception is not expected: ", e)); } } } } } + + /** + * Wait until the container reaches a state N times. + * @param container container to watch + * @param state state to test + * @param transitions the number N above + * @throws YarnException This happens if the test times out while waiting + */ + private void waitForContainerTransitionCount( + Container container, + org.apache.hadoop.yarn.server.nodemanager. + containermanager.container.ContainerState state, long transitions) + throws YarnException { + long transitionCount = -1; + do { + if (transitionCount != -1) { + try { + Thread.sleep(10); + } catch (InterruptedException e) { + throw new YarnException( + "Timeout at transition count:" + transitionCount, e); + } + } + DebugStateMachineDecoratorImpl< + org.apache.hadoop.yarn.server.nodemanager.containermanager. + container.ContainerState, + ContainerEventType, ContainerEvent> debugInfo = + ContainerImpl.getDebugInfo(container.getId()); + transitionCount = debugInfo. + getTransitionCounter(state); + } while (transitionCount != transitions); + } private void sleep(int sleepTime) { try { Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/state/DebugStateMachineDecoratorImpl.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/state/DebugStateMachineDecoratorImpl.java (revision 79476aff10258fae21c2fd1e746e55c9bae8b228) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/state/DebugStateMachineDecoratorImpl.java (revision 79476aff10258fae21c2fd1e746e55c9bae8b228) @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.state; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.util.HashMap; +import java.util.Map; + +/** + * Decorator object to keep track of accumulated state transitions. + * @param The state of the entity. + * @param The external eventType to be handled. + * @param The event object. + */ +public class DebugStateMachineDecoratorImpl, + EVENTTYPE extends Enum, EVENT> + implements StateMachine { + private static final Log LOG = + LogFactory.getLog(DebugStateMachineDecoratorImpl.class); + private StateMachine internal; + private Map transitionCounter = new HashMap<>(); + + public DebugStateMachineDecoratorImpl( + StateMachine wrapped) { + internal = wrapped; + } + + @Override + public synchronized STATE getCurrentState() { + return internal.getCurrentState(); + } + + @Override + public STATE doTransition(EVENTTYPE eventType, EVENT event) throws InvalidStateTransitionException { + STATE previous = getCurrentState(); + STATE current = internal.doTransition(eventType, event); + if (previous != current) { + long sum = transitionCounter.compute(current, + (state, count) -> count == null ? 1 : count + 1); + LOG.info("***** Transition from " + previous + " to " + current); + } + return current; + } + + /** + * Get the current number of state transitions. + * This is useful to check, if an event has occurred in unit tests. + * @param state Return the total transitions to this state + * @return Number of total transitions to the state specified. + */ + public synchronized long getTransitionCounter(STATE state) { + Long ret = transitionCounter.get(state); + return ret != null ? ret : 0; + } +} Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/state/StateMachine.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/state/StateMachine.java (revision 312e57b95477ec95e6735f5721c646ad1df019f8) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/state/StateMachine.java (revision 79476aff10258fae21c2fd1e746e55c9bae8b228) @@ -26,7 +26,7 @@ public interface StateMachine , EVENTTYPE extends Enum, EVENT> { - public STATE getCurrentState(); - public STATE doTransition(EVENTTYPE eventType, EVENT event) + STATE getCurrentState(); + STATE doTransition(EVENTTYPE eventType, EVENT event) throws InvalidStateTransitionException; } Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/state/StateMachineFactory.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/state/StateMachineFactory.java (revision 312e57b95477ec95e6735f5721c646ad1df019f8) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/state/StateMachineFactory.java (revision 79476aff10258fae21c2fd1e746e55c9bae8b228) @@ -50,7 +50,7 @@ private final TransitionsListNode transitionsListNode; private Map>> stateMachineTable; + Transition>> stateMachineTable; private STATE defaultInitialState; @@ -69,8 +69,8 @@ this.stateMachineTable = null; } - private StateMachineFactory - (StateMachineFactory that, + private StateMachineFactory( + StateMachineFactory that, ApplicableTransition t) { this.defaultInitialState = that.defaultInitialState; this.transitionsListNode @@ -79,8 +79,8 @@ this.stateMachineTable = null; } - private StateMachineFactory - (StateMachineFactory that, + private StateMachineFactory( + StateMachineFactory that, boolean optimized) { this.defaultInitialState = that.defaultInitialState; this.transitionsListNode = that.transitionsListNode; @@ -102,8 +102,8 @@ final ApplicableTransition transition; final TransitionsListNode next; - TransitionsListNode - (ApplicableTransition transition, + TransitionsListNode( + ApplicableTransition transition, TransitionsListNode next) { this.transition = transition; this.next = next; @@ -127,8 +127,8 @@ } @Override - public void apply - (StateMachineFactory subject) { + public void apply( + StateMachineFactory subject) { Map> transitionMap = subject.stateMachineTable.get(preState); if (transitionMap == null) { @@ -282,13 +282,14 @@ /** * Effect a transition due to the effecting stimulus. - * @param state current state + * @param operand operand to work on + * @param oldState current state * @param eventType trigger to initiate the transition - * @param cause causal eventType context + * @param event causal eventType context * @return transitioned state */ - private STATE doTransition - (OPERAND operand, STATE oldState, EVENTTYPE eventType, EVENT event) + private STATE doTransition( + OPERAND operand, STATE oldState, EVENTTYPE eventType, EVENT event) throws InvalidStateTransitionException { // We can assume that stateMachineTable is non-null because we call // maybeMakeStateMachineTable() when we build an InnerStateMachine , @@ -445,14 +446,14 @@ @Override public synchronized STATE doTransition(EVENTTYPE eventType, EVENT event) throws InvalidStateTransitionException { - currentState = StateMachineFactory.this.doTransition - (operand, currentState, eventType, event); + currentState = StateMachineFactory.this.doTransition( + operand, currentState, eventType, event); return currentState; } } /** - * Generate a graph represents the state graph of this StateMachine + * Generate a graph represents the state graph of this StateMachine. * @param name graph name * @return Graph object generated */ @@ -463,7 +464,8 @@ for (STATE startState : stateMachineTable.keySet()) { Map> transitions = stateMachineTable.get(startState); - for (Entry> entry : + for ( + Entry> entry : transitions.entrySet()) { Transition transition = entry.getValue(); if (transition instanceof StateMachineFactory.SingleInternalArc) { Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java (revision 312e57b95477ec95e6735f5721c646ad1df019f8) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java (revision 79476aff10258fae21c2fd1e746e55c9bae8b228) @@ -30,6 +30,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; @@ -84,6 +85,7 @@ import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredContainerStatus; import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher; import org.apache.hadoop.yarn.server.utils.BuilderUtils; +import org.apache.hadoop.yarn.state.DebugStateMachineDecoratorImpl; import org.apache.hadoop.yarn.state.InvalidStateTransitionException; import org.apache.hadoop.yarn.state.MultipleArcTransition; import org.apache.hadoop.yarn.state.SingleArcTransition; @@ -230,7 +232,18 @@ this.containerRetryContext = configureRetryContext( conf, launchContext, this.containerId); this.remainingRetryAttempts = this.containerRetryContext.getMaxRetries(); - stateMachine = stateMachineFactory.make(this); + if (debugInfo != null) { + // Unit tests may get notification about state transition counts + // using this object. This applies only, if they run in the same process. + DebugStateMachineDecoratorImpl< + ContainerState, ContainerEventType, ContainerEvent> debug = + new DebugStateMachineDecoratorImpl<>( + stateMachineFactory.make(this)); + debugInfo.put(containerId, debug); + stateMachine = debug; + } else { + stateMachine = stateMachineFactory.make(this); + } this.context = context; this.resourceSet = new ResourceSet(); } @@ -523,6 +536,28 @@ private final StateMachine stateMachine; + @VisibleForTesting + private static Map> debugInfo = null; + + /** + * Collect state transition counters. + */ + public static void enableDebugging() { + debugInfo = new ConcurrentHashMap<>(); + } + + /** + * Return debug information. + * @param id Container id to return information about + * @return Debug information object + */ + public static DebugStateMachineDecoratorImpl< + ContainerState, ContainerEventType, ContainerEvent> + getDebugInfo(ContainerId id) { + return debugInfo.get(id); + } + public org.apache.hadoop.yarn.api.records.ContainerState getCurrentState() { switch (stateMachine.getCurrentState()) { case NEW: