diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java index 2bf9c93..1e638f2 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java @@ -65,6 +65,17 @@ * continue to run even after this client is stopped and till the application * runs at which point ResourceManager will forcefully kill them. *

+ * + *

+ * Note that the blocking APIs ensure the RPC calls to NodeManager + * are executed immediately, and the responses are received before these APIs + * return. However, when {@link #startContainer} or {@link #stopContainer} + * returns, NodeManager may still need some time to make the + * container actually started or stopped because of its asynchronous + * implementation. Therefore, {@link #getContainerStatus} is likely to return a + * transit container status if it is executed immediately after + * {@link #startContainer} or {@link #stopContainer}. + *

*/ public class NMClientImpl extends AbstractService implements NMClient { diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java index 3b4439e..1e3561a 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java @@ -269,16 +269,24 @@ private void testContainerManagement(NMClientImpl nmClient, // leave one container unclosed if (++i < size) { - try { - ContainerStatus status = nmClient.getContainerStatus(container.getId(), - container.getNodeId(), container.getContainerToken()); - // verify the container is started and in good shape - assertEquals(container.getId(), status.getContainerId()); - assertEquals(ContainerState.RUNNING, status.getState()); - assertEquals("", status.getDiagnostics()); - assertEquals(-1000, status.getExitStatus()); - } catch (YarnRemoteException e) { - fail("Exception is not expected"); + // NodeManager may still need some time to make the container started + while (true) { + try { + ContainerStatus status = nmClient.getContainerStatus + (container.getId(), container.getNodeId(), + container.getContainerToken()); + if (status.getState() == ContainerState.RUNNING) { + assertEquals(container.getId(), status.getContainerId()); + assertEquals("", status.getDiagnostics()); + assertEquals(-1000, status.getExitStatus()); + break; + } + Thread.sleep(100); + } catch (YarnRemoteException e) { + fail("Exception is not expected"); + } catch (InterruptedException e) { + e.printStackTrace(); + } } try { @@ -289,17 +297,27 @@ private void testContainerManagement(NMClientImpl nmClient, } // getContainerStatus can be called after stopContainer - try { - ContainerStatus status = nmClient.getContainerStatus( - container.getId(), container.getNodeId(), - container.getContainerToken()); - assertEquals(container.getId(), status.getContainerId()); - assertEquals(ContainerState.RUNNING, status.getState()); - assertTrue("" + i, status.getDiagnostics().contains( - "Container killed by the ApplicationMaster.")); - assertEquals(-1000, status.getExitStatus()); - } catch (YarnRemoteException e) { - fail("Exception is not expected"); + while (true) { + try { + ContainerStatus status = nmClient.getContainerStatus( + container.getId(), container.getNodeId(), + container.getContainerToken()); + // NodeManager may still need some time to ensure the container + // stopped + if (status.getState() == ContainerState.COMPLETE) { + assertEquals(container.getId(), status.getContainerId()); + assertTrue("" + i + ": " + status.getDiagnostics(), + status.getDiagnostics().contains( + "Container killed by the ApplicationMaster.")); + assertEquals(143, status.getExitStatus()); + break; + } + Thread.sleep(100); + } catch (YarnRemoteException e) { + fail("Exception is not expected"); + } catch (InterruptedException e) { + e.printStackTrace(); + } } } }