diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java index 2bf9c93..bcab5a1 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java @@ -65,6 +65,17 @@ * continue to run even after this client is stopped and till the application * runs at which point ResourceManager will forcefully kill them. *

+ * + *

+ * Note that the blocking APIs ensure the RPC calls to NodeManager + * are executed immediately, and the responses are received before these APIs + * return. However, when {@link #startContainer} or {@link #stopContainer} + * returns, NodeManager may still need some time to either start + * or stop the container because of its asynchronous implementation. Therefore, + * {@link #getContainerStatus} is likely to return a transit container status + * if it is executed immediately after {@link #startContainer} or + * {@link #stopContainer}. + *

*/ public class NMClientImpl extends AbstractService implements NMClient { diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java index 3b4439e..f0936a3 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java @@ -20,8 +20,8 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.IOException; @@ -226,7 +226,7 @@ public void testNMClient() } private void testContainerManagement(NMClientImpl nmClient, - Set containers) throws IOException { + Set containers) throws YarnRemoteException, IOException { int size = containers.size(); int i = 0; for (Container container : containers) { @@ -269,17 +269,9 @@ private void testContainerManagement(NMClientImpl nmClient, // leave one container unclosed if (++i < size) { - try { - ContainerStatus status = nmClient.getContainerStatus(container.getId(), - container.getNodeId(), container.getContainerToken()); - // verify the container is started and in good shape - assertEquals(container.getId(), status.getContainerId()); - assertEquals(ContainerState.RUNNING, status.getState()); - assertEquals("", status.getDiagnostics()); - assertEquals(-1000, status.getExitStatus()); - } catch (YarnRemoteException e) { - fail("Exception is not expected"); - } + // NodeManager may still need some time to make the container started + testGetContainerStatus(container, i, ContainerState.RUNNING, "", + -1000); try { nmClient.stopContainer(container.getId(), container.getNodeId(), @@ -289,18 +281,8 @@ private void testContainerManagement(NMClientImpl nmClient, } // getContainerStatus can be called after stopContainer - try { - ContainerStatus status = nmClient.getContainerStatus( - container.getId(), container.getNodeId(), - container.getContainerToken()); - assertEquals(container.getId(), status.getContainerId()); - assertEquals(ContainerState.RUNNING, status.getState()); - assertTrue("" + i, status.getDiagnostics().contains( - "Container killed by the ApplicationMaster.")); - assertEquals(-1000, status.getExitStatus()); - } catch (YarnRemoteException e) { - fail("Exception is not expected"); - } + testGetContainerStatus(container, i, ContainerState.COMPLETE, + "Container killed by the ApplicationMaster.", 143); } } } @@ -313,4 +295,28 @@ private void sleep(int sleepTime) { } } + private void testGetContainerStatus(Container container, int index, + ContainerState state, String diagnostics, int exitStatus) + throws YarnRemoteException, IOException { + while (true) { + try { + ContainerStatus status = nmClient.getContainerStatus( + container.getId(), container.getNodeId(), + container.getContainerToken()); + // NodeManager may still need some time to get the stable + // container status + if (status.getState() == state) { + assertEquals(container.getId(), status.getContainerId()); + assertTrue("" + index + ": " + status.getDiagnostics(), + status.getDiagnostics().contains(diagnostics)); + assertEquals(exitStatus, status.getExitStatus()); + break; + } + Thread.sleep(100); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + }