diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerExitStatus.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerExitStatus.java index 76cb6c1..80ed283 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerExitStatus.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerExitStatus.java @@ -41,4 +41,6 @@ * threshold number of the nodemanager-log-directories become bad. */ public static final int DISKS_FAILED = -101; + + public static final int AUXSERVICE_FAILED = -102; } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/ApplicationMaster.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/ApplicationMaster.java index 8914646..7560c3a 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/ApplicationMaster.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/ApplicationMaster.java @@ -653,6 +653,17 @@ public void onError(Throwable e) { done = true; resourceManager.stop(); } + + @Override + public void onAuxServiceFailure(List containerStatus) { + for (ContainerStatus status : containerStatus) { + if (status.getExitStatus() == ContainerExitStatus.AUXSERVICE_FAILED) { + LOG.error(status.getDiagnostics()); + done = true; + resourceManager.stop(); + } + } + } } private class NMCallbackHandler implements NMClientAsync.CallbackHandler { diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/api/async/AMRMClientAsync.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/api/async/AMRMClientAsync.java index e726b73..c546df7 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/api/async/AMRMClientAsync.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/api/async/AMRMClientAsync.java @@ -217,8 +217,11 @@ public abstract void unregisterApplicationMaster( * availability etc. */ public void onNodesUpdated(List updatedNodes); - + + public void onAuxServiceFailure(List statuses); + public float getProgress(); + /** * Called when error comes from RM communications as well as from errors in diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/api/async/impl/AMRMClientAsyncImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/api/async/impl/AMRMClientAsyncImpl.java index dbea253..dbcfe07 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/api/async/impl/AMRMClientAsyncImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/api/async/impl/AMRMClientAsyncImpl.java @@ -300,6 +300,7 @@ public void run() { List completed = response.getCompletedContainersStatuses(); if (!completed.isEmpty()) { + handler.onAuxServiceFailure(completed); handler.onContainersCompleted(completed); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/async/impl/TestAMRMClientAsync.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/async/impl/TestAMRMClientAsync.java index 710f348..20747aa 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/async/impl/TestAMRMClientAsync.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/async/impl/TestAMRMClientAsync.java @@ -458,6 +458,10 @@ public void onError(Throwable e) { notifier.notifyAll(); } } + + @Override + public void onAuxServiceFailure(List statuses) { + } } private class TestCallbackHandler2 implements AMRMClientAsync.CallbackHandler { @@ -505,5 +509,9 @@ void callStopAndNotify() { notifier.notifyAll(); } } + + @Override + public void onAuxServiceFailure(List statuses) { + } } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/AuxServices.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/AuxServices.java index 955ccbf..6657c3a 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/AuxServices.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/AuxServices.java @@ -32,11 +32,15 @@ import org.apache.hadoop.service.Service; import org.apache.hadoop.service.ServiceStateChangeListener; import org.apache.hadoop.util.ReflectionUtils; +import org.apache.hadoop.yarn.api.records.ContainerExitStatus; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.server.api.ApplicationTerminationContext; import org.apache.hadoop.yarn.server.api.AuxiliaryService; import org.apache.hadoop.yarn.server.api.ApplicationInitializationContext; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEventType; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerExitEvent; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; public class AuxServices extends AbstractService implements ServiceStateChangeListener, EventHandler { @@ -169,13 +173,30 @@ public void handle(AuxServicesEvent event) { // TODO kill all containers waiting on Application return; } - service.initializeApplication(new ApplicationInitializationContext(event - .getUser(), event.getApplicationID(), event.getServiceData())); + try { + service + .initializeApplication(new ApplicationInitializationContext(event + .getUser(), event.getApplicationID(), event.getServiceData())); + } catch (Throwable th) { + ContainerImpl container = event.getContainer(); + StringBuilder diagnostics = new StringBuilder(); + diagnostics.append("AuxService failed at Container " + + container.getContainerId() + " : " + th.getMessage()); + container.handle(new ContainerExitEvent( + container.getContainerId(), + ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, + ContainerExitStatus.AUXSERVICE_FAILED, + diagnostics.toString())); + } break; case APPLICATION_STOP: - for (AuxiliaryService serv : serviceMap.values()) { - serv.stopApplication(new ApplicationTerminationContext(event - .getApplicationID())); + try { + for (AuxiliaryService serv : serviceMap.values()) { + serv.stopApplication(new ApplicationTerminationContext(event + .getApplicationID())); + } + } catch (Throwable ex) { + //Do nothing. The application is already finished. } break; default: diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/AuxServicesEvent.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/AuxServicesEvent.java index 4b9c931..e42af9c 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/AuxServicesEvent.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/AuxServicesEvent.java @@ -21,7 +21,9 @@ import java.nio.ByteBuffer; import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.event.AbstractEvent; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; public class AuxServicesEvent extends AbstractEvent { @@ -29,16 +31,22 @@ private final String serviceId; private final ByteBuffer serviceData; private final ApplicationId appId; + private final ContainerImpl container; public AuxServicesEvent(AuxServicesEventType eventType, ApplicationId appId) { - this(eventType, null, appId, null, null); + this(eventType, null, appId, null, null, null); } - public AuxServicesEvent(AuxServicesEventType eventType, String user, - ApplicationId appId, String serviceId, ByteBuffer serviceData) { + public AuxServicesEvent(AuxServicesEventType eventType, ContainerImpl container) { + this(eventType, null, null, container, null, null); + } + + public AuxServicesEvent(AuxServicesEventType eventType, String user, ApplicationId appId, + ContainerImpl container, String serviceId, ByteBuffer serviceData) { super(eventType); this.user = user; this.appId = appId; + this.container = container; this.serviceId = serviceId; this.serviceData = serviceData; } @@ -55,8 +63,11 @@ public String getUser() { return user; } + public ContainerImpl getContainer() { + return container; + } + public ApplicationId getApplicationID() { return appId; } - } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index 54a2cbe..33047f4 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -142,6 +142,10 @@ public ContainerImpl(Configuration conf, Dispatcher dispatcher, UPDATE_DIAGNOSTICS_TRANSITION) .addTransition(ContainerState.NEW, ContainerState.DONE, ContainerEventType.KILL_CONTAINER, CONTAINER_DONE_TRANSITION) + .addTransition(ContainerState.NEW, + ContainerState.EXITED_WITH_FAILURE, + ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, + new ExitedWithFailureTransition(true)) // From LOCALIZING State .addTransition(ContainerState.LOCALIZING, @@ -157,6 +161,10 @@ public ContainerImpl(Configuration conf, Dispatcher dispatcher, .addTransition(ContainerState.LOCALIZING, ContainerState.KILLING, ContainerEventType.KILL_CONTAINER, new KillDuringLocalizationTransition()) + .addTransition(ContainerState.LOCALIZING, + ContainerState.EXITED_WITH_FAILURE, + ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, + new ExitedWithFailureTransition(true)) // From LOCALIZATION_FAILED State .addTransition(ContainerState.LOCALIZATION_FAILED, @@ -180,6 +188,9 @@ public ContainerImpl(Configuration conf, Dispatcher dispatcher, .addTransition(ContainerState.LOCALIZATION_FAILED, ContainerState.LOCALIZATION_FAILED, ContainerEventType.RESOURCE_FAILED) + .addTransition(ContainerState.LOCALIZATION_FAILED, ContainerState.EXITED_WITH_FAILURE, + ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, + new ExitedWithFailureTransition(true)) // From LOCALIZED State .addTransition(ContainerState.LOCALIZED, ContainerState.RUNNING, @@ -508,11 +519,11 @@ public ContainerState transition(ContainerImpl container, if (csd != null) { // This can happen more than once per Application as each container may // have distinct service data + ApplicationId appId = container.containerId.getApplicationAttemptId().getApplicationId(); for (Map.Entry service : csd.entrySet()) { container.dispatcher.getEventHandler().handle( new AuxServicesEvent(AuxServicesEventType.APPLICATION_INIT, - container.user, container.containerId - .getApplicationAttemptId().getApplicationId(), + container.user, appId, container, service.getKey().toString(), service.getValue())); } } @@ -684,6 +695,7 @@ public ExitedWithFailureTransition(boolean clCleanupRequired) { public void transition(ContainerImpl container, ContainerEvent event) { ContainerExitEvent exitEvent = (ContainerExitEvent) event; container.exitCode = exitEvent.getExitCode(); + container.diagnostics.append(exitEvent.getDiagnosticInfo()).append("\n"); // TODO: Add containerWorkDir to the deletion service. // TODO: Add containerOuputDir to the deletion service. diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestAuxServices.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestAuxServices.java index fb4b69a..b7dd84b 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestAuxServices.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestAuxServices.java @@ -129,11 +129,11 @@ public void testAuxEventDispatch() { buf.putInt(65); buf.flip(); AuxServicesEvent event = new AuxServicesEvent( - AuxServicesEventType.APPLICATION_INIT, "user0", appId1, "Asrv", buf); + AuxServicesEventType.APPLICATION_INIT, "user0", appId1, null, "Asrv", buf); aux.handle(event); ApplicationId appId2 = ApplicationId.newInstance(0, 66); event = new AuxServicesEvent( - AuxServicesEventType.APPLICATION_STOP, "user0", appId2, "Bsrv", null); + AuxServicesEventType.APPLICATION_STOP, "user0", appId2, null, "Bsrv", null); // verify all services got the stop event aux.handle(event); Collection servs = aux.getServices();