diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/server/yarn_server_resourcemanager_service_protos.proto hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/server/yarn_server_resourcemanager_service_protos.proto index aa9e0c6..cbd803c 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/server/yarn_server_resourcemanager_service_protos.proto +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/server/yarn_server_resourcemanager_service_protos.proto @@ -127,6 +127,7 @@ message ApplicationAttemptStateDataProto { optional string diagnostics = 6 [default = "N/A"]; optional int64 start_time = 7; optional FinalApplicationStatusProto final_application_status = 8; + optional int32 exit_status = 9 [default = -1000]; } message RMStateVersionProto { diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileSystemRMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileSystemRMStateStore.java index 7f4dad8..37f08cf 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileSystemRMStateStore.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileSystemRMStateStore.java @@ -216,7 +216,8 @@ private void loadRMAppState(RMState rmState) throws Exception { attemptStateData.getState(), attemptStateData.getFinalTrackingUrl(), attemptStateData.getDiagnostics(), - attemptStateData.getFinalApplicationStatus()); + attemptStateData.getFinalApplicationStatus(), + attemptStateData.getAMContainerExitStatus()); // assert child node name is same as application attempt id assert attemptId.equals(attemptState.getAttemptId()); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/MemoryRMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/MemoryRMStateStore.java index a43b20d..fb0ce1a 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/MemoryRMStateStore.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/MemoryRMStateStore.java @@ -152,7 +152,8 @@ public synchronized void updateApplicationAttemptStateInternal( attemptStateData.getStartTime(), attemptStateData.getState(), attemptStateData.getFinalTrackingUrl(), attemptStateData.getDiagnostics(), - attemptStateData.getFinalApplicationStatus()); + attemptStateData.getFinalApplicationStatus(), + attemptStateData.getAMContainerExitStatus()); ApplicationState appState = state.getApplicationState().get( diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java index affc6f9..b18a874 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java @@ -39,6 +39,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; import org.apache.hadoop.yarn.api.records.Container; +import org.apache.hadoop.yarn.api.records.ContainerExitStatus; import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl; import org.apache.hadoop.yarn.event.AsyncDispatcher; @@ -258,19 +259,21 @@ public RMStateStore() { RMAppAttemptState state; String finalTrackingUrl = "N/A"; String diagnostics; + int exitStatus = ContainerExitStatus.INVALID; FinalApplicationStatus amUnregisteredFinalStatus; public ApplicationAttemptState(ApplicationAttemptId attemptId, Container masterContainer, Credentials appAttemptCredentials, long startTime) { this(attemptId, masterContainer, appAttemptCredentials, startTime, null, - null, "", null); + null, "", null, ContainerExitStatus.INVALID); } public ApplicationAttemptState(ApplicationAttemptId attemptId, Container masterContainer, Credentials appAttemptCredentials, long startTime, RMAppAttemptState state, String finalTrackingUrl, - String diagnostics, FinalApplicationStatus amUnregisteredFinalStatus) { + String diagnostics, FinalApplicationStatus amUnregisteredFinalStatus, + int exitStatus) { this.attemptId = attemptId; this.masterContainer = masterContainer; this.appAttemptCredentials = appAttemptCredentials; @@ -279,6 +282,7 @@ public ApplicationAttemptState(ApplicationAttemptId attemptId, this.finalTrackingUrl = finalTrackingUrl; this.diagnostics = diagnostics == null ? "" : diagnostics; this.amUnregisteredFinalStatus = amUnregisteredFinalStatus; + this.exitStatus = exitStatus; } public Container getMasterContainer() { @@ -305,6 +309,9 @@ public long getStartTime() { public FinalApplicationStatus getFinalApplicationStatus() { return amUnregisteredFinalStatus; } + public int getAMContainerExitStatus(){ + return this.exitStatus; + } } /** diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java index 63ae990..9ff128d 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java @@ -538,12 +538,12 @@ private void loadApplicationAttemptState(ApplicationState appState, ApplicationAttemptState attemptState = new ApplicationAttemptState(attemptId, - attemptStateData.getMasterContainer(), credentials, - attemptStateData.getStartTime(), - attemptStateData.getState(), - attemptStateData.getFinalTrackingUrl(), - attemptStateData.getDiagnostics(), - attemptStateData.getFinalApplicationStatus()); + attemptStateData.getMasterContainer(), credentials, + attemptStateData.getStartTime(), attemptStateData.getState(), + attemptStateData.getFinalTrackingUrl(), + attemptStateData.getDiagnostics(), + attemptStateData.getFinalApplicationStatus(), + attemptStateData.getAMContainerExitStatus()); appState.attempts.put(attemptState.getAttemptId(), attemptState); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/ApplicationAttemptStateData.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/ApplicationAttemptStateData.java index 6af048b..90fb3ec 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/ApplicationAttemptStateData.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/ApplicationAttemptStateData.java @@ -43,7 +43,7 @@ public static ApplicationAttemptStateData newInstance( ApplicationAttemptId attemptId, Container container, ByteBuffer attemptTokens, long startTime, RMAppAttemptState finalState, String finalTrackingUrl, String diagnostics, - FinalApplicationStatus amUnregisteredFinalStatus) { + FinalApplicationStatus amUnregisteredFinalStatus, int exitStatus) { ApplicationAttemptStateData attemptStateData = Records.newRecord(ApplicationAttemptStateData.class); attemptStateData.setAttemptId(attemptId); @@ -54,6 +54,7 @@ public static ApplicationAttemptStateData newInstance( attemptStateData.setDiagnostics(diagnostics); attemptStateData.setStartTime(startTime); attemptStateData.setFinalApplicationStatus(amUnregisteredFinalStatus); + attemptStateData.setAMContainerExitStatus(exitStatus); return attemptStateData; } @@ -67,11 +68,11 @@ public static ApplicationAttemptStateData newInstance( appAttemptTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); } return newInstance(attemptState.getAttemptId(), - attemptState.getMasterContainer(), appAttemptTokens, - attemptState.getStartTime(), attemptState.getState(), - attemptState.getFinalTrackingUrl(), - attemptState.getDiagnostics(), - attemptState.getFinalApplicationStatus()); + attemptState.getMasterContainer(), appAttemptTokens, + attemptState.getStartTime(), attemptState.getState(), + attemptState.getFinalTrackingUrl(), attemptState.getDiagnostics(), + attemptState.getFinalApplicationStatus(), + attemptState.getAMContainerExitStatus()); } public abstract ApplicationAttemptStateDataProto getProto(); @@ -150,5 +151,10 @@ public static ApplicationAttemptStateData newInstance( */ public abstract FinalApplicationStatus getFinalApplicationStatus(); - public abstract void setFinalApplicationStatus(FinalApplicationStatus finishState); + public abstract void setFinalApplicationStatus( + FinalApplicationStatus finishState); + + public abstract int getAMContainerExitStatus(); + + public abstract void setAMContainerExitStatus(int exitStatus); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/impl/pb/ApplicationAttemptStateDataPBImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/impl/pb/ApplicationAttemptStateDataPBImpl.java index e3ebe5e..180b4a4 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/impl/pb/ApplicationAttemptStateDataPBImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/records/impl/pb/ApplicationAttemptStateDataPBImpl.java @@ -248,11 +248,19 @@ public void setFinalApplicationStatus(FinalApplicationStatus finishState) { } @Override - public int hashCode() { - return getProto().hashCode(); + public int getAMContainerExitStatus() { + ApplicationAttemptStateDataProtoOrBuilder p = viaProto ? proto : builder; + return p.getExitStatus(); } @Override + public void setAMContainerExitStatus(int exitStatus) { + maybeInitBuilder(); + builder.setExitStatus(exitStatus); + } + + + @Override public boolean equals(Object other) { if (other == null) return false; @@ -281,5 +289,4 @@ private FinalApplicationStatusProto convertToProtoFormat(FinalApplicationStatus private FinalApplicationStatus convertFromProtoFormat(FinalApplicationStatusProto s) { return ProtoUtils.convertFromProtoFormat(s); } - } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java index 3318f15..226f5ba 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java @@ -71,14 +71,12 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppRemovedSchedulerEvent; -import org.apache.hadoop.yarn.server.resourcemanager.RMServerUtils; import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.state.InvalidStateTransitonException; import org.apache.hadoop.yarn.state.MultipleArcTransition; import org.apache.hadoop.yarn.state.SingleArcTransition; import org.apache.hadoop.yarn.state.StateMachine; import org.apache.hadoop.yarn.state.StateMachineFactory; -import org.apache.hadoop.yarn.util.resource.Resources; @SuppressWarnings({ "rawtypes", "unchecked" }) public class RMAppImpl implements RMApp, Recoverable { @@ -662,7 +660,7 @@ private void createNewAttempt() { ApplicationAttemptId.newInstance(applicationId, attempts.size() + 1); RMAppAttempt attempt = new RMAppAttemptImpl(appAttemptId, rmContext, scheduler, masterService, - submissionContext, conf, maxAppAttempts == attempts.size()); + submissionContext, conf, maxAppAttempts == getAttemptFailureCount()); attempts.put(appAttemptId, attempt); currentAttempt = attempt; } @@ -753,7 +751,7 @@ public RMAppState transition(RMAppImpl app, RMAppEvent event) { && (app.currentAttempt.getState() == RMAppAttemptState.KILLED || app.currentAttempt.getState() == RMAppAttemptState.FINISHED || (app.currentAttempt.getState() == RMAppAttemptState.FAILED - && app.attempts.size() == app.maxAppAttempts))) { + && app.getAttemptFailureCount() == app.maxAppAttempts))) { return RMAppState.ACCEPTED; } @@ -844,7 +842,7 @@ private String getAppAttemptFailedDiagnostics(RMAppEvent event) { msg = "Unmanaged application " + this.getApplicationId() + " failed due to " + failedEvent.getDiagnostics() + ". Failing the application."; - } else if (this.attempts.size() >= this.maxAppAttempts) { + } else if (getAttemptFailureCount() >= this.maxAppAttempts) { msg = "Application " + this.getApplicationId() + " failed " + this.maxAppAttempts + " times due to " + failedEvent.getDiagnostics() + ". Failing the application."; @@ -1070,6 +1068,17 @@ public void transition(RMAppImpl app, RMAppEvent event) { }; } + private int getAttemptFailureCount() { + int attemptFailures = 0; + // Do not count AM preemption as attempt failure. + for (RMAppAttempt attempt : attempts.values()) { + if (!((RMAppAttemptImpl) attempt).isPreempted()) { + attemptFailures++; + } + } + return attemptFailures; + } + private static final class AttemptFailedTransition implements MultipleArcTransition { @@ -1081,8 +1090,9 @@ public AttemptFailedTransition(RMAppState initialState) { @Override public RMAppState transition(RMAppImpl app, RMAppEvent event) { + if (!app.submissionContext.getUnmanagedAM() - && app.attempts.size() < app.maxAppAttempts) { + && app.getAttemptFailureCount() < app.maxAppAttempts) { boolean transferStateFromPreviousAttempt = false; RMAppFailedAttemptEvent failedEvent = (RMAppFailedAttemptEvent) event; transferStateFromPreviousAttempt = diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java index 2a1170d..3fd1f70 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java @@ -35,7 +35,6 @@ import javax.crypto.SecretKey; -import com.google.common.annotations.VisibleForTesting; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -51,6 +50,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationResourceUsageReport; import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; import org.apache.hadoop.yarn.api.records.Container; +import org.apache.hadoop.yarn.api.records.ContainerExitStatus; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; @@ -103,6 +103,8 @@ import org.apache.hadoop.yarn.state.StateMachineFactory; import org.apache.hadoop.yarn.webapp.util.WebAppUtils; +import com.google.common.annotations.VisibleForTesting; + @SuppressWarnings({"unchecked", "rawtypes"}) public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { @@ -152,6 +154,7 @@ // if an RMAppAttemptUnregistrationEvent occurs private FinalApplicationStatus finalStatus = null; private final StringBuilder diagnostics = new StringBuilder(); + private int amContainerExitStatus = ContainerExitStatus.INVALID; private Configuration conf; private final boolean isLastAttempt; @@ -583,6 +586,15 @@ public String getDiagnostics() { } } + public int getAMContainerExitStatus() { + this.readLock.lock(); + try { + return this.amContainerExitStatus; + } finally { + this.readLock.unlock(); + } + } + @Override public float getProgress() { this.readLock.lock(); @@ -694,6 +706,7 @@ public void recover(RMState state) throws Exception { + attemptState.getState()); diagnostics.append("Attempt recovered after RM restart"); diagnostics.append(attemptState.getDiagnostics()); + this.amContainerExitStatus = attemptState.getAMContainerExitStatus(); setMasterContainer(attemptState.getMasterContainer()); recoverAppAttemptCredentials(attemptState.getAppAttemptCredentials()); this.recoveredFinalState = attemptState.getState(); @@ -955,7 +968,7 @@ private void rememberTargetTransitionsAndStoreState(RMAppAttemptEvent event, String diags = null; String finalTrackingUrl = null; FinalApplicationStatus finalStatus = null; - + int exitStatus = ContainerExitStatus.PREEMPTED; switch (event.getType()) { case LAUNCH_FAILED: RMAppAttemptLaunchFailedEvent launchFaileEvent = @@ -976,6 +989,7 @@ private void rememberTargetTransitionsAndStoreState(RMAppAttemptEvent event, RMAppAttemptContainerFinishedEvent finishEvent = (RMAppAttemptContainerFinishedEvent) event; diags = getAMContainerCrashedDiagnostics(finishEvent); + exitStatus = finishEvent.getContainerStatus().getExitStatus(); break; case KILL: break; @@ -990,7 +1004,7 @@ private void rememberTargetTransitionsAndStoreState(RMAppAttemptEvent event, ApplicationAttemptState attemptState = new ApplicationAttemptState(applicationAttemptId, getMasterContainer(), rmStore.getCredentialsFromAppAttempt(this), startTime, - stateToBeStored, finalTrackingUrl, diags, finalStatus); + stateToBeStored, finalTrackingUrl, diags, finalStatus, exitStatus); LOG.info("Updating application attempt " + applicationAttemptId + " with final state: " + targetedFinalState); rmStore.updateApplicationAttemptState(attemptState); @@ -1085,12 +1099,13 @@ public void transition(RMAppAttemptImpl appAttempt, // don't leave the tracking URL pointing to a non-existent AM appAttempt.setTrackingUrlToRMAppPage(); appAttempt.invalidateAMHostAndPort(); - if (appAttempt.submissionContext - .getKeepContainersAcrossApplicationAttempts() - && !appAttempt.isLastAttempt - && !appAttempt.submissionContext.getUnmanagedAM()) { + + if ((appAttempt.submissionContext.getKeepContainersAcrossApplicationAttempts() + && !appAttempt.submissionContext.getUnmanagedAM()) + && (!appAttempt.isLastAttempt || appAttempt.isPreempted())) { keepContainersAcrossAppAttempts = true; } + appEvent = new RMAppFailedAttemptEvent(applicationId, RMAppEventType.ATTEMPT_FAILED, appAttempt.getDiagnostics(), @@ -1129,7 +1144,11 @@ public void transition(RMAppAttemptImpl appAttempt, appAttempt.getClientTokenMasterKey()); } } - + + public boolean isPreempted() { + return getAMContainerExitStatus() == ContainerExitStatus.PREEMPTED; + } + private static final class UnmanagedAMAttemptSavedTransition extends AMLaunchedTransition { @Override @@ -1232,14 +1251,26 @@ public void transition(RMAppAttemptImpl appAttempt, appAttempt.rmContext.getAMLivelinessMonitor().unregister( appAttempt.getAppAttemptId()); - // Setup diagnostic message - appAttempt.diagnostics - .append(getAMContainerCrashedDiagnostics(finishEvent)); + // Setup diagnostic message and exit status + appAttempt.setAMContainerCrashedDiagnosticsAndExitStatus(finishEvent); + // Tell the app, scheduler super.transition(appAttempt, finishEvent); } } + private void setAMContainerCrashedDiagnosticsAndExitStatus( + RMAppAttemptContainerFinishedEvent finishEvent) { + ContainerStatus status = finishEvent.getContainerStatus(); + String diagnostics = + "AM Container for " + finishEvent.getApplicationAttemptId() + + " exited with " + " exitCode: " + status.getExitStatus() + + " due to: " + status.getDiagnostics() + "." + + "Failing this attempt."; + this.diagnostics.append(diagnostics); + this.amContainerExitStatus = status.getExitStatus(); + } + private static String getAMContainerCrashedDiagnostics( RMAppAttemptContainerFinishedEvent finishEvent) { ContainerStatus status = finishEvent.getContainerStatus(); @@ -1472,13 +1503,12 @@ public RMAppAttemptState transition(RMAppAttemptImpl appAttempt, @Override public void transition(RMAppAttemptImpl appAttempt, RMAppAttemptEvent event) { - RMAppAttemptContainerFinishedEvent containerFinishedEvent = + RMAppAttemptContainerFinishedEvent finishEvent = (RMAppAttemptContainerFinishedEvent) event; // container associated with AM. must not be unmanaged assert appAttempt.submissionContext.getUnmanagedAM() == false; - // Setup diagnostic message - appAttempt.diagnostics - .append(getAMContainerCrashedDiagnostics(containerFinishedEvent)); + // Setup diagnostic message and exit status + appAttempt.setAMContainerCrashedDiagnosticsAndExitStatus(finishEvent); new FinalTransition(RMAppAttemptState.FAILED).transition(appAttempt, event); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java index 5de407d..74eb196 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java @@ -1076,14 +1076,12 @@ public void preemptContainer(ApplicationAttemptId aid, RMContainer cont) { @Override public void killContainer(RMContainer cont) { - if(LOG.isDebugEnabled()){ + if (LOG.isDebugEnabled()) { LOG.debug("KILL_CONTAINER: container" + cont.toString()); } - completedContainer(cont, - SchedulerUtils.createPreemptedContainerStatus( - cont.getContainerId(),"Container being forcibly preempted:" - + cont.getContainerId()), - RMContainerEventType.KILL); + completedContainer(cont, SchedulerUtils.createPreemptedContainerStatus( + cont.getContainerId(), SchedulerUtils.PREEMPTED_CONTAINER), + RMContainerEventType.KILL); } @Override diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java index e3a5776..e2bde01 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNM.java @@ -87,7 +87,7 @@ public int getHttpPort() { return httpPort; } - void setResourceTrackerService(ResourceTrackerService resourceTracker) { + public void setResourceTrackerService(ResourceTrackerService resourceTracker) { this.resourceTracker = resourceTracker; } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java index 67eac76..e494904 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java @@ -165,6 +165,19 @@ public void waitForContainerToComplete(RMAppAttempt attempt, } } + public MockAM waitForAMToRestart(ApplicationId appId, int attemptSize, + MockNM nm) throws Exception { + RMApp app = getRMContext().getRMApps().get(appId); + Assert.assertNotNull(app); + while (app.getAppAttempts().size() != attemptSize) { + System.out.println("Application " + appId + + " is waiting for AM to restart. Current has " + + app.getAppAttempts().size() + " attempts."); + Thread.sleep(200); + } + return launchAndRegisterAM(app, this, nm); + } + public void waitForState(MockNM nm, ContainerId containerId, RMContainerState containerState) throws Exception { RMContainer container = getResourceScheduler().getRMContainer(containerId); @@ -541,6 +554,7 @@ public static MockAM launchAM(RMApp app, MockRM rm, MockNM nm) throws Exception { rm.waitForState(app.getApplicationId(), RMAppState.ACCEPTED); RMAppAttempt attempt = app.getCurrentAppAttempt(); + System.out.println("Launch AM " + attempt.getAppAttemptId()); nm.nodeHeartbeat(true); MockAM am = rm.sendAMLaunched(attempt.getAppAttemptId()); rm.waitForState(attempt.getAppAttemptId(), RMAppAttemptState.LAUNCHED); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/TestAMRestart.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/TestAMRestart.java index bcd8c1b..532c015 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/TestAMRestart.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/TestAMRestart.java @@ -22,13 +22,12 @@ import java.util.HashMap; import java.util.List; -import org.junit.Assert; - import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse; import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse; import org.apache.hadoop.yarn.api.records.ApplicationAccessType; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.Container; +import org.apache.hadoop.yarn.api.records.ContainerExitStatus; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerState; import org.apache.hadoop.yarn.api.records.ContainerStatus; @@ -38,22 +37,22 @@ import org.apache.hadoop.yarn.server.resourcemanager.MockAM; import org.apache.hadoop.yarn.server.resourcemanager.MockNM; import org.apache.hadoop.yarn.server.resourcemanager.MockRM; +import org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore; +import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.ApplicationState; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler; +import org.junit.Assert; import org.junit.Test; -/** - * Test to restart the AM on failure. - * - */ public class TestAMRestart { - @Test + @Test(timeout = 30000) public void testAMRestartWithExistingContainers() throws Exception { YarnConfiguration conf = new YarnConfiguration(); conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 2); @@ -245,7 +244,7 @@ private void waitForContainersToFinish(int expectedNum, RMAppAttempt attempt) } } - @Test + @Test(timeout = 30000) public void testNMTokensRebindOnAMRestart() throws Exception { YarnConfiguration conf = new YarnConfiguration(); conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 3); @@ -345,4 +344,95 @@ public void testNMTokensRebindOnAMRestart() throws Exception { Assert.assertTrue(transferredTokens.containsAll(expectedNMTokens)); rm1.stop(); } + + // AM container preempted should not be counted towards AM max retry count. + @Test(timeout = 20000) + public void testAMPreemptedNotCountedForAMFailures() throws Exception { + YarnConfiguration conf = new YarnConfiguration(); + conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, + ResourceScheduler.class); + // explicitly set max-am-retry count as 1. + conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1); + MockRM rm1 = new MockRM(conf); + rm1.start(); + MockNM nm1 = + new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService()); + nm1.registerNode(); + RMApp app1 = rm1.submitApp(200); + MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1); + CapacityScheduler scheduler = + (CapacityScheduler) rm1.getResourceScheduler(); + ContainerId amContainer = + ContainerId.newInstance(am1.getApplicationAttemptId(), 1); + + // Forcibly preempt the am container; + scheduler.killContainer(scheduler.getRMContainer(amContainer)); + + am1.waitForState(RMAppAttemptState.FAILED); + rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED); + + // AM should be restarted even though max-am-attempt is 1. + MockAM am2 = MockRM.launchAM(app1, rm1, nm1); + + // fail the AM normally + nm1.nodeHeartbeat(am2.getApplicationAttemptId(), 1, ContainerState.COMPLETE); + am2.waitForState(RMAppAttemptState.FAILED); + // AM should not be restarted. + rm1.waitForState(app1.getApplicationId(), RMAppState.FAILED); + rm1.stop(); + } + + // Test RM restarts after AM container is preempted, new RM should not count + // AM preemption failure towards the max-retry-account and should be able to + // re-launch the AM. + @Test(timeout = 20000) + public void testPreemptedAMRestartOnRMRestart() throws Exception { + YarnConfiguration conf = new YarnConfiguration(); + conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, + ResourceScheduler.class); + conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true); + conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName()); + // explicitly set max-am-retry count as 1. + conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1); + MemoryRMStateStore memStore = new MemoryRMStateStore(); + memStore.init(conf); + + MockRM rm1 = new MockRM(conf, memStore); + rm1.start(); + MockNM nm1 = + new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService()); + nm1.registerNode(); + RMApp app1 = rm1.submitApp(200); + MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1); + CapacityScheduler scheduler = + (CapacityScheduler) rm1.getResourceScheduler(); + ContainerId amContainer = + ContainerId.newInstance(am1.getApplicationAttemptId(), 1); + + // Forcibly preempt the am container; + scheduler.killContainer(scheduler.getRMContainer(amContainer)); + + am1.waitForState(RMAppAttemptState.FAILED); + rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED); + + // state store has 1 attempt stored. + ApplicationState appState = + memStore.getState().getApplicationState().get(app1.getApplicationId()); + Assert.assertEquals(1, appState.getAttemptCount()); + // attempt stored has the preemption diagnostics + Assert.assertEquals(ContainerExitStatus.PREEMPTED, + appState.getAttempt(am1.getApplicationAttemptId()) + .getAMContainerExitStatus()); + // Restart rm. + MockRM rm2 = new MockRM(conf, memStore); + nm1.setResourceTrackerService(rm2.getResourceTrackerService()); + nm1.registerNode(); + rm2.start(); + + // Restarted RM should re-launch the am. + rm2.waitForAMToRestart(app1.getApplicationId(), 2, nm1); + + rm1.stop(); + rm2.stop(); + } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreTestBase.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreTestBase.java index 507e164..5b8fe89 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreTestBase.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStoreTestBase.java @@ -308,7 +308,7 @@ void testRMAppStateStore(RMStateStoreHelper stateStoreHelper) oldAttemptState.getAppAttemptCredentials(), oldAttemptState.getStartTime(), RMAppAttemptState.FINISHED, "myTrackingUrl", "attemptDiagnostics", - FinalApplicationStatus.SUCCEEDED); + FinalApplicationStatus.SUCCEEDED, 100); store.updateApplicationAttemptState(newAttemptState); // test updating the state of an app/attempt whose initial state was not @@ -331,7 +331,7 @@ void testRMAppStateStore(RMStateStoreHelper stateStoreHelper) oldAttemptState.getAppAttemptCredentials(), oldAttemptState.getStartTime(), RMAppAttemptState.FINISHED, "myTrackingUrl", "attemptDiagnostics", - FinalApplicationStatus.SUCCEEDED); + FinalApplicationStatus.SUCCEEDED, 111); store.updateApplicationAttemptState(dummyAttempt); // let things settle down @@ -370,6 +370,7 @@ void testRMAppStateStore(RMStateStoreHelper stateStoreHelper) assertEquals(RMAppAttemptState.FINISHED, updatedAttemptState.getState()); assertEquals("myTrackingUrl", updatedAttemptState.getFinalTrackingUrl()); assertEquals("attemptDiagnostics", updatedAttemptState.getDiagnostics()); + assertEquals(100, updatedAttemptState.getAMContainerExitStatus()); assertEquals(FinalApplicationStatus.SUCCEEDED, updatedAttemptState.getFinalApplicationStatus());