diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java index 42d889e..f96c264 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java @@ -139,6 +139,7 @@ private final long attemptFailuresValidityInterval; private final boolean amBlacklistingEnabled; private final float blacklistDisableThreshold; + private final boolean shouldRMFailFast; private Clock systemClock; @@ -474,6 +475,8 @@ public RMAppImpl(ApplicationId applicationId, RMContext rmContext, } else { blacklistDisableThreshold = 0.0f; } + + shouldRMFailFast = YarnConfiguration.shouldRMFailFast(conf); } @Override @@ -571,6 +574,10 @@ public RMAppAttempt getCurrentAppAttempt() { } } + public boolean getShouldRMFailFast() { + return shouldRMFailFast; + } + private FinalApplicationStatus createFinalApplicationStatus(RMAppState state) { switch(state) { case NEW: @@ -937,7 +944,9 @@ private void recoverAppAttempts() { public RMAppState transition(RMAppImpl app, RMAppEvent event) { RMAppRecoverEvent recoverEvent = (RMAppRecoverEvent) event; - app.recover(recoverEvent.getRMState()); + try { + app.recover(recoverEvent.getRMState()); + // The app has completed. if (app.recoveredFinalState != null) { app.recoverAppAttempts(); @@ -990,8 +999,19 @@ public RMAppState transition(RMAppImpl app, RMAppEvent event) { // Thus we return ACCECPTED state on recovery. return RMAppState.ACCEPTED; } + catch (Exception e) { + if (app.getShouldRMFailFast()) { + throw e; + } + String msg = app.applicationId + " failed to recover. " + e.getMessage(); + app.diagnostics.append(msg); + LOG.error(msg, e); + return RMAppState.FAILED; + } + } } + private static final class AddApplicationToSchedulerTransition extends RMAppTransition { @Override