diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index a18ef7c..8cc0703 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -402,7 +402,7 @@ private static void addDeprecatedKeys() { public static final boolean DEFAULT_RM_RECOVERY_ENABLED = false; public static final String YARN_FAIL_FAST = YARN_PREFIX + "fail-fast"; - public static final boolean DEFAULT_YARN_FAIL_FAST = true; + public static final boolean DEFAULT_YARN_FAIL_FAST = false; public static final String RM_FAIL_FAST = RM_PREFIX + "fail-fast"; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 62ba599..7070cd7 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -343,9 +343,12 @@ Should YARN fail fast if it encounters any errors. + This is a global config for all other components including RM,NM etc. + If no value is set for component-specific config (e.g yarn.resourcemanager.fail-fast), + this value will be the default. yarn.fail-fast - true + false diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java index b7f1e6c..2888087 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java @@ -45,6 +45,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; import org.apache.hadoop.yarn.api.records.ReservationId; import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl; +import org.apache.hadoop.yarn.conf.HAUtil; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.AsyncDispatcher; import org.apache.hadoop.yarn.event.Dispatcher; @@ -195,8 +196,12 @@ public void transition(RMStateStore store, RMStateStoreEvent event) { } catch (Exception e) { LOG.error("Error storing app: " + appId, e); store.notifyStoreOperationFailed(e); + store.notifyApplication( + new RMAppEvent(appId, RMAppEventType.APP_NEW_SAVED, + "Fail to store application in state-store." + + " Application will be lost after RM restart. \n")); } - }; + } } private static class UpdateAppTransition implements @@ -222,6 +227,12 @@ public void transition(RMStateStore store, RMStateStoreEvent event) { } catch (Exception e) { LOG.error("Error updating app: " + appId, e); store.notifyStoreOperationFailed(e); + if (((RMStateUpdateAppEvent) event).isNotifyApplication()) { + store.notifyApplication( + new RMAppEvent(appId, RMAppEventType.APP_UPDATE_SAVED, + "Fail to update application final state in state-store. \n")); + + } } }; } @@ -266,12 +277,18 @@ public void transition(RMStateStore store, RMStateStoreEvent event) { } store.storeApplicationAttemptStateInternal(attemptState.getAttemptId(), attemptState); - store.notifyApplicationAttempt(new RMAppAttemptEvent - (attemptState.getAttemptId(), - RMAppAttemptEventType.ATTEMPT_NEW_SAVED)); + store.notifyApplicationAttempt( + new RMAppAttemptEvent(attemptState.getAttemptId(), + RMAppAttemptEventType.ATTEMPT_NEW_SAVED)); } catch (Exception e) { LOG.error("Error storing appAttempt: " + attemptState.getAttemptId(), e); store.notifyStoreOperationFailed(e); + store.notifyApplicationAttempt( + new RMAppAttemptEvent(attemptState.getAttemptId(), + RMAppAttemptEventType.ATTEMPT_NEW_SAVED, + "Fail to store app attempt in state-store." + + " Attempt will be lost after RM restart. \n")); + } }; } @@ -299,7 +316,12 @@ public void transition(RMStateStore store, RMStateStoreEvent event) { } catch (Exception e) { LOG.error("Error updating appAttempt: " + attemptState.getAttemptId(), e); store.notifyStoreOperationFailed(e); + store.notifyApplicationAttempt( + new RMAppAttemptEvent(attemptState.getAttemptId(), + RMAppAttemptEventType.ATTEMPT_UPDATE_SAVED, + "Fail to update app attempt final state in state-store. \n")); } + }; } @@ -1013,18 +1035,20 @@ protected void handleStoreEvent(RMStateStoreEvent event) { */ protected void notifyStoreOperationFailed(Exception failureCause) { LOG.error("State store operation failed ", failureCause); - if (failureCause instanceof StoreFencedException) { + if (HAUtil.isHAEnabled(getConfig())) { + LOG.warn("State-store fenced ! Transitioning RM to standby"); updateFencedState(); Thread standByTransitionThread = new Thread(new StandByTransitionThread()); standByTransitionThread.setName("StandByTransitionThread Handler"); standByTransitionThread.start(); + } else if (YarnConfiguration.shouldRMFailFast(getConfig())) { + LOG.warn("Fail RM now due to state-store error!"); + rmDispatcher.getEventHandler().handle( + new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED, + failureCause)); } else { - if (YarnConfiguration.shouldRMFailFast(getConfig())) { - rmDispatcher.getEventHandler().handle( - new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED, - failureCause)); - } + LOG.warn("Skip the state-store error."); } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java index 0550087..945405e 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java @@ -1049,7 +1049,7 @@ public void run() { LOG.info(VerifyActiveStatusThread.class.getName() + " thread " + "interrupted! Exiting!"); } catch (Exception e) { - notifyStoreOperationFailed(new StoreFencedException()); + notifyStoreOperationFailed(e); } } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppEvent.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppEvent.java index a1c234c..b199d47 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppEvent.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppEvent.java @@ -24,13 +24,26 @@ public class RMAppEvent extends AbstractEvent{ private final ApplicationId appId; + private final String diagnostics; public RMAppEvent(ApplicationId appId, RMAppEventType type) { super(type); this.appId = appId; + this.diagnostics = null; + } + + public RMAppEvent(ApplicationId appId, RMAppEventType type, + String diagnostics) { + super(type); + this.appId = appId; + this.diagnostics = diagnostics; } public ApplicationId getApplicationId() { return this.appId; } + + public String getDiagnostics() { + return diagnostics; + } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java index 2eb74f7..3e8913a 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java @@ -962,6 +962,9 @@ public RMAppState transition(RMAppImpl app, RMAppEvent event) { public void transition(RMAppImpl app, RMAppEvent event) { app.handler.handle(new AppAddedSchedulerEvent(app.user, app.submissionContext, false)); + if(event.getDiagnostics() != null) { + app.diagnostics.append(event.getDiagnostics()); + } } } @@ -984,6 +987,9 @@ public RMAppState transition(RMAppImpl app, RMAppEvent event) { ((MultipleArcTransition) app.transitionTodo).transition(app, app.eventCausingFinalSaving); } + if (event.getDiagnostics() !=null ){ + app.diagnostics.append(event.getDiagnostics()); + } return app.targetedFinalState; } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptEvent.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptEvent.java index ad5c28a..024b2c4 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptEvent.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptEvent.java @@ -25,13 +25,27 @@ private final ApplicationAttemptId appAttemptId; + private final String diagnostics; + public RMAppAttemptEvent(ApplicationAttemptId appAttemptId, RMAppAttemptEventType type) { super(type); this.appAttemptId = appAttemptId; + this.diagnostics = null; + } + + public RMAppAttemptEvent(ApplicationAttemptId appAttemptId, + RMAppAttemptEventType type, String diagnostics) { + super(type); + this.appAttemptId = appAttemptId; + this.diagnostics = diagnostics; } public ApplicationAttemptId getApplicationAttemptId() { return this.appAttemptId; } + + public String getDiagnostics() { + return diagnostics; + } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java index 74a4000..b1088d5 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java @@ -1026,6 +1026,9 @@ public void run() { public void transition(RMAppAttemptImpl appAttempt, RMAppAttemptEvent event) { appAttempt.launchAttempt(); + if (event.getDiagnostics() != null) { + appAttempt.diagnostics.append(event.getDiagnostics()); + } } } @@ -1207,6 +1210,9 @@ public RMAppAttemptState transition(RMAppAttemptImpl appAttempt, ((MultipleArcTransition) appAttempt.transitionTodo).transition( appAttempt, causeEvent); } + if (event.getDiagnostics() !=null ) { + appAttempt.diagnostics.append(event.getDiagnostics()); + } return appAttempt.targetedFinalState; } } @@ -1340,7 +1346,9 @@ public void transition(RMAppAttemptImpl appAttempt, appAttempt.amrmToken = appAttempt.rmContext.getAMRMTokenSecretManager().createAndGetAMRMToken( appAttempt.applicationAttemptId); - + if (event.getDiagnostics() != null) { + appAttempt.diagnostics.append(event.getDiagnostics()); + } super.transition(appAttempt, event); } }