diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index a18ef7c..8cc0703 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -402,7 +402,7 @@ private static void addDeprecatedKeys() {
public static final boolean DEFAULT_RM_RECOVERY_ENABLED = false;
public static final String YARN_FAIL_FAST = YARN_PREFIX + "fail-fast";
- public static final boolean DEFAULT_YARN_FAIL_FAST = true;
+ public static final boolean DEFAULT_YARN_FAIL_FAST = false;
public static final String RM_FAIL_FAST = RM_PREFIX + "fail-fast";
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 62ba599..7070cd7 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -343,9 +343,12 @@
Should YARN fail fast if it encounters any errors.
+ This is a global config for all other components including RM,NM etc.
+ If no value is set for component-specific config (e.g yarn.resourcemanager.fail-fast),
+ this value will be the default.
yarn.fail-fast
- true
+ false
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java
index b7f1e6c..2888087 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java
@@ -45,6 +45,7 @@
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
import org.apache.hadoop.yarn.api.records.ReservationId;
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl;
+import org.apache.hadoop.yarn.conf.HAUtil;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.AsyncDispatcher;
import org.apache.hadoop.yarn.event.Dispatcher;
@@ -195,8 +196,12 @@ public void transition(RMStateStore store, RMStateStoreEvent event) {
} catch (Exception e) {
LOG.error("Error storing app: " + appId, e);
store.notifyStoreOperationFailed(e);
+ store.notifyApplication(
+ new RMAppEvent(appId, RMAppEventType.APP_NEW_SAVED,
+ "Fail to store application in state-store."
+ + " Application will be lost after RM restart. \n"));
}
- };
+ }
}
private static class UpdateAppTransition implements
@@ -222,6 +227,12 @@ public void transition(RMStateStore store, RMStateStoreEvent event) {
} catch (Exception e) {
LOG.error("Error updating app: " + appId, e);
store.notifyStoreOperationFailed(e);
+ if (((RMStateUpdateAppEvent) event).isNotifyApplication()) {
+ store.notifyApplication(
+ new RMAppEvent(appId, RMAppEventType.APP_UPDATE_SAVED,
+ "Fail to update application final state in state-store. \n"));
+
+ }
}
};
}
@@ -266,12 +277,18 @@ public void transition(RMStateStore store, RMStateStoreEvent event) {
}
store.storeApplicationAttemptStateInternal(attemptState.getAttemptId(),
attemptState);
- store.notifyApplicationAttempt(new RMAppAttemptEvent
- (attemptState.getAttemptId(),
- RMAppAttemptEventType.ATTEMPT_NEW_SAVED));
+ store.notifyApplicationAttempt(
+ new RMAppAttemptEvent(attemptState.getAttemptId(),
+ RMAppAttemptEventType.ATTEMPT_NEW_SAVED));
} catch (Exception e) {
LOG.error("Error storing appAttempt: " + attemptState.getAttemptId(), e);
store.notifyStoreOperationFailed(e);
+ store.notifyApplicationAttempt(
+ new RMAppAttemptEvent(attemptState.getAttemptId(),
+ RMAppAttemptEventType.ATTEMPT_NEW_SAVED,
+ "Fail to store app attempt in state-store."
+ + " Attempt will be lost after RM restart. \n"));
+
}
};
}
@@ -299,7 +316,12 @@ public void transition(RMStateStore store, RMStateStoreEvent event) {
} catch (Exception e) {
LOG.error("Error updating appAttempt: " + attemptState.getAttemptId(), e);
store.notifyStoreOperationFailed(e);
+ store.notifyApplicationAttempt(
+ new RMAppAttemptEvent(attemptState.getAttemptId(),
+ RMAppAttemptEventType.ATTEMPT_UPDATE_SAVED,
+ "Fail to update app attempt final state in state-store. \n"));
}
+
};
}
@@ -1013,18 +1035,20 @@ protected void handleStoreEvent(RMStateStoreEvent event) {
*/
protected void notifyStoreOperationFailed(Exception failureCause) {
LOG.error("State store operation failed ", failureCause);
- if (failureCause instanceof StoreFencedException) {
+ if (HAUtil.isHAEnabled(getConfig())) {
+ LOG.warn("State-store fenced ! Transitioning RM to standby");
updateFencedState();
Thread standByTransitionThread =
new Thread(new StandByTransitionThread());
standByTransitionThread.setName("StandByTransitionThread Handler");
standByTransitionThread.start();
+ } else if (YarnConfiguration.shouldRMFailFast(getConfig())) {
+ LOG.warn("Fail RM now due to state-store error!");
+ rmDispatcher.getEventHandler().handle(
+ new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED,
+ failureCause));
} else {
- if (YarnConfiguration.shouldRMFailFast(getConfig())) {
- rmDispatcher.getEventHandler().handle(
- new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED,
- failureCause));
- }
+ LOG.warn("Skip the state-store error.");
}
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java
index 0550087..945405e 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java
@@ -1049,7 +1049,7 @@ public void run() {
LOG.info(VerifyActiveStatusThread.class.getName() + " thread " +
"interrupted! Exiting!");
} catch (Exception e) {
- notifyStoreOperationFailed(new StoreFencedException());
+ notifyStoreOperationFailed(e);
}
}
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppEvent.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppEvent.java
index a1c234c..b199d47 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppEvent.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppEvent.java
@@ -24,13 +24,26 @@
public class RMAppEvent extends AbstractEvent{
private final ApplicationId appId;
+ private final String diagnostics;
public RMAppEvent(ApplicationId appId, RMAppEventType type) {
super(type);
this.appId = appId;
+ this.diagnostics = null;
+ }
+
+ public RMAppEvent(ApplicationId appId, RMAppEventType type,
+ String diagnostics) {
+ super(type);
+ this.appId = appId;
+ this.diagnostics = diagnostics;
}
public ApplicationId getApplicationId() {
return this.appId;
}
+
+ public String getDiagnostics() {
+ return diagnostics;
+ }
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java
index 2eb74f7..3e8913a 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java
@@ -962,6 +962,9 @@ public RMAppState transition(RMAppImpl app, RMAppEvent event) {
public void transition(RMAppImpl app, RMAppEvent event) {
app.handler.handle(new AppAddedSchedulerEvent(app.user,
app.submissionContext, false));
+ if(event.getDiagnostics() != null) {
+ app.diagnostics.append(event.getDiagnostics());
+ }
}
}
@@ -984,6 +987,9 @@ public RMAppState transition(RMAppImpl app, RMAppEvent event) {
((MultipleArcTransition) app.transitionTodo).transition(app,
app.eventCausingFinalSaving);
}
+ if (event.getDiagnostics() !=null ){
+ app.diagnostics.append(event.getDiagnostics());
+ }
return app.targetedFinalState;
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptEvent.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptEvent.java
index ad5c28a..024b2c4 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptEvent.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptEvent.java
@@ -25,13 +25,27 @@
private final ApplicationAttemptId appAttemptId;
+ private final String diagnostics;
+
public RMAppAttemptEvent(ApplicationAttemptId appAttemptId,
RMAppAttemptEventType type) {
super(type);
this.appAttemptId = appAttemptId;
+ this.diagnostics = null;
+ }
+
+ public RMAppAttemptEvent(ApplicationAttemptId appAttemptId,
+ RMAppAttemptEventType type, String diagnostics) {
+ super(type);
+ this.appAttemptId = appAttemptId;
+ this.diagnostics = diagnostics;
}
public ApplicationAttemptId getApplicationAttemptId() {
return this.appAttemptId;
}
+
+ public String getDiagnostics() {
+ return diagnostics;
+ }
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java
index 74a4000..b1088d5 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java
@@ -1026,6 +1026,9 @@ public void run() {
public void transition(RMAppAttemptImpl appAttempt,
RMAppAttemptEvent event) {
appAttempt.launchAttempt();
+ if (event.getDiagnostics() != null) {
+ appAttempt.diagnostics.append(event.getDiagnostics());
+ }
}
}
@@ -1207,6 +1210,9 @@ public RMAppAttemptState transition(RMAppAttemptImpl appAttempt,
((MultipleArcTransition) appAttempt.transitionTodo).transition(
appAttempt, causeEvent);
}
+ if (event.getDiagnostics() !=null ) {
+ appAttempt.diagnostics.append(event.getDiagnostics());
+ }
return appAttempt.targetedFinalState;
}
}
@@ -1340,7 +1346,9 @@ public void transition(RMAppAttemptImpl appAttempt,
appAttempt.amrmToken =
appAttempt.rmContext.getAMRMTokenSecretManager().createAndGetAMRMToken(
appAttempt.applicationAttemptId);
-
+ if (event.getDiagnostics() != null) {
+ appAttempt.diagnostics.append(event.getDiagnostics());
+ }
super.transition(appAttempt, event);
}
}