diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index 060635f..9832729 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -401,6 +401,11 @@ private static void addDeprecatedKeys() {
public static final String RECOVERY_ENABLED = RM_PREFIX + "recovery.enabled";
public static final boolean DEFAULT_RM_RECOVERY_ENABLED = false;
+ public static final String YARN_FAIL_FAST = YARN_PREFIX + "fail-fast";
+ public static final boolean DEFAULT_YARN_FAIL_FAST = true;
+
+ public static final String RM_FAIL_FAST = RM_PREFIX + "fail-fast";
+
@Private
public static final String RM_WORK_PRESERVING_RECOVERY_ENABLED = RM_PREFIX
+ "work-preserving-recovery.enabled";
@@ -2018,6 +2023,12 @@ public static boolean useHttps(Configuration conf) {
YARN_HTTP_POLICY_DEFAULT));
}
+ public static boolean shouldRMFailFast(Configuration conf) {
+ return conf.getBoolean(YarnConfiguration.RM_FAIL_FAST,
+ conf.getBoolean(YarnConfiguration.YARN_FAIL_FAST,
+ YarnConfiguration.DEFAULT_YARN_FAIL_FAST));
+ }
+
@Private
public static String getClusterId(Configuration conf) {
String clusterId = conf.get(YarnConfiguration.RM_CLUSTER_ID);
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index d586f51..8b3a3af 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -324,6 +324,22 @@
+ Should RM fail fast if it encounters any errors. By defalt, it
+ points to ${yarn.fail-fast}. Errors include:
+ 1) exceptions when state-store write/read operations fails.
+
+ yarn.resourcemanager.fail-fast
+ ${yarn.fail-fast}
+
+
+
+ Should YARN fail fast if it encounters any errors.
+
+ yarn.fail-fast
+ true
+
+
+
Enable RM work preserving recovery. This configuration is private
to YARN for experimenting the feature.
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java
index 46c2954..9b17bf7 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java
@@ -44,6 +44,7 @@
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.AsyncDispatcher;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.EventHandler;
@@ -855,6 +856,7 @@ protected void handleStoreEvent(RMStateStoreEvent event) {
* @param failureCause the exception due to which the operation failed
*/
protected void notifyStoreOperationFailed(Exception failureCause) {
+ LOG.error("State store operation failed ", failureCause);
if (failureCause instanceof StoreFencedException) {
updateFencedState();
Thread standByTransitionThread =
@@ -862,8 +864,11 @@ protected void notifyStoreOperationFailed(Exception failureCause) {
standByTransitionThread.setName("StandByTransitionThread Handler");
standByTransitionThread.start();
} else {
- rmDispatcher.getEventHandler().handle(
- new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED, failureCause));
+ if (YarnConfiguration.shouldRMFailFast(getConfig())) {
+ rmDispatcher.getEventHandler().handle(
+ new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED,
+ failureCause));
+ }
}
}