diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 060635f..2d697fb 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -401,6 +401,10 @@ private static void addDeprecatedKeys() { public static final String RECOVERY_ENABLED = RM_PREFIX + "recovery.enabled"; public static final boolean DEFAULT_RM_RECOVERY_ENABLED = false; + public static final String RM_FAIL_FAST = + RM_PREFIX + "fail-fast"; + public static final boolean DEFAULT_RM_FAIL_FAST = true; + @Private public static final String RM_WORK_PRESERVING_RECOVERY_ENABLED = RM_PREFIX + "work-preserving-recovery.enabled"; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index d586f51..bd1712f 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -324,6 +324,14 @@ + Should RM fail fast if it encounters any errors. Errors include: + 1) exceptions when state-store write/read operations fails. + + yarn.resourcemanager.fail-fast + true + + + Enable RM work preserving recovery. This configuration is private to YARN for experimenting the feature. diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java index 46c2954..d990c1a 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java @@ -44,6 +44,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl; +import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.AsyncDispatcher; import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.event.EventHandler; @@ -855,6 +856,7 @@ protected void handleStoreEvent(RMStateStoreEvent event) { * @param failureCause the exception due to which the operation failed */ protected void notifyStoreOperationFailed(Exception failureCause) { + LOG.error("State store operation failed ", failureCause); if (failureCause instanceof StoreFencedException) { updateFencedState(); Thread standByTransitionThread = @@ -862,8 +864,12 @@ protected void notifyStoreOperationFailed(Exception failureCause) { standByTransitionThread.setName("StandByTransitionThread Handler"); standByTransitionThread.start(); } else { - rmDispatcher.getEventHandler().handle( - new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED, failureCause)); + if (getConfig().getBoolean(YarnConfiguration.RM_FAIL_FAST, + YarnConfiguration.DEFAULT_RM_FAIL_FAST)) { + rmDispatcher.getEventHandler().handle( + new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED, + failureCause)); + } } }