diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index 861bf34..febc3b5 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -326,7 +326,15 @@ private static void addDeprecatedKeys() {
public static final String RM_AM_MAX_ATTEMPTS =
RM_PREFIX + "am.max-attempts";
public static final int DEFAULT_RM_AM_MAX_ATTEMPTS = 2;
-
+
+ /**
+ * It's a global setting for all application.
+ */
+ public static final String RM_APP_ATTEMPT_FAILURES_VALIDITY_INTERVAL =
+ RM_PREFIX + "app-attempt.failure-interval";
+ public static final long DEFAULT_RM_APP_ATTEMPT_FAILURES_VALIDITY_INTERVAL =
+ 10*60*1000;
+
/** The keytab for the resource manager.*/
public static final String RM_KEYTAB =
RM_PREFIX + "keytab";
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 3257d4a..7944277 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -198,6 +198,21 @@
+ When attemptFailuresValidityInterval in milliseconds is
+ set to > 0, the failure number will no take failures which happen
+ out of the validityInterval into failure count. If failure count
+ reaches to maxAppAttempts, the application will be failed. It's a global
+ setting for all applications. Each application master can specify
+ its individual attemptFailuresValidityInterval(value should be > 0)
+ via the API, but the individual number cannot be smaller than the
+ global upper bound. If it is, the resourcemanager will override it.
+ The default number is set to 600000.
+
+ yarn.resourcemanager.app-attempt.failure-interval
+ 600000
+
+
+
How often to check that containers are still alive.
yarn.resourcemanager.container.liveness-monitor.interval-ms
600000
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java
index 1173da2..f9de50e 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java
@@ -425,8 +425,26 @@ public RMAppImpl(ApplicationId applicationId, RMContext rmContext,
this.maxAppAttempts = individualMaxAppAttempts;
}
- this.attemptFailuresValidityInterval =
+ long globalAttemptFailuresValidityInterval =
+ conf.getLong(
+ YarnConfiguration.RM_APP_ATTEMPT_FAILURES_VALIDITY_INTERVAL,
+ YarnConfiguration.DEFAULT_RM_APP_ATTEMPT_FAILURES_VALIDITY_INTERVAL);
+ long individualAttemptFailuresValidityInterval =
submissionContext.getAttemptFailuresValidityInterval();
+ if (individualAttemptFailuresValidityInterval > 0 &&
+ individualAttemptFailuresValidityInterval <
+ globalAttemptFailuresValidityInterval) {
+ this.attemptFailuresValidityInterval =
+ globalAttemptFailuresValidityInterval;
+ LOG.warn("The specific AttemptFailuresValidityInterval: "
+ + individualMaxAppAttempts + " for application: "
+ + applicationId.getId() + " is invalid, because it is smaller than "
+ + globalAttemptFailuresValidityInterval
+ + ". Use the global AttemptFailuresValidityInterval instead.");
+ } else {
+ this.attemptFailuresValidityInterval =
+ individualAttemptFailuresValidityInterval;
+ }
if (this.attemptFailuresValidityInterval > 0) {
LOG.info("The attemptFailuresValidityInterval for the application: "
+ this.applicationId + " is " + this.attemptFailuresValidityInterval