diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 5a36bd1..2020193 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -313,7 +313,15 @@ private static void addDeprecatedKeys() { public static final String RM_AM_MAX_ATTEMPTS = RM_PREFIX + "am.max-attempts"; public static final int DEFAULT_RM_AM_MAX_ATTEMPTS = 2; - + + /** + * It's a global setting for all application. + */ + public static final String RM_APP_ATTEMPT_FAILURES_VALIDITY_INTERVAL = + RM_PREFIX + "app-attempt.failure-interval"; + public static final long DEFAULT_RM_APP_ATTEMPT_FAILURES_VALIDITY_INTERVAL = + 10*60*1000; + /** The keytab for the resource manager.*/ public static final String RM_KEYTAB = RM_PREFIX + "keytab"; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 9bbdb94..60eded5 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -234,6 +234,20 @@ + When attemptFailuresValidityInterval in milliseconds is + set to > 0, the failure number will no take failures which happen + out of the validityInterval into failure count. If failure count + reaches to maxAppAttempts, the application will be failed.. It's a global + setting for all applications. Each application master can specify + its individual attemptFailuresValidityInterval via the API, but the + individual number cannot be smaller than the global upper bound. If it is, + the resourcemanager will override it. The default number is set to 600000. + + yarn.resourcemanager.app-attempt.failure-interval + 600000 + + + How often to check that containers are still alive. yarn.resourcemanager.container.liveness-monitor.interval-ms 600000 diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java index 9220849..ad6069b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java @@ -428,8 +428,24 @@ public RMAppImpl(ApplicationId applicationId, RMContext rmContext, this.maxAppAttempts = individualMaxAppAttempts; } - this.attemptFailuresValidityInterval = + long globalAttemptFailuresValidityInterval = + conf.getLong( + YarnConfiguration.RM_APP_ATTEMPT_FAILURES_VALIDITY_INTERVAL, + YarnConfiguration.DEFAULT_RM_APP_ATTEMPT_FAILURES_VALIDITY_INTERVAL); + long individualAttemptFailuresValidityInterval = submissionContext.getAttemptFailuresValidityInterval(); + if (individualAttemptFailuresValidityInterval < globalAttemptFailuresValidityInterval) { + this.attemptFailuresValidityInterval = + globalAttemptFailuresValidityInterval; + LOG.warn("The specific AttemptFailuresValidityInterval: " + + individualMaxAppAttempts + " for application: " + + applicationId.getId() + " is invalid, because it is smaller than " + + globalMaxAppAttempts + + ". Use the global AttemptFailuresValidityInterval instead."); + } else { + this.attemptFailuresValidityInterval = + individualAttemptFailuresValidityInterval; + } if (this.attemptFailuresValidityInterval > 0) { LOG.info("The attemptFailuresValidityInterval for the application: " + this.applicationId + " is " + this.attemptFailuresValidityInterval