diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 861bf34..febc3b5 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -326,7 +326,15 @@ private static void addDeprecatedKeys() { public static final String RM_AM_MAX_ATTEMPTS = RM_PREFIX + "am.max-attempts"; public static final int DEFAULT_RM_AM_MAX_ATTEMPTS = 2; - + + /** + * It's a global setting for all application. + */ + public static final String RM_APP_ATTEMPT_FAILURES_VALIDITY_INTERVAL = + RM_PREFIX + "app-attempt.failure-interval"; + public static final long DEFAULT_RM_APP_ATTEMPT_FAILURES_VALIDITY_INTERVAL = + 10*60*1000; + /** The keytab for the resource manager.*/ public static final String RM_KEYTAB = RM_PREFIX + "keytab"; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 3257d4a..7944277 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -198,6 +198,21 @@ + When attemptFailuresValidityInterval in milliseconds is + set to > 0, the failure number will no take failures which happen + out of the validityInterval into failure count. If failure count + reaches to maxAppAttempts, the application will be failed. It's a global + setting for all applications. Each application master can specify + its individual attemptFailuresValidityInterval(value should be > 0) + via the API, but the individual number cannot be smaller than the + global upper bound. If it is, the resourcemanager will override it. + The default number is set to 600000. + + yarn.resourcemanager.app-attempt.failure-interval + 600000 + + + How often to check that containers are still alive. yarn.resourcemanager.container.liveness-monitor.interval-ms 600000 diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java index 1173da2..f9de50e 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java @@ -425,8 +425,26 @@ public RMAppImpl(ApplicationId applicationId, RMContext rmContext, this.maxAppAttempts = individualMaxAppAttempts; } - this.attemptFailuresValidityInterval = + long globalAttemptFailuresValidityInterval = + conf.getLong( + YarnConfiguration.RM_APP_ATTEMPT_FAILURES_VALIDITY_INTERVAL, + YarnConfiguration.DEFAULT_RM_APP_ATTEMPT_FAILURES_VALIDITY_INTERVAL); + long individualAttemptFailuresValidityInterval = submissionContext.getAttemptFailuresValidityInterval(); + if (individualAttemptFailuresValidityInterval > 0 && + individualAttemptFailuresValidityInterval < + globalAttemptFailuresValidityInterval) { + this.attemptFailuresValidityInterval = + globalAttemptFailuresValidityInterval; + LOG.warn("The specific AttemptFailuresValidityInterval: " + + individualMaxAppAttempts + " for application: " + + applicationId.getId() + " is invalid, because it is smaller than " + + globalAttemptFailuresValidityInterval + + ". Use the global AttemptFailuresValidityInterval instead."); + } else { + this.attemptFailuresValidityInterval = + individualAttemptFailuresValidityInterval; + } if (this.attemptFailuresValidityInterval > 0) { LOG.info("The attemptFailuresValidityInterval for the application: " + this.applicationId + " is " + this.attemptFailuresValidityInterval