diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index c09c7f1ac2c..5527ac4a12c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -1602,8 +1602,10 @@ public ContainerState transition(final ContainerImpl container, } container.addDiagnostics(exitEvent.getDiagnosticInfo() + "\n"); } - if (container.shouldRetry(container.exitCode)) { + // Updates to the retry context should be protected from concurrent + // writes. It should only be called from this transition. + container.retryPolicy.updateRetryContext(container.windowRetryContext); container.storeRetryContext(); doRelaunch(container, container.windowRetryContext.getRemainingRetries(), diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java index 02088794755..36a8b918c5d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java @@ -42,49 +42,40 @@ public SlidingWindowRetryPolicy(Clock clock) { public boolean shouldRetry(RetryContext retryContext, int errorCode) { - ContainerRetryContext containerRC = retryContext - .containerRetryContext; + ContainerRetryContext containerRC = retryContext.containerRetryContext; Preconditions.checkNotNull(containerRC, "container retry context null"); ContainerRetryPolicy retryPolicy = containerRC.getRetryPolicy(); if (retryPolicy == ContainerRetryPolicy.RETRY_ON_ALL_ERRORS || (retryPolicy == ContainerRetryPolicy.RETRY_ON_SPECIFIC_ERROR_CODES && containerRC.getErrorCodes() != null && containerRC.getErrorCodes().contains(errorCode))) { - if (containerRC.getMaxRetries() == ContainerRetryContext.RETRY_FOREVER) { - return true; - } - int pendingRetries = calculatePendingRetries(retryContext); - updateRetryContext(retryContext, pendingRetries); - return pendingRetries > 0; + return containerRC.getMaxRetries() == ContainerRetryContext.RETRY_FOREVER + || calculateRemainingRetries(retryContext) > 0; } return false; } /** - * Calculates the pending number of retries. - *
- * When failuresValidityInterval is > 0, it also removes time entries from
- *
+ * When failuresValidityInterval is > 0, it also removes time entries from
+ * restartTimes which are outside the validity interval.
+ * Calculates the remaining number of retries.
*
- * @return the pending retries.
+ * @return the remaining retries.
*/
- private int calculatePendingRetries(RetryContext retryContext) {
+ private int calculateRemainingRetries(RetryContext retryContext) {
ContainerRetryContext containerRC =
retryContext.containerRetryContext;
if (containerRC.getFailuresValidityInterval() > 0) {
- IteratorrestartTimes which are outside the validity interval.
*/
- private void updateRetryContext(RetryContext retryContext,
- int pendingRetries) {
- retryContext.setRemainingRetries(pendingRetries - 1);
- if (retryContext.containerRetryContext.getFailuresValidityInterval()
- > 0) {
- retryContext.getRestartTimes().add(clock.getTime());
+ protected void updateRetryContext(RetryContext retryContext) {
+ if (retryContext.containerRetryContext.getFailuresValidityInterval() > 0) {
+ ContainerRetryContext containerRC = retryContext.containerRetryContext;
+ Iterator