diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index c09c7f1ac2c..5527ac4a12c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -1602,8 +1602,10 @@ public ContainerState transition(final ContainerImpl container, } container.addDiagnostics(exitEvent.getDiagnosticInfo() + "\n"); } - if (container.shouldRetry(container.exitCode)) { + // Updates to the retry context should be protected from concurrent + // writes. It should only be called from this transition. + container.retryPolicy.updateRetryContext(container.windowRetryContext); container.storeRetryContext(); doRelaunch(container, container.windowRetryContext.getRemainingRetries(), diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java index 02088794755..14243ffbcfe 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java @@ -42,8 +42,7 @@ public SlidingWindowRetryPolicy(Clock clock) { public boolean shouldRetry(RetryContext retryContext, int errorCode) { - ContainerRetryContext containerRC = retryContext - .containerRetryContext; + ContainerRetryContext containerRC = retryContext.containerRetryContext; Preconditions.checkNotNull(containerRC, "container retry context null"); ContainerRetryPolicy retryPolicy = containerRC.getRetryPolicy(); if (retryPolicy == ContainerRetryPolicy.RETRY_ON_ALL_ERRORS @@ -53,38 +52,32 @@ public boolean shouldRetry(RetryContext retryContext, if (containerRC.getMaxRetries() == ContainerRetryContext.RETRY_FOREVER) { return true; } - int pendingRetries = calculatePendingRetries(retryContext); - updateRetryContext(retryContext, pendingRetries); - return pendingRetries > 0; + return calculateRemainingRetries(retryContext) > 0 ; } return false; } /** - * Calculates the pending number of retries. - *
- * When failuresValidityInterval is > 0, it also removes time entries from
- *
+ * When failuresValidityInterval is > 0, it also removes time entries from
+ * restartTimes which are outside the validity interval.
+ * Calculates the remaining number of retries.
*
- * @return the pending retries.
+ * @return the remaining retries.
*/
- private int calculatePendingRetries(RetryContext retryContext) {
+ private int calculateRemainingRetries(RetryContext retryContext) {
ContainerRetryContext containerRC =
retryContext.containerRetryContext;
if (containerRC.getFailuresValidityInterval() > 0) {
- IteratorrestartTimes which are outside the validity interval.
*/
- private void updateRetryContext(RetryContext retryContext,
- int pendingRetries) {
- retryContext.setRemainingRetries(pendingRetries - 1);
- if (retryContext.containerRetryContext.getFailuresValidityInterval()
- > 0) {
- retryContext.getRestartTimes().add(clock.getTime());
+ protected void updateRetryContext(RetryContext retryContext) {
+ if (retryContext.containerRetryContext.getFailuresValidityInterval() > 0) {
+ ContainerRetryContext containerRC = retryContext.containerRetryContext;
+ Iterator