Index: hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerAllocator.java =================================================================== --- hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerAllocator.java (revision 1539514) +++ hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerAllocator.java (working copy) @@ -570,6 +570,10 @@ if (response.getAMCommand() != null) { switch(response.getAMCommand()) { case AM_RESYNC: + LOG.warn("ApplicationMaster is out of sync with ResourceManager," + + " hence resyncing."); + lastResponseID = 0; + break; case AM_SHUTDOWN: // This can happen if the RM has been restarted. If it is in that state, // this application must clean itself up. Index: hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerRequestor.java =================================================================== --- hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerRequestor.java (revision 1539514) +++ hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/rm/RMContainerRequestor.java (working copy) @@ -38,6 +38,7 @@ import org.apache.hadoop.mapreduce.v2.app.client.ClientService; import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest; import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse; +import org.apache.hadoop.yarn.api.records.AMCommand; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.Resource; @@ -56,7 +57,7 @@ private static final Log LOG = LogFactory.getLog(RMContainerRequestor.class); - private int lastResponseID; + protected int lastResponseID; private Resource availableResources; private final RecordFactory recordFactory = @@ -163,33 +164,44 @@ } catch (YarnException e) { throw new IOException(e); } - lastResponseID = allocateResponse.getResponseId(); - availableResources = allocateResponse.getAvailableResources(); - lastClusterNmCount = clusterNmCount; - clusterNmCount = allocateResponse.getNumClusterNodes(); + if (!isResyncCommand(allocateResponse)) { + lastResponseID = allocateResponse.getResponseId(); + availableResources = allocateResponse.getAvailableResources(); + lastClusterNmCount = clusterNmCount; + clusterNmCount = allocateResponse.getNumClusterNodes(); - if (ask.size() > 0 || release.size() > 0) { - LOG.info("getResources() for " + applicationId + ":" + " ask=" - + ask.size() + " release= " + release.size() + " newContainers=" - + allocateResponse.getAllocatedContainers().size() - + " finishedContainers=" - + allocateResponse.getCompletedContainersStatuses().size() - + " resourcelimit=" + availableResources + " knownNMs=" - + clusterNmCount); - } + if (ask.size() > 0 || release.size() > 0) { + LOG.info("getResources() for " + applicationId + ":" + " ask=" + + ask.size() + " release= " + release.size() + " newContainers=" + + allocateResponse.getAllocatedContainers().size() + + " finishedContainers=" + + allocateResponse.getCompletedContainersStatuses().size() + + " resourcelimit=" + availableResources + " knownNMs=" + + clusterNmCount); + } - ask.clear(); - release.clear(); + ask.clear(); + release.clear(); - if (blacklistAdditions.size() > 0 || blacklistRemovals.size() > 0) { - LOG.info("Update the blacklist for " + applicationId + - ": blacklistAdditions=" + blacklistAdditions.size() + - " blacklistRemovals=" + blacklistRemovals.size()); + if (blacklistAdditions.size() > 0 || blacklistRemovals.size() > 0) { + LOG.info("Update the blacklist for " + applicationId + + ": blacklistAdditions=" + blacklistAdditions.size() + + " blacklistRemovals=" + blacklistRemovals.size()); + } + blacklistAdditions.clear(); + blacklistRemovals.clear(); + } else { + LOG.warn("ApplicationMaster is out of sync with ResourceManager," + + " hence resyncing."); } - blacklistAdditions.clear(); - blacklistRemovals.clear(); return allocateResponse; } + + private boolean isResyncCommand(AllocateResponse allocateResponse) + { + return allocateResponse.getAMCommand() != null + && allocateResponse.getAMCommand() == AMCommand.AM_RESYNC; + } // May be incorrect if there's multiple NodeManagers running on a single host. // knownNodeCount is based on node managers, not hosts. blacklisting is