diff --git a/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index 8a9b938..8e628cb 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -1681,6 +1681,8 @@ public class AssignmentManager extends ZooKeeperListener { boolean hijack) { boolean regionAlreadyInTransitionException = false; boolean serverNotRunningYet = false; + boolean socketTimeoutException = false; + long maxRegionServerStartupWaitTime = -1; for (int i = 0; i < this.maximumAssignmentAttempts; i++) { int versionOfOfflineNode = -1; @@ -1776,6 +1778,8 @@ public class AssignmentManager extends ZooKeeperListener { } regionAlreadyInTransitionException = false; serverNotRunningYet = false; + socketTimeoutException = false; + if (t instanceof RegionAlreadyInTransitionException) { regionAlreadyInTransitionException = true; if (LOG.isDebugEnabled()) { @@ -1811,7 +1815,10 @@ public class AssignmentManager extends ZooKeeperListener { + " has timed out when trying to assign " + region.getRegionNameAsString() + ", but the region might already be opened on " - + plan.getDestination() + ".", t); + + plan.getDestination() + ", retrying...", t); + // wait and reset the re-try count, server might be just busy. + Thread.sleep(100); + socketTimeoutException = true; i--; // reset the retry } LOG.warn("Failed assignment of " @@ -1819,9 +1826,10 @@ public class AssignmentManager extends ZooKeeperListener { + " to " + plan.getDestination() + ", trying to assign " - + (regionAlreadyInTransitionException || serverNotRunningYet + + (regionAlreadyInTransitionException || serverNotRunningYet || socketTimeoutException ? "to the same region server because of " - + "RegionAlreadyInTransitionException/ServerNotRunningYetException;" + + "RegionAlreadyInTransitionException/ServerNotRunningYetException/" + + "SocketTimeoutException;" : "elsewhere instead; ") + "retry=" + i, t); // Clean out plan we failed execute and one that doesn't look like it'll