commit a7f46bf333ae1e08697795a40ec93316a6b08adc Author: Enis Soztutar Date: Wed Jan 14 15:45:46 2015 -0800 HBASE-12844 ServerManager.isServerReacable() should sleep between retries Conflicts: hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 697ba8c..6b2b00b 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -63,6 +63,8 @@ import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.SplitLogTask.R import org.apache.hadoop.hbase.regionserver.RegionOpeningState; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Triple; +import org.apache.hadoop.hbase.util.RetryCounter; +import org.apache.hadoop.hbase.util.RetryCounterFactory; import com.google.common.annotations.VisibleForTesting; import com.google.protobuf.ServiceException; @@ -138,6 +140,8 @@ public class ServerManager { private final long maxSkew; private final long warningSkew; + private final RetryCounterFactory pingRetryCounterFactory; + /** * Set of region servers which are dead but not processed immediately. If one * server died before master enables ServerShutdownHandler, the server will be @@ -197,6 +201,11 @@ public class ServerManager { maxSkew = c.getLong("hbase.master.maxclockskew", 30000); warningSkew = c.getLong("hbase.master.warningclockskew", 10000); this.connection = connect ? HConnectionManager.getConnection(c) : null; + int pingMaxAttempts = Math.max(1, master.getConfiguration().getInt( + "hbase.master.maximum.ping.server.attempts", 10)); + int pingSleepInterval = Math.max(1, master.getConfiguration().getInt( + "hbase.master.ping.server.retry.sleep.interval", 100)); + this.pingRetryCounterFactory = new RetryCounterFactory(pingMaxAttempts, pingSleepInterval); } /** @@ -772,9 +781,9 @@ public class ServerManager { */ public boolean isServerReachable(ServerName server) { if (server == null) throw new NullPointerException("Passed server is null"); - int maximumAttempts = Math.max(1, master.getConfiguration().getInt( - "hbase.master.maximum.ping.server.attempts", 10)); - for (int i = 0; i < maximumAttempts; i++) { + + RetryCounter retryCounter = pingRetryCounterFactory.create(); + while (retryCounter.shouldRetry()) { try { AdminService.BlockingInterface admin = getRsAdmin(server); if (admin != null) { @@ -783,8 +792,13 @@ public class ServerManager { && server.getStartcode() == info.getServerName().getStartCode(); } } catch (IOException ioe) { - LOG.debug("Couldn't reach " + server + ", try=" + i - + " of " + maximumAttempts, ioe); + LOG.debug("Couldn't reach " + server + ", try=" + retryCounter.getAttemptTimes() + + " of " + retryCounter.getMaxAttempts(), ioe); + try { + retryCounter.sleepUntilNextRetry(); + } catch(InterruptedException ie) { + Thread.currentThread().interrupt(); + } } } return false; diff --git hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java index 298feb5..64be2bd 100644 --- hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java +++ hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java @@ -93,6 +93,8 @@ public class TestAssignmentManagerOnCluster { MyRegionObserver.class, RegionObserver.class); // Reduce the maximum attempts to speed up the test conf.setInt("hbase.assignment.maximum.attempts", 3); + conf.setInt("hbase.master.maximum.ping.server.attempts", 3); + conf.setInt("hbase.master.ping.server.retry.sleep.interval", 1); TEST_UTIL.startMiniCluster(1, 4, null, MyMaster.class, MyRegionServer.class); admin = TEST_UTIL.getHBaseAdmin();