From 06c5be5bf6a866ad887c584868d435ba20b9f0f4 Mon Sep 17 00:00:00 2001 From: Mingliang Liu Date: Thu, 6 Sep 2018 23:01:52 -0700 Subject: [PATCH] HBASE-21164 reportForDuty should do backoff rather than retry Remove unused methods from Sleeper (its ok, its @Private). Remove notion of startTime from Sleeper handling (it is is unused). Allow passing in how long to sleep so can maintain externally. In HRS, use a RetryCounter to calculate backoff sleep time for when reportForDuty is failing against a struggling Master. --- .../org/apache/hadoop/hbase/util/Sleeper.java | 40 +++++------- .../apache/hadoop/hbase/master/HMaster.java | 3 +- .../hbase/regionserver/HRegionServer.java | 21 ++++-- .../TestRegionServerReportForDuty.java | 65 +++++++++++++++++++ 4 files changed, 96 insertions(+), 33 deletions(-) diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java index 7d4d692e1a..403552c113 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.util; import org.apache.hadoop.hbase.Stoppable; +import org.apache.hadoop.util.Time; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -49,13 +50,6 @@ public class Sleeper { this.stopper = stopper; } - /** - * Sleep for period. - */ - public void sleep() { - sleep(System.currentTimeMillis()); - } - /** * If currently asleep, stops sleeping; if not asleep, will skip the next * sleep cycle. @@ -68,31 +62,27 @@ public class Sleeper { } /** - * Sleep for period adjusted by passed startTime - * @param startTime Time some task started previous to now. Time to sleep - * will be docked current time minus passed startTime. + * Sleep for period. */ - public void sleep(final long startTime) { + public void sleep() { + sleep(this.period); + } + + public void sleep(long sleepTime) { if (this.stopper.isStopped()) { return; } - long now = System.currentTimeMillis(); - long waitTime = this.period - (now - startTime); - if (waitTime > this.period) { - LOG.warn("Calculated wait time > " + this.period + - "; setting to this.period: " + System.currentTimeMillis() + ", " + - startTime); - waitTime = this.period; - } - while (waitTime > 0) { + long start = Time.monotonicNow(); + long currentSleepTime = sleepTime; + while (currentSleepTime > 0) { long woke = -1; try { synchronized (sleepLock) { if (triggerWake) break; - sleepLock.wait(waitTime); + sleepLock.wait(currentSleepTime); } - woke = System.currentTimeMillis(); - long slept = woke - now; + woke = Time.monotonicNow(); + long slept = woke - start; if (slept - this.period > MINIMAL_DELTA_FOR_LOGGING) { LOG.warn("We slept " + slept + "ms instead of " + this.period + "ms, this is likely due to a long " + @@ -107,8 +97,8 @@ public class Sleeper { } } // Recalculate waitTime. - woke = (woke == -1)? System.currentTimeMillis(): woke; - waitTime = this.period - (woke - startTime); + woke = (woke == -1)? Time.monotonicNow() : woke; + currentSleepTime = this.period - (woke - start); } synchronized(sleepLock) { triggerWake = false; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index f51496af23..6d7e2003cc 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -2913,7 +2913,8 @@ public class HMaster extends HRegionServer implements MasterServices { } } - void checkServiceStarted() throws ServerNotRunningYetException { + @VisibleForTesting + protected void checkServiceStarted() throws ServerNotRunningYetException { if (!serviceStarted) { throw new ServerNotRunningYetException("Server is not running yet"); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 02815c532a..d6f1230ade 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -149,6 +149,8 @@ import org.apache.hadoop.hbase.util.HasThread; import org.apache.hadoop.hbase.util.JvmPauseMonitor; import org.apache.hadoop.hbase.util.NettyEventLoopGroupConfig; import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.hbase.util.RetryCounter; +import org.apache.hadoop.hbase.util.RetryCounterFactory; import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil; import org.apache.hadoop.hbase.util.Sleeper; import org.apache.hadoop.hbase.util.Threads; @@ -940,14 +942,18 @@ public class HRegionServer extends HasThread implements this.rsHost = new RegionServerCoprocessorHost(this, this.conf); } - // Try and register with the Master; tell it we are here. Break if - // server is stopped or the clusterup flag is down or hdfs went wacky. - // Once registered successfully, go ahead and start up all Services. + // Try and register with the Master; tell it we are here. Break if server is stopped or the + // clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and start + // up all Services. Use RetryCounter to get backoff in case Master is struggling to come up. + RetryCounterFactory rcf = new RetryCounterFactory(Integer.MAX_VALUE, + this.sleeper.getPeriod(), 1000 * 60); + RetryCounter rc = rcf.create(); while (keepLooping()) { RegionServerStartupResponse w = reportForDuty(); if (w == null) { - LOG.warn("reportForDuty failed; sleeping and then retrying."); - this.sleeper.sleep(); + long sleepTime = rc.getBackoffTimeAndIncrementAttempts(); + LOG.warn("reportForDuty failed; sleeping {} ms and then retrying.", sleepTime); + this.sleeper.sleep(sleepTime); } else { handleReportForDutyResponse(w); break; @@ -2582,14 +2588,15 @@ public class HRegionServer extends HasThread implements return !this.stopped && isClusterUp(); } - /* + /** * Let the master know we're here Run initialization using parameters passed * us by the master. * @return A Map of key/value configurations we got from the Master else * null if we failed to register. * @throws IOException */ - private RegionServerStartupResponse reportForDuty() throws IOException { + @VisibleForTesting + RegionServerStartupResponse reportForDuty() throws IOException { if (this.masterless) return RegionServerStartupResponse.getDefaultInstance(); ServerName masterServerName = createRegionServerStatusStub(true); if (masterServerName == null) return null; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java index 83632be678..7281d34671 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerReportForDuty.java @@ -21,6 +21,8 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.IOException; +import java.util.concurrent.atomic.AtomicInteger; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtility; @@ -28,8 +30,11 @@ import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.LocalHBaseCluster; import org.apache.hadoop.hbase.MiniHBaseCluster.MiniHBaseClusterRegionServer; import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException; +import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.LoadBalancer; import org.apache.hadoop.hbase.master.ServerManager; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionServerStartupResponse; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread; @@ -77,6 +82,66 @@ public class TestRegionServerReportForDuty { testUtil.shutdownMiniDFSCluster(); } + /** + * This test HRegionServer class will maintain a static counter for number of reportForDuty(). + */ + public static class CountReportDutyRegionServer extends HRegionServer { + static final AtomicInteger reportForDutyCount = new AtomicInteger(0); + public CountReportDutyRegionServer(Configuration conf) throws IOException { + super(conf); + } + + @Override + RegionServerStartupResponse reportForDuty() throws IOException { + reportForDutyCount.incrementAndGet(); + return super.reportForDuty(); + } + } + + /** + * This test HMaster class will always throw ServerNotRunningYetException if checked. + */ + public static class NeverInitializedMaster extends HMaster { + public NeverInitializedMaster(Configuration conf) throws IOException, KeeperException { + super(conf); + } + + @Override + protected void checkServiceStarted() throws ServerNotRunningYetException { + throw new ServerNotRunningYetException("Server is not running yet"); + } + } + + /** + * Tests region server should backoff to report for duty if master is not ready. + */ + @Test + public void testReportForDutyBackoff() throws IOException, InterruptedException { + Configuration conf = cluster.getConfiguration(); + conf.set(HConstants.MASTER_IMPL, NeverInitializedMaster.class.getName()); + master = cluster.addMaster(); + master.start(); + + conf.set(HConstants.REGION_SERVER_IMPL, CountReportDutyRegionServer.class.getName()); + // Set sleep interval relatively low so that exponential backoff is more demanding. + int msginterval = 100; + conf.setInt("hbase.regionserver.msginterval", msginterval); + rs = cluster.addRegionServer(); + rs.start(); + + int interval = 10_000; + Thread.sleep(interval); + + int actualRetry = CountReportDutyRegionServer.reportForDutyCount.get(); + int expectedRetry = (int)Math.ceil(Math.log(interval - msginterval)); + // Following asserts the actual retry number is in range (expectedRetry/2, expectedRetry*2). + // Ideally we can assert the exact retry count. We relax here to tolerate contention error. + assertTrue(String.format("reportForDuty retries %d times, less than expected min %d", + actualRetry, expectedRetry / 2), actualRetry > expectedRetry / 2); + assertTrue(String.format("reportForDuty retries %d times, more than expected max %d", + actualRetry, expectedRetry * 2), actualRetry < expectedRetry * 2); + } + /** * Tests region sever reportForDuty with backup master becomes primary master after * the first master goes away. -- 2.19.0