From c282e5a7ff22ab07d644ad16364afcac006ce127 Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Thu, 6 Sep 2018 15:46:23 -0700 Subject: [PATCH] HBASE-21164 reportForDuty should do backoff rather than retry every 3 seconds (default). Remove unused methods from Sleeper (its ok, its @Private). Remove notion of startTime from Sleeper handling (it is is unused). Allow passing in how long to sleep so can maintain externally. In HRS, use a RetryCounter to calculate backoff sleep time for when reportForDuty is failing against a struggling Master. --- .../java/org/apache/hadoop/hbase/util/Sleeper.java | 31 +++++++--------------- .../hadoop/hbase/regionserver/HRegionServer.java | 16 +++++++---- 2 files changed, 21 insertions(+), 26 deletions(-) diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java index 7d4d692e1a..93ef08cc60 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/Sleeper.java @@ -49,13 +49,6 @@ public class Sleeper { this.stopper = stopper; } - /** - * Sleep for period. - */ - public void sleep() { - sleep(System.currentTimeMillis()); - } - /** * If currently asleep, stops sleeping; if not asleep, will skip the next * sleep cycle. @@ -68,28 +61,24 @@ public class Sleeper { } /** - * Sleep for period adjusted by passed startTime - * @param startTime Time some task started previous to now. Time to sleep - * will be docked current time minus passed startTime. + * Sleep for period. */ - public void sleep(final long startTime) { + public void sleep() { + sleep(this.period); + } + + public void sleep(long sleepTime) { if (this.stopper.isStopped()) { return; } long now = System.currentTimeMillis(); - long waitTime = this.period - (now - startTime); - if (waitTime > this.period) { - LOG.warn("Calculated wait time > " + this.period + - "; setting to this.period: " + System.currentTimeMillis() + ", " + - startTime); - waitTime = this.period; - } - while (waitTime > 0) { + long currentSleepTime = sleepTime; + while (currentSleepTime > 0) { long woke = -1; try { synchronized (sleepLock) { if (triggerWake) break; - sleepLock.wait(waitTime); + sleepLock.wait(currentSleepTime); } woke = System.currentTimeMillis(); long slept = woke - now; @@ -108,7 +97,7 @@ public class Sleeper { } // Recalculate waitTime. woke = (woke == -1)? System.currentTimeMillis(): woke; - waitTime = this.period - (woke - startTime); + currentSleepTime = this.period - (woke - now); } synchronized(sleepLock) { triggerWake = false; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 5cd649990f..808b787b03 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -149,6 +149,8 @@ import org.apache.hadoop.hbase.util.HasThread; import org.apache.hadoop.hbase.util.JvmPauseMonitor; import org.apache.hadoop.hbase.util.NettyEventLoopGroupConfig; import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.hbase.util.RetryCounter; +import org.apache.hadoop.hbase.util.RetryCounterFactory; import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil; import org.apache.hadoop.hbase.util.Sleeper; import org.apache.hadoop.hbase.util.Threads; @@ -940,14 +942,18 @@ public class HRegionServer extends HasThread implements this.rsHost = new RegionServerCoprocessorHost(this, this.conf); } - // Try and register with the Master; tell it we are here. Break if - // server is stopped or the clusterup flag is down or hdfs went wacky. - // Once registered successfully, go ahead and start up all Services. + // Try and register with the Master; tell it we are here. Break if server is stopped or the + // clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and start + // up all Services. Use RetryCounter to get backoff in case Master is struggling to come up. + RetryCounterFactory rcf = new RetryCounterFactory(Integer.MAX_VALUE, + this.sleeper.getPeriod(), 1000 * 60 * 5); + RetryCounter rc = rcf.create(); while (keepLooping()) { RegionServerStartupResponse w = reportForDuty(); if (w == null) { - LOG.warn("reportForDuty failed; sleeping and then retrying."); - this.sleeper.sleep(); + long sleepTime = rc.getBackoffTimeAndIncrementAttempts(); + LOG.warn("reportForDuty failed; sleeping {} ms and then retrying.", sleepTime); + this.sleeper.sleep(sleepTime); } else { handleReportForDutyResponse(w); break; -- 2.16.3