diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionUtils.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionUtils.java index 63861be..d9e460b 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionUtils.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionUtils.java @@ -54,6 +54,9 @@ public class ConnectionUtils { if (ntries >= HConstants.RETRY_BACKOFF.length) { ntries = HConstants.RETRY_BACKOFF.length - 1; } + if (ntries < 0) { + ntries = 0; + } long normalPause = pause * HConstants.RETRY_BACKOFF[ntries]; long jitter = (long)(normalPause * RANDOM.nextFloat() * 0.01f); // 1% possible jitter diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionAdminServiceCallable.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionAdminServiceCallable.java index 0d1fa02..ace9726 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionAdminServiceCallable.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionAdminServiceCallable.java @@ -153,7 +153,7 @@ public abstract class RegionAdminServiceCallable implements RetryingCallable< @Override public long sleep(long pause, int tries) { - long sleep = ConnectionUtils.getPauseTime(pause, tries + 1); + long sleep = ConnectionUtils.getPauseTime(pause, tries); if (sleep < MIN_WAIT_DEAD_SERVER && (location == null || connection.isDeadServer(location.getServerName()))) { sleep = ConnectionUtils.addJitter(MIN_WAIT_DEAD_SERVER, 0.10f); diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionServerCallable.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionServerCallable.java index 7c07a99..48b98e7 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionServerCallable.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RegionServerCallable.java @@ -142,8 +142,7 @@ public abstract class RegionServerCallable implements RetryingCallable { @Override public long sleep(long pause, int tries) { - // Tries hasn't been bumped up yet so we use "tries + 1" to get right pause time - long sleep = ConnectionUtils.getPauseTime(pause, tries + 1); + long sleep = ConnectionUtils.getPauseTime(pause, tries); if (sleep < MIN_WAIT_DEAD_SERVER && (location == null || getConnection().isDeadServer(location.getServerName()))) { sleep = ConnectionUtils.addJitter(MIN_WAIT_DEAD_SERVER, 0.10f); diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RpcRetryingCaller.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RpcRetryingCaller.java index 1d14f71..f8efee1 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RpcRetryingCaller.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/RpcRetryingCaller.java @@ -158,8 +158,8 @@ public class RpcRetryingCaller { } // If the server is dead, we need to wait a little before retrying, to give // a chance to the regions to be - // tries hasn't been bumped up yet so we use "tries + 1" to get right pause time - expectedSleep = callable.sleep(pause, tries + 1); + // get right pause time, start by RETRY_BACKOFF[0] * pause + expectedSleep = callable.sleep(pause, tries); // If, after the planned sleep, there won't be enough time left, we stop now. long duration = singleCallDuration(expectedSleep); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnectionUtils.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnectionUtils.java index 649d674..3d449ae 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnectionUtils.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestConnectionUtils.java @@ -19,6 +19,7 @@ */ package org.apache.hadoop.hbase.client; +import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.testclassification.SmallTests; import org.junit.Test; import org.junit.experimental.categories.Category; @@ -53,4 +54,23 @@ public class TestConnectionUtils { assertTrue(retyTimeSet.size() > (retries.length * 0.80)); } + @Test + public void testGetPauseTime() { + long pauseTime; + long baseTime = 100; + pauseTime = ConnectionUtils.getPauseTime(baseTime, -1); + assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[0])); + assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[0] * 1.01f)); + + for (int i = 0; i < HConstants.RETRY_BACKOFF.length; i++) { + pauseTime = ConnectionUtils.getPauseTime(baseTime, i); + assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[i])); + assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[i] * 1.01f)); + } + + int length = HConstants.RETRY_BACKOFF.length; + pauseTime = ConnectionUtils.getPauseTime(baseTime, length); + assertTrue(pauseTime >= (baseTime * HConstants.RETRY_BACKOFF[length - 1])); + assertTrue(pauseTime <= (baseTime * HConstants.RETRY_BACKOFF[length - 1] * 1.01f)); + } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java index 501d836..62ff3f0 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHCM.java @@ -347,6 +347,46 @@ public class TestHCM { } } + /** + * Test starting from 0 index when calculate the backoff time. + */ + @Test + public void testCallableSleep() throws Exception { + HTableDescriptor hdt = TEST_UTIL.createTableDescriptor("HCM-testCallableSleep"); + hdt.addCoprocessor(SleepAndFailFirstTime.class.getName()); + TEST_UTIL.createTable(hdt, new byte[][] { FAM_NAM }).close(); + + Configuration c = new Configuration(TEST_UTIL.getConfiguration()); + c.setInt(HConstants.HBASE_CLIENT_PAUSE, 30 * 1000); + c.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY, 40 * 1000); + + Connection connection = ConnectionFactory.createConnection(c); + Table t = connection.getTable(TableName.valueOf("HCM-testCallableSleep")); + if (t instanceof HTable) { + HTable table = (HTable) t; + table.setOperationTimeout(80 * 1000); + // Check that it works. Because 20 + 30 * RETRY_BACKOFF[0] + 20 < 80 + table.get(new Get(FAM_NAM)); + + // Resetting and retrying. + SleepAndFailFirstTime.ct.set(0); + try { + table.setOperationTimeout(60 * 1000); + // Will fail this time. After sleep, there are not enough time for second retry + // Beacuse 20 + 30 * RETRY_BACKOFF[0] + 20 > 60 + table.get(new Get(FAM_NAM)); + Assert.fail("We expect an exception here"); + } catch (SocketTimeoutException e) { + LOG.info("We received an exception, as expected ", e); + } catch (IOException e) { + Assert.fail("Wrong exception:" + e.getMessage()); + } finally { + table.close(); + connection.close(); + } + } + } + private void testConnectionClose(boolean allowsInterrupt) throws Exception { TableName tableName = TableName.valueOf("HCM-testConnectionClose" + allowsInterrupt); TEST_UTIL.createTable(tableName, FAM_NAM).close(); @@ -1129,7 +1169,7 @@ public class TestHCM { } } - @Ignore ("Test presumes RETRY_BACKOFF will never change; it has") @Test + @Test public void testErrorBackoffTimeCalculation() throws Exception { // TODO: This test would seem to presume hardcoded RETRY_BACKOFF which it should not. final long ANY_PAUSE = 100; @@ -1149,40 +1189,23 @@ public class TestHCM { // Check some backoff values from HConstants sequence. tracker.reportServerError(location); - assertEqualsWithJitter(ANY_PAUSE, tracker.calculateBackoffTime(location, ANY_PAUSE)); + assertEqualsWithJitter(ANY_PAUSE * HConstants.RETRY_BACKOFF[0], + tracker.calculateBackoffTime(location, ANY_PAUSE)); tracker.reportServerError(location); tracker.reportServerError(location); tracker.reportServerError(location); - assertEqualsWithJitter(ANY_PAUSE * 5, tracker.calculateBackoffTime(location, ANY_PAUSE)); + assertEqualsWithJitter(ANY_PAUSE * HConstants.RETRY_BACKOFF[3], + tracker.calculateBackoffTime(location, ANY_PAUSE)); // All of this shouldn't affect backoff for different location. assertEquals(0, tracker.calculateBackoffTime(diffLocation, ANY_PAUSE)); tracker.reportServerError(diffLocation); - assertEqualsWithJitter(ANY_PAUSE, tracker.calculateBackoffTime(diffLocation, ANY_PAUSE)); + assertEqualsWithJitter(ANY_PAUSE * HConstants.RETRY_BACKOFF[0], + tracker.calculateBackoffTime(diffLocation, ANY_PAUSE)); // Check with different base. - assertEqualsWithJitter(ANY_PAUSE * 10, + assertEqualsWithJitter(ANY_PAUSE * 2 * HConstants.RETRY_BACKOFF[3], tracker.calculateBackoffTime(location, ANY_PAUSE * 2)); - - // See that time from last error is taken into account. Time shift is applied after jitter, - // so pass the original expected backoff as the base for jitter. - long timeShift = (long)(ANY_PAUSE * 0.5); - timeMachine.setValue(timeBase + timeShift); - assertEqualsWithJitter((ANY_PAUSE * 5) - timeShift, - tracker.calculateBackoffTime(location, ANY_PAUSE), ANY_PAUSE * 2); - - // However we should not go into negative. - timeMachine.setValue(timeBase + ANY_PAUSE * 100); - assertEquals(0, tracker.calculateBackoffTime(location, ANY_PAUSE)); - - // We also should not go over the boundary; last retry would be on it. - long timeLeft = (long)(ANY_PAUSE * 0.5); - timeMachine.setValue(timeBase + largeAmountOfTime - timeLeft); - assertTrue(tracker.canRetryMore(1)); - tracker.reportServerError(location); - assertEquals(timeLeft, tracker.calculateBackoffTime(location, ANY_PAUSE)); - timeMachine.setValue(timeBase + largeAmountOfTime); - assertFalse(tracker.canRetryMore(1)); } finally { EnvironmentEdgeManager.reset(); }