diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ServerCallable.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ServerCallable.java index 278339d..e9bfff9 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ServerCallable.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ServerCallable.java @@ -174,7 +174,8 @@ public abstract class ServerCallable implements Callable { prepare(tries != 0); // if called with false, check table status on ZK return call(); } catch (Throwable t) { - LOG.warn("Call exception, tries=" + tries + ", numRetries=" + numRetries, t); + LOG.warn("Call exception, tries=" + tries + ", numRetries=" + numRetries + ", retryTime=" + + (this.globalStartTime - System.currentTimeMillis()) + "ms", t); t = translateException(t); // translateException throws an exception when we should not retry, i.e. when it's the diff --git a/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestClientNoCluster.java b/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestClientNoCluster.java index dbd4f7d..50eb9d3 100644 --- a/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestClientNoCluster.java +++ b/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestClientNoCluster.java @@ -37,6 +37,7 @@ import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.ClientService.Blo import org.apache.hadoop.hbase.util.Bytes; import org.junit.Before; import org.junit.Test; +import org.junit.Ignore; import org.mockito.Mockito; import com.google.protobuf.RpcController; @@ -92,6 +93,36 @@ public class TestClientNoCluster { } /** + * Remove the @Ignore to try out timeout and retry asettings + * @throws IOException + */ + @Ignore + @Test + public void testTimeoutAndRetries() throws IOException { + Configuration localConfig = HBaseConfiguration.create(this.conf); + // This override mocks up our exists/get call to throw a RegionServerStoppedException. + localConfig.set("hbase.client.connection.impl", RpcTimeoutConnection.class.getName()); + HTable table = new HTable(localConfig, HConstants.META_TABLE_NAME); + Throwable t = null; + LOG.info("Start"); + try { + // An exists call turns into a get w/ a flag. + table.exists(new Get(Bytes.toBytes("abc"))); + } catch (SocketTimeoutException e) { + // I expect this exception. + LOG.info("Got expected exception", e); + t = e; + } catch (RetriesExhaustedException e) { + // This is the old, unwanted behavior. If we get here FAIL!!! + fail(); + } finally { + table.close(); + } + LOG.info("Stop"); + assertTrue(t != null); + } + + /** * Test that operation timeout prevails over rpc default timeout and retries, etc. * @throws IOException */ @@ -102,7 +133,7 @@ public class TestClientNoCluster { localConfig.set("hbase.client.connection.impl", RpcTimeoutConnection.class.getName()); int pause = 10; localConfig.setInt("hbase.client.pause", pause); - localConfig.setInt("hbase.client.retries.number", 10); + localConfig.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 10); // Set the operation timeout to be < the pause. Expectation is that after first pause, we will // fail out of the rpc because the rpc timeout will have been set to the operation tiemout // and it has expired. Otherwise, if this functionality is broke, all retries will be run -- @@ -263,4 +294,4 @@ public class TestClientNoCluster { return this.stub; } } -} +} \ No newline at end of file diff --git a/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestSnapshotFromAdmin.java b/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestSnapshotFromAdmin.java index 0493192..5b10edb 100644 --- a/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestSnapshotFromAdmin.java +++ b/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestSnapshotFromAdmin.java @@ -54,13 +54,14 @@ public class TestSnapshotFromAdmin { */ @Test(timeout = 60000) public void testBackoffLogic() throws Exception { - final int maxWaitTime = 7500; - final int numRetries = 10; - final int pauseTime = 500; + final int pauseTime = 100; + final int maxWaitTime = + HConstants.RETRY_BACKOFF[HConstants.RETRY_BACKOFF.length - 1] * pauseTime; + final int numRetries = HConstants.RETRY_BACKOFF.length; // calculate the wait time, if we just do straight backoff (ignoring the expected time from // master) long ignoreExpectedTime = 0; - for (int i = 0; i < 6; i++) { + for (int i = 0; i < HConstants.RETRY_BACKOFF.length; i++) { ignoreExpectedTime += HConstants.RETRY_BACKOFF[i] * pauseTime; } // the correct wait time, capping at the maxTime/tries + fudge room diff --git a/hbase-client/src/test/resources/hbase-site.xml b/hbase-client/src/test/resources/hbase-site.xml index ffeb0ef..ab4d1cd 100644 --- a/hbase-client/src/test/resources/hbase-site.xml +++ b/hbase-client/src/test/resources/hbase-site.xml @@ -25,13 +25,4 @@ hbase.defaults.for.version.skip true - - hbase.client.retries.number - 5 - Maximum retries. Used as maximum for all retryable - operations such as fetching of the root region from root region - server, getting a cell's value, starting a row update, etc. - Default: 10. - - diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java index 7001ee9..fab7bbe 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java @@ -492,10 +492,13 @@ public final class HConstants { public static final String CONFIGURATION = "CONFIGURATION"; /** - * This is a retry backoff multiplier table similar to the BSD TCP syn - * backoff table, a bit more aggressive than simple exponential backoff. + * Retrying we multiply hbase.client.pause setting by what we have in this array until we + * run out of array items. Retries beyond this use the last number in the array. So, for + * example, if hbase.client.pause is 1 second, and maximum retries count + * hbase.client.retries.number is 10, we will retry at the following intervals: + * 1, 2, 3, 10, 100, 100, 100, 100, 100, 100. */ - public static int RETRY_BACKOFF[] = { 1, 1, 1, 2, 2, 4, 4, 8, 16, 32, 64 }; + public static int RETRY_BACKOFF[] = { 1, 2, 3, 5, 10, 100 }; public static final String REGION_IMPL = "hbase.hregion.impl"; @@ -574,7 +577,7 @@ public final class HConstants { /** * Default value of {@link #HBASE_CLIENT_RETRIES_NUMBER}. */ - public static int DEFAULT_HBASE_CLIENT_RETRIES_NUMBER = 20; + public static int DEFAULT_HBASE_CLIENT_RETRIES_NUMBER = 31; /** * Parameter name for client prefetch limit, used as the maximum number of regions @@ -729,7 +732,7 @@ public final class HConstants { public static final boolean DEFAULT_DISALLOW_WRITES_IN_RECOVERING_CONFIG = false; /** Conf key that specifies timeout value to wait for a region ready */ - public static final String LOG_REPLAY_WAIT_REGION_TIMEOUT = + public static final String LOG_REPLAY_WAIT_REGION_TIMEOUT = "hbase.master.log.replay.wait.region.timeout"; /** @@ -796,7 +799,7 @@ public final class HConstants { /* Name of old snapshot directory. See HBASE-8352 for details on why it needs to be renamed */ public static final String OLD_SNAPSHOT_DIR_NAME = ".snapshot"; - + /** Temporary directory used for table creation and deletion */ public static final String HBASE_TEMP_DIRECTORY = ".tmp"; diff --git a/hbase-common/src/main/resources/hbase-default.xml b/hbase-common/src/main/resources/hbase-default.xml index 1a4ea23..7eac982 100644 --- a/hbase-common/src/main/resources/hbase-default.xml +++ b/hbase-common/src/main/resources/hbase-default.xml @@ -429,7 +429,7 @@ possible configurations would overwhelm and obscure the important. hbase.client.retries.number - 14 + 35 Maximum retries. Used as maximum for all retryable operations such as the getting of a cell's value, starting a row update, etc. Retry interval is a rough function based on hbase.client.pause. At diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestClientScannerRPCTimeout.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestClientScannerRPCTimeout.java index a5a05a1..d761b58 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestClientScannerRPCTimeout.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestClientScannerRPCTimeout.java @@ -68,6 +68,7 @@ public class TestClientScannerRPCTimeout { conf.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY, rpcTimeout); conf.setStrings(HConstants.REGION_SERVER_IMPL, RegionServerWithScanTimeout.class.getName()); conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, CLIENT_RETRIES_NUMBER); + conf.setInt(HConstants.HBASE_CLIENT_PAUSE, 1000); TEST_UTIL.startMiniCluster(1); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java deleted file mode 100644 index c1d0f3d..0000000 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright The Apache Software Foundation - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.regionserver; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Set; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hbase.HBaseTestingUtility; -import org.apache.hadoop.hbase.HColumnDescriptor; -import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.HRegionInfo; -import org.apache.hadoop.hbase.HTableDescriptor; -import org.apache.hadoop.hbase.MiniHBaseCluster; -import org.apache.hadoop.hbase.ServerName; -import org.apache.hadoop.hbase.client.HBaseAdmin; -import org.apache.hadoop.hbase.client.HTable; -import org.apache.hadoop.hbase.client.Put; -import org.apache.hadoop.hbase.client.ResultScanner; -import org.apache.hadoop.hbase.client.Scan; -import org.apache.hadoop.hbase.master.HMaster; -import org.apache.hadoop.hbase.master.MasterFileSystem; -import org.apache.hadoop.hbase.master.ServerManager; -import org.apache.hadoop.hbase.master.TestMasterFailover; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; -import org.apache.hadoop.hbase.util.Threads; -import org.apache.hadoop.hbase.zookeeper.ZKAssign; -import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; -import org.apache.hadoop.hbase.LargeTests; -import org.apache.zookeeper.KeeperException; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.Test; -import org.junit.experimental.categories.Category; - -@Category(LargeTests.class) -public class TestRSKilledWhenMasterInitializing { - private static final Log LOG = LogFactory.getLog(TestMasterFailover.class); - - private static final HBaseTestingUtility TESTUTIL = new HBaseTestingUtility(); - private static final int NUM_MASTERS = 1; - private static final int NUM_RS = 4; - - @BeforeClass - public static void setUpBeforeClass() throws Exception { - // Set it so that this test runs with my custom master - Configuration conf = TESTUTIL.getConfiguration(); - conf.setClass(HConstants.MASTER_IMPL, TestingMaster.class, HMaster.class); - conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 3); - conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 4); - - // Start up the cluster. - TESTUTIL.startMiniCluster(NUM_MASTERS, NUM_RS); - } - - @AfterClass - public static void tearDownAfterClass() throws Exception { - if (!TESTUTIL.getHBaseCluster().getMaster().isInitialized()) { - // master is not initialized and is waiting something forever. - for (MasterThread mt : TESTUTIL.getHBaseCluster().getLiveMasterThreads()) { - mt.interrupt(); - } - } - TESTUTIL.shutdownMiniCluster(); - } - - /** - * An HMaster instance used in this test. If 'TestingMaster.sleep' is set in - * the Configuration, then we'll sleep after log is split and we'll also - * return a custom RegionServerTracker. - */ - public static class TestingMaster extends HMaster { - private boolean logSplit = false; - - public TestingMaster(Configuration conf) throws IOException, - KeeperException, InterruptedException { - super(conf); - } - } - - @Test(timeout = 120000) - public void testCorrectnessWhenMasterFailOver() throws Exception { - final byte[] TABLENAME = Bytes.toBytes("testCorrectnessWhenMasterFailOver"); - final byte[] FAMILY = Bytes.toBytes("family"); - final byte[][] SPLITKEYS = { Bytes.toBytes("b"), Bytes.toBytes("i") }; - - MiniHBaseCluster cluster = TESTUTIL.getHBaseCluster(); - - HTableDescriptor desc = new HTableDescriptor(TABLENAME); - desc.addFamily(new HColumnDescriptor(FAMILY)); - HBaseAdmin hbaseAdmin = TESTUTIL.getHBaseAdmin(); - hbaseAdmin.createTable(desc, SPLITKEYS); - - assertTrue(hbaseAdmin.isTableAvailable(TABLENAME)); - - HTable table = new HTable(TESTUTIL.getConfiguration(), TABLENAME); - List puts = new ArrayList(); - Put put1 = new Put(Bytes.toBytes("a")); - put1.add(FAMILY, Bytes.toBytes("q1"), Bytes.toBytes("value")); - Put put2 = new Put(Bytes.toBytes("h")); - put2.add(FAMILY, Bytes.toBytes("q1"), Bytes.toBytes("value")); - Put put3 = new Put(Bytes.toBytes("o")); - put3.add(FAMILY, Bytes.toBytes("q1"), Bytes.toBytes("value")); - puts.add(put1); - puts.add(put2); - puts.add(put3); - table.put(puts); - ResultScanner resultScanner = table.getScanner(new Scan()); - int count = 0; - while (resultScanner.next() != null) { - count++; - } - resultScanner.close(); - table.close(); - assertEquals(3, count); - - /* Starting test */ - cluster.getConfiguration().setBoolean("TestingMaster.sleep", true); - cluster.getConfiguration().setInt("TestingMaster.sleep.duration", 10000); - - /* NO.1 .META. region correctness */ - // First abort master - abortMaster(cluster); - TestingMaster master = startMasterAndWaitTillMetaRegionAssignment(cluster); - - // Second kill meta server - int metaServerNum = cluster.getServerWithMeta(); - HRegionServer metaRS = cluster.getRegionServer(metaServerNum); - LOG.debug("Killing metaRS"); - metaRS.kill(); - metaRS.join(); - - /* - * Sleep double time of TestingMaster.sleep.duration, so we can ensure that - * master has already assigned META or is blocking on assigning - * META - */ - Thread.sleep(10000 * 2); - - waitUntilMasterIsInitialized(master); - - // Third check whether data is correct in meta region - assertTrue(hbaseAdmin.isTableAvailable(TABLENAME)); - - /* NO.2 data region correctness */ - ServerManager serverManager = cluster.getMaster().getServerManager(); - while (serverManager.areDeadServersInProgress()) { - Thread.sleep(100); - } - // Create a ZKW to use in the test - ZooKeeperWatcher zkw = HBaseTestingUtility.getZooKeeperWatcher(TESTUTIL); - ZKAssign.blockUntilNoRIT(zkw); - - table = new HTable(TESTUTIL.getConfiguration(), TABLENAME); - resultScanner = table.getScanner(new Scan()); - count = 0; - while (resultScanner.next() != null) { - count++; - } - resultScanner.close(); - table.close(); - assertEquals(3, count); - } - - private void abortMaster(MiniHBaseCluster cluster) - throws InterruptedException { - for (MasterThread mt : cluster.getLiveMasterThreads()) { - if (mt.getMaster().isActiveMaster()) { - mt.getMaster().abort("Aborting for tests", new Exception("Trace info")); - mt.join(); - break; - } - } - LOG.debug("Master is aborted"); - } - - private TestingMaster startMasterAndWaitTillMetaRegionAssignment(MiniHBaseCluster cluster) - throws IOException, InterruptedException { - TestingMaster master = (TestingMaster) cluster.startMaster().getMaster(); - while (!master.isInitializationStartsMetaRegionAssignment()) { - Thread.sleep(100); - } - return master; - } - - private void waitUntilMasterIsInitialized(HMaster master) - throws InterruptedException { - while (!master.isInitialized()) { - Thread.sleep(100); - } - while (master.getServerManager().areDeadServersInProgress()) { - Thread.sleep(100); - } - LOG.debug("master isInitialized"); - } -} diff --git a/hbase-server/src/test/resources/hbase-site.xml b/hbase-server/src/test/resources/hbase-site.xml index abff5f5..07213b7 100644 --- a/hbase-server/src/test/resources/hbase-site.xml +++ b/hbase-server/src/test/resources/hbase-site.xml @@ -30,25 +30,10 @@ - hbase.client.pause - 1000 - General client pause value. Used mostly as value to wait - before running a retry of a failed get, region lookup, etc. - - hbase.defaults.for.version.skip true - hbase.client.retries.number - 20 - Maximum retries. Used as maximum for all retryable - operations such as fetching of the root region from root region - server, getting a cell's value, starting a row update, etc. - Default: 20. - - - hbase.server.thread.wakefrequency 1000 Time to sleep in between searches for work (in milliseconds).