From d5140d4b319abfae7feaed06c3533f3940424dd6 Mon Sep 17 00:00:00 2001 From: stack Date: Thu, 20 Feb 2020 22:14:10 -0800 Subject: [PATCH] HBASE-23880 [Flakey Test] TestRegionReplicaFailover fails frequently Purge start of RS at end of test, refactor so not using deprecated APIs, cut down on work done. --- .../TestRegionReplicaFailover.java | 96 +++++++++---------- 1 file changed, 46 insertions(+), 50 deletions(-) diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionReplicaFailover.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionReplicaFailover.java index cd9e1a560c..53fc239be8 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionReplicaFailover.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionReplicaFailover.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -27,7 +27,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.Waiter.Predicate; import org.apache.hadoop.hbase.client.Admin; @@ -36,6 +35,8 @@ import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.client.Consistency; import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.client.TableDescriptor; +import org.apache.hadoop.hbase.client.TableDescriptorBuilder; import org.apache.hadoop.hbase.replication.regionserver.TestRegionReplicaReplicationEndpoint; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.util.Bytes; @@ -68,6 +69,7 @@ public class TestRegionReplicaFailover { private static final HBaseTestingUtility HTU = new HBaseTestingUtility(); private static final int NB_SERVERS = 3; + private static final int ROWS = 100; protected final byte[][] families = new byte[][] {HBaseTestingUtility.fam1, HBaseTestingUtility.fam2, HBaseTestingUtility.fam3}; @@ -79,12 +81,12 @@ public class TestRegionReplicaFailover { @Rule public TestName name = new TestName(); - private HTableDescriptor htd; + private TableDescriptor htd; @Before public void before() throws Exception { Configuration conf = HTU.getConfiguration(); - // Up the handlers; this test needs more than usual. + // Up the handlers; this test needs more than usual. conf.setInt(HConstants.REGION_SERVER_HIGH_PRIORITY_HANDLER_COUNT, 10); conf.setBoolean(ServerRegionReplicaUtil.REGION_REPLICA_REPLICATION_CONF_KEY, true); conf.setBoolean(ServerRegionReplicaUtil.REGION_REPLICA_WAIT_FOR_PRIMARY_FLUSH_CONF_KEY, true); @@ -92,15 +94,20 @@ public class TestRegionReplicaFailover { conf.setBoolean("hbase.tests.use.shortcircuit.reads", false); HTU.startMiniCluster(NB_SERVERS); - htd = HTU.createTableDescriptor( - name.getMethodName().substring(0, name.getMethodName().length()-3)); - htd.setRegionReplication(3); + // Set replication. Have to do it this awkward way of building a TD with HTU so it adds all + // the column families, and then creating a new one so we can set region replication. + TableDescriptor tableDescriptor = + HTU.createTableDescriptor(TableName.valueOf(name.getMethodName(). + substring(0, name.getMethodName().length() - 3))); + this.htd = TableDescriptorBuilder.newBuilder(tableDescriptor).setRegionReplication(3).build(); HTU.getAdmin().createTable(htd); } @After public void after() throws Exception { - HTU.deleteTableIfAny(htd.getTableName()); + if (this.htd != null) { + HTU.deleteTableIfAny(htd.getTableName()); + } HTU.shutdownMiniCluster(); } @@ -125,7 +132,6 @@ public class TestRegionReplicaFailover { /** * Tests the case where if there is some data in the primary region, reopening the region replicas * (enable/disable table, etc) makes the region replicas readable. - * @throws IOException */ @Test public void testSecondaryRegionWithNonEmptyRegion() throws IOException { @@ -133,13 +139,10 @@ public class TestRegionReplicaFailover { // than disable and enable the table again and verify the data from secondary try (Connection connection = ConnectionFactory.createConnection(HTU.getConfiguration()); Table table = connection.getTable(htd.getTableName())) { - - HTU.loadNumericRows(table, fam, 0, 1000); - + HTU.loadNumericRows(table, fam, 0, ROWS); HTU.getAdmin().disableTable(htd.getTableName()); HTU.getAdmin().enableTable(htd.getTableName()); - - HTU.verifyNumericRows(table, fam, 0, 1000, 1); + HTU.verifyNumericRows(table, fam, 0, ROWS, 1); } } @@ -151,11 +154,11 @@ public class TestRegionReplicaFailover { try (Connection connection = ConnectionFactory.createConnection(HTU.getConfiguration()); Table table = connection.getTable(htd.getTableName())) { - HTU.loadNumericRows(table, fam, 0, 1000); + HTU.loadNumericRows(table, fam, 0, ROWS); // wal replication is async, we have to wait until the replication catches up, or we timeout - verifyNumericRowsWithTimeout(table, fam, 0, 1000, 1, 30000); - verifyNumericRowsWithTimeout(table, fam, 0, 1000, 2, 30000); + verifyNumericRowsWithTimeout(table, fam, 0, ROWS, 1, 30000); + verifyNumericRowsWithTimeout(table, fam, 0, ROWS, 2, 30000); // we should not have flushed files now, but data in memstores of primary and secondary // kill the primary region replica now, and ensure that when it comes back up, we can still @@ -174,13 +177,10 @@ public class TestRegionReplicaFailover { assertTrue(aborted); // wal replication is async, we have to wait until the replication catches up, or we timeout - verifyNumericRowsWithTimeout(table, fam, 0, 1000, 0, 30000); - verifyNumericRowsWithTimeout(table, fam, 0, 1000, 1, 30000); - verifyNumericRowsWithTimeout(table, fam, 0, 1000, 2, 30000); + verifyNumericRowsWithTimeout(table, fam, 0, ROWS, 0, 30000); + verifyNumericRowsWithTimeout(table, fam, 0, ROWS, 1, 30000); + verifyNumericRowsWithTimeout(table, fam, 0, ROWS, 2, 30000); } - - // restart the region server - HTU.getMiniHBaseCluster().startRegionServer(); } /** wal replication is async, we have to wait until the replication catches up, or we timeout @@ -213,11 +213,11 @@ public class TestRegionReplicaFailover { public void testSecondaryRegionKill() throws Exception { try (Connection connection = ConnectionFactory.createConnection(HTU.getConfiguration()); Table table = connection.getTable(htd.getTableName())) { - HTU.loadNumericRows(table, fam, 0, 1000); + HTU.loadNumericRows(table, fam, 0, ROWS); // wait for some time to ensure that async wal replication does it's magic - verifyNumericRowsWithTimeout(table, fam, 0, 1000, 1, 30000); - verifyNumericRowsWithTimeout(table, fam, 0, 1000, 2, 30000); + verifyNumericRowsWithTimeout(table, fam, 0, ROWS, 1, 30000); + verifyNumericRowsWithTimeout(table, fam, 0, ROWS, 2, 30000); // we should not have flushed files now, but data in memstores of primary and secondary // kill the secondary region replica now, and ensure that when it comes back up, we can still @@ -237,12 +237,9 @@ public class TestRegionReplicaFailover { Threads.sleep(5000); - HTU.verifyNumericRows(table, fam, 0, 1000, 1); - HTU.verifyNumericRows(table, fam, 0, 1000, 2); + HTU.verifyNumericRows(table, fam, 0, ROWS, 1); + HTU.verifyNumericRows(table, fam, 0, ROWS, 2); } - - // restart the region server - HTU.getMiniHBaseCluster().startRegionServer(); } /** @@ -256,9 +253,9 @@ public class TestRegionReplicaFailover { Table table = connection.getTable(htd.getTableName()); Admin admin = connection.getAdmin()) { // start a thread to do the loading of primary - HTU.loadNumericRows(table, fam, 0, 1000); // start with some base + HTU.loadNumericRows(table, fam, 0, ROWS); // start with some base admin.flush(table.getName()); - HTU.loadNumericRows(table, fam, 1000, 2000); + HTU.loadNumericRows(table, fam, ROWS, 2000); final AtomicReference ex = new AtomicReference<>(null); final AtomicBoolean done = new AtomicBoolean(false); @@ -269,8 +266,8 @@ public class TestRegionReplicaFailover { public void run() { while (!done.get()) { try { - HTU.loadNumericRows(table, fam, key.get(), key.get()+1000); - key.addAndGet(1000); + HTU.loadNumericRows(table, fam, key.get(), key.get()+ROWS); + key.addAndGet(ROWS); } catch (Throwable e) { ex.compareAndSet(null, e); } @@ -307,15 +304,12 @@ public class TestRegionReplicaFailover { assertNull(ex.get()); - assertTrue(key.get() > 1000); // assert that the test is working as designed + assertTrue(key.get() > ROWS); // assert that the test is working as designed LOG.info("Loaded up to key :" + key.get()); verifyNumericRowsWithTimeout(table, fam, 0, key.get(), 0, 30000); verifyNumericRowsWithTimeout(table, fam, 0, key.get(), 1, 30000); verifyNumericRowsWithTimeout(table, fam, 0, key.get(), 2, 30000); } - - // restart the region server - HTU.getMiniHBaseCluster().startRegionServer(); } /** @@ -324,21 +318,25 @@ public class TestRegionReplicaFailover { */ @Test public void testLotsOfRegionReplicas() throws IOException { - int numRegions = NB_SERVERS * 20; - int regionReplication = 10; - String tableName = htd.getTableName().getNameAsString() + "2"; - htd = HTU.createTableDescriptor(tableName); - htd.setRegionReplication(regionReplication); + int numRegions = NB_SERVERS * 10; + int regionReplication = 5; + // Different table name. Different replication. Have to do it this awkward way of building + // a TD with HTU so it adds all the column families, and then creating a new one so we can + // set region replication. + TableName tableName = TableName.valueOf(htd.getTableName().getNameAsString() + "2"); + TableDescriptor tableDescriptor = HTU.createTableDescriptor(tableName); + this.htd = TableDescriptorBuilder.newBuilder(tableDescriptor). + setRegionReplication(regionReplication).build(); + TableDescriptor td = TableDescriptorBuilder.newBuilder(this.htd). + setRegionReplication(regionReplication).build(); // dont care about splits themselves too much byte[] startKey = Bytes.toBytes("aaa"); byte[] endKey = Bytes.toBytes("zzz"); byte[][] splits = HTU.getRegionSplitStartKeys(startKey, endKey, numRegions); - HTU.getAdmin().createTable(htd, startKey, endKey, numRegions); - + HTU.getAdmin().createTable(td, startKey, endKey, numRegions); try (Connection connection = ConnectionFactory.createConnection(HTU.getConfiguration()); - Table table = connection.getTable(htd.getTableName())) { - + Table table = connection.getTable(td.getTableName())) { for (int i = 1; i < splits.length; i++) { for (int j = 0; j < regionReplication; j++) { Get get = new Get(splits[i]); @@ -348,7 +346,5 @@ public class TestRegionReplicaFailover { } } } - - HTU.deleteTableIfAny(TableName.valueOf(tableName)); } } -- 2.19.1