From 14a4887d007d0b04067d9962a5c8626cc752bc40 Mon Sep 17 00:00:00 2001 From: haxiaolin Date: Wed, 11 Apr 2018 18:08:08 +0800 Subject: [PATCH] HBASE-20368 Fix RIT stuck when a rsgroup has no online servers but AM's pendingAssginQueue is cleared --- .../hbase/rsgroup/RSGroupBasedLoadBalancer.java | 10 ++ .../hadoop/hbase/rsgroup/RSGroupTestingUtil.java | 42 +++++++ .../rsgroup/TestAssignmentOnRSGroupCrash.java | 128 +++++++++++++++++++++ .../hadoop/hbase/rsgroup/TestRSGroupsBase.java | 18 +-- .../hbase/master/assignment/AssignmentManager.java | 5 + 5 files changed, 186 insertions(+), 17 deletions(-) create mode 100644 hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/RSGroupTestingUtil.java create mode 100644 hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestAssignmentOnRSGroupCrash.java diff --git a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupBasedLoadBalancer.java b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupBasedLoadBalancer.java index 3182a61a16..dee798cb51 100644 --- a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupBasedLoadBalancer.java +++ b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupBasedLoadBalancer.java @@ -212,6 +212,16 @@ public class RSGroupBasedLoadBalancer implements RSGroupableBalancer { if(candidateList.size() > 0) { assignments.putAll(this.internalBalancer.retainAssignment( currentAssignmentMap, candidateList)); + } else{ + if (LOG.isDebugEnabled()) { + LOG.debug("No available server to assign regions: " + regionList.toString()); + } + for(RegionInfo region : regionList) { + if (!assignments.containsKey(LoadBalancer.BOGUS_SERVER_NAME)) { + assignments.put(LoadBalancer.BOGUS_SERVER_NAME, new ArrayList<>()); + } + assignments.get(LoadBalancer.BOGUS_SERVER_NAME).add(region); + } } } diff --git a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/RSGroupTestingUtil.java b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/RSGroupTestingUtil.java new file mode 100644 index 0000000000..c612a540cd --- /dev/null +++ b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/RSGroupTestingUtil.java @@ -0,0 +1,42 @@ +package org.apache.hadoop.hbase.rsgroup; + +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; +import org.apache.hadoop.hbase.net.Address; +import org.apache.yetus.audience.InterfaceAudience; +import org.apache.yetus.audience.InterfaceStability; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@InterfaceAudience.Private +@InterfaceStability.Evolving +public class RSGroupTestingUtil { + private static final Logger LOG = LoggerFactory.getLogger(RSGroupTestingUtil.class); + + private RSGroupTestingUtil() { + } + + public static RSGroupInfo addRSGroup(final RSGroupAdmin rsGroupAdmin, String groupName, + int groupRSCount) throws IOException { + RSGroupInfo defaultInfo = rsGroupAdmin.getRSGroupInfo(RSGroupInfo.DEFAULT_GROUP); + assertTrue(defaultInfo != null); + assertTrue(defaultInfo.getServers().size() >= groupRSCount); + rsGroupAdmin.addRSGroup(groupName); + + Set
set = new HashSet<>(); + for(Address server: defaultInfo.getServers()) { + if(set.size() == groupRSCount) { + break; + } + set.add(server); + } + rsGroupAdmin.moveServers(set, groupName); + RSGroupInfo result = rsGroupAdmin.getRSGroupInfo(groupName); + assertTrue(result.getServers().size() >= groupRSCount); + return result; + } + +} diff --git a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestAssignmentOnRSGroupCrash.java b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestAssignmentOnRSGroupCrash.java new file mode 100644 index 0000000000..d29237f32d --- /dev/null +++ b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestAssignmentOnRSGroupCrash.java @@ -0,0 +1,128 @@ +package org.apache.hadoop.hbase.rsgroup; + +import static org.apache.hadoop.hbase.util.Threads.sleep; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.coprocessor.CoprocessorHost; +import org.apache.hadoop.hbase.net.Address; +import org.apache.hadoop.hbase.testclassification.LargeTests; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.After; +import org.junit.Before; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Category({ MasterTests.class, LargeTests.class }) +public class TestAssignmentOnRSGroupCrash { + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestAssignmentOnRSGroupCrash.class); + + private static final Logger LOG = LoggerFactory.getLogger(TestAssignmentOnRSGroupCrash.class); + + private static final TableName TEST_TABLE = TableName.valueOf("testb"); + private static final String FAMILY_STR = "f"; + private static final byte[] FAMILY = Bytes.toBytes(FAMILY_STR); + private static final int NUM_RS = 3; + + private HBaseTestingUtility UTIL; + + private static RSGroupAdmin rsGroupAdmin; + + private static void setupConf(Configuration conf) { + conf.set(CoprocessorHost.MASTER_COPROCESSOR_CONF_KEY, RSGroupAdminEndpoint.class.getName()); + conf.set(HConstants.HBASE_MASTER_LOADBALANCER_CLASS, RSGroupBasedLoadBalancer.class.getName()); + } + + @Before + public void setup() throws Exception { + UTIL = new HBaseTestingUtility(); + + setupConf(UTIL.getConfiguration()); + UTIL.startMiniCluster(NUM_RS); + + UTIL.createTable(TEST_TABLE, new byte[][] { FAMILY }, + new byte[][] { Bytes.toBytes("B"), Bytes.toBytes("D"), Bytes.toBytes("F"), + Bytes.toBytes("L") }); + rsGroupAdmin = new VerifyingRSGroupAdminClient(new RSGroupAdminClient(UTIL.getConnection()), + UTIL.getConfiguration()); + } + + @After + public void tearDown() throws Exception { + UTIL.shutdownMiniCluster(); + } + + @Test + public void testKillAllRSInGroupAndThenStart() throws Exception { + // create a rsgroup and move one regionserver to it + String groupName = "my_group"; + int groupRSCount = 1; + RSGroupTestingUtil.addRSGroup(rsGroupAdmin, groupName, groupRSCount); + Set toAddTables = new HashSet<>(); + toAddTables.add(TEST_TABLE); + rsGroupAdmin.moveTables(toAddTables, groupName); + RSGroupInfo rsGroupInfo = rsGroupAdmin.getRSGroupInfo(groupName); + LOG.debug("my_group: " + rsGroupInfo.toString()); + Set
servers = rsGroupInfo.getServers(); + ServerName myGroupRS = null; + for (int i = 0; i < NUM_RS; ++i) { + ServerName sn = UTIL.getMiniHBaseCluster().getRegionServer(i).getServerName(); + if (servers.contains(sn.getAddress())) { + myGroupRS = sn; + break; + } + } + assertNotNull(myGroupRS); + checkRegionsOnline(TEST_TABLE, true); + + // stop regionserver in the rsgroup, and table regions will be offline + UTIL.getMiniHBaseCluster().stopRegionServer(myGroupRS); + // better wait for a while for region reassign + sleep(10000); + assertEquals(UTIL.getMiniHBaseCluster().getLiveRegionServerThreads().size(), + NUM_RS - servers.size()); + checkRegionsOnline(TEST_TABLE, false); + + // move another regionserver to the rsgroup + // in this case, moving another region server can be replaced by restarting the regionserver + // mentioned before + RSGroupInfo defaultInfo = rsGroupAdmin.getRSGroupInfo(RSGroupInfo.DEFAULT_GROUP); + Set
set = new HashSet<>(); + for (Address server : defaultInfo.getServers()) { + if (set.size() == groupRSCount) { + break; + } + set.add(server); + } + rsGroupAdmin.moveServers(set, groupName); + + // wait and check if table regions are online + sleep(10000); + checkRegionsOnline(TEST_TABLE, true); + } + + private void checkRegionsOnline(TableName tableName, boolean isOnline) throws IOException { + for (RegionInfo hri : UTIL.getHBaseAdmin().getTableRegions(tableName)) { + assertTrue(UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager() + .getRegionStates().isRegionOnline(hri) == isOnline); + } + } +} diff --git a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsBase.java b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsBase.java index 9422bf8b61..99cd7b332d 100644 --- a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsBase.java +++ b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/rsgroup/TestRSGroupsBase.java @@ -27,7 +27,6 @@ import java.io.IOException; import java.security.SecureRandom; import java.util.ArrayList; import java.util.EnumSet; -import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; @@ -99,22 +98,7 @@ public abstract class TestRSGroupsBase { protected RSGroupInfo addGroup(String groupName, int serverCount) throws IOException, InterruptedException { - RSGroupInfo defaultInfo = rsGroupAdmin.getRSGroupInfo(RSGroupInfo.DEFAULT_GROUP); - assertTrue(defaultInfo != null); - assertTrue(defaultInfo.getServers().size() >= serverCount); - rsGroupAdmin.addRSGroup(groupName); - - Set
set = new HashSet<>(); - for(Address server: defaultInfo.getServers()) { - if(set.size() == serverCount) { - break; - } - set.add(server); - } - rsGroupAdmin.moveServers(set, groupName); - RSGroupInfo result = rsGroupAdmin.getRSGroupInfo(groupName); - assertTrue(result.getServers().size() >= serverCount); - return result; + return RSGroupTestingUtil.addRSGroup(rsGroupAdmin, groupName, serverCount); } void removeGroup(String groupName) throws IOException { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 0e47065446..1fadc3f052 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -1828,6 +1828,11 @@ public class AssignmentManager implements ServerListener { if (plan.isEmpty()) return; + List bogusRegions = plan.remove(LoadBalancer.BOGUS_SERVER_NAME); + if (bogusRegions != null && !bogusRegions.isEmpty()) { + addToPendingAssignment(regions, bogusRegions); + } + int evcount = 0; for (Map.Entry> entry: plan.entrySet()) { final ServerName server = entry.getKey(); -- 2.14.1