diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index 4dff74e..c7ac189 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -1693,6 +1693,10 @@ public class AssignmentManager extends ZooKeeperListener { getLong("hbase.regionserver.rpc.startup.waittime", 60000); for (int i = 1; i <= maximumAttempts && !server.isStopped(); i++) { try { + // regionOpenInfos is empty if all of them are in failedToOpenRegions list + if (regionOpenInfos.isEmpty()) { + break; + } List regionOpeningStateList = serverManager .sendRegionOpen(destination, regionOpenInfos); if (regionOpeningStateList == null) { @@ -1956,8 +1960,12 @@ public class AssignmentManager extends ZooKeeperListener { if (useZKForAssignment && regionStates.isServerDeadAndNotProcessed(sn) && wasRegionOnDeadServerByMeta(region, sn)) { + if (!regionStates.isRegionInTransition(region)) { + LOG.info("Updating the state to " + State.OFFLINE + " to allow to be reassigned by SSH"); + regionStates.updateRegionState(region, State.OFFLINE); + } LOG.info("Skip assigning " + region.getRegionNameAsString() - + ", it is on a dead but not processed yet server: " + sn); + + ", it is on a dead but not processed yet server: " + sn); return null; } case CLOSED: diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 9ae374b..9bf3ce2 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -176,6 +176,9 @@ public class ServerManager { /** Listeners that are called on server events. */ private List listeners = new CopyOnWriteArrayList(); + + /* This is only for testing */ + static volatile boolean TEST_REMOVE_FROM_ONLINE_SERVERS = false; /** * Constructor. @@ -505,6 +508,9 @@ public class ServerManager { LOG.info("Master doesn't enable ServerShutdownHandler during initialization, " + "delay expiring server " + serverName); this.queuedDeadServers.add(serverName); + if (TEST_REMOVE_FROM_ONLINE_SERVERS) { + this.onlineServers.remove(serverName); + } return; } if (this.deadservers.isDeadServer(serverName)) { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java index d83269d..ca13ed2 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java @@ -73,6 +73,7 @@ import org.junit.BeforeClass; import org.junit.Test; import org.junit.experimental.categories.Category; + /** * This tests AssignmentManager with a testing cluster. */ @@ -131,7 +132,7 @@ public class TestAssignmentManagerOnCluster { RegionStates regionStates = am.getRegionStates(); ServerName serverName = regionStates.getRegionServerOfRegion(hri); - TEST_UTIL.assertRegionOnServer(hri, serverName, 200); + TEST_UTIL.assertRegionOnServer(hri, serverName, 6000); // Region is assigned now. Let's assign it again. // Master should not abort, and region should be assigned. @@ -145,7 +146,59 @@ public class TestAssignmentManagerOnCluster { TEST_UTIL.deleteTable(Bytes.toBytes(table)); } } - + + // Simulate a scenario where the AssignCallable and SSH are trying to assign a region + @Test (timeout=60000) + public void testAssignRegionBySSH() throws Exception { + if (!conf.getBoolean("hbase.assignment.usezk", true)) { + return; + } + String table = "testAssignRegionBySSH"; + try { + HTableDescriptor desc = new HTableDescriptor(TableName.valueOf(table)); + desc.addFamily(new HColumnDescriptor(FAMILY)); + admin.createTable(desc); + + HTable meta = new HTable(conf, TableName.META_TABLE_NAME); + HRegionInfo hri = new HRegionInfo( + desc.getTableName(), Bytes.toBytes("A"), Bytes.toBytes("Z")); + MetaEditor.addRegionToMeta(meta, hri); + // Add some dummy server for the region entry + MetaEditor.updateRegionLocation(TEST_UTIL.getHBaseCluster().getMaster().getCatalogTracker(), hri, + ServerName.valueOf("example.org", 1234, System.currentTimeMillis()), 0); + + ServerName controlledServer = TEST_UTIL.getHBaseCluster().getRegionServer(0).getServerName(); + // Move meta off the controlled server + TEST_UTIL.getHBaseAdmin().move(HRegionInfo.FIRST_META_REGIONINFO.getEncodedNameAsBytes(), + Bytes.toBytes(TEST_UTIL.getHBaseCluster().getRegionServer(1).getServerName().getServerName())); + Thread.sleep(3000); + MyMaster master = (MyMaster) TEST_UTIL.getHBaseCluster().getMaster(); + master.enableSSH(false); + ServerManager.TEST_REMOVE_FROM_ONLINE_SERVERS = true; + TEST_UTIL.getHBaseCluster().killRegionServer(controlledServer); + TEST_UTIL.getHBaseCluster().waitForRegionServerToStop(controlledServer, -1); + AssignmentManager am = master.getAssignmentManager(); + RegionStates regionStates = am.getRegionStates(); + + // Simulate the AssignCallable trying to assign the region. Have the region in PENDING_OPEN + // and the server is the dead 'controlledServer' + regionStates.createRegionState(hri, State.PENDING_OPEN, controlledServer); + am.assign(hri, true, true); + // Region should still be in transition + assertEquals(State.OFFLINE, regionStates.getRegionState(hri).getState()); + assertTrue(regionStates.isRegionInTransition(hri)); + + master.enableSSH(true); + am.waitForAssignment(hri); + assertTrue(regionStates.getRegionState(hri).isOpened()); + ServerName serverName = regionStates.getRegionServerOfRegion(hri); + TEST_UTIL.assertRegionOnlyOnServer(hri, serverName, 6000); + } finally { + ServerManager.TEST_REMOVE_FROM_ONLINE_SERVERS = false; + TEST_UTIL.deleteTable(Bytes.toBytes(table)); + } + } + /** * This tests region assignment on a simulated restarted server */ @@ -220,7 +273,7 @@ public class TestAssignmentManagerOnCluster { RegionStates regionStates = TEST_UTIL.getHBaseCluster(). getMaster().getAssignmentManager().getRegionStates(); ServerName serverName = regionStates.getRegionServerOfRegion(hri); - TEST_UTIL.assertRegionOnServer(hri, serverName, 200); + TEST_UTIL.assertRegionOnServer(hri, serverName, 6000); admin.offline(hri.getRegionName()); long timeoutTime = System.currentTimeMillis() + 800; @@ -272,7 +325,7 @@ public class TestAssignmentManagerOnCluster { while (true) { ServerName sn = regionStates.getRegionServerOfRegion(hri); if (sn != null && sn.equals(destServerName)) { - TEST_UTIL.assertRegionOnServer(hri, sn, 200); + TEST_UTIL.assertRegionOnServer(hri, sn, 6000); break; } long now = System.currentTimeMillis(); @@ -384,7 +437,7 @@ public class TestAssignmentManagerOnCluster { assertTrue(am.waitForAssignment(hri)); ServerName sn = am.getRegionStates().getRegionServerOfRegion(hri); TEST_UTIL.assertRegionOnServer(hri, sn, 6000); - + MyRegionObserver.preCloseEnabled.set(true); am.unassign(hri); RegionState state = am.getRegionStates().getRegionState(hri); @@ -450,7 +503,7 @@ public class TestAssignmentManagerOnCluster { assertTrue(am.waitForAssignment(hri)); ServerName serverName = master.getAssignmentManager(). getRegionStates().getRegionServerOfRegion(hri); - TEST_UTIL.assertRegionOnServer(hri, serverName, 200); + TEST_UTIL.assertRegionOnServer(hri, serverName, 6000); } finally { MyRegionObserver.preCloseEnabled.set(false); TEST_UTIL.deleteTable(Bytes.toBytes(table)); @@ -491,7 +544,7 @@ public class TestAssignmentManagerOnCluster { ServerName serverName = master.getAssignmentManager(). getRegionStates().getRegionServerOfRegion(hri); - TEST_UTIL.assertRegionOnServer(hri, serverName, 200); + TEST_UTIL.assertRegionOnServer(hri, serverName, 6000); } finally { MyLoadBalancer.controledRegion = null; TEST_UTIL.deleteTable(Bytes.toBytes(table)); @@ -541,7 +594,7 @@ public class TestAssignmentManagerOnCluster { ServerName serverName = master.getAssignmentManager(). getRegionStates().getRegionServerOfRegion(hri); - TEST_UTIL.assertRegionOnServer(hri, serverName, 200); + TEST_UTIL.assertRegionOnServer(hri, serverName, 6000); } finally { TEST_UTIL.deleteTable(table); } @@ -573,7 +626,7 @@ public class TestAssignmentManagerOnCluster { if (ConfigUtil.useZKForAssignment(conf)) { ZKAssign.createNodeOffline(zkw, hri, destServerName); ZKAssign.transitionNodeOpening(zkw, hri, destServerName); - + // Wait till the event is processed and the region is in transition long timeoutTime = System.currentTimeMillis() + 20000; while (!am.getRegionStates().isRegionInTransition(hri)) { @@ -644,7 +697,7 @@ public class TestAssignmentManagerOnCluster { assertTrue(am.waitForAssignment(hri)); ServerName serverName = master.getAssignmentManager(). getRegionStates().getRegionServerOfRegion(hri); - TEST_UTIL.assertRegionOnServer(hri, serverName, 200); + TEST_UTIL.assertRegionOnServer(hri, serverName, 6000); } finally { MyRegionObserver.postCloseEnabled.set(false); TEST_UTIL.deleteTable(Bytes.toBytes(table)); @@ -836,7 +889,7 @@ public class TestAssignmentManagerOnCluster { TEST_UTIL.deleteTable(Bytes.toBytes(table)); } } - + /** * Test that region state transition call is idempotent */ @@ -859,7 +912,7 @@ public class TestAssignmentManagerOnCluster { RegionStates regionStates = am.getRegionStates(); ServerName serverName = regionStates.getRegionServerOfRegion(hri); // Assert the the region is actually open on the server - TEST_UTIL.assertRegionOnServer(hri, serverName, 200); + TEST_UTIL.assertRegionOnServer(hri, serverName, 6000); // Closing region should just work fine admin.disableTable(TableName.valueOf(table)); assertTrue(regionStates.isRegionOffline(hri)); @@ -965,7 +1018,7 @@ public class TestAssignmentManagerOnCluster { } } } - + public static class MyRegionServer extends MiniHBaseClusterRegionServer { static volatile ServerName abortedServer = null; static volatile boolean simulateRetry; @@ -993,7 +1046,6 @@ public class TestAssignmentManagerOnCluster { } } - public static class MyRegionObserver extends BaseRegionObserver { // If enabled, fail all preClose calls static AtomicBoolean preCloseEnabled = new AtomicBoolean(false);