diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index 0880134..7ac3d58 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -1659,6 +1659,10 @@ public class AssignmentManager extends ZooKeeperListener { getLong("hbase.regionserver.rpc.startup.waittime", 60000); for (int i = 1; i <= maximumAttempts && !server.isStopped(); i++) { try { + // regionOpenInfos is empty if all regions are in failedToOpenRegions list + if (regionOpenInfos.isEmpty()) { + break; + } List regionOpeningStateList = serverManager .sendRegionOpen(destination, regionOpenInfos); if (regionOpeningStateList == null) { @@ -1920,8 +1924,12 @@ public class AssignmentManager extends ZooKeeperListener { if (useZKForAssignment && regionStates.isServerDeadAndNotProcessed(sn) && wasRegionOnDeadServerByMeta(region, sn)) { + if (!regionStates.isRegionInTransition(region)) { + LOG.info("Updating the state to " + State.OFFLINE + " to allow to be reassigned by SSH"); + regionStates.updateRegionState(region, State.OFFLINE); + } LOG.info("Skip assigning " + region.getRegionNameAsString() - + ", it is on a dead but not processed yet server: " + sn); + + ", it is on a dead but not processed yet server: " + sn); return null; } case CLOSED: diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java index a9399b3..d882106 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java @@ -243,7 +243,7 @@ public class ServerShutdownHandler extends EventHandler { Lock lock = am.acquireRegionLock(encodedName); try { RegionState rit = regionStates.getRegionTransitionState(hri); - if (processDeadRegion(hri, am)) { + if (processDeadRegion(hri, am)) { ServerName addressFromAM = regionStates.getRegionServerOfRegion(hri); if (addressFromAM != null && !addressFromAM.equals(this.serverName)) { // If this region is in transition on the dead server, it must be @@ -273,7 +273,7 @@ public class ServerShutdownHandler extends EventHandler { } toAssignRegions.add(hri); } else if (rit != null) { - if (rit.isPendingCloseOrClosing() + if ((rit.isPendingCloseOrClosing() || rit.isOffline()) && am.getTableStateManager().isTableState(hri.getTable(), ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) { // If the table was partially disabled and the RS went down, we should clear the RIT diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java index 349ce90..1c538e2 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java @@ -81,6 +81,7 @@ import org.junit.BeforeClass; import org.junit.Test; import org.junit.experimental.categories.Category; + /** * This tests AssignmentManager with a testing cluster. */ @@ -188,7 +189,7 @@ public class TestAssignmentManagerOnCluster { RegionStates regionStates = am.getRegionStates(); ServerName serverName = regionStates.getRegionServerOfRegion(hri); - TEST_UTIL.assertRegionOnServer(hri, serverName, 200); + TEST_UTIL.assertRegionOnServer(hri, serverName, 6000); // Region is assigned now. Let's assign it again. // Master should not abort, and region should be assigned. @@ -203,6 +204,58 @@ public class TestAssignmentManagerOnCluster { } } + // Simulate a scenario where the AssignCallable and SSH are trying to assign a region + @Test (timeout=60000) + public void testAssignRegionBySSH() throws Exception { + if (!conf.getBoolean("hbase.assignment.usezk", true)) { + return; + } + String table = "testAssignRegionBySSH"; + MyMaster master = (MyMaster) TEST_UTIL.getHBaseCluster().getMaster(); + try { + HTableDescriptor desc = new HTableDescriptor(TableName.valueOf(table)); + desc.addFamily(new HColumnDescriptor(FAMILY)); + admin.createTable(desc); + + HTable meta = new HTable(conf, TableName.META_TABLE_NAME); + HRegionInfo hri = new HRegionInfo( + desc.getTableName(), Bytes.toBytes("A"), Bytes.toBytes("Z")); + MetaTableAccessor.addRegionToMeta(meta, hri); + // Add some dummy server for the region entry + MetaTableAccessor.updateRegionLocation(TEST_UTIL.getHBaseCluster().getMaster().getShortCircuitConnection(), hri, + ServerName.valueOf("example.org", 1234, System.currentTimeMillis()), 0); + RegionStates regionStates = master.getAssignmentManager().getRegionStates(); + int i = TEST_UTIL.getHBaseCluster().getServerWithMeta(); + HRegionServer rs = TEST_UTIL.getHBaseCluster().getRegionServer(i == 0 ? 1 : 0); + // Choose a server other than meta to kill + ServerName controlledServer = rs.getServerName(); + master.enableSSH(false); + TEST_UTIL.getHBaseCluster().killRegionServer(controlledServer); + TEST_UTIL.getHBaseCluster().waitForRegionServerToStop(controlledServer, -1); + AssignmentManager am = master.getAssignmentManager(); + + // Simulate the AssignCallable trying to assign the region. Have the region in OFFLINE state, + // but not in transition and the server is the dead 'controlledServer' + regionStates.createRegionState(hri, State.OFFLINE, controlledServer, null); + am.assign(hri, true, true); + // Region should remain OFFLINE and go to transition + assertEquals(State.OFFLINE, regionStates.getRegionState(hri).getState()); + assertTrue (regionStates.isRegionInTransition(hri)); + + master.enableSSH(true); + am.waitForAssignment(hri); + assertTrue (regionStates.getRegionState(hri).isOpened()); + ServerName serverName = regionStates.getRegionServerOfRegion(hri); + TEST_UTIL.assertRegionOnlyOnServer(hri, serverName, 6000); + } finally { + if (master != null) { + master.enableSSH(true); + } + TEST_UTIL.deleteTable(Bytes.toBytes(table)); + TEST_UTIL.getHBaseCluster().startRegionServer(); + } + } + /** * This tests region assignment on a simulated restarted server */ @@ -277,7 +330,7 @@ public class TestAssignmentManagerOnCluster { RegionStates regionStates = TEST_UTIL.getHBaseCluster(). getMaster().getAssignmentManager().getRegionStates(); ServerName serverName = regionStates.getRegionServerOfRegion(hri); - TEST_UTIL.assertRegionOnServer(hri, serverName, 200); + TEST_UTIL.assertRegionOnServer(hri, serverName, 6000); admin.offline(hri.getRegionName()); long timeoutTime = System.currentTimeMillis() + 800; @@ -333,7 +386,7 @@ public class TestAssignmentManagerOnCluster { while (true) { ServerName sn = regionStates.getRegionServerOfRegion(hri); if (sn != null && sn.equals(destServerName)) { - TEST_UTIL.assertRegionOnServer(hri, sn, 200); + TEST_UTIL.assertRegionOnServer(hri, sn, 6000); break; } long now = System.currentTimeMillis(); @@ -511,7 +564,7 @@ public class TestAssignmentManagerOnCluster { assertTrue(am.waitForAssignment(hri)); ServerName serverName = master.getAssignmentManager(). getRegionStates().getRegionServerOfRegion(hri); - TEST_UTIL.assertRegionOnServer(hri, serverName, 200); + TEST_UTIL.assertRegionOnServer(hri, serverName, 6000); } finally { MyRegionObserver.preCloseEnabled.set(false); TEST_UTIL.deleteTable(Bytes.toBytes(table)); @@ -552,7 +605,7 @@ public class TestAssignmentManagerOnCluster { ServerName serverName = master.getAssignmentManager(). getRegionStates().getRegionServerOfRegion(hri); - TEST_UTIL.assertRegionOnServer(hri, serverName, 200); + TEST_UTIL.assertRegionOnServer(hri, serverName, 6000); } finally { MyLoadBalancer.controledRegion = null; TEST_UTIL.deleteTable(Bytes.toBytes(table)); @@ -602,7 +655,7 @@ public class TestAssignmentManagerOnCluster { ServerName serverName = master.getAssignmentManager(). getRegionStates().getRegionServerOfRegion(hri); - TEST_UTIL.assertRegionOnServer(hri, serverName, 200); + TEST_UTIL.assertRegionOnServer(hri, serverName, 6000); } finally { TEST_UTIL.deleteTable(table); } @@ -634,7 +687,7 @@ public class TestAssignmentManagerOnCluster { if (ConfigUtil.useZKForAssignment(conf)) { ZKAssign.createNodeOffline(zkw, hri, destServerName); ZKAssign.transitionNodeOpening(zkw, hri, destServerName); - + // Wait till the event is processed and the region is in transition long timeoutTime = System.currentTimeMillis() + 20000; while (!am.getRegionStates().isRegionInTransition(hri)) { @@ -705,7 +758,7 @@ public class TestAssignmentManagerOnCluster { assertTrue(am.waitForAssignment(hri)); ServerName serverName = master.getAssignmentManager(). getRegionStates().getRegionServerOfRegion(hri); - TEST_UTIL.assertRegionOnServer(hri, serverName, 200); + TEST_UTIL.assertRegionOnServer(hri, serverName, 6000); } finally { MyRegionObserver.postCloseEnabled.set(false); TEST_UTIL.deleteTable(Bytes.toBytes(table)); @@ -1106,7 +1159,7 @@ public class TestAssignmentManagerOnCluster { cluster.startRegionServer(); } } - + /** * Test that region state transition call is idempotent */ @@ -1129,7 +1182,7 @@ public class TestAssignmentManagerOnCluster { RegionStates regionStates = am.getRegionStates(); ServerName serverName = regionStates.getRegionServerOfRegion(hri); // Assert the the region is actually open on the server - TEST_UTIL.assertRegionOnServer(hri, serverName, 200); + TEST_UTIL.assertRegionOnServer(hri, serverName, 6000); // Closing region should just work fine admin.disableTable(TableName.valueOf(table)); assertTrue(regionStates.isRegionOffline(hri));