commit f678b8cfb8b45f1227d542932c22241fa52c226b Author: Virag Kothari Date: Mon Jan 12 02:54:24 2015 -0800 HBASE-12480 Regions in FAILED_OPEN/FAILED_CLOSE should be processed on master failover diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index f0fe635..6d10327 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -569,8 +569,9 @@ public class AssignmentManager extends ZooKeeperListener { if (!regionsInTransition.isEmpty()) { Set onlineServers = serverManager.getOnlineServers().keySet(); for (RegionState regionState : regionsInTransition.values()) { + ServerName serverName = regionState.getServerName(); if (!regionState.getRegion().isMetaRegion() - && onlineServers.contains(regionState.getServerName())) { + && serverName != null && onlineServers.contains(serverName)) { LOG.debug("Found " + regionState + " in RITs"); failover = true; break; @@ -2995,14 +2996,21 @@ public class AssignmentManager extends ZooKeeperListener { // to the region if the master dies right after the RPC call is out. Map rits = regionStates.getRegionsInTransition(); for (RegionState regionState : rits.values()) { - if (!serverManager.isServerOnline(regionState.getServerName())) { - continue; // SSH will handle it + LOG.info("Processing " + regionState); + ServerName serverName = regionState.getServerName(); + // Server could be null in case of FAILED_OPEN when master cannot find a region plan. In that + // case, try assigning it here. + if (serverName != null + && !serverManager.getOnlineServers().containsKey(serverName)) { + LOG.info("Server " + serverName + " isn't online. SSH will handle this"); + continue; } + HRegionInfo regionInfo = regionState.getRegion(); State state = regionState.getState(); - LOG.info("Processing " + regionState); + switch (state) { case CLOSED: - invokeAssign(regionState.getRegion()); + invokeAssign(regionInfo); break; case PENDING_OPEN: retrySendRegionOpen(regionState); @@ -3010,6 +3018,10 @@ public class AssignmentManager extends ZooKeeperListener { case PENDING_CLOSE: retrySendRegionClose(regionState); break; + case FAILED_CLOSE: + case FAILED_OPEN: + invokeUnassign(regionInfo); + break; default: // No process for other states } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java index 0a0e3d9..83ad29d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java @@ -1053,8 +1053,8 @@ public class TestMasterFailover { RegionState newState = regionStates.getRegionState(hri); assertTrue(newState.isOpened()); } - - /** + + /** * Simple test of master failover. *

* Starts with three masters. Kills a backup master. Then kills the active @@ -1165,7 +1165,7 @@ public class TestMasterFailover { } /** - * Test region in pending_open/close when master failover + * Test region in pending_open/close and failed_open/close when master failover */ @Test (timeout=180000) public void testPendingOpenOrCloseWhenMasterFailover() throws Exception { @@ -1230,6 +1230,37 @@ public class TestMasterFailover { oldState = new RegionState(hriOffline, State.OFFLINE); newState = new RegionState(hriOffline, State.PENDING_OPEN, newState.getServerName()); stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); + + HRegionInfo failedClose = new HRegionInfo(offlineTable.getTableName(), null, null); + createRegion(failedClose, rootdir, conf, offlineTable); + MetaEditor.addRegionToMeta(master.getCatalogTracker(), failedClose); + + oldState = new RegionState(failedClose, State.PENDING_CLOSE); + newState = new RegionState(failedClose, State.FAILED_CLOSE, newState.getServerName()); + stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); + + + HRegionInfo failedOpen = new HRegionInfo(offlineTable.getTableName(), null, null); + createRegion(failedOpen, rootdir, conf, offlineTable); + MetaEditor.addRegionToMeta(master.getCatalogTracker(), failedOpen); + + // Simulate a region transitioning to failed open when the region server reports the + // transition as FAILED_OPEN + oldState = new RegionState(failedOpen, State.PENDING_OPEN); + newState = new RegionState(failedOpen, State.FAILED_OPEN, newState.getServerName()); + stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); + + HRegionInfo failedOpenNullServer = new HRegionInfo(offlineTable.getTableName(), null, null); + createRegion(failedOpenNullServer, rootdir, conf, offlineTable); + MetaEditor.addRegionToMeta(master.getCatalogTracker(), failedOpenNullServer); + + // Simulate a region transitioning to failed open when the master couldn't find a plan for + // the region + oldState = new RegionState(failedOpenNullServer, State.OFFLINE); + newState = new RegionState(failedOpenNullServer, State.FAILED_OPEN, null); + stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); + + // Stop the master log("Aborting master"); @@ -1253,7 +1284,10 @@ public class TestMasterFailover { // Both pending_open (RPC sent/not yet) regions should be online assertTrue(regionStates.isRegionOnline(hriOffline)); assertTrue(regionStates.isRegionOnline(hriOnline)); - + assertTrue(regionStates.isRegionOnline(failedClose)); + assertTrue(regionStates.isRegionOnline(failedOpenNullServer)); + assertTrue(regionStates.isRegionOnline(failedOpen)); + log("Done with verification, shutting down cluster"); // Done, shutdown the cluster