commit 24a5e92dcbfa2c3aca2ab9c3be1c0ac3af57dd1b Author: Virag Kothari Date: Mon Jan 12 02:40:04 2015 -0800 HBASE-12480 Regions in FAILED_OPEN/FAILED_CLOSE should be processed on master failover diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index 2f6679f..b17561a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -450,8 +450,9 @@ public class AssignmentManager { Map regionsInTransition = regionStates.getRegionsInTransition(); if (!regionsInTransition.isEmpty()) { for (RegionState regionState: regionsInTransition.values()) { + ServerName serverName = regionState.getServerName(); if (!regionState.getRegion().isMetaRegion() - && onlineServers.contains(regionState.getServerName())) { + && serverName != null && onlineServers.contains(serverName)) { LOG.debug("Found " + regionState + " in RITs"); failover = true; break; @@ -1694,18 +1695,23 @@ public class AssignmentManager { /** * Processes list of regions in transition at startup */ - void processRegionsInTransition(Collection regionStates) { + void processRegionsInTransition(Collection regionsInTransition) { // We need to send RPC call again for PENDING_OPEN/PENDING_CLOSE regions // in case the RPC call is not sent out yet before the master was shut down // since we update the state before we send the RPC call. We can't update // the state after the RPC call. Otherwise, we don't know what's happened // to the region if the master dies right after the RPC call is out. - for (RegionState regionState: regionStates) { - if (!serverManager.isServerOnline(regionState.getServerName())) { + for (RegionState regionState: regionsInTransition) { + LOG.info("Processing " + regionState); + ServerName serverName = regionState.getServerName(); + // Server could be null in case of FAILED_OPEN when master cannot find a region plan. In that + // case, try assigning it here. + if (serverName != null && !serverManager.getOnlineServers().containsKey(serverName)) { + LOG.info("Server " + serverName + " isn't online. SSH will handle this"); continue; // SSH will handle it } + HRegionInfo regionInfo = regionState.getRegion(); RegionState.State state = regionState.getState(); - LOG.info("Processing " + regionState); switch (state) { case CLOSED: invokeAssign(regionState.getRegion()); @@ -1716,6 +1722,10 @@ public class AssignmentManager { case PENDING_CLOSE: retrySendRegionClose(regionState); break; + case FAILED_CLOSE: + case FAILED_OPEN: + invokeUnAssign(regionInfo); + break; default: // No process for other states } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java index f211754..cae1258 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java @@ -217,7 +217,7 @@ public class TestMasterFailover { HMaster master = masterThreads.get(0).getMaster(); assertTrue(master.isActiveMaster()); assertTrue(master.isInitialized()); - + // Create a table with a region online Table onlineTable = TEST_UTIL.createTable(TableName.valueOf("onlineTable"), "family"); onlineTable.close(); @@ -260,7 +260,36 @@ public class TestMasterFailover { oldState = new RegionState(hriOffline, State.OFFLINE); newState = new RegionState(hriOffline, State.PENDING_OPEN, newState.getServerName()); stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); - + + HRegionInfo failedClose = new HRegionInfo(offlineTable.getTableName(), null, null); + createRegion(failedClose, rootdir, conf, offlineTable); + MetaTableAccessor.addRegionToMeta(master.getConnection(), failedClose); + + oldState = new RegionState(failedClose, State.PENDING_CLOSE); + newState = new RegionState(failedClose, State.FAILED_CLOSE, newState.getServerName()); + stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); + + HRegionInfo failedOpen = new HRegionInfo(offlineTable.getTableName(), null, null); + createRegion(failedOpen, rootdir, conf, offlineTable); + MetaTableAccessor.addRegionToMeta(master.getConnection(), failedOpen); + + // Simulate a region transitioning to failed open when the region server reports the + // transition as FAILED_OPEN + oldState = new RegionState(failedOpen, State.PENDING_OPEN); + newState = new RegionState(failedOpen, State.FAILED_OPEN, newState.getServerName()); + stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); + + HRegionInfo failedOpenNullServer = new HRegionInfo(offlineTable.getTableName(), null, null); + LOG.info("Failed open NUll server " + failedOpenNullServer.getEncodedName()); + createRegion(failedOpenNullServer, rootdir, conf, offlineTable); + MetaTableAccessor.addRegionToMeta(master.getConnection(), failedOpenNullServer); + + // Simulate a region transitioning to failed open when the master couldn't find a plan for + // the region + oldState = new RegionState(failedOpenNullServer, State.OFFLINE); + newState = new RegionState(failedOpenNullServer, State.FAILED_OPEN, null); + stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); + // Stop the master log("Aborting master"); cluster.abortMaster(0); @@ -283,6 +312,9 @@ public class TestMasterFailover { // Both pending_open (RPC sent/not yet) regions should be online assertTrue(regionStates.isRegionOnline(hriOffline)); assertTrue(regionStates.isRegionOnline(hriOnline)); + assertTrue(regionStates.isRegionOnline(failedClose)); + assertTrue(regionStates.isRegionOnline(failedOpenNullServer)); + assertTrue(regionStates.isRegionOnline(failedOpen)); log("Done with verification, shutting down cluster");