commit c7b4bfdad80fc3e1e9393a46a86368b832827f1c Author: Virag Kothari Date: Fri Nov 14 17:48:16 2014 -0800 HBASE-12480 Regions in FAILED_OPEN/FAILED_CLOSE should be processed on master failover diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index 7c7f0b6..a9dc530 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -451,8 +451,9 @@ public class AssignmentManager { Map regionsInTransition = regionStates.getRegionsInTransition(); if (!regionsInTransition.isEmpty()) { for (RegionState regionState: regionsInTransition.values()) { + ServerName serverName = regionState.getServerName(); if (!regionState.getRegion().isMetaRegion() - && onlineServers.contains(regionState.getServerName())) { + && serverName != null && onlineServers.contains(serverName)) { LOG.debug("Found " + regionState + " in RITs"); failover = true; break; @@ -1664,18 +1665,23 @@ public class AssignmentManager { /** * Processes list of regions in transition at startup */ - void processRegionsInTransition(Collection regionStates) { + void processRegionsInTransition(Collection regionsInTransition) { // We need to send RPC call again for PENDING_OPEN/PENDING_CLOSE regions // in case the RPC call is not sent out yet before the master was shut down // since we update the state before we send the RPC call. We can't update // the state after the RPC call. Otherwise, we don't know what's happened // to the region if the master dies right after the RPC call is out. - for (RegionState regionState: regionStates) { - if (!serverManager.isServerOnline(regionState.getServerName())) { + for (RegionState regionState: regionsInTransition) { + LOG.info("Processing " + regionState); + ServerName serverName = regionState.getServerName(); + // Server could be null in case of FAILED_OPEN when master cannot find a region plan. In that + // case, try assigning it here. + if (serverName != null && !serverManager.getOnlineServers().containsKey(serverName)) { + LOG.info("Server " + serverName + " isn't online. SSH will handle this"); continue; // SSH will handle it } + HRegionInfo regionInfo = regionState.getRegion(); RegionState.State state = regionState.getState(); - LOG.info("Processing " + regionState); switch (state) { case CLOSED: invokeAssign(regionState.getRegion()); @@ -1686,6 +1692,18 @@ public class AssignmentManager { case PENDING_CLOSE: retrySendRegionClose(regionState); break; + case FAILED_CLOSE: + case FAILED_OPEN: + unassign(regionInfo, regionState.getServerName(), null); + regionState = regionStates.getRegionState(regionInfo); + if (regionState.isFailedClose()) { + // If we can't close the region, we can't re-assign + // it so as to avoid possible double assignment/data loss. + LOG.info("Skip assigning " + regionInfo + ", we couldn't close it: " + regionState); + return; + } + assign(regionState, false); + break; default: // No process for other states } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java index 6c17686..b4044d1 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java @@ -257,7 +257,36 @@ public class TestMasterFailover { oldState = new RegionState(hriOffline, State.OFFLINE); newState = new RegionState(hriOffline, State.PENDING_OPEN, newState.getServerName()); stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); - + + HRegionInfo failedClose = new HRegionInfo(offlineTable.getTableName(), null, null); + createRegion(failedClose, rootdir, conf, offlineTable); + MetaTableAccessor.addRegionToMeta(master.getShortCircuitConnection(), failedClose); + + oldState = new RegionState(failedClose, State.PENDING_CLOSE); + newState = new RegionState(failedClose, State.FAILED_CLOSE, newState.getServerName()); + stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); + + HRegionInfo failedOpen = new HRegionInfo(offlineTable.getTableName(), null, null); + createRegion(failedOpen, rootdir, conf, offlineTable); + MetaTableAccessor.addRegionToMeta(master.getShortCircuitConnection(), failedOpen); + + // Simulate a region transitioning to failed open when the region server reports the + // transition as FAILED_OPEN + oldState = new RegionState(failedOpen, State.PENDING_OPEN); + newState = new RegionState(failedOpen, State.FAILED_OPEN, newState.getServerName()); + stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); + + HRegionInfo failedOpenNullServer = new HRegionInfo(offlineTable.getTableName(), null, null); + LOG.info("Failed open NUll server " + failedOpenNullServer.getEncodedName()); + createRegion(failedOpenNullServer, rootdir, conf, offlineTable); + MetaTableAccessor.addRegionToMeta(master.getShortCircuitConnection(), failedOpenNullServer); + + // Simulate a region transitioning to failed open when the master couldn't find a plan for + // the region + oldState = new RegionState(failedOpenNullServer, State.OFFLINE); + newState = new RegionState(failedOpenNullServer, State.FAILED_OPEN, null); + stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); + // Stop the master log("Aborting master"); cluster.abortMaster(0); @@ -280,6 +309,9 @@ public class TestMasterFailover { // Both pending_open (RPC sent/not yet) regions should be online assertTrue(regionStates.isRegionOnline(hriOffline)); assertTrue(regionStates.isRegionOnline(hriOnline)); + assertTrue(regionStates.isRegionOnline(failedClose)); + assertTrue(regionStates.isRegionOnline(failedOpenNullServer)); + assertTrue(regionStates.isRegionOnline(failedOpen)); log("Done with verification, shutting down cluster");