Index: src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (revision 1185442) +++ src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (working copy) @@ -314,9 +314,7 @@ // Returns servers who have not checked in (assumed dead) and their regions Map>> deadServers = rebuildUserRegions(); - // Process list of dead servers; note this will add regions to the RIT. - // processRegionsInTransition will read them and assign them out. - processDeadServers(deadServers); + // Check existing regions in transition processRegionsInTransition(deadServers); @@ -374,11 +372,10 @@ // If we found user regions out on cluster, its a failover. if (regionsToProcess) { LOG.info("Found regions out on cluster or in RIT; failover"); - if (!nodes.isEmpty()) { - for (String encodedRegionName: nodes) { - processRegionInTransition(encodedRegionName, null, deadServers); - } - } + // Process list of dead servers and in RIT. + // See HBASE-4580 for more information. + processDeadServersAndRecoverLostRegions(deadServers, nodes); + } else { // Fresh cluster startup. LOG.info("Clean cluster startup. Assigning userregions"); @@ -2222,25 +2219,28 @@ } /** - * Processes list of dead servers from result of META scan. + * Processes list of dead servers from result of META scan and regions in RIT *

- * This is used as part of failover to handle RegionServers which failed - * while there was no active master. + * This is used for failover to recover the lost regions that belonged to + * RegionServers which failed while there was no active master or regions + * that were in RIT. *

- * Method stubs in-memory data to be as expected by the normal server shutdown - * handler. - * + * * @param deadServers + * The list of dead servers which failed while there was no active + * master. + * @param nodes + * The regions in RIT * @throws IOException * @throws KeeperException */ - private void processDeadServers( - Map>> deadServers) + private void processDeadServersAndRecoverLostRegions( + Map>> deadServers, List nodes) throws IOException, KeeperException { for (Map.Entry>> deadServer: deadServers.entrySet()) { - List> regions = deadServer.getValue(); - for (Pair region : regions) { + List> regions = deadServer.getValue(); + for (Pair region : regions) { HRegionInfo regionInfo = region.getFirst(); Result result = region.getSecond(); // If region was in transition (was in zk) force it offline for reassign @@ -2252,24 +2252,32 @@ // we consider that this region is being handled. // So we should skip it and process it in processRegionsInTransition. if (data != null && data.getOrigin() != null && - serverManager.isServerOnline(data.getOrigin())) { + serverManager.isServerOnline(data.getOrigin())) { LOG.info("The region " + regionInfo.getEncodedName() + "is being handled on " + data.getOrigin()); continue; } // Process with existing RS shutdown code - boolean assign = - ServerShutdownHandler.processDeadRegion(regionInfo, result, this, - this.catalogTracker); + boolean assign = ServerShutdownHandler.processDeadRegion(regionInfo, + result, this, this.catalogTracker); if (assign) { ZKAssign.createOrForceNodeOffline(watcher, regionInfo, - master.getServerName()); + master.getServerName()); + if (!nodes.contains(regionInfo.getEncodedName())) { + nodes.add(regionInfo.getEncodedName()); + } } } catch (KeeperException.NoNodeException nne) { // This is fine } } } + + if (!nodes.isEmpty()) { + for (String encodedRegionName : nodes) { + processRegionInTransition(encodedRegionName, null, deadServers); + } + } } /*