Index: src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (revision 1245318) +++ src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (working copy) @@ -2702,12 +2702,15 @@ return matchAM; } + /** * Process shutdown server removing any assignments. * @param sn Server that went down. - * @return list of regions in transition on this server + * @return list of regions in transition and region plans on this server */ - public List processServerShutdown(final ServerName sn) { + public RegionsOnDeadServer processServerShutdown(final ServerName sn) { + RegionsOnDeadServer regionsOnDeadServer = new RegionsOnDeadServer(); + Set regionsFromRegionPlansForServer = new HashSet(); // Clean out any existing assignment plans for this server synchronized (this.regionPlans) { for (Iterator > i = @@ -2716,11 +2719,16 @@ ServerName otherSn = e.getValue().getDestination(); // The name will be null if the region is planned for a random assign. if (otherSn != null && otherSn.equals(sn)) { + // Store the related regions in regionPlans. + regionsFromRegionPlansForServer.add(e.getValue().getRegionInfo()); // Use iterator's remove else we'll get CME i.remove(); } } } + + regionsOnDeadServer + .setRegionsFromRegionPlansForServer(regionsFromRegionPlansForServer); // TODO: Do we want to sync on RIT here? // Remove this server from map of servers to regions, and remove all regions // of this server from online map of regions. @@ -2729,8 +2737,9 @@ synchronized (this.regions) { Set assignedRegions = this.servers.remove(sn); if (assignedRegions == null || assignedRegions.isEmpty()) { + regionsOnDeadServer.setRegionsInTransition(rits); // No regions on this server, we are done, return empty list of RITs - return rits; + return regionsOnDeadServer; } deadRegions = new TreeSet(assignedRegions); for (HRegionInfo region : deadRegions) { @@ -2747,7 +2756,8 @@ } } } - return rits; + regionsOnDeadServer.setRegionsInTransition(rits); + return regionsOnDeadServer; } /** @@ -2986,6 +2996,27 @@ this.timeoutMonitor.interrupt(); } + public boolean isRegionOnline(HRegionInfo hri) { + ServerName serverName = null; + synchronized (this.regions) { + serverName = this.regions.get(hri); + if (serverName == null) { + return false; + } + if (this.isServerOnline(serverName)) { + return true; + } else { + // Remove the assignment mapping for hsi. + Set hriSet = this.servers.get(serverName); + if (hriSet != null) { + hriSet.remove(hri); + } + this.regions.remove(hri); + return false; + } + } + } + /** * Check whether the RegionServer is online. * @param serverName @@ -2994,6 +3025,7 @@ public boolean isServerOnline(ServerName serverName) { return this.serverManager.isServerOnline(serverName); } + /** * Shutdown the threadpool executor service */ @@ -3002,4 +3034,30 @@ this.threadPoolExecutorService.shutdown(); } } + + /** + * Store the related regions on a dead server, used by processServerShutdown. + */ + public static class RegionsOnDeadServer { + // The regions which being processed on this dead server. + private Set regionsFromRegionPlansForServer = null; + private List regionsInTransition = null; + + public Set getRegionsFromRegionPlansForServer() { + return regionsFromRegionPlansForServer; + } + + public void setRegionsFromRegionPlansForServer( + Set regionsFromRegionPlansForServer) { + this.regionsFromRegionPlansForServer = regionsFromRegionPlansForServer; + } + + public List getRegionsInTransition() { + return regionsInTransition; + } + + public void setRegionsInTransition(List regionsInTransition) { + this.regionsInTransition = regionsInTransition; + } + } } Index: src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java (revision 1245318) +++ src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java (working copy) @@ -20,9 +20,12 @@ package org.apache.hadoop.hbase.master.handler; import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.NavigableMap; +import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -37,6 +40,7 @@ import org.apache.hadoop.hbase.executor.EventHandler; import org.apache.hadoop.hbase.master.AssignmentManager; import org.apache.hadoop.hbase.master.AssignmentManager.RegionState; +import org.apache.hadoop.hbase.master.AssignmentManager.RegionsOnDeadServer; import org.apache.hadoop.hbase.master.DeadServer; import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.ServerManager; @@ -227,11 +231,14 @@ // doing after log splitting. Could do some states before -- OPENING? // OFFLINE? -- and then others after like CLOSING that depend on log // splitting. - List regionsInTransition = - this.services.getAssignmentManager() - .processServerShutdown(this.serverName); + Set regionsFromRegionPlansForServer = new HashSet(); + List regionsInTransition = new ArrayList(); + RegionsOnDeadServer regionsOnDeadServer = this.services + .getAssignmentManager().processServerShutdown(this.serverName); + regionsFromRegionPlansForServer = regionsOnDeadServer + .getRegionsFromRegionPlansForServer(); + regionsInTransition = regionsOnDeadServer.getRegionsInTransition(); - // Wait on meta to come online; we need it to progress. // TODO: Best way to hold strictly here? We should build this retry logic // into the MetaReader operations themselves. @@ -289,7 +296,8 @@ RegionState rit = this.services.getAssignmentManager().isRegionInTransition(e.getKey()); ServerName addressFromAM = this.services.getAssignmentManager() .getRegionServerOfRegion(e.getKey()); - if (rit != null && !rit.isClosing() && !rit.isPendingClose()) { + if (rit != null && !rit.isClosing() && !rit.isPendingClose() + && !regionsFromRegionPlansForServer.contains(rit.getRegion())) { // Skip regions that were in transition unless CLOSING or // PENDING_CLOSE LOG.info("Skip assigning region " + rit.toString()); @@ -301,10 +309,21 @@ + addressFromAM.getServerName()); } else { this.services.getAssignmentManager().assign(e.getKey(), true); + regionsFromRegionPlansForServer.remove(e.getKey()); } } } } + + int reassignedPlans = 0; + for (HRegionInfo hri : regionsFromRegionPlansForServer) { + if (!this.services.getAssignmentManager().isRegionOnline(hri)) { + this.services.getAssignmentManager().assign(hri, true); + reassignedPlans++; + } + } + LOG.info(reassignedPlans + " regions which planned to open on " + + this.serverName + " be re-assigned."); } finally { this.deadServers.finish(serverName); }