Index: src/main/java/org/apache/hadoop/hbase/master/HMaster.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/HMaster.java (revision 1231810) +++ src/main/java/org/apache/hadoop/hbase/master/HMaster.java (working copy) @@ -26,15 +26,16 @@ import java.net.InetSocketAddress; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -69,7 +70,6 @@ import org.apache.hadoop.hbase.ipc.HMasterRegionInterface; import org.apache.hadoop.hbase.ipc.ProtocolSignature; import org.apache.hadoop.hbase.ipc.RpcServer; -import org.apache.hadoop.hbase.master.CatalogJanitor.SplitParentFirstComparator; import org.apache.hadoop.hbase.master.handler.CreateTableHandler; import org.apache.hadoop.hbase.master.handler.DeleteTableHandler; import org.apache.hadoop.hbase.master.handler.DisableTableHandler; @@ -510,10 +510,17 @@ } } + Set knownServers = new HashSet(); + knownServers.addAll(serverManager.getOnlineServers().keySet()); + if (this.serverManager.areDeadServersInProgress()) { + // Dead servers are under processing, their logs would be split by + // ServerShutdownHandler + knownServers.addAll(serverManager.getDeadServersBeingProcessed()); + } + // TODO: Should do this in background rather than block master startup status.setStatus("Splitting logs after master startup"); - this.fileSystemManager. - splitLogAfterStartup(this.serverManager.getOnlineServers().keySet()); + this.fileSystemManager.splitLogAfterStartup(knownServers); // Make sure root and meta assigned before proceeding. assignRootAndMeta(status); @@ -526,7 +533,7 @@ // Fixup assignment manager status status.setStatus("Starting assignment manager"); - this.assignmentManager.joinCluster(); + this.assignmentManager.joinCluster(knownServers); this.balancer.setClusterStatus(getClusterStatus()); this.balancer.setMasterServices(this); @@ -584,8 +591,12 @@ // root so don't do it here. expiredServer = currentRootServer; } else { - // Root was not on an online server when we failed verification - this.assignmentManager.assignRoot(); + if (currentRootServer == null + || !this.serverManager.isDeadServerBeingProcessed(currentRootServer)) { + // Root was not on an online server when we failed verification and + // also was not being processed as dead server by SSH. + this.assignmentManager.assignRoot(); + } } this.catalogTracker.waitForRoot(); //This guarantees that the transition has completed @@ -611,7 +622,10 @@ // The expiration processing will take care of reassigning meta. expireIfOnline(currentMetaServer); } else { - this.assignmentManager.assignMeta(); + if (currentMetaServer == null + || !this.serverManager.isDeadServerBeingProcessed(currentMetaServer)) { + this.assignmentManager.assignMeta(); + } } this.catalogTracker.waitForMeta(); // Above check waits for general meta availability but this does not Index: src/main/java/org/apache/hadoop/hbase/master/DeadServer.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/DeadServer.java (revision 1231810) +++ src/main/java/org/apache/hadoop/hbase/master/DeadServer.java (working copy) @@ -28,7 +28,8 @@ import org.apache.hadoop.hbase.ServerName; /** - * Class to hold dead servers list and utility querying dead server list. + * Class to hold dead servers list, utility querying dead server list and being + * processed dead servers by the ServerShutdownHandler. */ public class DeadServer implements Set { /** @@ -39,6 +40,10 @@ * because by then, its regions have probably been reassigned. */ private final Set deadServers = new HashSet(); + /** + * Set of dead servers under processing by the ServerShutdownHander. + */ + private final Set deadServersUnderProcessing = new HashSet(); /** Number of dead servers currently being processed */ private int numProcessing; @@ -101,13 +106,22 @@ return clone; } + synchronized Set getDeadServersBeingProcessed() { + Set clone = new HashSet( + this.deadServersUnderProcessing.size()); + clone.addAll(this.deadServersUnderProcessing); + return clone; + } + public synchronized boolean add(ServerName e) { this.numProcessing++; + deadServersUnderProcessing.add(e); return deadServers.add(e); } public synchronized void finish(ServerName e) { this.numProcessing--; + deadServersUnderProcessing.remove(e); } public synchronized int size() { Index: src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (revision 1231810) +++ src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (working copy) @@ -301,13 +301,15 @@ } /** - * Called on startup. - * Figures whether a fresh cluster start of we are joining extant running cluster. + * Called on startup. Figures whether a fresh cluster start or we are joining + * extant running cluster. + * @param onlineServers onlined servers when master start * @throws IOException * @throws KeeperException * @throws InterruptedException */ - void joinCluster() throws IOException, KeeperException, InterruptedException { + void joinCluster(final Set onlineServers) throws IOException, + KeeperException, InterruptedException { // Concurrency note: In the below the accesses on regionsInTransition are // outside of a synchronization block where usually all accesses to RIT are // synchronized. The presumption is that in this case it is safe since this @@ -318,8 +320,8 @@ // Scan META to build list of existing regions, servers, and assignment // Returns servers who have not checked in (assumed dead) and their regions - Map>> deadServers = - rebuildUserRegions(); + Map>> deadServers = + rebuildUserRegions(onlineServers); processDeadServersAndRegionsInTransition(deadServers); @@ -2189,11 +2191,14 @@ *

* Returns a map of servers that are not found to be online and the regions * they were hosting. - * @return map of servers not online to their assigned regions, as stored - * in META + * @param onlineServers if one region's location belongs to onlineServers, it + * doesn't need to assign + * @return map of servers not online to their assigned regions, as stored in + * META * @throws IOException */ - Map>> rebuildUserRegions() + Map>> rebuildUserRegions( + final Set onlineServers) throws IOException, KeeperException { // Region assignment from META List results = MetaReader.fullScan(this.catalogTracker); @@ -2225,7 +2230,7 @@ } addTheTablesInPartialState(this.disablingTables, this.enablingTables, regionInfo, tableName); - } else if (!this.serverManager.isServerOnline(regionLocation)) { + } else if (!onlineServers.contains(regionLocation)) { // Region is located on a server that isn't online List> offlineRegions = offlineServers.get(regionLocation); Index: src/main/java/org/apache/hadoop/hbase/master/ServerManager.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (revision 1231810) +++ src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (working copy) @@ -45,10 +45,10 @@ import org.apache.hadoop.hbase.client.HConnectionManager; import org.apache.hadoop.hbase.client.RetriesExhaustedException; import org.apache.hadoop.hbase.ipc.HRegionInterface; -import org.apache.hadoop.hbase.regionserver.RegionOpeningState; import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler; import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler; import org.apache.hadoop.hbase.monitoring.MonitoredTask; +import org.apache.hadoop.hbase.regionserver.RegionOpeningState; /** * The ServerManager class manages info about region servers. @@ -295,11 +295,30 @@ } } + /** + * @return Set of known dead servers. + */ public Set getDeadServers() { return this.deadservers.clone(); } /** + * @return Set of dead servers which are being processed by the + * ServerShutdownHander. + */ + Set getDeadServersBeingProcessed() { + return this.deadservers.getDeadServersBeingProcessed(); + } + + public boolean isDeadServerBeingProcessed(ServerName server) { + if (!this.deadservers.areDeadServersInProgress()) { + return false; + } + return this.deadservers.getDeadServersBeingProcessed().contains(server); + + } + + /** * Checks if any dead servers are currently in progress. * @return true if any RS are being processed as dead, false if not */ @@ -629,4 +648,5 @@ } } } + }