Index: src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java (revision 1230405) +++ src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java (working copy) @@ -225,7 +225,23 @@ public HServerAddress getMetaLocation() { return this.metaLocation; } + + /** + * Method used by master on startup trying to figure state of cluster. + * Returns the current meta location unless its null. In this latter case, + * it has not yet been set so go check whats up in -ROOT- and + * return that. + * @return {@link ServerName} for server hosting .META. or if null, + * we'll read the location that is up in -ROOT- table (which + * could be null or just plain stale). + * @throws IOException + */ + public HServerAddress getMetaLocationOrReadLocationFromRoot() throws IOException { + HServerAddress sn = getMetaLocation(); + return sn != null? sn: MetaReader.getMetaRegionLocation(this); + } + /** * Waits indefinitely for availability of -ROOT-. Used during * cluster startup. Index: src/main/java/org/apache/hadoop/hbase/catalog/MetaReader.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/catalog/MetaReader.java (revision 1230405) +++ src/main/java/org/apache/hadoop/hbase/catalog/MetaReader.java (working copy) @@ -276,6 +276,19 @@ } /** + * Gets the location of .META. region by reading content of + * -ROOT-. + * @param ct + * @return location of .META. region as a {@link ServerName} or + * null if not found + * @throws IOException + */ + static HServerAddress getMetaRegionLocation(final CatalogTracker ct) + throws IOException { + return MetaReader.readMetaLocation(ct.waitForRootServerConnectionDefault()); + } + + /** * Reads the location of META from ROOT. * @param metaServer connection to server hosting ROOT * @return location of META in ROOT, null if not available Index: src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (revision 1230405) +++ src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (working copy) @@ -193,11 +193,13 @@ /** * Handle failover. Restore state from META and ZK. Handle any regions in * transition. Presumes .META. and -ROOT- deployed. + * @param onlineServers onlined servers when master start * @throws KeeperException * @throws IOException * @throws InterruptedException */ - void processFailover() throws KeeperException, IOException, InterruptedException { + void processFailover(final Set onlineServers) + throws KeeperException, IOException, InterruptedException { // Concurrency note: In the below the accesses on regionsInTransition are // outside of a synchronization block where usually all accesses to RIT are // synchronized. The presumption is that in this case it is safe since this @@ -218,7 +220,7 @@ // Scan META to build list of existing regions, servers, and assignment // Returns servers who have not checked in (assumed dead) and their regions Map>> deadServers = - rebuildUserRegions(); + rebuildUserRegions(onlineServers); // Process list of dead servers; note this will add regions to the RIT. // processRegionsInTransition will read them and assign them out. processDeadServers(deadServers); @@ -1560,12 +1562,15 @@ *

* Returns a map of servers that are not found to be online and the regions * they were hosting. + * @param onlineServers if one region's location belongs to onlineServers, it + * doesn't need to be assigned * @return map of servers not online to their assigned regions, as stored * in META * @throws IOException * @throws KeeperException */ - private Map>> rebuildUserRegions() + private Map>> rebuildUserRegions( + final Set onlineServers) throws IOException, KeeperException { // Region assignment from META List results = MetaReader.fullScanOfResults(catalogTracker); @@ -1592,7 +1597,7 @@ if (checkIfRegionBelongsToDisabling(regionInfo)) { disablingTables.add(disablingTableName); } - } else if (!serverManager.isServerOnline(regionLocation.getServerName())) { + } else if (!onlineServers.contains(regionLocation.getServerName())) { // Region is located on a server that isn't online List> offlineRegions = offlineServers.get(regionLocation.getServerName()); Index: src/main/java/org/apache/hadoop/hbase/master/DeadServer.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/DeadServer.java (revision 1230405) +++ src/main/java/org/apache/hadoop/hbase/master/DeadServer.java (working copy) @@ -22,15 +22,14 @@ import java.util.Collection; import java.util.HashSet; import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; import java.util.Set; import org.apache.commons.lang.NotImplementedException; import org.apache.hadoop.hbase.HServerInfo; /** - * Class to hold dead servers list and utility querying dead server list. + * Class to hold dead servers list, utility querying dead server list and the + * dead servers being processed by the ServerShutdownHandler. */ public class DeadServer implements Set { /** @@ -41,7 +40,11 @@ * because by then, its regions have probably been reassigned. */ private final Set deadServers = new HashSet(); - + /** + * Set of dead servers under processing by the ServerShutdownHander. + */ + private final Set deadServersUnderProcessing = new HashSet(); + /** Maximum number of dead servers to keep track of */ private final int maxDeadServers; @@ -111,13 +114,22 @@ return clone; } + synchronized Set getDeadServersBeingProcessed() { + Set clone = new HashSet( + this.deadServersUnderProcessing.size()); + clone.addAll(this.deadServersUnderProcessing); + return clone; + } + public synchronized boolean add(String e) { this.numProcessing++; + deadServersUnderProcessing.add(e); return deadServers.add(e); } public synchronized void finish(String e) { this.numProcessing--; + deadServersUnderProcessing.remove(e); } public synchronized int size() { Index: src/main/java/org/apache/hadoop/hbase/master/HMaster.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/HMaster.java (revision 1230405) +++ src/main/java/org/apache/hadoop/hbase/master/HMaster.java (working copy) @@ -25,8 +25,10 @@ import java.net.InetSocketAddress; import java.net.UnknownHostException; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.atomic.AtomicReference; import org.apache.commons.logging.Log; @@ -51,7 +53,6 @@ import org.apache.hadoop.hbase.catalog.CatalogTracker; import org.apache.hadoop.hbase.catalog.MetaEditor; import org.apache.hadoop.hbase.catalog.MetaReader; -import org.apache.hadoop.hbase.client.HConnectionManager; import org.apache.hadoop.hbase.client.MetaScanner; import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor; import org.apache.hadoop.hbase.client.Result; @@ -374,10 +375,16 @@ // Wait for region servers to report in. Returns count of regions. int regionCount = this.serverManager.waitForRegionServers(); - + + Set knownServers = new HashSet(); + knownServers.addAll(serverManager.getOnlineServers().keySet()); + if (this.serverManager.areDeadServersInProgress()) { + // Dead servers are processing, their logs would be split by + // ServerShutdownHandler + knownServers.addAll(serverManager.getDeadServersBeingProcessed()); + } // TODO: Should do this in background rather than block master startup - this.fileSystemManager. - splitLogAfterStartup(this.serverManager.getOnlineServers()); + this.fileSystemManager.splitLogAfterStartup(knownServers); // Make sure root and meta assigned before proceeding. assignRootAndMeta(); @@ -393,7 +400,7 @@ this.assignmentManager.assignAllUserRegions(); } else { LOG.info("Master startup proceeding: master failover"); - this.assignmentManager.processFailover(); + this.assignmentManager.processFailover(knownServers); } // Start balancer and meta catalog janitor after meta and regions have @@ -406,6 +413,15 @@ initialized = true; } + boolean isProcessingServer(HServerAddress address) { + if (serverManager.getDeadServersBeingProcessed() == null) { + return false; + } + String serverName = HServerInfo.getServerName(address, 1); + return HServerInfo.isServer(serverManager.getDeadServersBeingProcessed(), + serverName, true); + } + /** * Check -ROOT- and .META. are assigned. If not, * assign them. @@ -418,14 +434,19 @@ throws InterruptedException, IOException, KeeperException { int assigned = 0; long timeout = this.conf.getLong("hbase.catalog.verification.timeout", 1000); - + // Work on ROOT region. Is it in zk in transition? - boolean rit = this.assignmentManager. - processRegionInTransitionAndBlockUntilAssigned(HRegionInfo.ROOT_REGIONINFO); + boolean rit = this.assignmentManager + .processRegionInTransitionAndBlockUntilAssigned(HRegionInfo.ROOT_REGIONINFO); if (!catalogTracker.verifyRootRegionLocation(timeout)) { - this.assignmentManager.assignRoot(); - this.catalogTracker.waitForRoot(); - //This guarantees that the transition has completed + + HServerAddress address = catalogTracker.getRootLocation(); + if (address == null || !isProcessingServer(address)) { + this.assignmentManager.assignRoot(); + this.catalogTracker.waitForRoot(); + } + + // This guarantees that the transition has completed this.assignmentManager.waitForAssignment(HRegionInfo.ROOT_REGIONINFO); assigned++; } @@ -433,14 +454,21 @@ ", location=" + catalogTracker.getRootLocation()); // Work on meta region - rit = this.assignmentManager. - processRegionInTransitionAndBlockUntilAssigned(HRegionInfo.FIRST_META_REGIONINFO); + rit = this.assignmentManager + .processRegionInTransitionAndBlockUntilAssigned(HRegionInfo.FIRST_META_REGIONINFO); if (!this.catalogTracker.verifyMetaRegionLocation(timeout)) { - this.assignmentManager.assignMeta(); - this.catalogTracker.waitForMeta(); + + HServerAddress address = catalogTracker + .getMetaLocationOrReadLocationFromRoot(); + if (address == null || !isProcessingServer(address)) { + this.assignmentManager.assignMeta(); + this.catalogTracker.waitForMeta(); + } + // Above check waits for general meta availability but this does not // guarantee that the transition has completed - this.assignmentManager.waitForAssignment(HRegionInfo.FIRST_META_REGIONINFO); + this.assignmentManager + .waitForAssignment(HRegionInfo.FIRST_META_REGIONINFO); assigned++; } LOG.info(".META. assigned=" + assigned + ", rit=" + rit + Index: src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java (revision 1230405) +++ src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java (working copy) @@ -20,7 +20,7 @@ package org.apache.hadoop.hbase.master; import java.io.IOException; -import java.util.Map; +import java.util.Set; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; @@ -155,7 +155,7 @@ * @param onlineServers Map of online servers keyed by * {@link HServerInfo#getServerName()} */ - void splitLogAfterStartup(final Map onlineServers) { + void splitLogAfterStartup(final Set onlineServers) { Path logsDirPath = new Path(this.rootdir, HConstants.HREGION_LOGDIR_NAME); try { if (!this.fs.exists(logsDirPath)) { @@ -176,7 +176,7 @@ } for (FileStatus status : logFolders) { String serverName = status.getPath().getName(); - if (onlineServers.get(serverName) == null) { + if (!onlineServers.contains(serverName)) { LOG.info("Log folder " + status.getPath() + " doesn't belong " + "to a known region server, splitting"); splitLog(serverName); Index: src/main/java/org/apache/hadoop/hbase/master/ServerManager.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (revision 1230405) +++ src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (working copy) @@ -49,7 +49,6 @@ import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler; import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler; import org.apache.hadoop.hbase.master.metrics.MasterMetrics; -import org.apache.hadoop.hbase.regionserver.Leases.LeaseStillHeldException; /** * The ServerManager class manages info about region servers - HServerInfo, @@ -420,11 +419,22 @@ } } + /** + * @return Set of known dead servers. + */ public Set getDeadServers() { return this.deadservers.clone(); } /** + * @return Set of dead servers which are being processed by the + * ServerShutdownHander. + */ + public Set getDeadServersBeingProcessed() { + return this.deadservers.getDeadServersBeingProcessed(); + } + + /** * Checks if any dead servers are currently in progress. * @return true if any RS are being processed as dead, false if not */