### Eclipse Workspace Patch 1.0 #P hbase Index: src/main/java/org/apache/hadoop/hbase/master/ServerManager.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (revision 1338528) +++ src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (working copy) @@ -200,7 +200,10 @@ existingServer + " looks stale, new server:" + serverName); expireServer(existingServer); } - throw new PleaseHoldException(message); + if (services.isServerShutdownHandlerEnabled()) { + // master has completed the initialization + throw new PleaseHoldException(message); + } } } Index: src/main/java/org/apache/hadoop/hbase/master/HMaster.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/HMaster.java (revision 1338528) +++ src/main/java/org/apache/hadoop/hbase/master/HMaster.java (working copy) @@ -577,11 +577,9 @@ } this.assignmentManager.startTimeOutMonitor(); - Set onlineServers = new HashSet(serverManager - .getOnlineServers().keySet()); // TODO: Should do this in background rather than block master startup status.setStatus("Splitting logs after master startup"); - splitLogAfterStartup(this.fileSystemManager, onlineServers); + splitLogAfterStartup(this.fileSystemManager); // Make sure root and meta assigned before proceeding. if (!assignRootAndMeta(status)) return; @@ -598,7 +596,7 @@ this.balancer.setMasterServices(this); // Fixup assignment manager status status.setStatus("Starting assignment manager"); - this.assignmentManager.joinCluster(onlineServers); + this.assignmentManager.joinCluster(); this.balancer.setClusterStatus(getClusterStatus()); @@ -642,9 +640,8 @@ * @param mfs * @param onlineServers */ - protected void splitLogAfterStartup(final MasterFileSystem mfs, - Set onlineServers) { - mfs.splitLogAfterStartup(onlineServers); + protected void splitLogAfterStartup(final MasterFileSystem mfs) { + mfs.splitLogAfterStartup(); } /** Index: src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java (revision 1338528) +++ src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java (working copy) @@ -188,7 +188,7 @@ * @param onlineServers Set of online servers keyed by * {@link ServerName} */ - void splitLogAfterStartup(final Set onlineServers) { + void splitLogAfterStartup() { boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors", HLog.SPLIT_SKIP_ERRORS_DEFAULT); Path logsDirPath = new Path(this.rootdir, HConstants.HREGION_LOGDIR_NAME); @@ -197,6 +197,10 @@ try { if (!this.fs.exists(logsDirPath)) return; FileStatus[] logFolders = FSUtils.listStatus(this.fs, logsDirPath, null); + // Get online servers after getting log folders to avoid log folder deletion of newly + // checked in region servers . see HBASE-5916 + Set onlineServers = ((HMaster) master).getServerManager().getOnlineServers() + .keySet(); if (logFolders == null || logFolders.length == 0) { LOG.debug("No log files to split, proceeding..."); Index: src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (revision 1338528) +++ src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (working copy) @@ -343,7 +343,7 @@ * @throws KeeperException * @throws InterruptedException */ - void joinCluster(final Set onlineServers) throws IOException, + void joinCluster() throws IOException, KeeperException, InterruptedException { // Concurrency note: In the below the accesses on regionsInTransition are // outside of a synchronization block where usually all accesses to RIT are @@ -355,7 +355,7 @@ // Scan META to build list of existing regions, servers, and assignment // Returns servers who have not checked in (assumed dead) and their regions - Map>> deadServers = rebuildUserRegions(onlineServers); + Map>> deadServers = rebuildUserRegions(); // This method will assign all user regions if a clean server startup or // it will reconstitute master state and cleanup any leftovers from @@ -369,16 +369,6 @@ } /** - * Only used for tests - * @throws IOException - * @throws KeeperException - * @throws InterruptedException - */ - void joinCluster() throws IOException, KeeperException, InterruptedException { - joinCluster(serverManager.getOnlineServers().keySet()); - } - - /** * Process all regions that are in transition up in zookeeper. Used by * master joining an already running cluster. * @throws KeeperException @@ -2499,11 +2489,12 @@ * in META * @throws IOException */ - Map>> rebuildUserRegions( - final Set onlineServers) + Map>> rebuildUserRegions() throws IOException, KeeperException { // Region assignment from META List results = MetaReader.fullScan(this.catalogTracker); + // Get any new but slow to checkin region server that joined the cluster + Set onlineServers = serverManager.getOnlineServers().keySet(); // Map of offline servers and their regions to be returned Map>> offlineServers = new TreeMap>>(); @@ -2712,7 +2703,13 @@ final List nodes) throws KeeperException, IOException { if (deadServers == null) return; + Set actualDeadServers = this.serverManager.getDeadServers(); for (Map.Entry>> deadServer: deadServers.entrySet()) { + // skip regions of dead servers because SSH will process regions during rs expiration. + // see HBASE-5916 + if(actualDeadServers.contains(deadServer.getKey())){ + continue; + } List> regions = deadServer.getValue(); for (Pair region : regions) { HRegionInfo regionInfo = region.getFirst(); Index: src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java =================================================================== --- src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java (revision 1338528) +++ src/test/java/org/apache/hadoop/hbase/regionserver/TestRSKilledWhenMasterInitializing.java (working copy) @@ -49,6 +49,8 @@ import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; import org.apache.hadoop.hbase.util.Threads; +import org.apache.hadoop.hbase.zookeeper.ZKAssign; +import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; import org.apache.hadoop.hbase.LargeTests; import org.apache.zookeeper.KeeperException; import org.junit.AfterClass; @@ -98,9 +100,8 @@ } @Override - protected void splitLogAfterStartup(MasterFileSystem mfs, - Set onlineServers) { - super.splitLogAfterStartup(mfs, onlineServers); + protected void splitLogAfterStartup(MasterFileSystem mfs) { + super.splitLogAfterStartup(mfs); logSplit = true; // If "TestingMaster.sleep" is set, sleep after log split. if (getConfiguration().getBoolean("TestingMaster.sleep", false)) { @@ -215,6 +216,10 @@ while (serverManager.areDeadServersInProgress()) { Thread.sleep(100); } + // Create a ZKW to use in the test + ZooKeeperWatcher zkw = HBaseTestingUtility.getZooKeeperWatcher(TESTUTIL); + ZKAssign.blockUntilNoRIT(zkw); + table = new HTable(TESTUTIL.getConfiguration(), TABLENAME); resultScanner = table.getScanner(new Scan()); count = 0;