Index: src/main/java/org/apache/hadoop/hbase/master/DeadServer.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/DeadServer.java (revision 1465359) +++ src/main/java/org/apache/hadoop/hbase/master/DeadServer.java (working copy) @@ -40,6 +40,13 @@ */ private final Set deadServers = new HashSet(); + /** + * Set of processing dead servers. Dead server is added before starting + * ServerShutdownHandler, and will be removed once ServerShutdownHandler + * finished. + */ + private final Set processingDeadServers = new HashSet(); + /** Number of dead servers currently being processed */ private int numProcessing; @@ -103,11 +110,13 @@ public synchronized boolean add(ServerName e) { this.numProcessing++; + this.processingDeadServers.add(e); return deadServers.add(e); } public synchronized void finish(ServerName e) { this.numProcessing--; + this.processingDeadServers.remove(e); } public synchronized int size() { @@ -122,6 +131,10 @@ return deadServers.contains(o); } + public synchronized boolean isProcessingDeadServer(ServerName sn) { + return processingDeadServers.contains(sn); + } + public Iterator iterator() { return this.deadServers.iterator(); } Index: src/main/java/org/apache/hadoop/hbase/master/HMaster.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/HMaster.java (revision 1465359) +++ src/main/java/org/apache/hadoop/hbase/master/HMaster.java (working copy) @@ -585,7 +585,6 @@ // Make sure root and meta assigned before proceeding. assignRootAndMeta(status); - enableServerShutdownHandler(); // Update meta with new HRI if required. i.e migrate all HRI with HTD to // HRI with out HTD in meta and update the status in ROOT. This must happen @@ -704,23 +703,36 @@ LOG.info("-ROOT- assigned=" + assigned + ", rit=" + rit + ", location=" + catalogTracker.getRootLocation()); + // Enable ServerShutdownHandler before assign META in case of ROOT RS has + // gone. See HBASE-8251. + this.enableServerShutdownHandler(); // Work on meta region status.setStatus("Assigning META region"); rit = this.assignmentManager. processRegionInTransitionAndBlockUntilAssigned(HRegionInfo.FIRST_META_REGIONINFO); boolean metaRegionLocation = this.catalogTracker.verifyMetaRegionLocation(timeout); if (!rit && !metaRegionLocation) { - ServerName currentMetaServer = - this.catalogTracker.getMetaLocationOrReadLocationFromRoot(); - if (currentMetaServer != null - && !currentMetaServer.equals(currentRootServer)) { - splitLogAndExpireIfOnline(currentMetaServer); + ServerName currentMetaServer = this.catalogTracker + .getMetaLocationOrReadLocationFromRoot(); + // Leave META assignment to ServerShutdownHandler if current META server + // is a online or processing dead server. + boolean needToAssign = !(currentMetaServer != null && this.serverManager + .isOnlineOrProcessingDeadServer(currentMetaServer)); + if (needToAssign) { + if (currentMetaServer != null + && !currentMetaServer.equals(currentRootServer)) { + splitLogAndExpireIfOnline(currentMetaServer); + } + assignmentManager.assignMeta(); + } else { + LOG.info("Skip assign META since ServerShutdownHandler will take it over."); } - assignmentManager.assignMeta(); - enableSSHandWaitForMeta(); - assigned++; + waitForMetaAssignment(); + if (needToAssign) { + assigned++; + } } else if (rit && !metaRegionLocation) { - enableSSHandWaitForMeta(); + waitForMetaAssignment(); assigned++; } else { // Region already assigned. We didnt' assign it. Add to in-memory state. @@ -734,9 +746,8 @@ return assigned; } - private void enableSSHandWaitForMeta() throws IOException, + private void waitForMetaAssignment() throws IOException, InterruptedException { - enableServerShutdownHandler(); this.catalogTracker.waitForMeta(); // Above check waits for general meta availability but this does not // guarantee that the transition has completed Index: src/main/java/org/apache/hadoop/hbase/master/ServerManager.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (revision 1465359) +++ src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (working copy) @@ -709,4 +709,16 @@ } } + /** + * Check whether a server is a online server or processing dead server. + * @param sn ServerName. + * @return + */ + public synchronized boolean isOnlineOrProcessingDeadServer(ServerName sn) { + if (sn == null) { + return false; + } + return this.onlineServers.containsKey(sn) + || this.deadservers.isProcessingDeadServer(sn); + } } Index: src/test/java/org/apache/hadoop/hbase/master/TestMasterStartup.java =================================================================== --- src/test/java/org/apache/hadoop/hbase/master/TestMasterStartup.java (revision 0) +++ src/test/java/org/apache/hadoop/hbase/master/TestMasterStartup.java (working copy) @@ -0,0 +1,141 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master; + +import java.io.IOException; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.HRegionInfo; +import org.apache.hadoop.hbase.LargeTests; +import org.apache.hadoop.hbase.MiniHBaseCluster; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.util.JVMClusterUtil; +import org.junit.Test; +import static org.junit.Assert.fail; +import org.junit.experimental.categories.Category; + +@Category(LargeTests.class) +public class TestMasterStartup { + private static final Log LOG = LogFactory.getLog(TestMasterStartup.class); + private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); + + /** + * Test enable ServerShutdownHandler before assign META. + * + * @throws Exception + */ + @Test + public void testEnableSSHBeforeAssignMeta() throws Exception { + TEST_UTIL.startMiniDFSCluster(2); + TEST_UTIL.startMiniZKCluster(); + // We need to manage the cluster startup ourselves. + final MiniHBaseCluster hbaseCluster = new MiniHBaseCluster( + TEST_UTIL.getConfiguration(), 0, 0); + final JVMClusterUtil.MasterThread master = hbaseCluster.startMaster(); + hbaseCluster.startRegionServer(); + hbaseCluster.startRegionServer(); + final CountDownLatch finishSignal = new CountDownLatch(1); + final AtomicBoolean metaAssigned = new AtomicBoolean(false); + + Thread rootMonitor = new Thread() { + private AssignmentManager am = null; + private final int RETRY_LIMIT = 1000; + + public void run() { + waitUntilAMInitialized(); + waitUntilRootAssigned(); + LOG.info("ROOT has been opened successfully."); + ServerName sn = am.getRegionServerOfRegion(HRegionInfo.ROOT_REGIONINFO); + try { + LOG.info("Killing the regionserver hosts ROOT."); + hbaseCluster.killRegionServer(sn); + } catch (IOException e) { + LOG.warn( + "IOE happened when killing RegionServer : " + sn.getServerName(), + e); + } + waitUntilMetaAssigned(); + } + + private void waitUntilAMInitialized() { + int retried = 0; + while ((am = master.getMaster().getAssignmentManager()) == null) { + try { + Thread.sleep(10); + } catch (InterruptedException e) { + LOG.warn("Got interrupted.", e); + } + retried++; + if (retried == RETRY_LIMIT) { + LOG.warn("Retry exhaust while waiting for AssignmentManager initialize."); + finishSignal.countDown(); + } + } + } + + private void waitUntilRootAssigned() { + int retried = 0; + while (!am.isRegionAssigned(HRegionInfo.ROOT_REGIONINFO)) { + try { + LOG.info("ROOT has not been opened."); + Thread.sleep(10); + } catch (InterruptedException e) { + LOG.warn("Got interrupted.", e); + } + retried++; + if (retried == RETRY_LIMIT) { + LOG.warn("Retry exhaust while waiting for root to be opened."); + finishSignal.countDown(); + } + } + } + + private void waitUntilMetaAssigned() { + int retried = 0; + while (!am.isRegionAssigned(HRegionInfo.FIRST_META_REGIONINFO)) { + try { + LOG.info("META has not been assigned."); + Thread.sleep(10); + } catch (InterruptedException e) { + LOG.warn("Got interrupted.", e); + } + retried++; + if (retried == RETRY_LIMIT) { + LOG.warn("Retry exhaust while waiting for root to be opened."); + finishSignal.countDown(); + } + } + // META has been opened successfully. + metaAssigned.set(true); + // Notify the success. + finishSignal.countDown(); + } + + }; + rootMonitor.start(); + finishSignal.await(60, TimeUnit.SECONDS); + if (!metaAssigned.get()) { + fail("META region failed to open."); + } + TEST_UTIL.shutdownMiniCluster(); + } +}