From 7c3a580c9d658a78e7ab91d5252c6e5a67aeedbc Mon Sep 17 00:00:00 2001 From: Nick Dimiduk Date: Wed, 12 Nov 2014 15:24:21 -0800 Subject: [PATCH] HBASE-12467 Master joins cluster but never completes initialization --- .../org/apache/hadoop/hbase/master/HMaster.java | 68 +++++++++++++++++++--- 1 file changed, 59 insertions(+), 9 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 3437f34..d8f6fd2 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -33,6 +33,7 @@ import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicReference; import javax.servlet.ServletException; @@ -109,15 +110,7 @@ import org.apache.hadoop.hbase.regionserver.RSRpcServices; import org.apache.hadoop.hbase.regionserver.RegionSplitPolicy; import org.apache.hadoop.hbase.replication.regionserver.Replication; import org.apache.hadoop.hbase.security.UserProvider; -import org.apache.hadoop.hbase.util.Addressing; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.hbase.util.CompressionTest; -import org.apache.hadoop.hbase.util.FSUtils; -import org.apache.hadoop.hbase.util.HFileArchiveUtil; -import org.apache.hadoop.hbase.util.Pair; -import org.apache.hadoop.hbase.util.Threads; -import org.apache.hadoop.hbase.util.VersionInfo; -import org.apache.hadoop.hbase.util.ZKDataMigrator; +import org.apache.hadoop.hbase.util.*; import org.apache.hadoop.hbase.zookeeper.DrainingServerTracker; import org.apache.hadoop.hbase.zookeeper.LoadBalancerTracker; import org.apache.hadoop.hbase.zookeeper.MasterAddressTracker; @@ -158,6 +151,59 @@ import com.google.protobuf.Service; public class HMaster extends HRegionServer implements MasterServices, Server { private static final Log LOG = LogFactory.getLog(HMaster.class.getName()); + /** + * Protection against zombie master. Started once Master accepts active responsibility and + * starts taking over responsibilities. Allows a finite time window before giving up ownership. + */ + private static class InitializationMonitor extends HasThread { + /** The amount of time in milliseconds to sleep before checking initialization status. */ + public static final String TIMEOUT_KEY = "hbase.master.initializationmonitor.timeout"; + public static final long TIMEOUT_DEFAULT = TimeUnit.MILLISECONDS.convert(15, TimeUnit.MINUTES); + + /** + * When timeout expired and initialization has not complete, call {@link System#exit(int)} when + * true, do nothing otherwise. + */ + public static final String HALT_KEY = "hbase.master.initializationmonitor.haltontimeout"; + public static final boolean HALT_DEFAULT = false; + + private final HMaster master; + private final long timeout; + private final boolean haltOnTimeout; + + /** Creates a Thread that monitors the {@link #isInitialized()} state. */ + InitializationMonitor(HMaster master) { + super("MasterInitializationMonitor"); + this.master = master; + this.timeout = master.getConfiguration().getLong(TIMEOUT_KEY, TIMEOUT_DEFAULT); + this.haltOnTimeout = master.getConfiguration().getBoolean(HALT_KEY, HALT_DEFAULT); + this.setDaemon(true); + } + + @Override + public void run() { + try { + while (!master.isStopped() && master.isActiveMaster()) { + Thread.sleep(timeout); + if (master.isInitialized()) { + LOG.debug("Initialization completed within allotted tolerance. Monitor exiting."); + } else { + LOG.error("Master failed to complete initialization after " + timeout + "ms. Please" + + " consider submitting a bug report including a thread dump of this process."); + if (haltOnTimeout) { + LOG.error("Zombie Master exiting."); + System.exit(-1); + } else { + LOG.error("Zombie Master?"); + } + } + } + } catch (InterruptedException ie) { + LOG.trace("InitMonitor thread interrupted. Existing."); + } + } + } + // MASTER is name of the webapp and the attribute name used stuffing this //instance into web context. public static final String MASTER = "master"; @@ -511,6 +557,8 @@ public class HMaster extends HRegionServer implements MasterServices, Server { throws IOException, InterruptedException, KeeperException, CoordinatedStateException { isActiveMaster = true; + Thread zombieDetector = new Thread(new InitializationMonitor(this)); + zombieDetector.start(); /* * We are active master now... go initialize components we need to run. @@ -680,6 +728,8 @@ public class HMaster extends HRegionServer implements MasterServices, Server { LOG.error("Coprocessor postStartMaster() hook failed", ioe); } } + + zombieDetector.interrupt(); } /** -- 1.9.3 (Apple Git-50)