Index: src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java =================================================================== --- src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java (revision 1376369) +++ src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java (working copy) @@ -39,6 +39,7 @@ import java.util.NavigableSet; import java.util.Random; import java.util.UUID; +import java.util.concurrent.atomic.AtomicBoolean; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -71,6 +72,7 @@ import org.apache.hadoop.hbase.security.User; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.FSUtils; +import org.apache.hadoop.hbase.util.JVMClusterUtil; import org.apache.hadoop.hbase.util.RegionSplitter; import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.util.Writables; @@ -84,6 +86,7 @@ import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MiniMRCluster; import org.apache.zookeeper.KeeperException; +import org.apache.zookeeper.WatchedEvent; import org.apache.zookeeper.KeeperException.NodeExistsException; import org.apache.zookeeper.ZooKeeper; @@ -1342,7 +1345,7 @@ */ public void expireMasterSession() throws Exception { HMaster master = hbaseCluster.getMaster(); - expireSession(master.getZooKeeper(), master); + expireSession(master.getZooKeeper(), false); } /** @@ -1352,16 +1355,22 @@ */ public void expireRegionServerSession(int index) throws Exception { HRegionServer rs = hbaseCluster.getRegionServer(index); - expireSession(rs.getZooKeeper(), rs); + expireSession(rs.getZooKeeper(), false); } - public void expireSession(ZooKeeperWatcher nodeZK, Server server) + /** + * Expire a ZooKeeper session as recommended in ZooKeeper documentation + * http://wiki.apache.org/hadoop/ZooKeeper/FAQ#A4 + * There are issues when doing this: + * [1] http://www.mail-archive.com/dev@zookeeper.apache.org/msg01942.html + * [2] https://issues.apache.org/jira/browse/ZOOKEEPER-1105 + * + * @param nodeZK - the ZK to make expiry + * @param checkStatus - true to check if the we can create a HTable with the + * current configuration. + */ + public void expireSession(ZooKeeperWatcher nodeZK, boolean checkStatus) throws Exception { - expireSession(nodeZK, server, false); - } - - public void expireSession(ZooKeeperWatcher nodeZK, Server server, - boolean checkStatus) throws Exception { Configuration c = new Configuration(this.conf); String quorumServers = ZKConfig.getZKQuorumServersString(c); int sessionTimeout = 500; @@ -1369,14 +1378,29 @@ byte[] password = zk.getSessionPasswd(); long sessionID = zk.getSessionId(); + // Expiry seems to be asynchronous (see comment from P. Hunt in [1]), + // so we create a first watcher to be sure that the + // event was sent. We expect that if our watcher receives the event + // other watchers on the same machine will get is as well. + // When we ask to close the connection, ZK does not close it before + // we receive all the events, so don't have to capture the event, just + // closing the connection should be enough. + ZooKeeper monitor = new ZooKeeper(quorumServers, + 1000, new org.apache.zookeeper.Watcher(){ + @Override + public void process(WatchedEvent watchedEvent) { + LOG.info("Monitor ZKW received event="+watchedEvent); + } + } , sessionID, password); + + // Making it expire ZooKeeper newZK = new ZooKeeper(quorumServers, sessionTimeout, EmptyWatcher.instance, sessionID, password); newZK.close(); - final long sleep = 7000; // 7s seems enough to manage the timeout - LOG.info("ZK Closed Session 0x" + Long.toHexString(sessionID) + - "; sleeping=" + sleep); + LOG.info("ZK Closed Session 0x" + Long.toHexString(sessionID)); - Thread.sleep(sleep); + // Now closing & waiting to be sure that the clients get it. + monitor.close(); if (checkStatus) { new HTable(new Configuration(conf), HConstants.META_TABLE_NAME).close(); @@ -1545,7 +1569,7 @@ * Make sure that at least the specified number of region servers * are running * @param num minimum number of region servers that should be running - * @return True if we started some servers + * @return true if we started some servers * @throws IOException */ public boolean ensureSomeRegionServersAvailable(final int num) @@ -1561,8 +1585,33 @@ } + /** + * Make sure that at least the specified number of region servers + * are running. We don't count the ones that are currently stopping or are + * stopped. + * @param num minimum number of region servers that should be running + * @return true if we started some servers + * @throws IOException + */ + public boolean ensureSomeNonStoppedRegionServersAvailable(final int num) + throws IOException { + boolean startedServer = ensureSomeRegionServersAvailable(num); + for (JVMClusterUtil.RegionServerThread rst : + hbaseCluster.getRegionServerThreads()) { + HRegionServer hrs = rst.getRegionServer(); + if (hrs.isStopping() || hrs.isStopped()) { + LOG.info("A region server is stopped or stopping:"+hrs); + LOG.info("Started new server=" + hbaseCluster.startRegionServer()); + startedServer = true; + } + } + + return startedServer; + } + + /** * This method clones the passed c configuration setting a new * user into the clone. Use it getting new instances of FileSystem. Only Index: src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java =================================================================== --- src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java (revision 1376369) +++ src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java (working copy) @@ -20,6 +20,7 @@ package org.apache.hadoop.hbase.regionserver; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNotSame; import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertFalse; @@ -83,7 +84,7 @@ } @Before public void setup() throws IOException { - TESTING_UTIL.ensureSomeRegionServersAvailable(NB_SERVERS); + TESTING_UTIL.ensureSomeNonStoppedRegionServersAvailable(NB_SERVERS); this.admin = new HBaseAdmin(TESTING_UTIL.getConfiguration()); this.cluster = TESTING_UTIL.getMiniHBaseCluster(); } @@ -651,7 +652,10 @@ HRegionServer tableRegionServer = cluster.getRegionServer(tableRegionIndex); if (metaRegionServer.getServerName().equals(tableRegionServer.getServerName())) { HRegionServer hrs = getOtherRegionServer(cluster, metaRegionServer); - LOG.info("Moving " + hri.getRegionNameAsString() + " to " + + assertNotNull(hrs); + assertNotNull(hri); + LOG. + info("Moving " + hri.getRegionNameAsString() + " to " + hrs.getServerName() + "; metaServerIndex=" + metaServerIndex); admin.move(hri.getEncodedNameAsBytes(), Bytes.toBytes(hrs.getServerName().toString())); Index: src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java =================================================================== --- src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java (revision 1376369) +++ src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java (working copy) @@ -26,6 +26,8 @@ import static org.junit.Assert.fail; import java.io.IOException; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; import java.util.concurrent.ExecutorService; import java.util.concurrent.SynchronousQueue; import java.util.concurrent.ThreadPoolExecutor; @@ -40,6 +42,7 @@ import org.apache.hadoop.hbase.client.HConnectionManager; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.zookeeper.ZKConfig; @@ -97,6 +100,15 @@ TEST_UTIL.ensureSomeRegionServersAvailable(2); } + private ZooKeeperWatcher getZooKeeperWatcher(HConnection c) throws + NoSuchMethodException, InvocationTargetException, IllegalAccessException { + + Method getterZK = c.getClass().getMethod("getKeepAliveZooKeeperWatcher"); + getterZK.setAccessible(true); + + return (ZooKeeperWatcher) getterZK.invoke(c); + } + /** * See HBASE-1232 and http://wiki.apache.org/hadoop/ZooKeeper/FAQ#4. * @throws IOException @@ -111,7 +123,7 @@ new HTable(c, HConstants.META_TABLE_NAME).close(); HConnection connection = HConnectionManager.getConnection(c); ZooKeeperWatcher connectionZK = connection.getZooKeeperWatcher(); - TEST_UTIL.expireSession(connectionZK, null); + TEST_UTIL.expireSession(connectionZK, false); // provoke session expiration by doing something with ZK ZKUtil.dump(connectionZK); @@ -350,6 +362,21 @@ ZKUtil.getChildDataAndWatchForNewChildren(zkw, "/wrongNode"); } + /** + * Master recovery when the znode already exists. Internally, this + * test differs from {@link #testMasterSessionExpired} because here + * the master znode will exist in ZK. + */ + @Test(timeout=20000) + public void testMasterZKSessionRecoveryFailure() throws Exception { + MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); + HMaster m = cluster.getMaster(); + m.abort("Test recovery from zk session expired", + new KeeperException.SessionExpiredException()); + assertFalse(m.isStopped()); + testSanity(); + } + @org.junit.Rule public org.apache.hadoop.hbase.ResourceCheckerJUnitRule cu = new org.apache.hadoop.hbase.ResourceCheckerJUnitRule(); Index: src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java =================================================================== --- src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java (revision 1376369) +++ src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java (working copy) @@ -87,6 +87,8 @@ LOG.info("Starting cluster"); conf = HBaseConfiguration.create(); conf.getLong("hbase.splitlog.max.resubmit", 0); + // Make the failure test faster + conf.setInt("zookeeper.recovery.retry", 0); TEST_UTIL = new HBaseTestingUtility(conf); TEST_UTIL.startMiniCluster(NUM_MASTERS, num_rs); cluster = TEST_UTIL.getHBaseCluster(); @@ -245,7 +247,7 @@ slm.enqueueSplitTask(logfiles[0].getPath().toString(), batch); //waitForCounter but for one of the 2 counters long curt = System.currentTimeMillis(); - long waitTime = 30000; + long waitTime = 80000; long endt = curt + waitTime; while (curt < endt) { if ((tot_wkr_task_resigned.get() + tot_wkr_task_err.get() + Index: src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java =================================================================== --- src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java (revision 1376369) +++ src/test/java/org/apache/hadoop/hbase/master/TestMasterZKSessionRecovery.java (working copy) @@ -80,40 +80,6 @@ } /** - * Negative test of master recovery from zk session expiry. - *

- * Starts with one master. Fakes the master zk session expired. - * The master should be able to come up if he is able to create - * the node as active master. - * @throws Exception - */ - @Test(timeout=10000) - public void testMasterZKSessionRecoveryFailure() throws Exception { - MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); - HMaster m = cluster.getMaster(); - m.abort("Test recovery from zk session expired", - new KeeperException.SessionExpiredException()); - assertFalse(m.isStopped()); - } - - /** - * Positive test of master recovery from zk session expiry. - *

- * Starts with one master. Closes the master zk session. - * Ensures the master can recover the expired zk session. - * @throws Exception - */ - @Test(timeout=60000) - public void testMasterZKSessionRecoverySuccess() throws Exception { - MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); - HMaster m = cluster.getMaster(); - m.getZooKeeperWatcher().close(); - m.abort("Test recovery from zk session expired", - new KeeperException.SessionExpiredException()); - assertFalse(m.isStopped()); - } - - /** * Tests that the master does not call retainAssignment after recovery from * expired zookeeper session. Without the HBASE-6046 fix master always tries * to assign all the user regions by calling retainAssignment. Index: src/test/java/org/apache/hadoop/hbase/replication/TestReplication.java =================================================================== --- src/test/java/org/apache/hadoop/hbase/replication/TestReplication.java (revision 1376369) +++ src/test/java/org/apache/hadoop/hbase/replication/TestReplication.java (working copy) @@ -100,6 +100,8 @@ conf1.setLong("replication.source.sleepforretries", 100); conf1.setInt("hbase.regionserver.maxlogs", 10); conf1.setLong("hbase.master.logcleaner.ttl", 10); + conf1.setInt("zookeeper.recovery.retry", 1); + conf1.setInt("zookeeper.recovery.retry.intervalmill", 10); conf1.setBoolean(HConstants.REPLICATION_ENABLE_KEY, true); conf1.setBoolean("dfs.support.append", true); conf1.setLong(HConstants.THREAD_WAKE_FREQUENCY, 100); @@ -757,9 +759,11 @@ int lastCount = 0; + final long start = System.currentTimeMillis(); for (int i = 0; i < NB_RETRIES; i++) { if (i==NB_RETRIES-1) { - fail("Waited too much time for queueFailover replication"); + fail("Waited too much time for queueFailover replication. " + + "Waited "+(System.currentTimeMillis() - start)+"ms."); } Scan scan2 = new Scan(); ResultScanner scanner2 = htable2.getScanner(scan2); Index: src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java (revision 1376369) +++ src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java (working copy) @@ -69,7 +69,9 @@ // An identifier of this process in the cluster private final String identifier; private final byte[] id; - private int retryIntervalMillis; + private Watcher watcher; + private int sessionTimeout; + private String quorumServers; // The metadata attached to each piece of data has the // format: @@ -84,20 +86,33 @@ private static final int ID_LENGTH_OFFSET = MAGIC_SIZE; private static final int ID_LENGTH_SIZE = Bytes.SIZEOF_INT; - public RecoverableZooKeeper(String quorumServers, int seesionTimeout, + public RecoverableZooKeeper(String quorumServers, int sessionTimeout, Watcher watcher, int maxRetries, int retryIntervalMillis) throws IOException { - this.zk = new ZooKeeper(quorumServers, seesionTimeout, watcher); + this.zk = new ZooKeeper(quorumServers, sessionTimeout, watcher); this.retryCounterFactory = new RetryCounterFactory(maxRetries, retryIntervalMillis); - this.retryIntervalMillis = retryIntervalMillis; // the identifier = processID@hostName this.identifier = ManagementFactory.getRuntimeMXBean().getName(); LOG.info("The identifier of this process is " + identifier); this.id = Bytes.toBytes(identifier); + this.watcher = watcher; + this.sessionTimeout = sessionTimeout; + this.quorumServers = quorumServers; } + public void reconnectAfterExpiration() + throws IOException, InterruptedException { + LOG.info("Closing dead ZooKeeper connection, session" + + " was: 0x"+Long.toHexString(zk.getSessionId())); + zk.close(); + this.zk = new ZooKeeper(this.quorumServers, + this.sessionTimeout, this.watcher); + LOG.info("Recreated a ZooKeeper, session" + + " is: 0x"+Long.toHexString(zk.getSessionId())); + } + /** * delete is an idempotent operation. Retry before throwing exception. * This function will not throw NoNodeException if the path does not @@ -124,6 +139,7 @@ throw e; case CONNECTIONLOSS: + case SESSIONEXPIRED: case OPERATIONTIMEOUT: retryOrThrow(retryCounter, e, "delete"); break; @@ -151,6 +167,7 @@ } catch (KeeperException e) { switch (e.code()) { case CONNECTIONLOSS: + case SESSIONEXPIRED: case OPERATIONTIMEOUT: retryOrThrow(retryCounter, e, "exists"); break; @@ -177,6 +194,7 @@ } catch (KeeperException e) { switch (e.code()) { case CONNECTIONLOSS: + case SESSIONEXPIRED: case OPERATIONTIMEOUT: retryOrThrow(retryCounter, e, "exists"); break; @@ -213,6 +231,7 @@ } catch (KeeperException e) { switch (e.code()) { case CONNECTIONLOSS: + case SESSIONEXPIRED: case OPERATIONTIMEOUT: retryOrThrow(retryCounter, e, "getChildren"); break; @@ -239,6 +258,7 @@ } catch (KeeperException e) { switch (e.code()) { case CONNECTIONLOSS: + case SESSIONEXPIRED: case OPERATIONTIMEOUT: retryOrThrow(retryCounter, e, "getChildren"); break; @@ -266,6 +286,7 @@ } catch (KeeperException e) { switch (e.code()) { case CONNECTIONLOSS: + case SESSIONEXPIRED: case OPERATIONTIMEOUT: retryOrThrow(retryCounter, e, "getData"); break; @@ -293,6 +314,7 @@ } catch (KeeperException e) { switch (e.code()) { case CONNECTIONLOSS: + case SESSIONEXPIRED: case OPERATIONTIMEOUT: retryOrThrow(retryCounter, e, "getData"); break; @@ -322,6 +344,7 @@ } catch (KeeperException e) { switch (e.code()) { case CONNECTIONLOSS: + case SESSIONEXPIRED: case OPERATIONTIMEOUT: retryOrThrow(retryCounter, e, "setData"); break; @@ -418,6 +441,7 @@ throw e; case CONNECTIONLOSS: + case SESSIONEXPIRED: case OPERATIONTIMEOUT: retryOrThrow(retryCounter, e, "create"); break; @@ -452,6 +476,7 @@ } catch (KeeperException e) { switch (e.code()) { case CONNECTIONLOSS: + case SESSIONEXPIRED: case OPERATIONTIMEOUT: retryOrThrow(retryCounter, e, "create"); break; Index: src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java (revision 1376369) +++ src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java (working copy) @@ -243,6 +243,10 @@ return recoverableZooKeeper; } + public void reconnectAfterExpiration() throws IOException, InterruptedException { + recoverableZooKeeper.reconnectAfterExpiration(); + } + /** * Get the quorum address of this instance. * @return quorum string of this zookeeper connection instance Index: src/main/java/org/apache/hadoop/hbase/master/HMaster.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/HMaster.java (revision 1376369) +++ src/main/java/org/apache/hadoop/hbase/master/HMaster.java (working copy) @@ -1464,8 +1464,7 @@ private boolean tryRecoveringExpiredZKSession() throws InterruptedException, IOException, KeeperException, ExecutionException { - this.zooKeeper = new ZooKeeperWatcher(conf, MASTER + ":" - + this.serverName.getPort(), this, true); + this.zooKeeper.reconnectAfterExpiration(); Callable callable = new Callable () { public Boolean call() throws InterruptedException, Index: src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java (revision 1376369) +++ src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java (working copy) @@ -122,97 +122,94 @@ * * This also makes sure that we are watching the master znode so will be * notified if another master dies. - * @param startupStatus + * @param startupStatus * @return True if no issue becoming active master else false if another * master was running or if some other problem (zookeeper, stop flag has been * set on this Master) */ boolean blockUntilBecomingActiveMaster(MonitoredTask startupStatus, - ClusterStatusTracker clusterStatusTracker) { - startupStatus.setStatus("Trying to register in ZK as active master"); - boolean cleanSetOfActiveMaster = true; - // Try to become the active master, watch if there is another master. - // Write out our ServerName as versioned bytes. - try { - String backupZNode = ZKUtil.joinZNode( + ClusterStatusTracker clusterStatusTracker) { + while (true) { + startupStatus.setStatus("Trying to register in ZK as active master"); + // Try to become the active master, watch if there is another master. + // Write out our ServerName as versioned bytes. + try { + String backupZNode = ZKUtil.joinZNode( this.watcher.backupMasterAddressesZNode, this.sn.toString()); - if (ZKUtil.createEphemeralNodeAndWatch(this.watcher, + if (ZKUtil.createEphemeralNodeAndWatch(this.watcher, this.watcher.masterAddressZNode, this.sn.getVersionedBytes())) { - // If we were a backup master before, delete our ZNode from the backup - // master directory since we are the active now - LOG.info("Deleting ZNode for " + backupZNode + - " from backup master directory"); - ZKUtil.deleteNodeFailSilent(this.watcher, backupZNode); + // If we were a backup master before, delete our ZNode from the backup + // master directory since we are the active now + LOG.info("Deleting ZNode for " + backupZNode + + " from backup master directory"); + ZKUtil.deleteNodeFailSilent(this.watcher, backupZNode); - // We are the master, return - startupStatus.setStatus("Successfully registered as active master."); + // We are the master, return + startupStatus.setStatus("Successfully registered as active master."); + this.clusterHasActiveMaster.set(true); + LOG.info("Master=" + this.sn); + return true; + } + + // There is another active master running elsewhere or this is a restart + // and the master ephemeral node has not expired yet. this.clusterHasActiveMaster.set(true); - LOG.info("Master=" + this.sn); - return cleanSetOfActiveMaster; - } - cleanSetOfActiveMaster = false; - // There is another active master running elsewhere or this is a restart - // and the master ephemeral node has not expired yet. - this.clusterHasActiveMaster.set(true); - - /* - * Add a ZNode for ourselves in the backup master directory since we are - * not the active master. - * - * If we become the active master later, ActiveMasterManager will delete - * this node explicitly. If we crash before then, ZooKeeper will delete - * this node for us since it is ephemeral. - */ - LOG.info("Adding ZNode for " + backupZNode + - " in backup master directory"); - ZKUtil.createEphemeralNodeAndWatch(this.watcher, backupZNode, + /* + * Add a ZNode for ourselves in the backup master directory since we are + * not the active master. + * + * If we become the active master later, ActiveMasterManager will delete + * this node explicitly. If we crash before then, ZooKeeper will delete + * this node for us since it is ephemeral. + */ + LOG.info("Adding ZNode for " + backupZNode + + " in backup master directory"); + ZKUtil.createEphemeralNodeAndWatch(this.watcher, backupZNode, this.sn.getVersionedBytes()); - String msg; - byte [] bytes = - ZKUtil.getDataAndWatch(this.watcher, this.watcher.masterAddressZNode); - if (bytes == null) { - msg = ("A master was detected, but went down before its address " + - "could be read. Attempting to become the next active master"); - } else { - ServerName currentMaster = ServerName.parseVersionedServerName(bytes); - if (ServerName.isSameHostnameAndPort(currentMaster, this.sn)) { - msg = ("Current master has this master's address, " + - currentMaster + "; master was restarted? Waiting on znode " + - "to expire..."); - // Hurry along the expiration of the znode. - ZKUtil.deleteNode(this.watcher, this.watcher.masterAddressZNode); + String msg; + byte [] bytes = + ZKUtil.getDataAndWatch(this.watcher, this.watcher.masterAddressZNode); + if (bytes == null) { + msg = ("A master was detected, but went down before its address " + + "could be read. Attempting to become the next active master"); } else { - msg = "Another master is the active master, " + currentMaster + - "; waiting to become the next active master"; + ServerName currentMaster = ServerName.parseVersionedServerName(bytes); + if (ServerName.isSameHostnameAndPort(currentMaster, this.sn)) { + msg = ("Current master has this master's address, " + + currentMaster + "; master was restarted? Deleting node."); + // Hurry along the expiration of the znode. + ZKUtil.deleteNode(this.watcher, this.watcher.masterAddressZNode); + } else { + msg = "Another master is the active master, " + currentMaster + + "; waiting to become the next active master"; + } } + LOG.info(msg); + startupStatus.setStatus(msg); + } catch (KeeperException ke) { + master.abort("Received an unexpected KeeperException, aborting", ke); + return false; } - LOG.info(msg); - startupStatus.setStatus(msg); - } catch (KeeperException ke) { - master.abort("Received an unexpected KeeperException, aborting", ke); - return false; - } - synchronized (this.clusterHasActiveMaster) { - while (this.clusterHasActiveMaster.get() && !this.master.isStopped()) { - try { - this.clusterHasActiveMaster.wait(); - } catch (InterruptedException e) { - // We expect to be interrupted when a master dies, will fall out if so - LOG.debug("Interrupted waiting for master to die", e); + synchronized (this.clusterHasActiveMaster) { + while (this.clusterHasActiveMaster.get() && !this.master.isStopped()) { + try { + this.clusterHasActiveMaster.wait(); + } catch (InterruptedException e) { + // We expect to be interrupted when a master dies, will fall out if so + LOG.debug("Interrupted waiting for master to die", e); + } } + if (!clusterStatusTracker.isClusterUp()) { + this.master.stop("Cluster went down before this master became active"); + } + if (this.master.isStopped()) { + return false; + } + // Try to become active master again now that there is no active master } - if (!clusterStatusTracker.isClusterUp()) { - this.master.stop("Cluster went down before this master became active"); - } - if (this.master.isStopped()) { - return cleanSetOfActiveMaster; - } - // Try to become active master again now that there is no active master - cleanSetOfActiveMaster = blockUntilBecomingActiveMaster(startupStatus,clusterStatusTracker); } - return cleanSetOfActiveMaster; } /**