diff --git src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java index f317524..81e6916 100644 --- src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java +++ src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java @@ -1756,12 +1756,15 @@ public class HConnectionManager { } public void close() { + final String zooKeeperName = + ( zooKeeper != null ? this.zooKeeper.toString() : "null ZooKeeper"); + if (managed) { HConnectionManager.deleteConnection((HConnection)this, stopProxy, false); } else { close(true); } - LOG.debug("The connection to " + this.zooKeeper + " has been closed."); + LOG.debug("The connection to " + zooKeeperName + " has been closed."); } /** @@ -1779,8 +1782,10 @@ public class HConnectionManager { protected void finalize() throws Throwable { // Pretend as if we are about to release the last remaining reference refCount = 1; + final String zooKeeperName = + ( zooKeeper != null ? this.zooKeeper.toString() : "null ZooKeeper"); close(); - LOG.debug("The connection to " + this.zooKeeper + LOG.debug("The connection to " + zooKeeperName + " was closed by the finalize method."); } diff --git src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index f7ef653..936913e 100644 --- src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -410,7 +410,8 @@ public class AssignmentManager extends ZooKeeperListener { synchronized(this.regionsInTransition) { while (!this.master.isStopped() && this.regionsInTransition.containsKey(hri.getEncodedName())) { - this.regionsInTransition.wait(); + // We expect a notify, but by security we set a timout + this.regionsInTransition.wait(100); } } return intransistion; @@ -1825,7 +1826,10 @@ public class AssignmentManager extends ZooKeeperListener { throws InterruptedException { synchronized(regions) { while(!regions.containsKey(regionInfo)) { - regions.wait(); + // We should receive a notification, but it's + // better to have a timeout to recheck the condition here: + // it lowers the impact of a race condition if any + regions.wait(100); } } } diff --git src/main/java/org/apache/hadoop/hbase/master/HMaster.java src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 73a6343..4c4866e 100644 --- src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -72,7 +72,6 @@ import org.apache.hadoop.hbase.master.handler.TableAddFamilyHandler; import org.apache.hadoop.hbase.master.handler.TableDeleteFamilyHandler; import org.apache.hadoop.hbase.master.handler.TableModifyFamilyHandler; import org.apache.hadoop.hbase.master.metrics.MasterMetrics; -import org.apache.hadoop.hbase.master.RegionPlan; import org.apache.hadoop.hbase.monitoring.MemoryBoundedLogMessageBuffer; import org.apache.hadoop.hbase.monitoring.MonitoredTask; import org.apache.hadoop.hbase.monitoring.TaskMonitor; @@ -392,11 +391,12 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server { ", cluster-up flag was=" + wasUp); } + + // Check if we should stop every second. + private Sleeper stopSleeper = new Sleeper(1000, this); private void loop() { - // Check if we should stop every second. - Sleeper sleeper = new Sleeper(1000, this); while (!this.stopped) { - sleeper.sleep(); + stopSleeper.sleep(); } } @@ -1363,6 +1363,10 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server { public void stop(final String why) { LOG.info(why); this.stopped = true; + + // We wake up the stopSleeper to stop immediately + stopSleeper.skipSleepCycle(); + // If we are a backup master, we need to interrupt wait if (this.activeMasterManager != null) { synchronized (this.activeMasterManager.clusterHasActiveMaster) { diff --git src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 1d9e48d..a8f38b3 100644 --- src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -954,6 +954,8 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, stop("Failed initialization"); throw convertThrowableToIOE(cleanup(e, "Failed init"), "Region server startup failed"); + } finally { + sleeper.skipSleepCycle(); } } @@ -1539,9 +1541,14 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, public void stop(final String msg) { this.stopped = true; LOG.info("STOPPED: " + msg); - synchronized (this) { - // Wakes run() if it is sleeping - notifyAll(); // FindBugs NN_NAKED_NOTIFY + // Wakes run() if it is sleeping + //sleeper.skipSleepCycle(); + //will be uncommented later, see discussion in jira 4798 + } + + public void waitForServerOnline(){ + while (!isOnline() && !isStopped()){ + sleeper.sleep(); } } @@ -1707,10 +1714,17 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, */ private ServerName getMaster() { ServerName masterServerName = null; + long previousLogTime = 0; while ((masterServerName = this.masterAddressManager.getMasterAddress()) == null) { if (!keepLooping()) return null; - LOG.debug("No master found; retry"); - sleeper.sleep(); + if (System.currentTimeMillis() > (previousLogTime+1000)){ + LOG.debug("No master found; retry"); + previousLogTime = System.currentTimeMillis(); + } + try { + Thread.sleep(100); + } catch (InterruptedException ignored) { + } } InetSocketAddress isa = new InetSocketAddress(masterServerName.getHostname(), masterServerName.getPort()); @@ -1729,11 +1743,20 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, e = e instanceof RemoteException ? ((RemoteException)e).unwrapRemoteException() : e; if (e instanceof ServerNotRunningYetException) { - LOG.info("Master isn't available yet, retrying"); + if (System.currentTimeMillis() > (previousLogTime+1000)){ + LOG.info("Master isn't available yet, retrying"); + previousLogTime = System.currentTimeMillis(); + } } else { - LOG.warn("Unable to connect to master. Retrying. Error was:", e); + if (System.currentTimeMillis() > (previousLogTime + 1000)) { + LOG.warn("Unable to connect to master. Retrying. Error was:", e); + previousLogTime = System.currentTimeMillis(); + } + } + try { + Thread.sleep(200); + } catch (InterruptedException ignored) { } - sleeper.sleep(); } } LOG.info("Connected to master at " + isa); diff --git src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java index c7d4c6b..6056f73 100644 --- src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java +++ src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java @@ -62,14 +62,7 @@ public class JVMClusterUtil { // the HRS#run method. HRS#init can fail for whatever region. In those // cases, we'll jump out of the run without setting online flag. Check // stopRequested so we don't wait here a flag that will never be flipped. - while (!this.regionServer.isOnline() && - !this.regionServer.isStopped()) { - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - // continue waiting - } - } + regionServer.waitForServerOnline(); } } @@ -118,22 +111,6 @@ public class JVMClusterUtil { public HMaster getMaster() { return this.master; } - - /** - * Block until the master has come online, indicating it is ready - * to be used. - */ - public void waitForServerOnline() { - // The server is marked online after init begins but before race to become - // the active master. - while (!this.master.isMasterRunning() && !this.master.isStopped()) { - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - // continue waiting - } - } - } } /** @@ -165,20 +142,49 @@ public class JVMClusterUtil { return new JVMClusterUtil.MasterThread(server, index); } + private static JVMClusterUtil.MasterThread findActiveMaster( + List masters) { + for (JVMClusterUtil.MasterThread t : masters) { + if (t.master.isActiveMaster()) { + return t; + } + } + + return null; + } + /** - * Start the cluster. Waits until there is a primary master and returns its - * address. + * Start the cluster. Waits until there is a primary master initialized + * and returns its address. * @param masters * @param regionservers * @return Address to use contacting primary master. */ public static String startup(final List masters, final List regionservers) throws IOException { - if (masters != null) { - for (JVMClusterUtil.MasterThread t : masters) { - t.start(); + + if (masters == null || masters.isEmpty()) { + return null; + } + + for (JVMClusterUtil.MasterThread t : masters) { + t.start(); + } + + // Wait for an active master + // having an active master before starting the region threads allows + // then to succeed on their connection to master + long startTime = System.currentTimeMillis(); + while (findActiveMaster(masters) == null) { + try { + Thread.sleep(100); + } catch (InterruptedException ignored) { + } + if (System.currentTimeMillis() > startTime + 30000) { + throw new RuntimeException("Master not active after 30 seconds"); } } + if (regionservers != null) { for (JVMClusterUtil.RegionServerThread t: regionservers) { HRegionServer hrs = t.getRegionServer(); @@ -187,19 +193,21 @@ public class JVMClusterUtil { t.start(); } } - if (masters == null || masters.isEmpty()) { - return null; - } - // Wait for an active master + + // Wait for an active master to be initialized (implies being master) + // with this, when we return the cluster is complete + startTime = System.currentTimeMillis(); while (true) { - for (JVMClusterUtil.MasterThread t : masters) { - if (t.master.isActiveMaster()) { - return t.master.getServerName().toString(); - } + JVMClusterUtil.MasterThread t = findActiveMaster(masters); + if (t != null && t.master.isInitialized()) { + return t.master.getServerName().toString(); + } + if (System.currentTimeMillis() > startTime + 200000) { + throw new RuntimeException("Master not initialized after 200 seconds"); } try { - Thread.sleep(1000); - } catch(InterruptedException e) { + Thread.sleep(100); + } catch (InterruptedException ignored) { // Keep waiting } } diff --git src/main/java/org/apache/hadoop/hbase/util/Sleeper.java src/main/java/org/apache/hadoop/hbase/util/Sleeper.java index 011dcbe..f153f7f 100644 --- src/main/java/org/apache/hadoop/hbase/util/Sleeper.java +++ src/main/java/org/apache/hadoop/hbase/util/Sleeper.java @@ -62,7 +62,7 @@ public class Sleeper { public void skipSleepCycle() { synchronized (sleepLock) { triggerWake = true; - sleepLock.notify(); + sleepLock.notifyAll(); } } diff --git src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperNodeTracker.java src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperNodeTracker.java index 19dcffe..c6e607e 100644 --- src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperNodeTracker.java +++ src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperNodeTracker.java @@ -127,11 +127,9 @@ public abstract class ZooKeeperNodeTracker extends ZooKeeperListener { } } while (!this.stopped && (notimeout || remaining > 0) && this.data == null) { - if (notimeout) { - wait(); - continue; - } - wait(remaining); + // We expect a notification; but we wait with a + // a timeout to lower the impact of a race condition if any + wait(100); remaining = timeout - (System.currentTimeMillis() - startTime); } return this.data; diff --git src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java index d1b7647..42eefe1 100644 --- src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java +++ src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java @@ -1241,11 +1241,12 @@ public class HBaseTestingUtility { expireSession(nodeZK, server, false); } + public static final int zooKeeperTimeoutForTimeoutTest = 1000; public void expireSession(ZooKeeperWatcher nodeZK, Server server, boolean checkStatus) throws Exception { Configuration c = new Configuration(this.conf); String quorumServers = ZKConfig.getZKQuorumServersString(c); - int sessionTimeout = 5 * 1000; // 5 seconds + int sessionTimeout = c.getInt("zookeeper.session.timeout", 5000); ZooKeeper zk = nodeZK.getRecoverableZooKeeper().getZooKeeper(); byte[] password = zk.getSessionPasswd(); long sessionID = zk.getSessionId(); diff --git src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java index 56b1818..b36d12c 100644 --- src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java +++ src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java @@ -293,7 +293,6 @@ public class MiniHBaseCluster { try { t = hbaseCluster.addMaster(c, hbaseCluster.getMasters().size(), user); t.start(); - t.waitForServerOnline(); } catch (InterruptedException ie) { throw new IOException("Interrupted adding master to cluster", ie); } @@ -382,7 +381,7 @@ public class MiniHBaseCluster { return true; } } - Thread.sleep(200); + Thread.sleep(100); } return false; } diff --git src/test/java/org/apache/hadoop/hbase/TestRegionRebalancing.java src/test/java/org/apache/hadoop/hbase/TestRegionRebalancing.java index 317336f..7f7d20d 100644 --- src/test/java/org/apache/hadoop/hbase/TestRegionRebalancing.java +++ src/test/java/org/apache/hadoop/hbase/TestRegionRebalancing.java @@ -212,7 +212,7 @@ public class TestRegionRebalancing { // while (!cluster.getMaster().allRegionsAssigned()) { LOG.debug("Waiting for there to be 22 regions, but there are " + getRegionCount() + " right now."); try { - Thread.sleep(1000); + Thread.sleep(200); } catch (InterruptedException e) {} } } diff --git src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java index 3f84744..ec29ff6 100644 --- src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java +++ src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java @@ -66,6 +66,10 @@ public class TestZooKeeper { public static void setUpBeforeClass() throws Exception { // Test we can first start the ZK cluster by itself TEST_UTIL.startMiniZKCluster(); + TEST_UTIL.getConfiguration().setInt( "zookeeper.session.timeout", + HBaseTestingUtility.zooKeeperTimeoutForTimeoutTest); + TEST_UTIL.getConfiguration().setInt("hbase.zookeeper.property.tickTime", + HBaseTestingUtility.zooKeeperTimeoutForTimeoutTest/10); TEST_UTIL.getConfiguration().setBoolean("dfs.support.append", true); TEST_UTIL.startMiniCluster(2); } @@ -98,7 +102,7 @@ public class TestZooKeeper { Configuration c = new Configuration(TEST_UTIL.getConfiguration()); new HTable(c, HConstants.META_TABLE_NAME); String quorumServers = ZKConfig.getZKQuorumServersString(c); - int sessionTimeout = 5 * 1000; // 5 seconds + int sessionTimeout = 1 * 1000; HConnection connection = HConnectionManager.getConnection(c); ZooKeeperWatcher connectionZK = connection.getZooKeeperWatcher(); long sessionID = connectionZK.getRecoverableZooKeeper().getSessionId(); diff --git src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTrackerOnCluster.java src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTrackerOnCluster.java index 31e8fde..4952519 100644 --- src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTrackerOnCluster.java +++ src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTrackerOnCluster.java @@ -23,6 +23,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.*; import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; +import org.apache.zookeeper.KeeperException; import org.junit.Test; import org.junit.experimental.categories.Category; @@ -37,7 +38,7 @@ public class TestCatalogTrackerOnCluster { /** * @throws Exception - * @see https://issues.apache.org/jira/browse/HBASE-3445 + * @see {https://issues.apache.org/jira/browse/HBASE-3445} */ @Test public void testBadOriginalRootLocation() throws Exception { UTIL.getConfiguration().setInt("ipc.socket.timeout", 3000); @@ -61,9 +62,15 @@ public class TestCatalogTrackerOnCluster { ServerName nonsense = new ServerName("example.org", 1234, System.currentTimeMillis()); RootLocationEditor.setRootLocation(zookeeper, nonsense); + // Bring back up the hbase cluster. See if it can deal with nonsense root - // location. + // location. The cluster should start and be fully available. UTIL.startMiniHBaseCluster(1, 1); + + // if we can create a table, it's a good sign that it's working + UTIL.createTable( + getClass().getSimpleName().getBytes(), "family".getBytes()); + UTIL.shutdownMiniCluster(); } } diff --git src/test/java/org/apache/hadoop/hbase/master/TestRestartCluster.java src/test/java/org/apache/hadoop/hbase/master/TestRestartCluster.java index 30e59b6..9185c1e 100644 --- src/test/java/org/apache/hadoop/hbase/master/TestRestartCluster.java +++ src/test/java/org/apache/hadoop/hbase/master/TestRestartCluster.java @@ -94,8 +94,11 @@ public class TestRestartCluster { LOG.info("\n\nCreating tables"); for(byte [] TABLE : TABLES) { UTIL.createTable(TABLE, FAMILY); + } + for(byte [] TABLE : TABLES) { UTIL.waitTableAvailable(TABLE, 30000); } + List allRegions = MetaScanner.listAllRegions(UTIL.getConfiguration()); assertEquals(3, allRegions.size()); diff --git src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestHLog.java src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestHLog.java index 09d1b0b..d6b88d4 100644 --- src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestHLog.java +++ src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestHLog.java @@ -127,7 +127,7 @@ public class TestHLog { } @AfterClass public static void tearDownAfterClass() throws Exception { - TEST_UTIL.shutdownMiniDFSCluster(); + TEST_UTIL.shutdownMiniCluster(); } private static String getName() { diff --git src/test/java/org/apache/hadoop/hbase/replication/TestReplicationPeer.java src/test/java/org/apache/hadoop/hbase/replication/TestReplicationPeer.java index 25a9b25..9685710 100644 --- src/test/java/org/apache/hadoop/hbase/replication/TestReplicationPeer.java +++ src/test/java/org/apache/hadoop/hbase/replication/TestReplicationPeer.java @@ -41,6 +41,10 @@ public class TestReplicationPeer { conf = HBaseConfiguration.create(); utility = new HBaseTestingUtility(conf); conf = utility.getConfiguration(); + conf.setInt( "zookeeper.session.timeout", + HBaseTestingUtility.zooKeeperTimeoutForTimeoutTest); + conf.setInt("hbase.zookeeper.property.tickTime", + HBaseTestingUtility.zooKeeperTimeoutForTimeoutTest/10); utility.startMiniZKCluster(); rp = new ReplicationPeer(conf, "clusterKey", "clusterId"); diff --git src/test/java/org/apache/hadoop/hbase/util/TestFSTableDescriptors.java src/test/java/org/apache/hadoop/hbase/util/TestFSTableDescriptors.java index f7e4431..caf3d19 100644 --- src/test/java/org/apache/hadoop/hbase/util/TestFSTableDescriptors.java +++ src/test/java/org/apache/hadoop/hbase/util/TestFSTableDescriptors.java @@ -59,7 +59,7 @@ public class TestFSTableDescriptors { assertTrue(FSTableDescriptors.createTableDescriptor(fs, testdir, htd)); assertFalse(FSTableDescriptors.createTableDescriptor(fs, testdir, htd)); FileStatus [] statuses = fs.listStatus(testdir); - assertTrue(statuses.length == 1); + assertTrue("statuses.length="+statuses.length, statuses.length == 1); for (int i = 0; i < 10; i++) { FSTableDescriptors.updateHTableDescriptor(fs, testdir, htd); }