Index: src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java (revision 1003377) +++ src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java (working copy) @@ -104,7 +104,27 @@ try { // Create all the necessary "directories" of znodes // TODO: Move this to an init method somewhere so not everyone calls it? - ZKUtil.createAndFailSilent(this, baseZNode); + + // The first job on zk can fail with connection loss. Seems common. + // Apparently this is recoverable. Retry a while. + // See http://wiki.apache.org/hadoop/ZooKeeper/ErrorHandling + long wait = conf.getLong("hbase.zookeeper.startup.connectionloss.wait", 10000); + long finished = System.currentTimeMillis() + wait; + KeeperException ke = null; + do { + try { + ZKUtil.createAndFailSilent(this, baseZNode); + ke = null; + break; + } catch (KeeperException.ConnectionLossException e) { + LOG.debug("Retrying for another " + (finished + + System.currentTimeMillis()) + + "ms (Set 'hbase.zookeeper.startup.connectionloss.wait' to change " + + "wait time); " + e.getMessage()); + ke = e; + } + } while (System.currentTimeMillis() < finished); + if (ke != null) throw ke; ZKUtil.createAndFailSilent(this, assignmentZNode); ZKUtil.createAndFailSilent(this, rsZNode); ZKUtil.createAndFailSilent(this, tableZNode); Index: src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (revision 1003377) +++ src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (working copy) @@ -417,11 +417,17 @@ throw new NullPointerException("Server address cannot be null; " + "hbase-958 debugging"); } - initializeZooKeeper(); - initializeThreads(); - int nbBlocks = conf.getInt("hbase.regionserver.nbreservationblocks", 4); - for (int i = 0; i < nbBlocks; i++) { - reservedSpace.add(new byte[HConstants.DEFAULT_SIZE_RESERVATION_BLOCK]); + try { + initializeZooKeeper(); + initializeThreads(); + int nbBlocks = conf.getInt("hbase.regionserver.nbreservationblocks", 4); + for (int i = 0; i < nbBlocks; i++) { + reservedSpace.add(new byte[HConstants.DEFAULT_SIZE_RESERVATION_BLOCK]); + } + } catch (Throwable t) { + // Call stop if error or process will stick around for ever since server + // puts up non-daemon threads. + this.server.stop(); } }