Index: src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java (revision 1006040) +++ src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java (working copy) @@ -104,7 +104,30 @@ try { // Create all the necessary "directories" of znodes // TODO: Move this to an init method somewhere so not everyone calls it? - ZKUtil.createAndFailSilent(this, baseZNode); + + // The first call against zk can fail with connection loss. Seems common. + // Apparently this is recoverable. Retry a while. + // See http://wiki.apache.org/hadoop/ZooKeeper/ErrorHandling + // TODO: Generalize out in ZKUtil. + long wait = conf.getLong("hbase.zookeeper.recoverable.waittime", 10000); + long finished = System.currentTimeMillis() + wait; + KeeperException ke = null; + do { + try { + ZKUtil.createAndFailSilent(this, baseZNode); + ke = null; + break; + } catch (KeeperException.ConnectionLossException e) { + if (LOG.isDebugEnabled() && (isFinishedRetryingRecoverable(finished))) { + LOG.debug("Retrying zk create for another " + + (finished - System.currentTimeMillis()) + + "ms; set 'hbase.zookeeper.recoverable.waittime' to change " + + "wait time); " + e.getMessage()); + } + ke = e; + } + } while (isFinishedRetryingRecoverable(finished)); + if (ke != null) throw ke; ZKUtil.createAndFailSilent(this, assignmentZNode); ZKUtil.createAndFailSilent(this, rsZNode); ZKUtil.createAndFailSilent(this, tableZNode); @@ -114,6 +137,10 @@ } } + private boolean isFinishedRetryingRecoverable(final long finished) { + return System.currentTimeMillis() < finished; + } + @Override public String toString() { return this.identifier; Index: src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (revision 1006040) +++ src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (working copy) @@ -402,11 +402,17 @@ * @throws InterruptedException */ private void initialize() throws IOException, InterruptedException { - initializeZooKeeper(); - initializeThreads(); - int nbBlocks = conf.getInt("hbase.regionserver.nbreservationblocks", 4); - for (int i = 0; i < nbBlocks; i++) { - reservedSpace.add(new byte[HConstants.DEFAULT_SIZE_RESERVATION_BLOCK]); + try { + initializeZooKeeper(); + initializeThreads(); + int nbBlocks = conf.getInt("hbase.regionserver.nbreservationblocks", 4); + for (int i = 0; i < nbBlocks; i++) { + reservedSpace.add(new byte[HConstants.DEFAULT_SIZE_RESERVATION_BLOCK]); + } + } catch (Throwable t) { + // Call stop if error or process will stick around for ever since server + // puts up non-daemon threads. + this.server.stop(); } } @@ -2465,4 +2471,4 @@ new HRegionServerCommandLine(regionServerClass).doMain(args); } -} \ No newline at end of file +}