Index: src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java (revision 1003377) +++ src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperWatcher.java (working copy) @@ -104,7 +104,27 @@ try { // Create all the necessary "directories" of znodes // TODO: Move this to an init method somewhere so not everyone calls it? - ZKUtil.createAndFailSilent(this, baseZNode); + + // The first job on zk can fail with connection loss. Seems common. + // Apparently this is recoverable. Retry a while. + // See http://wiki.apache.org/hadoop/ZooKeeper/ErrorHandling + long wait = conf.getLong("hbase.zookeeper.startup.connectionloss.wait", 10000); + long finished = System.currentTimeMillis() + wait; + KeeperException ke = null; + do { + try { + ZKUtil.createAndFailSilent(this, baseZNode); + ke = null; + break; + } catch (KeeperException.ConnectionLossException e) { + LOG.debug("Retrying for another " + (finished + + System.currentTimeMillis()) + + "ms (Set 'hbase.zookeeper.startup.connectionloss.wait' to change " + + "wait time); " + e.getMessage()); + ke = e; + } + } while (System.currentTimeMillis() < finished); + if (ke != null) throw ke; ZKUtil.createAndFailSilent(this, assignmentZNode); ZKUtil.createAndFailSilent(this, rsZNode); ZKUtil.createAndFailSilent(this, tableZNode); Index: src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (revision 1003377) +++ src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (working copy) @@ -395,7 +395,7 @@ private void initialize() throws IOException, InterruptedException { this.abortRequested = false; this.stopped = false; - + initializeZooKeeper(); // Server to handle client requests this.server = HBaseRPC.getServer(this, new Class[]{HRegionInterface.class, HBaseRPCErrorHandler.class, @@ -417,7 +417,6 @@ throw new NullPointerException("Server address cannot be null; " + "hbase-958 debugging"); } - initializeZooKeeper(); initializeThreads(); int nbBlocks = conf.getInt("hbase.regionserver.nbreservationblocks", 4); for (int i = 0; i < nbBlocks; i++) {