Index: hbase-client/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java =================================================================== --- hbase-client/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java (revision 1486276) +++ hbase-client/src/main/java/org/apache/hadoop/hbase/client/HConnectionManager.java (working copy) @@ -614,7 +614,7 @@ } private void checkIfBaseNodeAvailable(ZooKeeperWatcher zkw) - throws MasterNotRunningException { + throws MasterNotRunningException, KeeperException { String errorMsg; try { if (ZKUtil.checkExists(zkw, zkw.baseZNode) == -1) { @@ -628,7 +628,7 @@ } catch (KeeperException e) { errorMsg = "Can't get connection to ZooKeeper: " + e.getMessage(); LOG.error(errorMsg); - throw new MasterNotRunningException(errorMsg, e); + throw e; } } @@ -1382,7 +1382,7 @@ exceptionCaught = e; } - if (exceptionCaught != null) + if (exceptionCaught != null) { // It failed. If it's not the last try, we're going to wait a little if (tries < numTries) { // tries at this point is 1 or more; decrement to start from 0. @@ -1398,12 +1398,33 @@ throw new RuntimeException( "Thread was interrupted while trying to connect to master.", e); } + + // This is to handle the cluster restart or failover case. KeeperException with the + // CONNECTIONLOSS code was thrown while checking the existence of znode "/hbase" + // during the 1st retry. Catch this exception and reconnect with a new ZooKeeper + // session. + if (exceptionCaught instanceof KeeperException) { + if (((KeeperException) exceptionCaught).code() == + KeeperException.Code.CONNECTIONLOSS) { + try { + getKeepAliveZooKeeperWatcher().reconnectAfterExpiration(); + } catch (IOException ioe) { + LOG.error("Encountered unexpected exception while trying " + + "to recover from ZooKeeper connection loss.", ioe); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + throw new RuntimeException( + "Interrupted while trying to recover from ZooKeeper connection loss.",ie); + } + } + } } else { // Enough tries, we stop now LOG.info("getMaster attempt " + tries + " of " + numTries + " failed; no more retrying.", exceptionCaught); throw new MasterNotRunningException(exceptionCaught); } + } } if (stub == null) {