diff --git hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java
index d9410d2..540fb4b 100644
--- hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java
+++ hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java
@@ -753,7 +753,7 @@ private boolean reEstablishSession() {
return success;
}
- private void createConnection() throws IOException, KeeperException {
+ protected void createConnection() throws IOException, KeeperException {
if (zkClient != null) {
try {
zkClient.close();
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index 5176477..b6a095f 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -527,6 +527,16 @@ private static void addDeprecatedKeys() {
public static final int
DEFAULT_CLIENT_FAILOVER_RETRIES_ON_SOCKET_TIMEOUTS = 0;
+ /** number of zookeeper operation retry times in ActiveStandbyElector */
+ public static final String RM_HA_FC_ELECTOR_ZK_OP_RETRIES_KEY = RM_HA_PREFIX
+ + "failover-controller.active-standby-elector.zk.op.retries";
+
+ /** Time interval between each attempt for EmbeddedElectorService
+ * to connect to Zookeeper Cluster.
+ */
+ public static final String RM_HA_ZK_CONNECT_RETRY_INTERVAL_MS = RM_HA_PREFIX
+ + "zk.connect.retry-interval.ms";
+ public static final long DEFAULT_RM_HA_ZK_CONNECT_RETRY_INTERVAL_MS = 5000;
////////////////////////////////
// RM state store configs
////////////////////////////////
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 7eb9b84..0aaf855 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -596,6 +596,21 @@
+ When automatic failover is enabled, number of zookeeper
+ operation retry times in ActiveStandbyElector
+ yarn.resourcemanager.ha.failover-controller.active-standby-elector.zk.op.retries
+
+
+
+
+ When automatic failover is enabled, Time interval between each
+ attempt for EmbeddedElectorService to connect to Zookeeper Cluster.
+
+ yarn.resourcemanager.ha.zk.connect.retry-interval.ms
+ 5000
+
+
+
The maximum number of completed applications RM keeps.
yarn.resourcemanager.max-completed-applications
10000
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java
index 73bdca0..ebf018a 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java
@@ -18,6 +18,7 @@
package org.apache.hadoop.yarn.server.resourcemanager;
import com.google.protobuf.InvalidProtocolBufferException;
+
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
@@ -86,11 +87,55 @@ protected void serviceInit(Configuration conf)
List zkAcls = RMZKUtils.getZKAcls(conf);
List zkAuths = RMZKUtils.getZKAuths(conf);
- int maxRetryNum = conf.getInt(
- CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY,
- CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT);
+ String tempMaxRetryNum =
+ conf.get(YarnConfiguration.RM_HA_FC_ELECTOR_ZK_OP_RETRIES_KEY);
+ final int maxRetryNum =
+ tempMaxRetryNum == null || tempMaxRetryNum.isEmpty() ? conf.getInt(
+ CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY,
+ CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT)
+ : Integer.parseInt(tempMaxRetryNum);
+
+ final long retryInterval =
+ conf.getLong(YarnConfiguration.RM_HA_ZK_CONNECT_RETRY_INTERVAL_MS,
+ YarnConfiguration.DEFAULT_RM_HA_ZK_CONNECT_RETRY_INTERVAL_MS);
+
elector = new ActiveStandbyElector(zkQuorum, (int) zkSessionTimeout,
- electionZNode, zkAcls, zkAuths, this, maxRetryNum);
+ electionZNode, zkAcls, zkAuths, this, maxRetryNum) {
+ @Override
+ protected void createConnection() throws IOException, KeeperException {
+ int connectionRetryCount = 0;
+ boolean success = false;
+ while(!success && connectionRetryCount < maxRetryNum) {
+ LOG.debug("Establishing zookeeper connection for " + this);
+ try {
+ super.createConnection();
+ success = true;
+ } catch(IOException e) {
+ LOG.warn(e);
+ sleepFor(retryInterval);
+ } catch(KeeperException e) {
+ LOG.warn(e);
+ sleepFor(retryInterval);
+ }
+ ++connectionRetryCount;
+ }
+ if (!success) {
+ throw new IOException(
+ "Can not establish Zookeeper Connection for " + this
+ + " after retry " + maxRetryNum + " times");
+ }
+ }
+
+ private void sleepFor(long retryInterval) {
+ if (retryInterval > 0) {
+ try {
+ Thread.sleep(retryInterval);
+ } catch (InterruptedException e) {
+ // DO NOTHING
+ }
+ }
+ }
+ };
elector.ensureParentZNode();
if (!isParentZnodeSafe(clusterId)) {