diff --git hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java index d9410d2..540fb4b 100644 --- hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java +++ hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java @@ -753,7 +753,7 @@ private boolean reEstablishSession() { return success; } - private void createConnection() throws IOException, KeeperException { + protected void createConnection() throws IOException, KeeperException { if (zkClient != null) { try { zkClient.close(); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 5176477..3b88662 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -527,6 +527,16 @@ private static void addDeprecatedKeys() { public static final int DEFAULT_CLIENT_FAILOVER_RETRIES_ON_SOCKET_TIMEOUTS = 0; + /** number of zookeeper operation retry times in ActiveStandbyElector */ + public static final String RM_HA_FC_ELECTOR_ZK_OP_RETRIES_KEY = RM_HA_PREFIX + + "failover-controller.active-standby-elector.zk.op.retries"; + + /** Time interval between each attempt for EmbeddedElectorService + * to connect to Zookeeper Cluster. + */ + public static final String RM_HA_ZK_CONNECT_RETRY_INTERVAL_MS = RM_HA_PREFIX + + "zk.connect.retry-interval.ms"; + public static long DEFAULT_RM_HA_ZK_CONNECT_RETRY_INTERVAL_MS = 5000; //////////////////////////////// // RM state store configs //////////////////////////////// diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 7eb9b84..0aaf855 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -596,6 +596,21 @@ + When automatic failover is enabled, number of zookeeper + operation retry times in ActiveStandbyElector + yarn.resourcemanager.ha.failover-controller.active-standby-elector.zk.op.retries + + + + + When automatic failover is enabled, Time interval between each + attempt for EmbeddedElectorService to connect to Zookeeper Cluster. + + yarn.resourcemanager.ha.zk.connect.retry-interval.ms + 5000 + + + The maximum number of completed applications RM keeps. yarn.resourcemanager.max-completed-applications 10000 diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java index 73bdca0..ebf018a 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/EmbeddedElectorService.java @@ -18,6 +18,7 @@ package org.apache.hadoop.yarn.server.resourcemanager; import com.google.protobuf.InvalidProtocolBufferException; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; @@ -86,11 +87,55 @@ protected void serviceInit(Configuration conf) List zkAcls = RMZKUtils.getZKAcls(conf); List zkAuths = RMZKUtils.getZKAuths(conf); - int maxRetryNum = conf.getInt( - CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY, - CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT); + String tempMaxRetryNum = + conf.get(YarnConfiguration.RM_HA_FC_ELECTOR_ZK_OP_RETRIES_KEY); + final int maxRetryNum = + tempMaxRetryNum == null || tempMaxRetryNum.isEmpty() ? conf.getInt( + CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY, + CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT) + : Integer.parseInt(tempMaxRetryNum); + + final long retryInterval = + conf.getLong(YarnConfiguration.RM_HA_ZK_CONNECT_RETRY_INTERVAL_MS, + YarnConfiguration.DEFAULT_RM_HA_ZK_CONNECT_RETRY_INTERVAL_MS); + elector = new ActiveStandbyElector(zkQuorum, (int) zkSessionTimeout, - electionZNode, zkAcls, zkAuths, this, maxRetryNum); + electionZNode, zkAcls, zkAuths, this, maxRetryNum) { + @Override + protected void createConnection() throws IOException, KeeperException { + int connectionRetryCount = 0; + boolean success = false; + while(!success && connectionRetryCount < maxRetryNum) { + LOG.debug("Establishing zookeeper connection for " + this); + try { + super.createConnection(); + success = true; + } catch(IOException e) { + LOG.warn(e); + sleepFor(retryInterval); + } catch(KeeperException e) { + LOG.warn(e); + sleepFor(retryInterval); + } + ++connectionRetryCount; + } + if (!success) { + throw new IOException( + "Can not establish Zookeeper Connection for " + this + + " after retry " + maxRetryNum + " times"); + } + } + + private void sleepFor(long retryInterval) { + if (retryInterval > 0) { + try { + Thread.sleep(retryInterval); + } catch (InterruptedException e) { + // DO NOTHING + } + } + } + }; elector.ensureParentZNode(); if (!isParentZnodeSafe(clusterId)) {