diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index 72ad08f..58edba3 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -315,8 +315,14 @@
public static final String RM_STORE = RM_PREFIX + "store.class";
/** URI for FileSystemRMStateStore */
- public static final String FS_RM_STATE_STORE_URI =
- RM_PREFIX + "fs.state-store.uri";
+ public static final String FS_RM_STATE_STORE_URI = RM_PREFIX
+ + "fs.state-store.uri";
+ public static final String FS_RM_STATE_STORE_RETRY_ENABLED = RM_PREFIX
+ + "fs.state-store.retry.enabled";
+ public static final boolean DEFAULT_FS_RM_STATE_STORE_RETRY_ENABLED = true;
+ public static final String FS_RM_STATE_STORE_RETRY_POLICY_SPEC = RM_PREFIX
+ + "fs.state-store.retry.policy.spec";
+ public static final String DEFAULT_FS_RM_STATE_STORE_RETRY_POLICY_SPEC = "10000,6,60000,10";
/**
* Comma separated host:port pairs, each corresponding to a ZK server for
* ZKRMStateStore
@@ -325,7 +331,11 @@
RM_PREFIX + "zk.state-store.";
public static final String ZK_RM_STATE_STORE_NUM_RETRIES =
ZK_STATE_STORE_PREFIX + "num-retries";
- public static final int DEFAULT_ZK_RM_STATE_STORE_NUM_RETRIES = 3;
+ public static final int DEFAULT_ZK_RM_STATE_STORE_NUM_RETRIES = 120;
+ /** retry interval when connecting to zookeeper*/
+ public static final String ZK_RM_STATE_STORE_RETRY_INTERVAL_MS =
+ ZK_STATE_STORE_PREFIX + "retry.interval.ms";
+ public static final long DEFAULT_ZK_RM_STATE_STORE_RETRY_INTERVAL_MS = 10000;
public static final String ZK_RM_STATE_STORE_ADDRESS =
ZK_STATE_STORE_PREFIX + "address";
/** Timeout in millisec for ZK server connection for ZKRMStateStore */
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 7f6e050..9225423 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -294,7 +294,14 @@
org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore
as the value for yarn.resourcemanager.store.class
yarn.resourcemanager.zk.state-store.num-retries
- 3
+ 120
+
+
+
+ Retry interval in milliseconds when ZKRMStateStore tries to
+ connect to ZooKeeper.
+ yarn.resourcemanager.zk.state-store.retry.interval.ms
+ 10000
@@ -307,7 +314,11 @@
- Timeout when connecting to ZooKeeper.
+ ZooKeeper session timeout in milliseconds. Session expiration
+ is managed by the ZooKeeper cluster itself, not by the client. This value is
+ used by the cluster to determine when the client's session expires.
+ Expirations happens when the cluster does not hear from the client within
+ the specified session timeout period (i.e. no heartbeat).
This may be supplied when using
org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore
as the value for yarn.resourcemanager.store.class
@@ -360,6 +371,28 @@
+ Enable hdfs client retry. If true, hdfs client will retry
+ in case of connection failures or SafeMode exception of NameNode
+ This is used when using
+ org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
+ as the value for yarn.resourcemanager.store.class
+
+ yarn.resourcemanager.fs.state-store.retry.enabled
+ true
+
+
+
+ hdfs client retry policy specification. This is used when
+ yarn.resourcemanager.fs.state-store.retry.enabled is enabled.
+ Specified in pairs of sleep-time and number-of-retries and (t0, n0),
+ (t1, n1), ..., the first n0 retries sleep t0 milliseconds on average,
+ the following n1 retries sleep t1 milliseconds on average, and so on.
+
+ yarn.resourcemanager.fs.state-store.retry.policy.spec
+ 10000,6,60000,10
+
+
+
Enable RM high-availability. When enabled,
(1) The RM starts in the Standby mode by default, and transitions to
the Active mode when prompted to.
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileSystemRMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileSystemRMStateStore.java
index 34bcdf0..70c98cf 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileSystemRMStateStore.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileSystemRMStateStore.java
@@ -90,7 +90,19 @@ protected synchronized void startInternal() throws Exception {
// create filesystem only now, as part of service-start. By this time, RM is
// authenticated with kerberos so we are good to create a file-system
// handle.
- fs = fsWorkingPath.getFileSystem(getConfig());
+ Configuration conf = new Configuration(getConfig());
+ boolean retryEnabled =
+ conf.getBoolean(YarnConfiguration.FS_RM_STATE_STORE_RETRY_ENABLED,
+ YarnConfiguration.DEFAULT_FS_RM_STATE_STORE_RETRY_ENABLED);
+ conf.setBoolean("dfs.client.retry.policy.enabled", retryEnabled);
+ if (retryEnabled) {
+ String retryPolicy =
+ conf.get(YarnConfiguration.FS_RM_STATE_STORE_RETRY_POLICY_SPEC,
+ YarnConfiguration.DEFAULT_FS_RM_STATE_STORE_RETRY_POLICY_SPEC);
+ conf.set("dfs.client.retry.policy.spec", retryPolicy);
+ }
+
+ fs = fsWorkingPath.getFileSystem(conf);
fs.mkdirs(rmDTSecretManagerRoot);
fs.mkdirs(rmAppRoot);
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java
index a4d0c93..8521b67 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java
@@ -77,6 +77,7 @@
private String zkHostPort = null;
private int zkSessionTimeout;
+ private long zkRetryInterval;
private List zkAcl;
private String zkRootNodePath;
private String rmDTSecretManagerRoot;
@@ -156,6 +157,9 @@ public synchronized void initInternal(Configuration conf) throws Exception {
zkSessionTimeout =
conf.getInt(YarnConfiguration.ZK_RM_STATE_STORE_TIMEOUT_MS,
YarnConfiguration.DEFAULT_ZK_RM_STATE_STORE_TIMEOUT_MS);
+ zkRetryInterval =
+ conf.getLong(YarnConfiguration.ZK_RM_STATE_STORE_RETRY_INTERVAL_MS,
+ YarnConfiguration.DEFAULT_ZK_RM_STATE_STORE_RETRY_INTERVAL_MS);
// Parse authentication from configuration.
String zkAclConf =
conf.get(YarnConfiguration.ZK_RM_STATE_STORE_ACL,
@@ -803,6 +807,9 @@ T runWithRetries() throws Exception {
}
} catch (KeeperException ke) {
if (shouldRetry(ke.code()) && ++retry < numRetries) {
+ LOG.info("Waiting for zookeeper to be connected, retry no. + "
+ + retry);
+ Thread.sleep(zkRetryInterval);
continue;
}
throw ke;