diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 72ad08f..58edba3 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -315,8 +315,14 @@ public static final String RM_STORE = RM_PREFIX + "store.class"; /** URI for FileSystemRMStateStore */ - public static final String FS_RM_STATE_STORE_URI = - RM_PREFIX + "fs.state-store.uri"; + public static final String FS_RM_STATE_STORE_URI = RM_PREFIX + + "fs.state-store.uri"; + public static final String FS_RM_STATE_STORE_RETRY_ENABLED = RM_PREFIX + + "fs.state-store.retry.enabled"; + public static final boolean DEFAULT_FS_RM_STATE_STORE_RETRY_ENABLED = true; + public static final String FS_RM_STATE_STORE_RETRY_POLICY_SPEC = RM_PREFIX + + "fs.state-store.retry.policy.spec"; + public static final String DEFAULT_FS_RM_STATE_STORE_RETRY_POLICY_SPEC = "10000,6,60000,10"; /** * Comma separated host:port pairs, each corresponding to a ZK server for * ZKRMStateStore @@ -325,7 +331,11 @@ RM_PREFIX + "zk.state-store."; public static final String ZK_RM_STATE_STORE_NUM_RETRIES = ZK_STATE_STORE_PREFIX + "num-retries"; - public static final int DEFAULT_ZK_RM_STATE_STORE_NUM_RETRIES = 3; + public static final int DEFAULT_ZK_RM_STATE_STORE_NUM_RETRIES = 120; + /** retry interval when connecting to zookeeper*/ + public static final String ZK_RM_STATE_STORE_RETRY_INTERVAL_MS = + ZK_STATE_STORE_PREFIX + "retry.interval.ms"; + public static final long DEFAULT_ZK_RM_STATE_STORE_RETRY_INTERVAL_MS = 10000; public static final String ZK_RM_STATE_STORE_ADDRESS = ZK_STATE_STORE_PREFIX + "address"; /** Timeout in millisec for ZK server connection for ZKRMStateStore */ diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 7f6e050..9225423 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -294,7 +294,14 @@ org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore as the value for yarn.resourcemanager.store.class yarn.resourcemanager.zk.state-store.num-retries - 3 + 120 + + + + Retry interval in milliseconds when ZKRMStateStore tries to + connect to ZooKeeper. + yarn.resourcemanager.zk.state-store.retry.interval.ms + 10000 @@ -307,7 +314,11 @@ - Timeout when connecting to ZooKeeper. + ZooKeeper session timeout in milliseconds. Session expiration + is managed by the ZooKeeper cluster itself, not by the client. This value is + used by the cluster to determine when the client's session expires. + Expirations happens when the cluster does not hear from the client within + the specified session timeout period (i.e. no heartbeat). This may be supplied when using org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore as the value for yarn.resourcemanager.store.class @@ -360,6 +371,28 @@ + Enable hdfs client retry. If true, hdfs client will retry + in case of connection failures or SafeMode exception of NameNode + This is used when using + org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore + as the value for yarn.resourcemanager.store.class + + yarn.resourcemanager.fs.state-store.retry.enabled + true + + + + hdfs client retry policy specification. This is used when + yarn.resourcemanager.fs.state-store.retry.enabled is enabled. + Specified in pairs of sleep-time and number-of-retries and (t0, n0), + (t1, n1), ..., the first n0 retries sleep t0 milliseconds on average, + the following n1 retries sleep t1 milliseconds on average, and so on. + + yarn.resourcemanager.fs.state-store.retry.policy.spec + 10000,6,60000,10 + + + Enable RM high-availability. When enabled, (1) The RM starts in the Standby mode by default, and transitions to the Active mode when prompted to. diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileSystemRMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileSystemRMStateStore.java index 34bcdf0..70c98cf 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileSystemRMStateStore.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/FileSystemRMStateStore.java @@ -90,7 +90,19 @@ protected synchronized void startInternal() throws Exception { // create filesystem only now, as part of service-start. By this time, RM is // authenticated with kerberos so we are good to create a file-system // handle. - fs = fsWorkingPath.getFileSystem(getConfig()); + Configuration conf = new Configuration(getConfig()); + boolean retryEnabled = + conf.getBoolean(YarnConfiguration.FS_RM_STATE_STORE_RETRY_ENABLED, + YarnConfiguration.DEFAULT_FS_RM_STATE_STORE_RETRY_ENABLED); + conf.setBoolean("dfs.client.retry.policy.enabled", retryEnabled); + if (retryEnabled) { + String retryPolicy = + conf.get(YarnConfiguration.FS_RM_STATE_STORE_RETRY_POLICY_SPEC, + YarnConfiguration.DEFAULT_FS_RM_STATE_STORE_RETRY_POLICY_SPEC); + conf.set("dfs.client.retry.policy.spec", retryPolicy); + } + + fs = fsWorkingPath.getFileSystem(conf); fs.mkdirs(rmDTSecretManagerRoot); fs.mkdirs(rmAppRoot); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java index a4d0c93..8521b67 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java @@ -77,6 +77,7 @@ private String zkHostPort = null; private int zkSessionTimeout; + private long zkRetryInterval; private List zkAcl; private String zkRootNodePath; private String rmDTSecretManagerRoot; @@ -156,6 +157,9 @@ public synchronized void initInternal(Configuration conf) throws Exception { zkSessionTimeout = conf.getInt(YarnConfiguration.ZK_RM_STATE_STORE_TIMEOUT_MS, YarnConfiguration.DEFAULT_ZK_RM_STATE_STORE_TIMEOUT_MS); + zkRetryInterval = + conf.getLong(YarnConfiguration.ZK_RM_STATE_STORE_RETRY_INTERVAL_MS, + YarnConfiguration.DEFAULT_ZK_RM_STATE_STORE_RETRY_INTERVAL_MS); // Parse authentication from configuration. String zkAclConf = conf.get(YarnConfiguration.ZK_RM_STATE_STORE_ACL, @@ -803,6 +807,9 @@ T runWithRetries() throws Exception { } } catch (KeeperException ke) { if (shouldRetry(ke.code()) && ++retry < numRetries) { + LOG.info("Waiting for zookeeper to be connected, retry no. + " + + retry); + Thread.sleep(zkRetryInterval); continue; } throw ke;