diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java index f12ada7..3aaea36 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java @@ -318,7 +318,10 @@ public synchronized void startInternal() throws Exception { createRootDir(dtSequenceNumberPath); createRootDir(amrmTokenSecretManagerRoot); - syncInternal(zkRootNodePath); + boolean succeeded = syncInternal(zkRootNodePath); + if (!succeeded) { + throw new IOException("failing to sync operation at starting up RM"); + } } private void createRootDir(final String rootPath) throws Exception { @@ -952,8 +955,14 @@ private boolean syncInternal(String path) throws InterruptedException { } else { zkClient.sync(zkRootNodePath, cb, null); } + /** + * Waiting for zkSessionTimeout * 3 to prevent ZK's sync operation + * from succeeding after the timeout. + * We cannot use zkResyncWaitTime since its default value is + * zkRetryInterval * numRetries: and shorter than zkSessionTimeout. + */ boolean succeededToSync = cb.latch.await( - zkSessionTimeout, TimeUnit.MILLISECONDS); + zkSessionTimeout * 3, TimeUnit.MILLISECONDS); return succeededToSync; }