diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestDiskFailures.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestDiskFailures.java index e9de20a94bb..07548694144 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestDiskFailures.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestDiskFailures.java @@ -18,6 +18,12 @@ package org.apache.hadoop.yarn.server; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Iterator; +import java.util.List; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileContext; import org.apache.hadoop.fs.FileUtil; @@ -27,22 +33,13 @@ import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.yarn.api.records.NodeState; import org.apache.hadoop.yarn.conf.YarnConfiguration; -import org.apache.hadoop.yarn.server.MiniYARNCluster; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.Iterator; -import java.util.List; - import org.junit.AfterClass; +import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; - -import org.junit.Assert; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -127,8 +124,8 @@ public void testDirFailuresOnStartup() throws IOException { conf.set(YarnConfiguration.NM_LOCAL_DIRS, localDir1 + "," + localDir2); conf.set(YarnConfiguration.NM_LOG_DIRS, logDir1 + "," + logDir2); - prepareDirToFail(localDir1); - prepareDirToFail(logDir2); + prepareDirToFail(localDir1, 1); + prepareDirToFail(logDir2, 1); LocalDirsHandlerService dirSvc = new LocalDirsHandlerService(); dirSvc.init(conf); @@ -180,7 +177,7 @@ private void testDirsFailures(boolean localORLogDirs) throws IOException { // Make 1 nm-local-dir fail and verify if "the nodemanager can identify // the disk failure(s) and can update the list of good nm-local-dirs. - prepareDirToFail(dirs[2]); + prepareDirToFail(dirs[2], 1); expectedDirs = dirs[0] + "," + dirs[1] + "," + dirs[3]; verifyDisksHealth(localORLogDirs, expectedDirs, true); @@ -189,33 +186,49 @@ private void testDirsFailures(boolean localORLogDirs) throws IOException { // nodemanager can identify the disk failures and can update the list of // good nm-local-dirs/nm-log-dirs and can update the overall health status // of the node to unhealthy". - prepareDirToFail(dirs[0]); + prepareDirToFail(dirs[0], 1); expectedDirs = dirs[1] + "," + dirs[3]; verifyDisksHealth(localORLogDirs, expectedDirs, false); // Fail the remaining 2 local-dirs/log-dirs and verify if NM remains with // empty list of local-dirs/log-dirs and the overall health status is // unhealthy. - prepareDirToFail(dirs[1]); - prepareDirToFail(dirs[3]); + prepareDirToFail(dirs[1], 1); + prepareDirToFail(dirs[3], 1); expectedDirs = ""; verifyDisksHealth(localORLogDirs, expectedDirs, false); } /** - * Wait for the NodeManger to go for the disk-health-check at least once. + * Wait for the NodeManger to go for the disk-health-check at least twice. 1 + * health check could be not enough in case of disk check happens during file + * operations in the test */ - private void waitForDiskHealthCheck() { - long lastDisksCheckTime = dirsHandler.getLastDisksCheckTime(); - long time = lastDisksCheckTime; - for (int i = 0; i < 10 && (time <= lastDisksCheckTime); i++) { + private void waitForTwoDiskHealthCheck() { + long lastDisksCheckTimeBeforeWait = dirsHandler.getLastDisksCheckTime(); + int checkCounter = 0; + int requiredCheckNumber = 2; + int maxTimeoutIteration = 10; + for (int i = 0; i <= maxTimeoutIteration; i++) { try { - Thread.sleep(1000); - } catch(InterruptedException e) { + Thread.sleep(DISK_HEALTH_CHECK_INTERVAL); + } catch (InterruptedException e) { LOG.error( "Interrupted while waiting for NodeManager's disk health check."); } - time = dirsHandler.getLastDisksCheckTime(); + long lastDiskCheckTime = dirsHandler.getLastDisksCheckTime(); + if (lastDiskCheckTime > lastDisksCheckTimeBeforeWait) { + checkCounter++; + lastDisksCheckTimeBeforeWait = lastDiskCheckTime; + if (checkCounter == requiredCheckNumber) { + break; + } + } + if (i == maxTimeoutIteration) { + Assert.fail( + "Disk check was not initiated. Number of perceived disk checks:" + + checkCounter); + } } } @@ -229,7 +242,7 @@ private void waitForDiskHealthCheck() { private void verifyDisksHealth(boolean localORLogDirs, String expectedDirs, boolean isHealthy) { // Wait for the NodeManager to identify disk failures. - waitForDiskHealthCheck(); + waitForTwoDiskHealthCheck(); List list = localORLogDirs ? dirsHandler.getLocalDirs() : dirsHandler.getLogDirs(); @@ -263,17 +276,27 @@ private void verifyDisksHealth(boolean localORLogDirs, String expectedDirs, } /** - * Prepare directory for a failure: Replace the given directory on the - * local FileSystem with a regular file with the same name. - * This would cause failure of creation of directory in DiskChecker.checkDir() - * with the same name. + * Prepare directory for a failure: Replace the given directory on the local + * FileSystem with a regular file with the same name. This would cause failure + * of creation of directory in DiskChecker.checkDir() with the same name. + * Tries to delete the file 3 times in case of failure. + * * @param dir the directory to be failed - * @throws IOException + * @throws IOException */ - private void prepareDirToFail(String dir) throws IOException { + private void prepareDirToFail(String dir, int tryCounter) throws IOException { + int maxTryNumber = 3; File file = new File(dir); - FileUtil.fullyDelete(file); - file.createNewFile(); - LOG.info("Prepared " + dir + " to fail."); + boolean deleteSucceeded; + try { + deleteSucceeded = FileUtil.fullyDelete(file); + deleteSucceeded = deleteSucceeded && file.createNewFile(); + LOG.info("Prepared " + dir + " to fail."); + } catch (IOException e) { + deleteSucceeded = false; + } + if (!deleteSucceeded && maxTryNumber > tryCounter) { + prepareDirToFail(dir, ++tryCounter ); + } } }