diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestDiskFailures.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestDiskFailures.java index e9de20a94bb..112d5ab5f62 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestDiskFailures.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestDiskFailures.java @@ -18,6 +18,12 @@ package org.apache.hadoop.yarn.server; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Iterator; +import java.util.List; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileContext; import org.apache.hadoop.fs.FileUtil; @@ -27,22 +33,13 @@ import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.yarn.api.records.NodeState; import org.apache.hadoop.yarn.conf.YarnConfiguration; -import org.apache.hadoop.yarn.server.MiniYARNCluster; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.Iterator; -import java.util.List; - import org.junit.AfterClass; +import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; - -import org.junit.Assert; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -203,19 +200,36 @@ private void testDirsFailures(boolean localORLogDirs) throws IOException { } /** - * Wait for the NodeManger to go for the disk-health-check at least once. + * Wait for the NodeManger to go for the disk-health-check at least twice. 1 + * health check could be not enough in case of disk check happens during file + * operations in the test */ - private void waitForDiskHealthCheck() { - long lastDisksCheckTime = dirsHandler.getLastDisksCheckTime(); - long time = lastDisksCheckTime; - for (int i = 0; i < 10 && (time <= lastDisksCheckTime); i++) { + private void waitForTwoDiskHealthCheck() { + long lastDisksCheckTimeBeforeWait = dirsHandler.getLastDisksCheckTime(); + int checkCounter = 0; + int requiredCheckNumber = 2; + int maxTimeoutIteration = 10; + int iterationCounter = 0; + for (; iterationCounter < maxTimeoutIteration; iterationCounter++) { try { - Thread.sleep(1000); - } catch(InterruptedException e) { + Thread.sleep(DISK_HEALTH_CHECK_INTERVAL); + } catch (InterruptedException e) { LOG.error( "Interrupted while waiting for NodeManager's disk health check."); } - time = dirsHandler.getLastDisksCheckTime(); + long lastDiskCheckTime = dirsHandler.getLastDisksCheckTime(); + if (lastDiskCheckTime > lastDisksCheckTimeBeforeWait) { + checkCounter++; + lastDisksCheckTimeBeforeWait = lastDiskCheckTime; + if (checkCounter == requiredCheckNumber) { + break; + } + } + } + if (iterationCounter == maxTimeoutIteration) { + Assert.fail( + "Disk check was not initiated. Number of perceived disk checks:" + + checkCounter); } } @@ -229,7 +243,7 @@ private void waitForDiskHealthCheck() { private void verifyDisksHealth(boolean localORLogDirs, String expectedDirs, boolean isHealthy) { // Wait for the NodeManager to identify disk failures. - waitForDiskHealthCheck(); + waitForTwoDiskHealthCheck(); List list = localORLogDirs ? dirsHandler.getLocalDirs() : dirsHandler.getLogDirs(); @@ -263,17 +277,35 @@ private void verifyDisksHealth(boolean localORLogDirs, String expectedDirs, } /** - * Prepare directory for a failure: Replace the given directory on the - * local FileSystem with a regular file with the same name. - * This would cause failure of creation of directory in DiskChecker.checkDir() - * with the same name. + * Prepare directory for a failure: Replace the given directory on the local + * FileSystem with a regular file with the same name. This would cause failure + * of creation of directory in DiskChecker.checkDir() with the same name. + * Tries to delete the file 3 times in case of failure. + * * @param dir the directory to be failed - * @throws IOException + * @throws IOException */ private void prepareDirToFail(String dir) throws IOException { + prepareDirToFail(dir, 1, 3); + } + + private void prepareDirToFail(String dir, int tryCounter, int maxTryNumber) { + boolean deleteSucceeded = tryToPrepareDirToFail(dir); + if (!deleteSucceeded && maxTryNumber > tryCounter) { + prepareDirToFail(dir, ++tryCounter, maxTryNumber); + } + } + + private boolean tryToPrepareDirToFail(String dir) { + boolean deleteSucceeded; File file = new File(dir); - FileUtil.fullyDelete(file); - file.createNewFile(); - LOG.info("Prepared " + dir + " to fail."); + try { + deleteSucceeded = FileUtil.fullyDelete(file); + deleteSucceeded = deleteSucceeded && file.createNewFile(); + LOG.info("Prepared " + dir + " to fail."); + } catch (IOException e) { + deleteSucceeded = false; + } + return deleteSucceeded; } }