diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index fce78c93556aba75c68f8487651306503d419e58..36c0436c933c044919a1ed1f8b3924ab72784ba3 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -101,7 +101,23 @@ private static void addDeprecatedKeys() {
YarnConfiguration.NM_PREFIX + "log-container-debug-info.enabled";
public static final boolean DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO = false;
-
+
+ /** Command to run on container killed event.
+ * This can be used to collect and save debug information to find the root
+ * cause of the issue.
+ * Example: jmap -heap {{PID}} 2>{{LOG_DIR}}/heap.err 1>{{LOG_DIR}}/heap.out*/
+ public static final String NM_SAVE_DEBUG_INFO_COMMAND =
+ YarnConfiguration.NM_PREFIX + "save-debug-info.command";
+
+ public static final String DEFAULT_NM_SAVE_DEBUG_INFO_COMMAND =
+ "";
+
+ /** Timeout to save container debug information on container killed event. */
+ public static final String NM_SAVE_DEBUG_INFO_TIMEOUT_SEC =
+ YarnConfiguration.NM_PREFIX + "save-debug-info.timeout-sec";
+
+ public static final Integer DEFAULT_NM_SAVE_DEBUG_INFO_TIMEOUT_SEC = 60;
+
////////////////////////////////
// IPC Configs
////////////////////////////////
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 47d12d16265710ac9b144e171d90c60b676aa3a5..9a86eed7439a16fb13ee94f70618a0a3d1865d3e 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -1027,6 +1027,36 @@
+
+ This property specifies a command to run, if the container is killed due
+ to resource constraints.
+ This can be used to collect and save debug information to find the root
+ cause of the issue. If Linux container executor is enabled, the script
+ has to run as root or the user, who executes the container. It has to
+ have the SUID bit set in these cases to be called by the Yarn user.
+ See yarn.nodemanager.linux-container-executor.nonsecure-mode.local-user
+ for details.
+ Examples:
+ jmap -heap {{PID}} 2>{{LOG_DIR}}/heap.err 1>{{LOG_DIR}}/heap.out
+ gcore -o {{LOG_DIR}}/heap.core {{PID}} 2>{{LOG_DIR}}/heap.err
+ 1>{{LOG_DIR}}/heap.out
+ run_gcore_with_suid.sh {{LOG_DIR}} {{PID}}
+
+ yarn.nodemanager.save-debug-info.command
+
+
+
+
+
+ This property tells node manager to kill the save debug information
+ command, if it exceeds this timeout. The unit is in seconds.
+ The command is specified by yarn.nodemanager.save-debug-info.command.
+
+ yarn.nodemanager.save-debug-info.timeout-sec
+ 60
+
+
+
Keytab for NM.
yarn.nodemanager.keytab
/etc/krb5.keytab
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java
index 8004f3393aab23953abf303c38216201a8fb83bd..4c1ca99768bc83bdd1d4dcbdb48c0e0d495da410 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java
@@ -91,5 +91,7 @@
void sendLaunchEvent();
+ void sendCleanupEvent();
+
void sendKillEvent(int exitStatus, String description);
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
index 4a6be32976ff243056da2fae682c44f46b451567..931b8d452695c9d8502193ec42a5d1e3434c836c 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
@@ -721,6 +721,14 @@ public void sendLaunchEvent() {
}
@SuppressWarnings("unchecked") // dispatcher not typed
+ @Override
+ public void sendCleanupEvent() {
+ dispatcher.getEventHandler().handle(
+ new ContainersLauncherEvent(this,
+ ContainersLauncherEventType.CLEANUP_CONTAINER));
+ }
+
+ @SuppressWarnings("unchecked") // dispatcher not typed
private void sendScheduleEvent() {
dispatcher.getEventHandler().handle(
new ContainerSchedulerEvent(this,
@@ -1461,12 +1469,22 @@ public void transition(ContainerImpl container, ContainerEvent event) {
@SuppressWarnings("unchecked")
@Override
public void transition(ContainerImpl container, ContainerEvent event) {
+ ContainerKillEvent killEvent = (ContainerKillEvent)event;
+ boolean isDebugInfoCollectionSupported =
+ killEvent.getContainerExitStatus() == ContainerExitStatus
+ .KILLED_EXCEEDED_VMEM ||
+ killEvent.getContainerExitStatus() == ContainerExitStatus
+ .KILLED_EXCEEDED_PMEM;
+ ContainersLauncherEventType nextEvent =
+ isDebugInfoCollectionSupported ?
+ ContainersLauncherEventType.DEBUG_AND_CLEANUP_CONTAINER :
+ ContainersLauncherEventType.CLEANUP_CONTAINER;
+
// Kill the process/process-grp
container.setIsReInitializing(false);
container.dispatcher.getEventHandler().handle(
new ContainersLauncherEvent(container,
- ContainersLauncherEventType.CLEANUP_CONTAINER));
- ContainerKillEvent killEvent = (ContainerKillEvent) event;
+ nextEvent));
container.addDiagnostics(killEvent.getDiagnostic(), "\n");
container.exitCode = killEvent.getContainerExitStatus();
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
index 823457ffc59b714202700cf3f0ec6cc8ea549450..77db99433dde8f2183fccb765c6a7885abfa5e6a 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java
@@ -34,6 +34,7 @@
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.Callable;
+import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.commons.logging.Log;
@@ -651,6 +652,76 @@ public void cleanupContainer() throws IOException {
}
/**
+ * Clean up container saving debug information.
+ * Collects debug information from the container process like a heap dump.
+ * It cleans up the container after this step.
+ */
+ void cleanupSavingDebugInfo() {
+ // get process id from pid file or shell
+ String processId = null;
+ if (pidFilePath != null) {
+ try {
+ processId = getContainerPid(pidFilePath);
+ } catch (Exception e) {
+ LOG.warn("Could not collect PID information for debugging ", e);
+ }
+ }
+
+ // Check if debug info collection is enabled
+ String command = null;
+ Integer timeoutSec = null;
+ if (conf != null) {
+ command = conf.get(YarnConfiguration.NM_SAVE_DEBUG_INFO_COMMAND,
+ YarnConfiguration.DEFAULT_NM_SAVE_DEBUG_INFO_COMMAND);
+ timeoutSec =
+ conf.getInt(YarnConfiguration.NM_SAVE_DEBUG_INFO_TIMEOUT_SEC,
+ YarnConfiguration.DEFAULT_NM_SAVE_DEBUG_INFO_TIMEOUT_SEC);
+ }
+
+ if (command != null &&
+ !command.isEmpty() &&
+ processId != null &&
+ !processId.isEmpty()) {
+ // Replace current PID and log directory in the command
+ final CharSequence pid = "{{PID}}";
+ final CharSequence logDir = "{{LOG_DIR}}";
+ String specializedCommand = command
+ .replace(pid, processId)
+ .replace(logDir, container.getLogDir());
+
+ // Build process as a shell script
+ String[] args = new String[3];
+ args[0] = Shell.WINDOWS ? "cmd" : "sh";
+ args[1] = Shell.WINDOWS ? "/c" : "-c";
+ args[2] = specializedCommand;
+
+ // Run collection command
+ ProcessBuilder builder = new ProcessBuilder();
+ builder.command(args);
+ builder.directory(new File(container.getWorkDir()));
+ Process process = null;
+ try {
+ process = builder.start();
+ } catch (IOException e) {
+ LOG.info("Could not start collecting debug information ", e);
+ }
+
+ // Wait for collection command
+ try {
+ if (!process.waitFor(timeoutSec, TimeUnit.SECONDS)) {
+ // Timeout expired
+ process.destroy();
+ LOG.info("Collection of debug information did not finish in "
+ + timeoutSec.toString() + " seconds. Collection killed.");
+ }
+ } catch (InterruptedException e) {
+ LOG.info("Collection of debug information interrupted.");
+ process.destroy();
+ }
+ }
+ }
+
+ /**
* Send a signal to the container.
*
*
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java
index d4a7bfdacf03d9dfbef87c0ee7c88e8f36d56bc3..cf589afd533492a033295842b10f0eeb1361fca1 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncher.java
@@ -107,69 +107,78 @@ public void handle(ContainersLauncherEvent event) {
Container container = event.getContainer();
ContainerId containerId = container.getContainerId();
switch (event.getType()) {
- case LAUNCH_CONTAINER:
- Application app =
+ case LAUNCH_CONTAINER:
+ Application app =
context.getApplications().get(
+ containerId.getApplicationAttemptId().getApplicationId());
+
+ ContainerLaunch launch =
+ new ContainerLaunch(context, getConfig(), dispatcher, exec, app,
+ event.getContainer(), dirsHandler, containerManager);
+ containerLauncher.submit(launch);
+ running.put(containerId, launch);
+ break;
+ case RELAUNCH_CONTAINER:
+ app = context.getApplications().get(
containerId.getApplicationAttemptId().getApplicationId());
- ContainerLaunch launch =
- new ContainerLaunch(context, getConfig(), dispatcher, exec, app,
+ ContainerRelaunch relaunch =
+ new ContainerRelaunch(context, getConfig(), dispatcher, exec, app,
event.getContainer(), dirsHandler, containerManager);
- containerLauncher.submit(launch);
- running.put(containerId, launch);
- break;
- case RELAUNCH_CONTAINER:
- app = context.getApplications().get(
- containerId.getApplicationAttemptId().getApplicationId());
-
- ContainerRelaunch relaunch =
- new ContainerRelaunch(context, getConfig(), dispatcher, exec, app,
- event.getContainer(), dirsHandler, containerManager);
- containerLauncher.submit(relaunch);
- running.put(containerId, relaunch);
- break;
- case RECOVER_CONTAINER:
- app = context.getApplications().get(
- containerId.getApplicationAttemptId().getApplicationId());
- launch = new RecoveredContainerLaunch(context, getConfig(), dispatcher,
- exec, app, event.getContainer(), dirsHandler, containerManager);
- containerLauncher.submit(launch);
- running.put(containerId, launch);
- break;
- case CLEANUP_CONTAINER:
- case CLEANUP_CONTAINER_FOR_REINIT:
- ContainerLaunch launcher = running.remove(containerId);
- if (launcher == null) {
- // Container not launched. So nothing needs to be done.
- return;
- }
-
- // Cleanup a container whether it is running/killed/completed, so that
- // no sub-processes are alive.
- try {
- launcher.cleanupContainer();
- } catch (IOException e) {
- LOG.warn("Got exception while cleaning container " + containerId
- + ". Ignoring.");
- }
- break;
- case SIGNAL_CONTAINER:
- SignalContainersLauncherEvent signalEvent =
- (SignalContainersLauncherEvent) event;
- ContainerLaunch runningContainer = running.get(containerId);
- if (runningContainer == null) {
- // Container not launched. So nothing needs to be done.
- LOG.info("Container " + containerId + " not running, nothing to signal.");
- return;
- }
-
- try {
- runningContainer.signalContainer(signalEvent.getCommand());
- } catch (IOException e) {
- LOG.warn("Got exception while signaling container " + containerId
- + " with command " + signalEvent.getCommand());
- }
- break;
+ containerLauncher.submit(relaunch);
+ running.put(containerId, relaunch);
+ break;
+ case RECOVER_CONTAINER:
+ app = context.getApplications().get(
+ containerId.getApplicationAttemptId().getApplicationId());
+ launch = new RecoveredContainerLaunch(context, getConfig(), dispatcher,
+ exec, app, event.getContainer(), dirsHandler, containerManager);
+ containerLauncher.submit(launch);
+ running.put(containerId, launch);
+ break;
+ case DEBUG_AND_CLEANUP_CONTAINER:
+ ContainerLaunch launcherToDebug = running.get(containerId);
+ if (launcherToDebug != null) {
+ // Save debug information like heap dump
+ launcherToDebug.cleanupSavingDebugInfo();
+ }
+ // Clean up the container as the next step
+ container.sendCleanupEvent();
+ break;
+ case CLEANUP_CONTAINER:
+ case CLEANUP_CONTAINER_FOR_REINIT:
+ ContainerLaunch launcher = running.remove(containerId);
+ if (launcher == null) {
+ // Container not launched. So nothing needs to be done.
+ return;
+ }
+
+ // Cleanup a container whether it is running/killed/completed, so that
+ // no sub-processes are alive.
+ try {
+ launcher.cleanupContainer();
+ } catch (IOException e) {
+ LOG.warn("Got exception while cleaning container " + containerId
+ + ". Ignoring.");
+ }
+ break;
+ case SIGNAL_CONTAINER:
+ SignalContainersLauncherEvent signalEvent =
+ (SignalContainersLauncherEvent) event;
+ ContainerLaunch runningContainer = running.get(containerId);
+ if (runningContainer == null) {
+ // Container not launched. So nothing needs to be done.
+ LOG.info("Container " + containerId + " not running, nothing to signal.");
+ return;
+ }
+
+ try {
+ runningContainer.signalContainer(signalEvent.getCommand());
+ } catch (IOException e) {
+ LOG.warn("Got exception while signaling container " + containerId
+ + " with command " + signalEvent.getCommand());
+ }
+ break;
}
}
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncherEventType.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncherEventType.java
index 380a032ca78f0d5c6e9d2a3c34a484348e932a87..4444edbf80c3022c6d0206d1fbcee94dad29432b 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncherEventType.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainersLauncherEventType.java
@@ -23,6 +23,7 @@
RELAUNCH_CONTAINER,
RECOVER_CONTAINER,
CLEANUP_CONTAINER, // The process(grp) itself.
+ DEBUG_AND_CLEANUP_CONTAINER, // The process(grp) itself.
CLEANUP_CONTAINER_FOR_REINIT, // The process(grp) itself.
SIGNAL_CONTAINER,
}
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
index be6eadba0a34e51b4d3265ffef23292c7cd9f75f..b54d8f94b960a00a0ea0bb0c66ad1c3dcce72098 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
@@ -21,6 +21,7 @@
import static org.apache.hadoop.test.PlatformAssumptions.assumeWindows;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
@@ -44,6 +45,7 @@
import java.util.jar.Manifest;
import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
@@ -910,9 +912,16 @@ public void testAuxiliaryServiceHelper() throws Exception {
AuxiliaryServiceHelper.getServiceDataFromEnv(serviceName, env));
}
- private void internalKillTest(boolean delayed) throws Exception {
+ /**
+ * Run a container kill test.
+ * @param delayed Delayed stop
+ * @param limitViolation Simulate memory limit violation
+ * @throws Exception An exception has occured
+ */
+ private void internalKillTest(boolean delayed, boolean limitViolation)
+ throws Exception {
conf.setLong(YarnConfiguration.NM_SLEEP_DELAY_BEFORE_SIGKILL_MS,
- delayed ? 1000 : 0);
+ delayed ? 1000 : 0);
containerManager.start();
// ////// Construct the Container-id
@@ -988,17 +997,23 @@ private void internalKillTest(boolean delayed) throws Exception {
Assert.assertTrue("ProcessStartFile doesn't exist!",
processStartFile.exists());
+ Container container = containerManager.getContext().getContainers()
+ .get(cId);
NMContainerStatus nmContainerStatus =
- containerManager.getContext().getContainers().get(cId)
- .getNMContainerStatus();
+ container.getNMContainerStatus();
Assert.assertEquals(priority, nmContainerStatus.getPriority());
// Now test the stop functionality.
List containerIds = new ArrayList();
containerIds.add(cId);
- StopContainersRequest stopRequest =
- StopContainersRequest.newInstance(containerIds);
- containerManager.stopContainers(stopRequest);
+
+ if (limitViolation) {
+ container.sendKillEvent(ContainerExitStatus.KILLED_EXCEEDED_VMEM, "");
+ } else {
+ StopContainersRequest stopRequest =
+ StopContainersRequest.newInstance(containerIds);
+ containerManager.stopContainers(stopRequest);
+ }
BaseContainerManagerTest.waitForContainerState(containerManager, cId,
ContainerState.COMPLETE);
@@ -1011,7 +1026,10 @@ private void internalKillTest(boolean delayed) throws Exception {
ContainerStatus containerStatus =
containerManager.getContainerStatuses(gcsRequest)
.getContainerStatuses().get(0);
- Assert.assertEquals(ContainerExitStatus.KILLED_BY_APPMASTER,
+ Assert.assertEquals(
+ limitViolation
+ ? ContainerExitStatus.KILLED_EXCEEDED_VMEM
+ : ContainerExitStatus.KILLED_BY_APPMASTER,
containerStatus.getExitStatus());
// Now verify the contents of the file. Script generates a message when it
@@ -1044,12 +1062,96 @@ private void internalKillTest(boolean delayed) throws Exception {
@Test (timeout = 30000)
public void testDelayedKill() throws Exception {
- internalKillTest(true);
+ internalKillTest(true, false);
}
@Test (timeout = 30000)
public void testImmediateKill() throws Exception {
- internalKillTest(false);
+ internalKillTest(false, false);
+ }
+
+ /**
+ * Setup collection of debug information on container kill.
+ * @return the path to a canary file showing that the script ran.
+ */
+ private File setupCollectDebugInformation(String command) {
+ File canary = new File("canary.txt");
+ if (canary.exists()) {
+ assertTrue("Canary should be deleted", canary.delete());
+ }
+ conf.set(YarnConfiguration.NM_SAVE_DEBUG_INFO_COMMAND,
+ command.replace("", canary.getAbsolutePath()));
+ return canary;
+ }
+
+ /**
+ * Cleanup collection of debug information on container kill.
+ */
+ private void cleanupCollectDebugInformation(File canary) {
+ if (canary.exists()) {
+ assertTrue("Canary should be deleted", canary.delete());
+ }
+ }
+
+
+ /**
+ * Test, if the debug hook on container kill collects a PID.
+ * @throws Exception An error occurred
+ */
+ @Test (timeout = 30000)
+ public void testImmediateKillWithDebugInfoCollectionPid() throws Exception {
+ File canary = setupCollectDebugInformation(
+ "echo {{PID}} >");
+
+ internalKillTest(false, true);
+
+ Assert.assertTrue("The script should have run", canary.exists());
+ String pidString = FileUtils.readFileToString(canary);
+ int pid = Integer.parseInt(pidString.trim());
+ Assert.assertNotEquals(0, pid);
+
+ cleanupCollectDebugInformation(canary);
+ }
+
+ /**
+ * Test, if the debug hook on container kill collects a log dir.
+ * @throws Exception An error occurred
+ */
+ @Test (timeout = 30000)
+ public void testImmediateKillWithDebugInfoCollectionLogDir()
+ throws Exception {
+ conf.set(YarnConfiguration.NM_LOG_DIRS,
+ new File(".").getAbsolutePath());
+ File canary = setupCollectDebugInformation(
+ "echo {{LOG_DIR}} >");
+
+ internalKillTest(false, true);
+
+ Assert.assertTrue("The script should have run", canary.exists());
+ String dirString = FileUtils.readFileToString(canary);
+ File logDir = new File(dirString.trim());
+ Assert.assertTrue(
+ "We should get the expected log dir",
+ logDir.getAbsolutePath().startsWith(localLogDir.getAbsolutePath()));
+
+ cleanupCollectDebugInformation(canary);
+ }
+
+ /**
+ * Test, if the debug hook on container kill can be killed gracefully.
+ * @throws Exception An error occurred
+ */
+ @Test (timeout = 30000)
+ public void testImmediateKillWithDebugInfoCollectionTimeout()
+ throws Exception {
+ File canary = setupCollectDebugInformation(
+ "sleep 100 && echo {{PID}} {{LOG_DIR}}>");
+ conf.setInt(YarnConfiguration.NM_SAVE_DEBUG_INFO_TIMEOUT_SEC, 1);
+ internalKillTest(false, true);
+ Assert.assertFalse("The script should have been interrupted",
+ canary.exists());
+
+ cleanupCollectDebugInformation(canary);
}
@SuppressWarnings("rawtypes")
diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java
index 686a0d9fdd5c2ca51ac81d4d3415ccf68149d5e6..92f48863275adc827cda2e3d6ab8be01b487c0cb 100644
--- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java
+++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java
@@ -227,6 +227,11 @@ public void sendLaunchEvent() {
}
@Override
+ public void sendCleanupEvent() {
+
+ }
+
+ @Override
public void sendKillEvent(int exitStatus, String description) {
}