diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c index 264d63769b9..839c6c6e436 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c @@ -1454,7 +1454,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id, if (exit_code != 0) { fprintf(ERRORFILE, "Could not create script path\n"); fflush(ERRORFILE); - goto cleanup; + goto pre_launch_cleanup; } fprintf(LOGFILE, "Creating local dirs...\n"); @@ -1465,7 +1465,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id, if (exit_code != 0) { fprintf(ERRORFILE, "Could not create local files and directories %d %d\n", container_file_source, cred_file_source); fflush(ERRORFILE); - goto cleanup; + goto pre_launch_cleanup; } docker_command = construct_docker_command(command_file); @@ -1477,14 +1477,14 @@ int launch_docker_container_as_user(const char * user, const char *app_id, exit_code = OUT_OF_MEMORY; fprintf(ERRORFILE, "Container out of memory"); fflush(ERRORFILE); - goto cleanup; + goto pre_launch_cleanup; } fprintf(LOGFILE, "Changing effective user to root...\n"); if (change_effective_user(0, user_gid) != 0) { fprintf(ERRORFILE, "Could not change to effective users %d, %d\n", 0, user_gid); fflush(ERRORFILE); - goto cleanup; + goto pre_launch_cleanup; } snprintf(docker_command_with_binary, command_size, "%s %s", docker_binary, docker_command); @@ -1497,7 +1497,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id, "Could not invoke docker %s.\n", docker_command_with_binary); fflush(ERRORFILE); exit_code = UNABLE_TO_EXECUTE_CONTAINER_SCRIPT; - goto cleanup; + goto post_launch_cleanup; } snprintf(docker_inspect_command, command_size, @@ -1514,7 +1514,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id, "Could not inspect docker to get pid %s.\n", docker_inspect_command); fflush(ERRORFILE); exit_code = UNABLE_TO_EXECUTE_CONTAINER_SCRIPT; - goto cleanup; + goto post_launch_cleanup; } if (pid != 0) { @@ -1529,7 +1529,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id, if (strcmp(*cgroup_ptr, "none") != 0 && write_pid_to_cgroup_as_root(*cgroup_ptr, pid) != 0) { exit_code = WRITE_CGROUP_FAILED; - goto cleanup; + goto post_launch_cleanup; } } } @@ -1542,7 +1542,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id, exit_code = WRITE_PIDFILE_FAILED; fprintf(ERRORFILE, "Could not write pid to %s", pid_file); fflush(ERRORFILE); - goto cleanup; + goto post_launch_cleanup; } snprintf(docker_wait_command, command_size, @@ -1588,20 +1588,40 @@ int launch_docker_container_as_user(const char * user, const char *app_id, } } +post_launch_cleanup: + fprintf(LOGFILE, "Removing docker container post-exit...\n"); snprintf(docker_rm_command, command_size, "%s rm %s", docker_binary, container_id); - FILE* rm_docker = popen(docker_rm_command, "w"); - if (pclose (rm_docker) != 0) - { + int i, sleep_time = 1, max_iterations = 5; + for(i = 0; i < max_iterations; i++) { + if(i > 0) { + sleep(sleep_time); + sleep_time *= 2; + } + FILE* rm_docker = popen(docker_rm_command, "w"); + if (rm_docker == 0) { + fprintf (ERRORFILE, + "popen() failed: %s\n", strerror(errno)); + fflush(ERRORFILE); + } + if (pclose (rm_docker) == 0) { + break; + } fprintf (ERRORFILE, - "Could not remove container %s.\n", docker_rm_command); + "pclose() failed: %s\n", strerror(errno)); + fflush(ERRORFILE); + } + + if(i == max_iterations) { + // Tried 5 times and failed. + fprintf (ERRORFILE, + "Could not remove container after 5 tries: %s\n", docker_rm_command); fflush(ERRORFILE); exit_code = UNABLE_TO_EXECUTE_CONTAINER_SCRIPT; - goto cleanup; } -cleanup: +pre_launch_cleanup: if (exit_code_file != NULL && write_exit_code_file_as_nm(exit_code_file, exit_code) < 0) { fprintf (ERRORFILE,