+ * http://www.apache.org/licenses/LICENSE-2.0 + *
+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.yarn.server.nodemanager.util; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.ContainerInfo; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.ContainersInfo; +import org.apache.hadoop.yarn.util.StringHelper; +import org.apache.hadoop.yarn.webapp.util.WebAppUtils; +import org.apache.http.HttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.BasicResponseHandler; +import org.apache.http.impl.client.DefaultHttpClient; + +import javax.xml.bind.JAXBException; +import java.io.IOException; + +/** + * Called by container executor, such as winutils or container-executor, to + * check whether current container is unmanaged by YARN. + * + * Container can become (partially) unmanaged due to: + * 1. For container resource managed by YARN, such as container job object + * and disk data: + * a. NM service is disabled or removed on the node. + * b. NM is unable to start up again on the node, such as depended + * configuration, or resources cannot be ready. + * c. NM local leveldb store is corrupted, such as bad disk sectors. + * d. NM has bugs, such as wrongly mark live container as complete. + * 2. For container resource unmanaged by YARN: + * a. User breakaway processes from container job object. + * b. User creates VMs from container job object. + * c. User acquires other resource on the machine which is unmanaged by + * YARN, such as produce data outside Container folder. + * + * The current container will be considered as unmanaged when: + * 1. NM is dead: + * Failed to check whether container is managed by NM within timeout. + * 2. NM is alive but container is + * {@link org.apache.hadoop.yarn.api.records.ContainerState#COMPLETE} + * or not found: + * The container is + * {@link org.apache.hadoop.yarn.api.records.ContainerState#COMPLETE} or + * not found in the NM container list. + */ +public class UnmanagedContainerChecker { + + private enum LogLevel {INFO, WARN, ERROR} + + private static final Log LOG = LogFactory.getLog(UnmanagedContainerChecker.class); + public static final int FAILED_EXITCODE = 175; + public static final int UNMANAGED_EXITCODE = 176; + + // Usage: containerId + // Process exits if and only if the container of the corresponding containerId + // is unmanaged by local NM. + public static void main(String[] argv) { + int exitCode; + try { + monitorContainer(argv[0]); + exitCode = UNMANAGED_EXITCODE; + } catch (Throwable t) { + log(LogLevel.ERROR, "Failed to monitor container. " + + "Consider it is unmanaged.", t); + exitCode = FAILED_EXITCODE; + } + + log(LogLevel.ERROR, "Exiting with exitcode: " + exitCode); + System.exit(exitCode); + } + + // Only throws unrecoverable, non-transient error, so caller is not expected + // to retry it. + private static void monitorContainer(String containerId) { + Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler()); + + YarnConfiguration conf = new YarnConfiguration(); + long checkIntervalMs = conf.getLong( + YarnConfiguration.NM_WINDOWS_CONTAINER_UNMANAGED_CONTAINER_CLEANUP_CHECK_INTERVAL_MS, + YarnConfiguration.DEFAULT_NM_WINDOWS_CONTAINER_UNMANAGED_CONTAINER_CLEANUP_CHECK_INTERVAL_MS); + long checkTimeoutMs = conf.getLong( + YarnConfiguration.NM_WINDOWS_CONTAINER_UNMANAGED_CONTAINER_CLEANUP_CHECK_TIMEOUT_MS, + YarnConfiguration.DEFAULT_NM_WINDOWS_CONTAINER_UNMANAGED_CONTAINER_CLEANUP_CHECK_TIMEOUT_MS); + + String nmContainersUri = StringHelper.pjoin( + WebAppUtils.getNMWebAppURLWithScheme(conf), + "ws/v1/node", "containers"); + + long failedToCheckStartTimeMs = -1; + while (true) { + try { + boolean isManaged = + isContainerManagedInNM(nmContainersUri, containerId); + + if (failedToCheckStartTimeMs != -1) { + log(LogLevel.INFO, + "Succeeded to check whether container is managed by NM. " + + "Check timeout canceled."); + failedToCheckStartTimeMs = -1; + } + + if (!isManaged) { + log(LogLevel.ERROR, "Container is not managed in NM."); + return; + } + } catch (Throwable t) { + long failedToCheckCurrentTimeMs = System.currentTimeMillis(); + log(LogLevel.WARN, + "Failed to check whether container is managed by NM " + + "from time " + failedToCheckStartTimeMs + "ms " + + "to time " + failedToCheckCurrentTimeMs + "ms. " + + "Check timeout is " + checkTimeoutMs + "ms.", t); + if (failedToCheckStartTimeMs == -1) { + failedToCheckStartTimeMs = failedToCheckCurrentTimeMs; + } else { + if (failedToCheckCurrentTimeMs - failedToCheckStartTimeMs >= + checkTimeoutMs) { + log(LogLevel.ERROR, + "Failed to check whether container is managed by NM " + + "within timeout. Consider it is unmanaged."); + return; + } + } + } + + try { + Thread.sleep(checkIntervalMs); + } catch (Throwable t) { + log(LogLevel.WARN, "Failed to sleep, ignore it.", t); + } + } + } + + private static void log( + LogLevel level, Object message) { + log(level, message, null); + } + + private static void log( + LogLevel level, Object message, Throwable t) { + String currentTimeMs = "[" + System.currentTimeMillis() + "ms]: "; + String printMessage = currentTimeMs + message; + + switch (level) { + case INFO: + // Logs into console for diagnostics + System.out.println(printMessage); + // Logs into file for reliable debugging + LOG.info(message, t); + break; + case WARN: + System.err.println(printMessage); + LOG.warn(message, t); + break; + case ERROR: + System.err.println(printMessage); + LOG.error(message, t); + break; + } + } + + private static boolean isContainerManagedInNM( + String nmContainersUri, String containerId) throws IOException, JAXBException { + ContainersInfo containersInfo = getContainersInfoFromNM(nmContainersUri); + for (ContainerInfo containerInfo : containersInfo.getContainers()) { + if (containerInfo.getId().equals(containerId)) { + return isContainerStateManagedInNM(containerInfo.getState()); + } + } + return false; + } + + private static boolean isContainerStateManagedInNM(String containerStateStr) { + ContainerState containerState; + try { + containerState = ContainerState.valueOf(containerStateStr); + } catch (Exception e) { + log(LogLevel.WARN, "Failed to parse ContainerState from string to enum: " + + containerStateStr, e); + return false; + } + + // Only considers API Container COMPLETE state as unmanaged, because: + // 1. Other NM Container States may allow running container process to + // exist, such as NEW, LOCALIZING, LOCALIZED. + // 2. The COMPLETE state is from API which means it is to be reported to the + // application, so we can safely consider the container is unmanaged. + if (ContainerState.toApiContainerState(containerState) == + org.apache.hadoop.yarn.api.records.ContainerState.COMPLETE) { + return false; + } + + return true; + } + + // Since NM starts its WebServer only if all containers managed by it are + // recovered, so the returned ContainersInfo is ensured to be complete. + private static ContainersInfo getContainersInfoFromNM( + String nmContainersUri) throws IOException, JAXBException { + DefaultHttpClient client = new DefaultHttpClient(); + HttpGet httpGet = new HttpGet(nmContainersUri); + httpGet.setHeader("Accept", "application/json"); + + HttpResponse response = client.execute(httpGet); + + int statusCode = response.getStatusLine().getStatusCode(); + if (!(statusCode >= 200 && statusCode <= 299)) { + throw new IOException( + "Http request failed with status code: " + statusCode); + } + + String jsonContent = new BasicResponseHandler().handleResponse(response); + return WebAppUtils.jsonToObject(jsonContent, ContainersInfo.class); + } +}