diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java index e0ada45..e7c7d52 100644 --- a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java @@ -64,13 +64,13 @@ import org.slf4j.LoggerFactory; public class LlapStatusServiceDriver { + private static final Logger LOG = LoggerFactory.getLogger(LlapStatusServiceDriver.class); + private static final Logger CONSOLE_LOGGER = LoggerFactory.getLogger("LlapStatusServiceDriverConsole"); private static final EnumSet NO_YARN_SERVICE_INFO_STATES = EnumSet.of( State.APP_NOT_FOUND, State.COMPLETE, State.LAUNCHING); private static final EnumSet LAUNCHING_STATES = EnumSet.of( State.LAUNCHING, State.RUNNING_PARTIAL, State.RUNNING_ALL); - private static final Logger LOG = LoggerFactory.getLogger(LlapStatusServiceDriver.class); - private static final Logger CONSOLE_LOGGER = LoggerFactory.getLogger("LlapStatusServiceDriverConsole"); // Defining a bunch of configs here instead of in HiveConf. These are experimental, and mainly // for use when retry handling is fixed in Yarn/Hadoop @@ -80,18 +80,15 @@ // The following two keys should ideally be used to control RM connect timeouts. However, // they don't seem to work. The IPC timeout needs to be set instead. @InterfaceAudience.Private - private static final String CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS = - CONF_PREFIX + "yarn.rm.connect.max-wait-ms"; + private static final String CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS = CONF_PREFIX + "yarn.rm.connect.max-wait-ms"; private static final long CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS_DEFAULT = 10000l; @InterfaceAudience.Private - private static final String CONFIG_YARN_RM_RETRY_INTERVAL_MS = - CONF_PREFIX + "yarn.rm.connect.retry-interval.ms"; + private static final String CONFIG_YARN_RM_RETRY_INTERVAL_MS = CONF_PREFIX + "yarn.rm.connect.retry-interval.ms"; private static final long CONFIG_YARN_RM_RETRY_INTERVAL_MS_DEFAULT = 5000l; // As of Hadoop 2.7 - this is what controls the RM timeout. @InterfaceAudience.Private - private static final String CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES = - CONF_PREFIX + "ipc.client.max-retries"; + private static final String CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES = CONF_PREFIX + "ipc.client.max-retries"; private static final int CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT = 2; @InterfaceAudience.Private private static final String CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS = @@ -107,16 +104,15 @@ private static final String CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC_DEFAULT = "2000, 1"; - private static final String CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS = - CONF_PREFIX + "zk-registry.timeout-ms"; + private static final String CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS = CONF_PREFIX + "zk-registry.timeout-ms"; private static final long CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS_DEFAULT = 20000l; private static final long LOG_SUMMARY_INTERVAL = 15000L; // Log summary every ~15 seconds. - private static final String LLAP_KEY = "llap"; + private final Configuration conf; - private final Clock clock = new SystemClock(); private String appName = null; + private String applicationId = null; private ServiceClient serviceClient = null; private Configuration llapRegistryConf = null; private LlapRegistryService llapRegistry = null; @@ -145,21 +141,17 @@ private void setupConf() { CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC_DEFAULT)); conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS, - conf.getLong(CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS, - CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS_DEFAULT)); + conf.getLong(CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS, CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS_DEFAULT)); conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS, conf.getLong(CONFIG_YARN_RM_RETRY_INTERVAL_MS, CONFIG_YARN_RM_RETRY_INTERVAL_MS_DEFAULT)); conf.setInt(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, - conf.getInt(CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES, - CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT)); + conf.getInt(CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES, CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT)); conf.setLong(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_RETRY_INTERVAL_KEY, - conf.getLong(CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS, - CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS_DEFAULT)); + conf.getLong(CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS, CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS_DEFAULT)); - HiveConf.setVar(conf, HiveConf.ConfVars.HIVE_ZOOKEEPER_SESSION_TIMEOUT, (conf - .getLong(CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS, CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS_DEFAULT) + - "ms")); + HiveConf.setVar(conf, HiveConf.ConfVars.HIVE_ZOOKEEPER_SESSION_TIMEOUT, ( + conf.getLong(CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS, CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS_DEFAULT) + "ms")); llapRegistryConf = new Configuration(conf); } @@ -167,7 +159,6 @@ private void setupConf() { /** * Parse command line options. * - * @param args * @return command line options. */ public LlapStatusOptions parseOptions(String[] args) throws @@ -211,9 +202,7 @@ public int run(LlapStatusOptions options, long watchTimeoutMs) { LOG.info(message); return ExitCode.INCORRECT_USAGE.getInt(); } - if (LOG.isDebugEnabled()) { - LOG.debug("Using appName: {}", appName); - } + LOG.debug("Using appName: {}", appName); llapRegistryConf.set(HiveConf.ConfVars.LLAP_DAEMON_SERVICE_HOSTS.varname, "@" + appName); } @@ -224,8 +213,7 @@ public int run(LlapStatusOptions options, long watchTimeoutMs) { } } catch (Exception e) { LlapStatusCliException le = new LlapStatusCliException( - LlapStatusServiceDriver.ExitCode.SERVICE_CLIENT_ERROR_CREATE_FAILED, - "Failed to create service client", e); + LlapStatusServiceDriver.ExitCode.SERVICE_CLIENT_ERROR_CREATE_FAILED, "Failed to create service client", e); logError(le); return le.getExitCode().getInt(); } @@ -233,8 +221,7 @@ public int run(LlapStatusOptions options, long watchTimeoutMs) { // Get the App report from YARN ApplicationReport appReport; try { - appReport = LlapSliderUtils.getAppReport(appName, serviceClient, - options.getFindAppTimeoutMs()); + appReport = LlapSliderUtils.getAppReport(appName, serviceClient, options.getFindAppTimeoutMs()); } catch (LlapStatusCliException e) { logError(e); return e.getExitCode().getInt(); @@ -256,11 +243,9 @@ public int run(LlapStatusOptions options, long watchTimeoutMs) { } else { // Get information from YARN Service try { - ret = populateAppStatusFromServiceStatus(appName, serviceClient, - appStatusBuilder); + ret = populateAppStatusFromServiceStatus(appName, serviceClient, appStatusBuilder); } catch (LlapStatusCliException e) { - // In case of failure, send back whatever is constructed so far - - // which would be from the AppReport + // In case of failure, send back whatever is constructed so far - which would be from the AppReport logError(e); return e.getExitCode().getInt(); } @@ -279,9 +264,7 @@ public int run(LlapStatusOptions options, long watchTimeoutMs) { return ret.getInt(); } finally { - if (LOG.isDebugEnabled()) { - LOG.debug("Final AppState: " + appStatusBuilder.toString()); - } + LOG.debug("Final AppState: " + appStatusBuilder.toString()); } } @@ -295,28 +278,26 @@ public void outputJson(PrintWriter writer) throws writer.println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(appStatusBuilder)); } catch (IOException e) { LOG.warn("Failed to create JSON", e); - throw new LlapStatusCliException(ExitCode.LLAP_JSON_GENERATION_ERROR, "Failed to create JSON", - e); + throw new LlapStatusCliException(ExitCode.LLAP_JSON_GENERATION_ERROR, "Failed to create JSON", e); } } /** * Populates parts of the AppStatus * - * @param appReport - * @param appStatusBuilder * @return an ExitCode. An ExitCode other than ExitCode.SUCCESS implies future progress not possible * @throws LlapStatusCliException */ - private ExitCode processAppReport(ApplicationReport appReport, - AppStatusBuilder appStatusBuilder) throws - LlapStatusCliException { + private ExitCode processAppReport(ApplicationReport appReport, AppStatusBuilder appStatusBuilder) + throws LlapStatusCliException { if (appReport == null) { appStatusBuilder.setState(State.APP_NOT_FOUND); LOG.info("No Application Found"); return ExitCode.SUCCESS; } + applicationId = appReport.getApplicationId().toString(); + // TODO Maybe add the YARN URL for the app. appStatusBuilder.setAmInfo( new LlapStatusHelpers.AmInfo().setAppName(appReport.getName()).setAppType(appReport.getApplicationType())); @@ -328,17 +309,17 @@ private ExitCode processAppReport(ApplicationReport appReport, appStatusBuilder.setState(State.LAUNCHING); return ExitCode.SUCCESS; case ACCEPTED: - appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(appReport.getApplicationId().toString()); + appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(applicationId); appStatusBuilder.setState(State.LAUNCHING); return ExitCode.SUCCESS; case RUNNING: - appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(appReport.getApplicationId().toString()); + appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(applicationId); // If the app state is running, get additional information from YARN Service return ExitCode.SUCCESS; case FINISHED: case FAILED: case KILLED: - appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(appReport.getApplicationId().toString()); + appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(applicationId); appStatusBuilder.setAppFinishTime(appReport.getFinishTime()); appStatusBuilder.setState(State.COMPLETE); // add log links and other diagnostics from YARN Service @@ -352,33 +333,23 @@ private ExitCode processAppReport(ApplicationReport appReport, /** * Populates information from YARN Service Status. * - * @param appName - * @param serviceClient - * @param appStatusBuilder - * @return an ExitCode. An ExitCode other than ExitCode.SUCCESS implies future - * progress not possible + * @return an ExitCode. An ExitCode other than ExitCode.SUCCESS implies future progress not possible * @throws LlapStatusCliException */ - private ExitCode populateAppStatusFromServiceStatus(String appName, - ServiceClient serviceClient, AppStatusBuilder appStatusBuilder) - throws LlapStatusCliException { + private ExitCode populateAppStatusFromServiceStatus(String appName, ServiceClient serviceClient, + AppStatusBuilder appStatusBuilder) throws LlapStatusCliException { ExitCode exitCode = ExitCode.YARN_ERROR; try { Service service = serviceClient.getStatus(appName); if (service != null) { // How to get config paths and AmInfo ServiceState state = service.getState(); - appStatusBuilder.setAppStartTime(service.getLaunchTime() == null ? 0 - : service.getLaunchTime().getTime()); - appStatusBuilder.setDesiredInstances( - service.getComponent(LLAP_KEY).getNumberOfContainers() == null ? 0 - : service.getComponent(LLAP_KEY).getNumberOfContainers() - .intValue()); - appStatusBuilder.setLiveInstances( - service.getComponent(LLAP_KEY).getContainers().size()); + appStatusBuilder.setAppStartTime(service.getLaunchTime() == null ? 0 : service.getLaunchTime().getTime()); + appStatusBuilder.setDesiredInstances(service.getComponent(LLAP_KEY).getNumberOfContainers() == null ? 0 + : service.getComponent(LLAP_KEY).getNumberOfContainers().intValue()); + appStatusBuilder.setLiveInstances(service.getComponent(LLAP_KEY).getContainers().size()); for (Container cont : service.getComponent(LLAP_KEY).getContainers()) { - LlapInstance llapInstance = new LlapInstance(cont.getHostname(), - cont.getId()); + LlapInstance llapInstance = new LlapInstance(cont.getHostname(), cont.getId()); appStatusBuilder.addNewRunningLlapInstance(llapInstance); } if (state == ServiceState.STABLE) { @@ -389,8 +360,7 @@ private ExitCode populateAppStatusFromServiceStatus(String appName, } } catch (IOException | YarnException e) { LlapStatusCliException le = new LlapStatusCliException( - LlapStatusServiceDriver.ExitCode.SERVICE_CLIENT_ERROR_OTHER, - "Failed to get service status", e); + LlapStatusServiceDriver.ExitCode.SERVICE_CLIENT_ERROR_OTHER, "Failed to get service status", e); logError(le); exitCode = le.getExitCode(); } @@ -400,13 +370,11 @@ private ExitCode populateAppStatusFromServiceStatus(String appName, /** * Populate additional information for containers from the LLAP registry. Must be invoked * after YARN Service status and diagnostics. - * @param appStatusBuilder * @return an ExitCode. An ExitCode other than ExitCode.SUCCESS implies future progress not possible * @throws LlapStatusCliException */ - private ExitCode populateAppStatusFromLlapRegistry( - AppStatusBuilder appStatusBuilder, long watchTimeoutMs) throws - LlapStatusCliException { + private ExitCode populateAppStatusFromLlapRegistry(AppStatusBuilder appStatusBuilder, long watchTimeoutMs) + throws LlapStatusCliException { if (llapRegistry == null) { try { @@ -425,9 +393,7 @@ private ExitCode populateAppStatusFromLlapRegistry( } if (serviceInstances == null || serviceInstances.isEmpty()) { - if (LOG.isDebugEnabled()) { - LOG.debug("No information found in the LLAP registry"); - } + LOG.debug("No information found in the LLAP registry"); appStatusBuilder.setLiveInstances(0); appStatusBuilder.setState(State.LAUNCHING); appStatusBuilder.clearRunningLlapInstances(); @@ -461,8 +427,8 @@ private ExitCode populateAppStatusFromLlapRegistry( appStatusBuilder.setLiveInstances(validatedInstances.size()); appStatusBuilder.setLaunchingInstances(llapExtraInstances.size()); - if (appStatusBuilder.getDesiredInstances() != null && validatedInstances - .size() >= appStatusBuilder.getDesiredInstances()) { + if (appStatusBuilder.getDesiredInstances() != null && + validatedInstances.size() >= appStatusBuilder.getDesiredInstances()) { appStatusBuilder.setState(State.RUNNING_ALL); if (validatedInstances.size() > appStatusBuilder.getDesiredInstances()) { LOG.warn("Found more entries in LLAP registry, as compared to desired entries"); @@ -493,107 +459,13 @@ private ExitCode populateAppStatusFromLlapRegistry( return ExitCode.SUCCESS; } - private static String constructCompletedContainerDiagnostics(List completedInstances) { - StringBuilder sb = new StringBuilder(); - if (completedInstances == null || completedInstances.size() == 0) { - return ""; - } else { - // TODO HIVE-15865 Ideally sort these by completion time, once that is available. - boolean isFirst = true; - for (LlapInstance instance : completedInstances) { - if (!isFirst) { - sb.append("\n"); - } else { - isFirst = false; - } - - if (instance.getYarnContainerExitStatus() == - ContainerExitStatus.KILLED_EXCEEDED_PMEM || - instance.getYarnContainerExitStatus() == - ContainerExitStatus.KILLED_EXCEEDED_VMEM) { - sb.append("\tKILLED container (by YARN for exceeding memory limits): "); - } else { - // TODO HIVE-15865 Handle additional reasons like OS launch failed - sb.append("\tFAILED container: "); - } - sb.append(" ").append(instance.getContainerId()); - sb.append(", Logs at: ").append(instance.getLogUrl()); - } - } - return sb.toString(); - } - - /** - * Helper method to construct a diagnostic message from a complete - * AppStatusBuilder. - * - * @return - */ - private static String constructDiagnostics( - AppStatusBuilder appStatusBuilder) { - StringBuilder sb = new StringBuilder(); - - switch (appStatusBuilder.getState()) { - case APP_NOT_FOUND: - sb.append("LLAP status unknown. Awaiting app launch"); - break; - case LAUNCHING: - // This is a catch all state - when containers have not started yet, or LLAP has not started yet. - if (StringUtils.isNotBlank(appStatusBuilder.getAmInfo().getAppId())) { - sb.append("LLAP Starting up with AppId=") - .append(appStatusBuilder.getAmInfo().getAppId()).append("."); - if (appStatusBuilder.getDesiredInstances() != null) { - sb.append(" Started 0/").append(appStatusBuilder.getDesiredInstances()).append(" instances"); - } - - String containerDiagnostics = constructCompletedContainerDiagnostics( - appStatusBuilder.getCompletedInstances()); - if (StringUtils.isNotEmpty(containerDiagnostics)) { - sb.append("\n").append(containerDiagnostics); - } - } else { - sb.append("Awaiting LLAP startup"); - } - break; - case RUNNING_PARTIAL: - sb.append("LLAP Starting up with ApplicationId=") - .append(appStatusBuilder.getAmInfo().getAppId()); - sb.append(" Started").append(appStatusBuilder.getLiveInstances()) - .append("/").append(appStatusBuilder.getDesiredInstances()) - .append(" instances"); - String containerDiagnostics = constructCompletedContainerDiagnostics( - appStatusBuilder.getCompletedInstances()); - if (StringUtils.isNotEmpty(containerDiagnostics)) { - sb.append("\n").append(containerDiagnostics); - } - - // TODO HIVE-15865: Include information about pending requests, and last - // allocation time once YARN Service provides this information. - break; - case RUNNING_ALL: - sb.append("LLAP Application running with ApplicationId=") - .append(appStatusBuilder.getAmInfo().getAppId()); - break; - case COMPLETE: - - sb.append("LLAP Application already complete. ApplicationId=") - .append(appStatusBuilder.getAmInfo().getAppId()); - containerDiagnostics = constructCompletedContainerDiagnostics( - appStatusBuilder.getCompletedInstances()); - if (StringUtils.isNotEmpty(containerDiagnostics)) { - sb.append("\n").append(containerDiagnostics); - } - - break; - case UNKNOWN: - sb.append("LLAP status unknown"); - break; + private void close() { + if (serviceClient != null) { + serviceClient.stop(); } - if (StringUtils.isNotBlank(appStatusBuilder.getDiagnostics())) { - sb.append("\n").append(appStatusBuilder.getDiagnostics()); + if (llapRegistry != null) { + llapRegistry.stop(); } - - return sb.toString(); } public enum ExitCode { @@ -618,18 +490,16 @@ public int getInt() { } } - public static class LlapStatusCliException extends Exception { final LlapStatusServiceDriver.ExitCode exitCode; - public LlapStatusCliException(LlapStatusServiceDriver.ExitCode exitCode, String message) { super(exitCode.getInt() +": " + message); this.exitCode = exitCode; } public LlapStatusCliException(LlapStatusServiceDriver.ExitCode exitCode, String message, Throwable cause) { - super(message, cause); + super(exitCode.getInt() +": " + message, cause); this.exitCode = exitCode; } @@ -638,18 +508,10 @@ public LlapStatusCliException(LlapStatusServiceDriver.ExitCode exitCode, String } } - - private static void logError(Throwable t) { - LOG.error("FAILED: " + t.getMessage(), t); - System.err.println("FAILED: " + t.getMessage()); - } - - public static void main(String[] args) { LOG.info("LLAP status invoked with arguments = {}", Arrays.toString(args)); int ret = ExitCode.SUCCESS.getInt(); Clock clock = SystemClock.getInstance(); - long startTime = clock.getTime(); long lastSummaryLogTime = -1; LlapStatusServiceDriver statusServiceDriver = null; @@ -661,8 +523,7 @@ public static void main(String[] args) { statusServiceDriver.close(); logError(t); if (t instanceof LlapStatusCliException) { - LlapStatusCliException - ce = (LlapStatusCliException) t; + LlapStatusCliException ce = (LlapStatusCliException) t; ret = ce.getExitCode().getInt(); } else { ret = ExitCode.INTERNAL_ERROR.getInt(); @@ -712,9 +573,8 @@ public static void main(String[] args) { ret = statusServiceDriver.run(options, watchMode ? watchTimeout : 0); currentState = statusServiceDriver.appStatusBuilder.getState(); try { - lastSummaryLogTime = LlapStatusServiceDriver - .maybeLogSummary(clock, lastSummaryLogTime, statusServiceDriver, - watchMode, watchTimeout, launchingState); + lastSummaryLogTime = LlapStatusServiceDriver.maybeLogSummary(clock, lastSummaryLogTime, + statusServiceDriver, watchMode, watchTimeout, launchingState); } catch (Exception e) { LOG.warn("Failed to log summary", e); } @@ -731,8 +591,7 @@ public static void main(String[] args) { if (currentState.equals(State.COMPLETE)) { if (launchingState != null || options.isLaunched()) { LOG.warn("COMPLETE state reached while waiting for RUNNING state. Failing."); - System.err.println("Final diagnostics: " + - statusServiceDriver.appStatusBuilder.getDiagnostics()); + System.err.println("Final diagnostics: " + statusServiceDriver.appStatusBuilder.getDiagnostics()); break; } else { LOG.info("Found a stopped application; assuming it was a previous attempt " @@ -740,19 +599,15 @@ public static void main(String[] args) { } } - if (!(currentState.equals(State.RUNNING_PARTIAL) || currentState.equals( - State.RUNNING_ALL))) { - if (LOG.isDebugEnabled()) { - LOG.debug( - "Current state: {}. Desired state: {}. {}/{} instances.", - currentState, - runningNodesThreshold == 1.0f ? - State.RUNNING_ALL : - State.RUNNING_PARTIAL, - statusServiceDriver.appStatusBuilder.getLiveInstances(), - statusServiceDriver.appStatusBuilder - .getDesiredInstances()); - } + if (!(currentState.equals(State.RUNNING_PARTIAL) || currentState.equals(State.RUNNING_ALL))) { + LOG.debug( + "Current state: {}. Desired state: {}. {}/{} instances.", + currentState, + runningNodesThreshold == 1.0f ? + State.RUNNING_ALL : + State.RUNNING_PARTIAL, + statusServiceDriver.appStatusBuilder.getLiveInstances(), + statusServiceDriver.appStatusBuilder.getDesiredInstances()); numAttempts--; continue; } @@ -763,17 +618,13 @@ public static void main(String[] args) { if (desiredInstances > 0) { final float ratio = (float) liveInstances / (float) desiredInstances; if (ratio < runningNodesThreshold) { - if (LOG.isDebugEnabled()) { - LOG.debug( - "Waiting until running nodes threshold is reached. Current: {} Desired: {}." + - " {}/{} instances.", - new DecimalFormat("#.###").format(ratio), - new DecimalFormat("#.###") - .format(runningNodesThreshold), - statusServiceDriver.appStatusBuilder.getLiveInstances(), - statusServiceDriver.appStatusBuilder - .getDesiredInstances()); - } + LOG.debug( + "Waiting until running nodes threshold is reached. Current: {} Desired: {}." + + " {}/{} instances.", + new DecimalFormat("#.###").format(ratio), + new DecimalFormat("#.###").format(runningNodesThreshold), + statusServiceDriver.appStatusBuilder.getLiveInstances(), + statusServiceDriver.appStatusBuilder.getDesiredInstances()); numAttempts--; continue; } else { @@ -805,9 +656,7 @@ public static void main(String[] args) { break; } // Log final state to CONSOLE_LOGGER - LlapStatusServiceDriver - .maybeLogSummary(clock, 0L, statusServiceDriver, - watchMode, watchTimeout, launchingState); + LlapStatusServiceDriver.maybeLogSummary(clock, 0L, statusServiceDriver, watchMode, watchTimeout, launchingState); CONSOLE_LOGGER.info("\n\n\n"); // print current state before exiting statusServiceDriver.outputJson(pw); @@ -820,30 +669,30 @@ public static void main(String[] args) { } catch (Throwable t) { logError(t); if (t instanceof LlapStatusCliException) { - LlapStatusCliException - ce = (LlapStatusCliException) t; + LlapStatusCliException ce = (LlapStatusCliException) t; ret = ce.getExitCode().getInt(); } else { ret = ExitCode.INTERNAL_ERROR.getInt(); } } finally { LOG.info("LLAP status finished"); + if (ret != ExitCode.SUCCESS.exitCode) { + LOG.error("LLAP did not start. Check the application log for more info:\n" + + "\tyarn logs --applicationId {} -out ", statusServiceDriver.applicationId); + } statusServiceDriver.close(); } - if (LOG.isDebugEnabled()) { - LOG.debug("Completed processing - exiting with " + ret); - } + LOG.debug("Completed processing - exiting with " + ret); + System.exit(ret); } - private static long maybeLogSummary(Clock clock, long lastSummaryLogTime, - LlapStatusServiceDriver statusServiceDriver, + private static long maybeLogSummary(Clock clock, long lastSummaryLogTime, LlapStatusServiceDriver statusServiceDriver, boolean watchMode, long watchTimeout, LlapStatusHelpers.State launchingState) { long currentTime = clock.getTime(); if (lastSummaryLogTime < currentTime - LOG_SUMMARY_INTERVAL) { String diagString = null; - if (launchingState == null && statusServiceDriver.appStatusBuilder.getState() == - State.COMPLETE && watchMode) { + if (launchingState == null && statusServiceDriver.appStatusBuilder.getState() == State.COMPLETE && watchMode) { // First known state was COMPLETED. Wait for the app launch to start. diagString = "Awaiting LLAP launch"; // Clear completed instances in this case. Don't want to provide information from the previous run. @@ -859,23 +708,106 @@ private static long maybeLogSummary(Clock clock, long lastSummaryLogTime, } else { CONSOLE_LOGGER.info("\nLLAPSTATUS"); } - CONSOLE_LOGGER.info( - "--------------------------------------------------------------------------------"); + CONSOLE_LOGGER.info("--------------------------------------------------------------------------------"); } CONSOLE_LOGGER.info(diagString); - CONSOLE_LOGGER.info( - "--------------------------------------------------------------------------------"); + CONSOLE_LOGGER.info("--------------------------------------------------------------------------------"); lastSummaryLogTime = currentTime; } return lastSummaryLogTime; } - private void close() { - if (serviceClient != null) { - serviceClient.stop(); + /** + * Helper method to construct a diagnostic message from a complete AppStatusBuilder. + */ + private static String constructDiagnostics(AppStatusBuilder appStatusBuilder) { + StringBuilder sb = new StringBuilder(); + + switch (appStatusBuilder.getState()) { + case APP_NOT_FOUND: + sb.append("LLAP status unknown. Awaiting app launch"); + break; + case LAUNCHING: + // This is a catch all state - when containers have not started yet, or LLAP has not started yet. + if (StringUtils.isNotBlank(appStatusBuilder.getAmInfo().getAppId())) { + sb.append("LLAP Starting up with AppId=").append(appStatusBuilder.getAmInfo().getAppId()).append("."); + if (appStatusBuilder.getDesiredInstances() != null) { + sb.append(" Started 0/").append(appStatusBuilder.getDesiredInstances()).append(" instances"); + } + + String containerDiagnostics = constructCompletedContainerDiagnostics( + appStatusBuilder.getCompletedInstances()); + if (StringUtils.isNotEmpty(containerDiagnostics)) { + sb.append("\n").append(containerDiagnostics); + } + } else { + sb.append("Awaiting LLAP startup"); + } + break; + case RUNNING_PARTIAL: + sb.append("LLAP Starting up with ApplicationId=").append(appStatusBuilder.getAmInfo().getAppId()); + sb.append(" Started").append(appStatusBuilder.getLiveInstances()).append("/") + .append(appStatusBuilder.getDesiredInstances()).append(" instances"); + String containerDiagnostics = constructCompletedContainerDiagnostics(appStatusBuilder.getCompletedInstances()); + if (StringUtils.isNotEmpty(containerDiagnostics)) { + sb.append("\n").append(containerDiagnostics); + } + + // TODO HIVE-15865: Include information about pending requests, and last + // allocation time once YARN Service provides this information. + break; + case RUNNING_ALL: + sb.append("LLAP Application running with ApplicationId=").append(appStatusBuilder.getAmInfo().getAppId()); + break; + case COMPLETE: + sb.append("LLAP Application already complete. ApplicationId=").append(appStatusBuilder.getAmInfo().getAppId()); + containerDiagnostics = constructCompletedContainerDiagnostics(appStatusBuilder.getCompletedInstances()); + if (StringUtils.isNotEmpty(containerDiagnostics)) { + sb.append("\n").append(containerDiagnostics); + } + + break; + case UNKNOWN: + sb.append("LLAP status unknown"); + break; } - if (llapRegistry != null) { - llapRegistry.stop(); + if (StringUtils.isNotBlank(appStatusBuilder.getDiagnostics())) { + sb.append("\n").append(appStatusBuilder.getDiagnostics()); + } + + return sb.toString(); + } + + private static String constructCompletedContainerDiagnostics(List completedInstances) { + StringBuilder sb = new StringBuilder(); + if (completedInstances == null || completedInstances.size() == 0) { + return ""; + } else { + // TODO HIVE-15865 Ideally sort these by completion time, once that is available. + boolean isFirst = true; + for (LlapInstance instance : completedInstances) { + if (!isFirst) { + sb.append("\n"); + } else { + isFirst = false; + } + + if (instance.getYarnContainerExitStatus() == ContainerExitStatus.KILLED_EXCEEDED_PMEM || + instance.getYarnContainerExitStatus() == ContainerExitStatus.KILLED_EXCEEDED_VMEM) { + sb.append("\tKILLED container (by YARN for exceeding memory limits): "); + } else { + // TODO HIVE-15865 Handle additional reasons like OS launch failed + sb.append("\tFAILED container: "); + } + sb.append(" ").append(instance.getContainerId()); + sb.append(", Logs at: ").append(instance.getLogUrl()); + } } + return sb.toString(); + } + + private static void logError(Throwable t) { + LOG.error("FAILED: " + t.getMessage(), t); + System.err.println("FAILED: " + t.getMessage()); } }