diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusOptionsProcessor.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusOptionsProcessor.java index e88c819..dca0c7b 100644 --- a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusOptionsProcessor.java +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusOptionsProcessor.java @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.llap.cli; -import java.util.Arrays; import java.util.Properties; import java.util.concurrent.TimeUnit; @@ -29,23 +28,29 @@ import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; +import com.google.common.annotations.VisibleForTesting; + public class LlapStatusOptionsProcessor { private static final String LLAPSTATUS_CONSTANT = "llapstatus"; - private static final long FIND_YARN_APP_TIMEOUT_MS = 20 * 1000l; // 20seconds to wait for app to be visible - - private static final long DEFAULT_STATUS_REFRESH_INTERVAL_MS = 1 * 1000l; // 1 seconds wait until subsequent status - private static final long DEFAULT_WATCH_MODE_TIMEOUT_MS = 5 * 60 * 1000l; // 5 minutes timeout for watch mode - private static final float DEFAULT_RUNNING_NODES_THRESHOLD = 1.0f; + @VisibleForTesting + public static final long FIND_YARN_APP_TIMEOUT_MS = 20 * 1000l; // 20seconds to wait for app to be visible + @VisibleForTesting + public static final long DEFAULT_STATUS_REFRESH_INTERVAL_MS = 1 * 1000l; // 1 seconds wait until subsequent status + @VisibleForTesting + public static final long DEFAULT_WATCH_MODE_TIMEOUT_MS = 5 * 60 * 1000l; // 5 minutes timeout for watch mode + @VisibleForTesting + public static final float DEFAULT_RUNNING_NODES_THRESHOLD = 1.0f; // TODO: why doesn't this use one of the existing options implementations?! enum OptionConstants { NAME("name", 'n', "LLAP cluster name", true), FIND_APP_TIMEOUT("findAppTimeout", 'f', - "Amount of time(s) that the tool will sleep to wait for the YARN application to start. negative values=wait forever, 0=Do not wait. default=" + - TimeUnit.SECONDS.convert(FIND_YARN_APP_TIMEOUT_MS, TimeUnit.MILLISECONDS) + "s", true), + "Amount of time(s) that the tool will sleep to wait for the YARN application to start. negative values=wait " + + "forever, 0=Do not wait. default=" + TimeUnit.SECONDS.convert(FIND_YARN_APP_TIMEOUT_MS, TimeUnit.MILLISECONDS) + + "s", true), OUTPUT_FILE("outputFile", 'o', "File to which output should be written (Default stdout)", true), WATCH_MODE("watch", 'w', "Watch mode waits until all LLAP daemons are running or subset of the nodes are " + "running (threshold can be specified via -r option) (Default wait until all nodes are running)", false), @@ -59,7 +64,8 @@ " in watch mode. Valid only for watch mode. (Default " + TimeUnit.SECONDS.convert(DEFAULT_STATUS_REFRESH_INTERVAL_MS, TimeUnit.MILLISECONDS) + "s)", true), WATCH_MODE_TIMEOUT("watchTimeout", 't', "Exit watch mode if the desired state is not attained until the specified" + - " timeout. (Default " + TimeUnit.SECONDS.convert(DEFAULT_WATCH_MODE_TIMEOUT_MS, TimeUnit.MILLISECONDS) +"s)", true), + " timeout. (Default " + TimeUnit.SECONDS.convert(DEFAULT_WATCH_MODE_TIMEOUT_MS, TimeUnit.MILLISECONDS) +"s)", + true), HIVECONF("hiveconf", null, "Use value for given property. Overridden by explicit parameters", "property=value", 2), HELP("help", 'H', "Print help information", false); @@ -238,7 +244,7 @@ public LlapStatusOptions processOptions(String[] args) throws ParseException { } watchTimeoutMs = TimeUnit.MILLISECONDS.convert(watchTimeoutSec, TimeUnit.SECONDS); } - + boolean isLaunched = !commandLine.hasOption(OptionConstants.NOT_LAUNCHED.getLongOpt()); float runningNodesThreshold = DEFAULT_RUNNING_NODES_THRESHOLD; diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java index e0ada45..a521799 100644 --- a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java @@ -64,13 +64,13 @@ import org.slf4j.LoggerFactory; public class LlapStatusServiceDriver { + private static final Logger LOG = LoggerFactory.getLogger(LlapStatusServiceDriver.class); + private static final Logger CONSOLE_LOGGER = LoggerFactory.getLogger("LlapStatusServiceDriverConsole"); private static final EnumSet NO_YARN_SERVICE_INFO_STATES = EnumSet.of( State.APP_NOT_FOUND, State.COMPLETE, State.LAUNCHING); private static final EnumSet LAUNCHING_STATES = EnumSet.of( State.LAUNCHING, State.RUNNING_PARTIAL, State.RUNNING_ALL); - private static final Logger LOG = LoggerFactory.getLogger(LlapStatusServiceDriver.class); - private static final Logger CONSOLE_LOGGER = LoggerFactory.getLogger("LlapStatusServiceDriverConsole"); // Defining a bunch of configs here instead of in HiveConf. These are experimental, and mainly // for use when retry handling is fixed in Yarn/Hadoop @@ -80,18 +80,15 @@ // The following two keys should ideally be used to control RM connect timeouts. However, // they don't seem to work. The IPC timeout needs to be set instead. @InterfaceAudience.Private - private static final String CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS = - CONF_PREFIX + "yarn.rm.connect.max-wait-ms"; + private static final String CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS = CONF_PREFIX + "yarn.rm.connect.max-wait-ms"; private static final long CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS_DEFAULT = 10000l; @InterfaceAudience.Private - private static final String CONFIG_YARN_RM_RETRY_INTERVAL_MS = - CONF_PREFIX + "yarn.rm.connect.retry-interval.ms"; + private static final String CONFIG_YARN_RM_RETRY_INTERVAL_MS = CONF_PREFIX + "yarn.rm.connect.retry-interval.ms"; private static final long CONFIG_YARN_RM_RETRY_INTERVAL_MS_DEFAULT = 5000l; // As of Hadoop 2.7 - this is what controls the RM timeout. @InterfaceAudience.Private - private static final String CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES = - CONF_PREFIX + "ipc.client.max-retries"; + private static final String CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES = CONF_PREFIX + "ipc.client.max-retries"; private static final int CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT = 2; @InterfaceAudience.Private private static final String CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS = @@ -107,22 +104,20 @@ private static final String CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC_DEFAULT = "2000, 1"; - private static final String CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS = - CONF_PREFIX + "zk-registry.timeout-ms"; + private static final String CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS = CONF_PREFIX + "zk-registry.timeout-ms"; private static final long CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS_DEFAULT = 20000l; private static final long LOG_SUMMARY_INTERVAL = 15000L; // Log summary every ~15 seconds. - private static final String LLAP_KEY = "llap"; + private final Configuration conf; - private final Clock clock = new SystemClock(); private String appName = null; + private String applicationId = null; private ServiceClient serviceClient = null; private Configuration llapRegistryConf = null; private LlapRegistryService llapRegistry = null; - @VisibleForTesting - AppStatusBuilder appStatusBuilder; + private AppStatusBuilder appStatusBuilder; public LlapStatusServiceDriver() { SessionState ss = SessionState.get(); @@ -145,21 +140,17 @@ private void setupConf() { CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC_DEFAULT)); conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS, - conf.getLong(CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS, - CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS_DEFAULT)); + conf.getLong(CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS, CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS_DEFAULT)); conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS, - conf.getLong(CONFIG_YARN_RM_RETRY_INTERVAL_MS, CONFIG_YARN_RM_RETRY_INTERVAL_MS_DEFAULT)); + conf.getLong(CONFIG_YARN_RM_RETRY_INTERVAL_MS, CONFIG_YARN_RM_RETRY_INTERVAL_MS_DEFAULT)); conf.setInt(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, - conf.getInt(CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES, - CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT)); + conf.getInt(CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES, CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT)); conf.setLong(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_RETRY_INTERVAL_KEY, - conf.getLong(CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS, - CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS_DEFAULT)); + conf.getLong(CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS, CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS_DEFAULT)); - HiveConf.setVar(conf, HiveConf.ConfVars.HIVE_ZOOKEEPER_SESSION_TIMEOUT, (conf - .getLong(CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS, CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS_DEFAULT) + - "ms")); + HiveConf.setVar(conf, HiveConf.ConfVars.HIVE_ZOOKEEPER_SESSION_TIMEOUT, ( + conf.getLong(CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS, CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS_DEFAULT) + "ms")); llapRegistryConf = new Configuration(conf); } @@ -167,11 +158,10 @@ private void setupConf() { /** * Parse command line options. * - * @param args * @return command line options. */ - public LlapStatusOptions parseOptions(String[] args) throws - LlapStatusCliException { + @VisibleForTesting + public LlapStatusOptions parseOptions(String[] args) throws LlapStatusCliException { LlapStatusOptionsProcessor optionsProcessor = new LlapStatusOptionsProcessor(); LlapStatusOptions options; @@ -211,9 +201,7 @@ public int run(LlapStatusOptions options, long watchTimeoutMs) { LOG.info(message); return ExitCode.INCORRECT_USAGE.getInt(); } - if (LOG.isDebugEnabled()) { - LOG.debug("Using appName: {}", appName); - } + LOG.debug("Using appName: {}", appName); llapRegistryConf.set(HiveConf.ConfVars.LLAP_DAEMON_SERVICE_HOSTS.varname, "@" + appName); } @@ -224,8 +212,7 @@ public int run(LlapStatusOptions options, long watchTimeoutMs) { } } catch (Exception e) { LlapStatusCliException le = new LlapStatusCliException( - LlapStatusServiceDriver.ExitCode.SERVICE_CLIENT_ERROR_CREATE_FAILED, - "Failed to create service client", e); + LlapStatusServiceDriver.ExitCode.SERVICE_CLIENT_ERROR_CREATE_FAILED, "Failed to create service client", e); logError(le); return le.getExitCode().getInt(); } @@ -233,8 +220,7 @@ public int run(LlapStatusOptions options, long watchTimeoutMs) { // Get the App report from YARN ApplicationReport appReport; try { - appReport = LlapSliderUtils.getAppReport(appName, serviceClient, - options.getFindAppTimeoutMs()); + appReport = LlapSliderUtils.getAppReport(appName, serviceClient, options.getFindAppTimeoutMs()); } catch (LlapStatusCliException e) { logError(e); return e.getExitCode().getInt(); @@ -256,11 +242,9 @@ public int run(LlapStatusOptions options, long watchTimeoutMs) { } else { // Get information from YARN Service try { - ret = populateAppStatusFromServiceStatus(appName, serviceClient, - appStatusBuilder); + ret = populateAppStatusFromServiceStatus(appName, serviceClient, appStatusBuilder); } catch (LlapStatusCliException e) { - // In case of failure, send back whatever is constructed so far - - // which would be from the AppReport + // In case of failure, send back whatever is constructed so far - which would be from the AppReport logError(e); return e.getExitCode().getInt(); } @@ -279,14 +263,11 @@ public int run(LlapStatusOptions options, long watchTimeoutMs) { return ret.getInt(); } finally { - if (LOG.isDebugEnabled()) { - LOG.debug("Final AppState: " + appStatusBuilder.toString()); - } + LOG.debug("Final AppState: " + appStatusBuilder.toString()); } } - public void outputJson(PrintWriter writer) throws - LlapStatusCliException { + public void outputJson(PrintWriter writer) throws LlapStatusCliException { ObjectMapper mapper = new ObjectMapper(); mapper.configure(SerializationConfig.Feature.FAIL_ON_EMPTY_BEANS, false); mapper.setSerializationInclusion(JsonSerialize.Inclusion.NON_NULL); @@ -295,28 +276,26 @@ public void outputJson(PrintWriter writer) throws writer.println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(appStatusBuilder)); } catch (IOException e) { LOG.warn("Failed to create JSON", e); - throw new LlapStatusCliException(ExitCode.LLAP_JSON_GENERATION_ERROR, "Failed to create JSON", - e); + throw new LlapStatusCliException(ExitCode.LLAP_JSON_GENERATION_ERROR, "Failed to create JSON", e); } } /** * Populates parts of the AppStatus * - * @param appReport - * @param appStatusBuilder * @return an ExitCode. An ExitCode other than ExitCode.SUCCESS implies future progress not possible * @throws LlapStatusCliException */ - private ExitCode processAppReport(ApplicationReport appReport, - AppStatusBuilder appStatusBuilder) throws - LlapStatusCliException { + private ExitCode processAppReport(ApplicationReport appReport, AppStatusBuilder appStatusBuilder) + throws LlapStatusCliException { if (appReport == null) { appStatusBuilder.setState(State.APP_NOT_FOUND); LOG.info("No Application Found"); return ExitCode.SUCCESS; } + applicationId = appReport.getApplicationId().toString(); + // TODO Maybe add the YARN URL for the app. appStatusBuilder.setAmInfo( new LlapStatusHelpers.AmInfo().setAppName(appReport.getName()).setAppType(appReport.getApplicationType())); @@ -328,17 +307,17 @@ private ExitCode processAppReport(ApplicationReport appReport, appStatusBuilder.setState(State.LAUNCHING); return ExitCode.SUCCESS; case ACCEPTED: - appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(appReport.getApplicationId().toString()); + appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(applicationId); appStatusBuilder.setState(State.LAUNCHING); return ExitCode.SUCCESS; case RUNNING: - appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(appReport.getApplicationId().toString()); + appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(applicationId); // If the app state is running, get additional information from YARN Service return ExitCode.SUCCESS; case FINISHED: case FAILED: case KILLED: - appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(appReport.getApplicationId().toString()); + appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(applicationId); appStatusBuilder.setAppFinishTime(appReport.getFinishTime()); appStatusBuilder.setState(State.COMPLETE); // add log links and other diagnostics from YARN Service @@ -352,33 +331,23 @@ private ExitCode processAppReport(ApplicationReport appReport, /** * Populates information from YARN Service Status. * - * @param appName - * @param serviceClient - * @param appStatusBuilder - * @return an ExitCode. An ExitCode other than ExitCode.SUCCESS implies future - * progress not possible + * @return an ExitCode. An ExitCode other than ExitCode.SUCCESS implies future progress not possible * @throws LlapStatusCliException */ - private ExitCode populateAppStatusFromServiceStatus(String appName, - ServiceClient serviceClient, AppStatusBuilder appStatusBuilder) - throws LlapStatusCliException { + private ExitCode populateAppStatusFromServiceStatus(String appName, ServiceClient serviceClient, + AppStatusBuilder appStatusBuilder) throws LlapStatusCliException { ExitCode exitCode = ExitCode.YARN_ERROR; try { Service service = serviceClient.getStatus(appName); if (service != null) { // How to get config paths and AmInfo ServiceState state = service.getState(); - appStatusBuilder.setAppStartTime(service.getLaunchTime() == null ? 0 - : service.getLaunchTime().getTime()); - appStatusBuilder.setDesiredInstances( - service.getComponent(LLAP_KEY).getNumberOfContainers() == null ? 0 - : service.getComponent(LLAP_KEY).getNumberOfContainers() - .intValue()); - appStatusBuilder.setLiveInstances( - service.getComponent(LLAP_KEY).getContainers().size()); + appStatusBuilder.setAppStartTime(service.getLaunchTime() == null ? 0 : service.getLaunchTime().getTime()); + appStatusBuilder.setDesiredInstances(service.getComponent(LLAP_KEY).getNumberOfContainers() == null ? 0 + : service.getComponent(LLAP_KEY).getNumberOfContainers().intValue()); + appStatusBuilder.setLiveInstances(service.getComponent(LLAP_KEY).getContainers().size()); for (Container cont : service.getComponent(LLAP_KEY).getContainers()) { - LlapInstance llapInstance = new LlapInstance(cont.getHostname(), - cont.getId()); + LlapInstance llapInstance = new LlapInstance(cont.getHostname(), cont.getId()); appStatusBuilder.addNewRunningLlapInstance(llapInstance); } if (state == ServiceState.STABLE) { @@ -389,8 +358,7 @@ private ExitCode populateAppStatusFromServiceStatus(String appName, } } catch (IOException | YarnException e) { LlapStatusCliException le = new LlapStatusCliException( - LlapStatusServiceDriver.ExitCode.SERVICE_CLIENT_ERROR_OTHER, - "Failed to get service status", e); + LlapStatusServiceDriver.ExitCode.SERVICE_CLIENT_ERROR_OTHER, "Failed to get service status", e); logError(le); exitCode = le.getExitCode(); } @@ -400,13 +368,11 @@ private ExitCode populateAppStatusFromServiceStatus(String appName, /** * Populate additional information for containers from the LLAP registry. Must be invoked * after YARN Service status and diagnostics. - * @param appStatusBuilder * @return an ExitCode. An ExitCode other than ExitCode.SUCCESS implies future progress not possible * @throws LlapStatusCliException */ - private ExitCode populateAppStatusFromLlapRegistry( - AppStatusBuilder appStatusBuilder, long watchTimeoutMs) throws - LlapStatusCliException { + private ExitCode populateAppStatusFromLlapRegistry(AppStatusBuilder appStatusBuilder, long watchTimeoutMs) + throws LlapStatusCliException { if (llapRegistry == null) { try { @@ -425,9 +391,7 @@ private ExitCode populateAppStatusFromLlapRegistry( } if (serviceInstances == null || serviceInstances.isEmpty()) { - if (LOG.isDebugEnabled()) { - LOG.debug("No information found in the LLAP registry"); - } + LOG.debug("No information found in the LLAP registry"); appStatusBuilder.setLiveInstances(0); appStatusBuilder.setState(State.LAUNCHING); appStatusBuilder.clearRunningLlapInstances(); @@ -461,8 +425,8 @@ private ExitCode populateAppStatusFromLlapRegistry( appStatusBuilder.setLiveInstances(validatedInstances.size()); appStatusBuilder.setLaunchingInstances(llapExtraInstances.size()); - if (appStatusBuilder.getDesiredInstances() != null && validatedInstances - .size() >= appStatusBuilder.getDesiredInstances()) { + if (appStatusBuilder.getDesiredInstances() != null && + validatedInstances.size() >= appStatusBuilder.getDesiredInstances()) { appStatusBuilder.setState(State.RUNNING_ALL); if (validatedInstances.size() > appStatusBuilder.getDesiredInstances()) { LOG.warn("Found more entries in LLAP registry, as compared to desired entries"); @@ -493,107 +457,13 @@ private ExitCode populateAppStatusFromLlapRegistry( return ExitCode.SUCCESS; } - private static String constructCompletedContainerDiagnostics(List completedInstances) { - StringBuilder sb = new StringBuilder(); - if (completedInstances == null || completedInstances.size() == 0) { - return ""; - } else { - // TODO HIVE-15865 Ideally sort these by completion time, once that is available. - boolean isFirst = true; - for (LlapInstance instance : completedInstances) { - if (!isFirst) { - sb.append("\n"); - } else { - isFirst = false; - } - - if (instance.getYarnContainerExitStatus() == - ContainerExitStatus.KILLED_EXCEEDED_PMEM || - instance.getYarnContainerExitStatus() == - ContainerExitStatus.KILLED_EXCEEDED_VMEM) { - sb.append("\tKILLED container (by YARN for exceeding memory limits): "); - } else { - // TODO HIVE-15865 Handle additional reasons like OS launch failed - sb.append("\tFAILED container: "); - } - sb.append(" ").append(instance.getContainerId()); - sb.append(", Logs at: ").append(instance.getLogUrl()); - } - } - return sb.toString(); - } - - /** - * Helper method to construct a diagnostic message from a complete - * AppStatusBuilder. - * - * @return - */ - private static String constructDiagnostics( - AppStatusBuilder appStatusBuilder) { - StringBuilder sb = new StringBuilder(); - - switch (appStatusBuilder.getState()) { - case APP_NOT_FOUND: - sb.append("LLAP status unknown. Awaiting app launch"); - break; - case LAUNCHING: - // This is a catch all state - when containers have not started yet, or LLAP has not started yet. - if (StringUtils.isNotBlank(appStatusBuilder.getAmInfo().getAppId())) { - sb.append("LLAP Starting up with AppId=") - .append(appStatusBuilder.getAmInfo().getAppId()).append("."); - if (appStatusBuilder.getDesiredInstances() != null) { - sb.append(" Started 0/").append(appStatusBuilder.getDesiredInstances()).append(" instances"); - } - - String containerDiagnostics = constructCompletedContainerDiagnostics( - appStatusBuilder.getCompletedInstances()); - if (StringUtils.isNotEmpty(containerDiagnostics)) { - sb.append("\n").append(containerDiagnostics); - } - } else { - sb.append("Awaiting LLAP startup"); - } - break; - case RUNNING_PARTIAL: - sb.append("LLAP Starting up with ApplicationId=") - .append(appStatusBuilder.getAmInfo().getAppId()); - sb.append(" Started").append(appStatusBuilder.getLiveInstances()) - .append("/").append(appStatusBuilder.getDesiredInstances()) - .append(" instances"); - String containerDiagnostics = constructCompletedContainerDiagnostics( - appStatusBuilder.getCompletedInstances()); - if (StringUtils.isNotEmpty(containerDiagnostics)) { - sb.append("\n").append(containerDiagnostics); - } - - // TODO HIVE-15865: Include information about pending requests, and last - // allocation time once YARN Service provides this information. - break; - case RUNNING_ALL: - sb.append("LLAP Application running with ApplicationId=") - .append(appStatusBuilder.getAmInfo().getAppId()); - break; - case COMPLETE: - - sb.append("LLAP Application already complete. ApplicationId=") - .append(appStatusBuilder.getAmInfo().getAppId()); - containerDiagnostics = constructCompletedContainerDiagnostics( - appStatusBuilder.getCompletedInstances()); - if (StringUtils.isNotEmpty(containerDiagnostics)) { - sb.append("\n").append(containerDiagnostics); - } - - break; - case UNKNOWN: - sb.append("LLAP status unknown"); - break; + private void close() { + if (serviceClient != null) { + serviceClient.stop(); } - if (StringUtils.isNotBlank(appStatusBuilder.getDiagnostics())) { - sb.append("\n").append(appStatusBuilder.getDiagnostics()); + if (llapRegistry != null) { + llapRegistry.stop(); } - - return sb.toString(); } public enum ExitCode { @@ -618,18 +488,16 @@ public int getInt() { } } - public static class LlapStatusCliException extends Exception { final LlapStatusServiceDriver.ExitCode exitCode; - public LlapStatusCliException(LlapStatusServiceDriver.ExitCode exitCode, String message) { super(exitCode.getInt() +": " + message); this.exitCode = exitCode; } public LlapStatusCliException(LlapStatusServiceDriver.ExitCode exitCode, String message, Throwable cause) { - super(message, cause); + super(exitCode.getInt() +": " + message, cause); this.exitCode = exitCode; } @@ -638,18 +506,10 @@ public LlapStatusCliException(LlapStatusServiceDriver.ExitCode exitCode, String } } - - private static void logError(Throwable t) { - LOG.error("FAILED: " + t.getMessage(), t); - System.err.println("FAILED: " + t.getMessage()); - } - - public static void main(String[] args) { LOG.info("LLAP status invoked with arguments = {}", Arrays.toString(args)); int ret = ExitCode.SUCCESS.getInt(); Clock clock = SystemClock.getInstance(); - long startTime = clock.getTime(); long lastSummaryLogTime = -1; LlapStatusServiceDriver statusServiceDriver = null; @@ -661,8 +521,7 @@ public static void main(String[] args) { statusServiceDriver.close(); logError(t); if (t instanceof LlapStatusCliException) { - LlapStatusCliException - ce = (LlapStatusCliException) t; + LlapStatusCliException ce = (LlapStatusCliException) t; ret = ce.getExitCode().getInt(); } else { ret = ExitCode.INTERNAL_ERROR.getInt(); @@ -691,9 +550,9 @@ public static void main(String[] args) { LOG.info("Configured refresh interval: {}s. Watch timeout: {}s. Attempts remaining: {}." + " Watch mode: {}. Running nodes threshold: {}.", - TimeUnit.SECONDS.convert(refreshInterval, TimeUnit.MILLISECONDS), - TimeUnit.SECONDS.convert(watchTimeout, TimeUnit.MILLISECONDS), - numAttempts, watchMode, new DecimalFormat("#.###").format(runningNodesThreshold)); + TimeUnit.SECONDS.convert(refreshInterval, TimeUnit.MILLISECONDS), + TimeUnit.SECONDS.convert(watchTimeout, TimeUnit.MILLISECONDS), + numAttempts, watchMode, new DecimalFormat("#.###").format(runningNodesThreshold)); while (numAttempts > 0) { if (!firstAttempt) { if (watchMode) { @@ -712,9 +571,8 @@ public static void main(String[] args) { ret = statusServiceDriver.run(options, watchMode ? watchTimeout : 0); currentState = statusServiceDriver.appStatusBuilder.getState(); try { - lastSummaryLogTime = LlapStatusServiceDriver - .maybeLogSummary(clock, lastSummaryLogTime, statusServiceDriver, - watchMode, watchTimeout, launchingState); + lastSummaryLogTime = LlapStatusServiceDriver.maybeLogSummary(clock, lastSummaryLogTime, + statusServiceDriver, watchMode, watchTimeout, launchingState); } catch (Exception e) { LOG.warn("Failed to log summary", e); } @@ -731,8 +589,7 @@ public static void main(String[] args) { if (currentState.equals(State.COMPLETE)) { if (launchingState != null || options.isLaunched()) { LOG.warn("COMPLETE state reached while waiting for RUNNING state. Failing."); - System.err.println("Final diagnostics: " + - statusServiceDriver.appStatusBuilder.getDiagnostics()); + System.err.println("Final diagnostics: " + statusServiceDriver.appStatusBuilder.getDiagnostics()); break; } else { LOG.info("Found a stopped application; assuming it was a previous attempt " @@ -740,19 +597,15 @@ public static void main(String[] args) { } } - if (!(currentState.equals(State.RUNNING_PARTIAL) || currentState.equals( - State.RUNNING_ALL))) { - if (LOG.isDebugEnabled()) { - LOG.debug( - "Current state: {}. Desired state: {}. {}/{} instances.", - currentState, - runningNodesThreshold == 1.0f ? - State.RUNNING_ALL : - State.RUNNING_PARTIAL, - statusServiceDriver.appStatusBuilder.getLiveInstances(), - statusServiceDriver.appStatusBuilder - .getDesiredInstances()); - } + if (!(currentState.equals(State.RUNNING_PARTIAL) || currentState.equals(State.RUNNING_ALL))) { + LOG.debug( + "Current state: {}. Desired state: {}. {}/{} instances.", + currentState, + runningNodesThreshold == 1.0f ? + State.RUNNING_ALL : + State.RUNNING_PARTIAL, + statusServiceDriver.appStatusBuilder.getLiveInstances(), + statusServiceDriver.appStatusBuilder.getDesiredInstances()); numAttempts--; continue; } @@ -763,17 +616,13 @@ public static void main(String[] args) { if (desiredInstances > 0) { final float ratio = (float) liveInstances / (float) desiredInstances; if (ratio < runningNodesThreshold) { - if (LOG.isDebugEnabled()) { - LOG.debug( - "Waiting until running nodes threshold is reached. Current: {} Desired: {}." + - " {}/{} instances.", - new DecimalFormat("#.###").format(ratio), - new DecimalFormat("#.###") - .format(runningNodesThreshold), - statusServiceDriver.appStatusBuilder.getLiveInstances(), - statusServiceDriver.appStatusBuilder - .getDesiredInstances()); - } + LOG.debug( + "Waiting until running nodes threshold is reached. Current: {} Desired: {}." + + " {}/{} instances.", + new DecimalFormat("#.###").format(ratio), + new DecimalFormat("#.###").format(runningNodesThreshold), + statusServiceDriver.appStatusBuilder.getLiveInstances(), + statusServiceDriver.appStatusBuilder.getDesiredInstances()); numAttempts--; continue; } else { @@ -805,9 +654,7 @@ public static void main(String[] args) { break; } // Log final state to CONSOLE_LOGGER - LlapStatusServiceDriver - .maybeLogSummary(clock, 0L, statusServiceDriver, - watchMode, watchTimeout, launchingState); + LlapStatusServiceDriver.maybeLogSummary(clock, 0L, statusServiceDriver, watchMode, watchTimeout, launchingState); CONSOLE_LOGGER.info("\n\n\n"); // print current state before exiting statusServiceDriver.outputJson(pw); @@ -815,35 +662,35 @@ public static void main(String[] args) { pw.flush(); if (numAttempts == 0 && watchMode && !desiredStateAttained) { LOG.warn("Watch timeout {}s exhausted before desired state RUNNING is attained.", - TimeUnit.SECONDS.convert(watchTimeout, TimeUnit.MILLISECONDS)); + TimeUnit.SECONDS.convert(watchTimeout, TimeUnit.MILLISECONDS)); } } catch (Throwable t) { logError(t); if (t instanceof LlapStatusCliException) { - LlapStatusCliException - ce = (LlapStatusCliException) t; + LlapStatusCliException ce = (LlapStatusCliException) t; ret = ce.getExitCode().getInt(); } else { ret = ExitCode.INTERNAL_ERROR.getInt(); } } finally { LOG.info("LLAP status finished"); + if (ret != ExitCode.SUCCESS.exitCode) { + LOG.error("LLAP did not start. Check the application log for more info:\n" + + "\tyarn logs --applicationId {} -out ", statusServiceDriver.applicationId); + } statusServiceDriver.close(); } - if (LOG.isDebugEnabled()) { - LOG.debug("Completed processing - exiting with " + ret); - } + LOG.debug("Completed processing - exiting with " + ret); + System.exit(ret); } - private static long maybeLogSummary(Clock clock, long lastSummaryLogTime, - LlapStatusServiceDriver statusServiceDriver, + private static long maybeLogSummary(Clock clock, long lastSummaryLogTime, LlapStatusServiceDriver statusServiceDriver, boolean watchMode, long watchTimeout, LlapStatusHelpers.State launchingState) { long currentTime = clock.getTime(); if (lastSummaryLogTime < currentTime - LOG_SUMMARY_INTERVAL) { String diagString = null; - if (launchingState == null && statusServiceDriver.appStatusBuilder.getState() == - State.COMPLETE && watchMode) { + if (launchingState == null && statusServiceDriver.appStatusBuilder.getState() == State.COMPLETE && watchMode) { // First known state was COMPLETED. Wait for the app launch to start. diagString = "Awaiting LLAP launch"; // Clear completed instances in this case. Don't want to provide information from the previous run. @@ -859,23 +706,106 @@ private static long maybeLogSummary(Clock clock, long lastSummaryLogTime, } else { CONSOLE_LOGGER.info("\nLLAPSTATUS"); } - CONSOLE_LOGGER.info( - "--------------------------------------------------------------------------------"); + CONSOLE_LOGGER.info("--------------------------------------------------------------------------------"); } CONSOLE_LOGGER.info(diagString); - CONSOLE_LOGGER.info( - "--------------------------------------------------------------------------------"); + CONSOLE_LOGGER.info("--------------------------------------------------------------------------------"); lastSummaryLogTime = currentTime; } return lastSummaryLogTime; } - private void close() { - if (serviceClient != null) { - serviceClient.stop(); + /** + * Helper method to construct a diagnostic message from a complete AppStatusBuilder. + */ + private static String constructDiagnostics(AppStatusBuilder appStatusBuilder) { + StringBuilder sb = new StringBuilder(); + + switch (appStatusBuilder.getState()) { + case APP_NOT_FOUND: + sb.append("LLAP status unknown. Awaiting app launch"); + break; + case LAUNCHING: + // This is a catch all state - when containers have not started yet, or LLAP has not started yet. + if (StringUtils.isNotBlank(appStatusBuilder.getAmInfo().getAppId())) { + sb.append("LLAP Starting up with AppId=").append(appStatusBuilder.getAmInfo().getAppId()).append("."); + if (appStatusBuilder.getDesiredInstances() != null) { + sb.append(" Started 0/").append(appStatusBuilder.getDesiredInstances()).append(" instances"); + } + + String containerDiagnostics = constructCompletedContainerDiagnostics( + appStatusBuilder.getCompletedInstances()); + if (StringUtils.isNotEmpty(containerDiagnostics)) { + sb.append("\n").append(containerDiagnostics); + } + } else { + sb.append("Awaiting LLAP startup"); + } + break; + case RUNNING_PARTIAL: + sb.append("LLAP Starting up with ApplicationId=").append(appStatusBuilder.getAmInfo().getAppId()); + sb.append(" Started").append(appStatusBuilder.getLiveInstances()).append("/") + .append(appStatusBuilder.getDesiredInstances()).append(" instances"); + String containerDiagnostics = constructCompletedContainerDiagnostics(appStatusBuilder.getCompletedInstances()); + if (StringUtils.isNotEmpty(containerDiagnostics)) { + sb.append("\n").append(containerDiagnostics); + } + + // TODO HIVE-15865: Include information about pending requests, and last + // allocation time once YARN Service provides this information. + break; + case RUNNING_ALL: + sb.append("LLAP Application running with ApplicationId=").append(appStatusBuilder.getAmInfo().getAppId()); + break; + case COMPLETE: + sb.append("LLAP Application already complete. ApplicationId=").append(appStatusBuilder.getAmInfo().getAppId()); + containerDiagnostics = constructCompletedContainerDiagnostics(appStatusBuilder.getCompletedInstances()); + if (StringUtils.isNotEmpty(containerDiagnostics)) { + sb.append("\n").append(containerDiagnostics); + } + + break; + case UNKNOWN: + sb.append("LLAP status unknown"); + break; } - if (llapRegistry != null) { - llapRegistry.stop(); + if (StringUtils.isNotBlank(appStatusBuilder.getDiagnostics())) { + sb.append("\n").append(appStatusBuilder.getDiagnostics()); } + + return sb.toString(); + } + + private static String constructCompletedContainerDiagnostics(List completedInstances) { + StringBuilder sb = new StringBuilder(); + if (completedInstances == null || completedInstances.size() == 0) { + return ""; + } else { + // TODO HIVE-15865 Ideally sort these by completion time, once that is available. + boolean isFirst = true; + for (LlapInstance instance : completedInstances) { + if (!isFirst) { + sb.append("\n"); + } else { + isFirst = false; + } + + if (instance.getYarnContainerExitStatus() == ContainerExitStatus.KILLED_EXCEEDED_PMEM || + instance.getYarnContainerExitStatus() == ContainerExitStatus.KILLED_EXCEEDED_VMEM) { + sb.append("\tKILLED container (by YARN for exceeding memory limits): "); + } else { + // TODO HIVE-15865 Handle additional reasons like OS launch failed + sb.append("\tFAILED container: "); + } + sb.append(" ").append(instance.getContainerId()); + sb.append(", Logs at: ").append(instance.getLogUrl()); + } + } + return sb.toString(); + } + + private static void logError(Throwable t) { + LOG.error("FAILED: " + t.getMessage(), t); + System.err.println("FAILED: " + t.getMessage()); } } diff --git a/llap-server/src/test/org/apache/hadoop/hive/llap/cli/TestLlapStatusServiceDriver.java b/llap-server/src/test/org/apache/hadoop/hive/llap/cli/TestLlapStatusServiceDriver.java new file mode 100644 index 0000000..54166d5 --- /dev/null +++ b/llap-server/src/test/org/apache/hadoop/hive/llap/cli/TestLlapStatusServiceDriver.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hadoop.hive.llap.cli; + +import org.apache.hadoop.hive.llap.cli.LlapStatusOptionsProcessor.LlapStatusOptions; +import org.apache.hadoop.hive.llap.cli.LlapStatusServiceDriver.ExitCode; +import org.apache.hadoop.hive.llap.cli.LlapStatusServiceDriver.LlapStatusCliException; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +import static junit.framework.TestCase.assertEquals; + +import java.util.Properties; + +// TODO: write unit tests for the main logic of this class - needs refactoring first, current design isn't testable. +public class TestLlapStatusServiceDriver { + @Rule + public ExpectedException thrown = ExpectedException.none(); + + @Test + public void testArgumentParsingDefault() throws LlapStatusCliException { + LlapStatusServiceDriver driver = new LlapStatusServiceDriver(); + LlapStatusOptions parseOptions = driver.parseOptions(new String[] {}); + + assertEquals("findAppTimeout should be the default value if not specified otherwise", + parseOptions.getFindAppTimeoutMs(), LlapStatusOptionsProcessor.FIND_YARN_APP_TIMEOUT_MS); + + assertEquals("refreshInterval should be the default value if not specified otherwise", + parseOptions.getRefreshIntervalMs(), LlapStatusOptionsProcessor.DEFAULT_STATUS_REFRESH_INTERVAL_MS); + + assertEquals("watchTimeout should be the default value if not specified otherwise", + parseOptions.getWatchTimeoutMs(), LlapStatusOptionsProcessor.DEFAULT_WATCH_MODE_TIMEOUT_MS); + + assertEquals("runningNodesThreshold should be the default value if not specified otherwise", + parseOptions.getRunningNodesThreshold(), LlapStatusOptionsProcessor.DEFAULT_RUNNING_NODES_THRESHOLD); + + assertEquals("hiveConf should be empty properties if not specified otherwise", parseOptions.getConf(), + new Properties()); + + assertEquals("isLaunched should be the true if not specified otherwise", parseOptions.isLaunched(), true); + + assertEquals("watchMode should be the false if not specified otherwise", parseOptions.isWatchMode(), false); + } + + @Test + public void testNegativeRefreshInterval() throws LlapStatusCliException { + thrown.expect(LlapStatusCliException.class); + thrown.expectMessage(ExitCode.INCORRECT_USAGE.getInt() + ": Incorrect usage"); + + LlapStatusServiceDriver driver = new LlapStatusServiceDriver(); + driver.parseOptions(new String[] {"--refreshInterval -1"}); + } + + @Test + public void testNegativeWatchTimeout() throws LlapStatusCliException { + thrown.expect(LlapStatusCliException.class); + thrown.expectMessage(ExitCode.INCORRECT_USAGE.getInt() + ": Incorrect usage"); + + LlapStatusServiceDriver driver = new LlapStatusServiceDriver(); + driver.parseOptions(new String[] {"--watchTimeout -1"}); + } + + @Test + public void testNegativeRunningNodesThreshold() throws LlapStatusCliException { + thrown.expect(LlapStatusCliException.class); + thrown.expectMessage(ExitCode.INCORRECT_USAGE.getInt() + ": Incorrect usage"); + + LlapStatusServiceDriver driver = new LlapStatusServiceDriver(); + driver.parseOptions(new String[] {"--runningNodesThreshold -1"}); + } + + @Test + public void testRunningNodesThresholdOverOne() throws LlapStatusCliException { + thrown.expect(LlapStatusCliException.class); + thrown.expectMessage(ExitCode.INCORRECT_USAGE.getInt() + ": Incorrect usage"); + + LlapStatusServiceDriver driver = new LlapStatusServiceDriver(); + driver.parseOptions(new String[] {"--runningNodesThreshold 1.1"}); + } +}