diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java index b30f837..1b9eba6 100644 --- a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java @@ -497,7 +497,7 @@ private ExitCode populateAppStatusFromLlapRegistry( Collection serviceInstances; try { serviceInstances = llapRegistry.getInstances(watchTimeoutMs).getAll(); - } catch (IOException e) { + } catch (Exception e) { throw new LlapStatusCliException(ExitCode.LLAP_REGISTRY_ERROR, "Failed to get instances from llap registry", e); } @@ -541,7 +541,11 @@ private ExitCode populateAppStatusFromLlapRegistry( LOG.warn("Found more entries in LLAP registry, as compared to desired entries"); } } else { - appStatusBuilder.setState(State.RUNNING_PARTIAL); + if (validatedInstances.size() > 0) { + appStatusBuilder.setState(State.RUNNING_PARTIAL); + } else { + appStatusBuilder.setState(State.LAUNCHING); + } } // At this point, everything that can be consumed from AppStatusBuilder has been consumed. @@ -575,6 +579,8 @@ private ExitCode populateAppStatusFromLlapRegistry( private Long appStartTime; private Long appFinishTime; + private boolean runningThresholdAchieved = false; + private final List llapInstances = new LinkedList<>(); private transient Map containerToInstanceMap = new HashMap<>(); @@ -625,6 +631,11 @@ public AppStatusBuilder addNewLlapInstance(LlapInstance llapInstance) { return this; } + public AppStatusBuilder setRunningThresholdAchieved(boolean thresholdAchieved) { + this.runningThresholdAchieved = thresholdAchieved; + return this; + } + public LlapInstance removeAndgetLlapInstanceForContainer(String containerIdString) { return containerToInstanceMap.remove(containerIdString); } @@ -683,6 +694,10 @@ public Long getAppFinishTime() { return llapInstances; } + public boolean isRunningThresholdAchieved() { + return runningThresholdAchieved; + } + @JsonIgnore public AmInfo maybeCreateAndGetAmInfo() { if (amInfo == null) { @@ -994,7 +1009,7 @@ public static void main(String[] args) { // we have reached RUNNING state, now check if running nodes threshold is met final int liveInstances = statusServiceDriver.appStatusBuilder.getLiveInstances(); final int desiredInstances = statusServiceDriver.appStatusBuilder.getDesiredInstances(); - if (liveInstances > 0 && desiredInstances > 0) { + if (desiredInstances > 0) { final float ratio = (float) liveInstances / (float) desiredInstances; if (ratio < runningNodesThreshold) { LOG.warn("Waiting until running nodes threshold is reached. Current: {} Desired: {}." + @@ -1006,9 +1021,29 @@ public static void main(String[] args) { continue; } else { desiredStateAttained = true; + statusServiceDriver.appStatusBuilder.setRunningThresholdAchieved(true); } + } else { + numAttempts--; + continue; } } + } else if (ret == ExitCode.YARN_ERROR.getInt() && watchMode) { + LOG.warn("Watch mode enabled and got YARN error. Retrying.."); + numAttempts--; + continue; + } else if (ret == ExitCode.SLIDER_CLIENT_ERROR_CREATE_FAILED.getInt() && watchMode) { + LOG.warn("Watch mode enabled and slider client creation failed. Retrying.."); + numAttempts--; + continue; + } else if (ret == ExitCode.SLIDER_CLIENT_ERROR_OTHER.getInt() && watchMode) { + LOG.warn("Watch mode enabled and got slider client error. Retrying.."); + numAttempts--; + continue; + } else if (ret == ExitCode.LLAP_REGISTRY_ERROR.getInt() && watchMode) { + LOG.warn("Watch mode enabled and got LLAP registry error. Retrying.."); + numAttempts--; + continue; } break; } finally {