diff --git llap-client/src/java/org/apache/hadoop/hive/llap/configuration/LlapConfiguration.java llap-client/src/java/org/apache/hadoop/hive/llap/configuration/LlapConfiguration.java index 359b17b..381c16b 100644 --- llap-client/src/java/org/apache/hadoop/hive/llap/configuration/LlapConfiguration.java +++ llap-client/src/java/org/apache/hadoop/hive/llap/configuration/LlapConfiguration.java @@ -45,8 +45,9 @@ public LlapConfiguration() { public static final String LLAP_DAEMON_SHUFFLE_DIR_WATCHER_ENABLED = LLAP_DAEMON_PREFIX + "shuffle.dir-watcher.enabled"; public static final boolean LLAP_DAEMON_SHUFFLE_DIR_WATCHER_ENABLED_DEFAULT = false; + // This needs to be kept below the task timeout interval, but otherwise as high as possible to avoid unnecessary traffic. public static final String LLAP_DAEMON_LIVENESS_HEARTBEAT_INTERVAL_MS = LLAP_DAEMON_PREFIX + "liveness.heartbeat.interval-ms"; - public static final long LLAP_DAEMON_LIVENESS_HEARTBEAT_INTERVAL_MS_DEFAULT = 5000l; + public static final long LLAP_DAEMON_LIVENESS_HEARTBEAT_INTERVAL_MS_DEFAULT = 10000l; // Section for configs used in AM and executors diff --git llap-server/src/java/org/apache/hadoop/hive/llap/daemon/impl/AMReporter.java llap-server/src/java/org/apache/hadoop/hive/llap/daemon/impl/AMReporter.java index c9baba1..3dc8fb8 100644 --- llap-server/src/java/org/apache/hadoop/hive/llap/daemon/impl/AMReporter.java +++ llap-server/src/java/org/apache/hadoop/hive/llap/daemon/impl/AMReporter.java @@ -111,6 +111,7 @@ public void onSuccess(Void result) { @Override public void onFailure(Throwable t) { LOG.error("AMReporter QueueDrainer exited with error", t); + System.exit(-1); } }); LOG.info("Started service: " + getName()); @@ -180,7 +181,8 @@ protected Void callInternal() { amNodeInfo.stopUmbilical(); } else { // Add back to the queue for the next heartbeat, and schedule the actual heartbeat - amNodeInfo.setNextHeartbeatTime(System.currentTimeMillis() + heartbeatInterval); + long next = System.currentTimeMillis() + heartbeatInterval; + amNodeInfo.setNextHeartbeatTime(next); pendingHeartbeatQueeu.add(amNodeInfo); executor.submit(new AMHeartbeatCallable(amNodeInfo)); } @@ -298,7 +300,7 @@ synchronized void setNextHeartbeatTime(long nextTime) { @Override public long getDelay(TimeUnit unit) { - return 0; + return unit.convert(nextHeartbeatTime - System.currentTimeMillis(), TimeUnit.MILLISECONDS); } @Override diff --git llap-server/src/java/org/apache/hadoop/hive/llap/tezplugins/LlapTaskCommunicator.java llap-server/src/java/org/apache/hadoop/hive/llap/tezplugins/LlapTaskCommunicator.java index e1610fe..303bf6d 100644 --- llap-server/src/java/org/apache/hadoop/hive/llap/tezplugins/LlapTaskCommunicator.java +++ llap-server/src/java/org/apache/hadoop/hive/llap/tezplugins/LlapTaskCommunicator.java @@ -458,7 +458,7 @@ void nodePinged(String hostname, int port) { } } else { if (System.currentTimeMillis() > nodeNotFoundLogTime.get() + 5000l) { - LOG.warn("Recevied ping from unknown node: " + hostname + ":" + port + + LOG.warn("Received ping from unknown node: " + hostname + ":" + port + ". Could be caused by pre-emption by the AM," + " or a mismatched hostname. Enable debug logging for mismatched host names"); nodeNotFoundLogTime.set(System.currentTimeMillis());