Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/ApplicationMaster.java =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/ApplicationMaster.java (revision 1564845) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/ApplicationMaster.java (working copy) @@ -858,9 +858,11 @@ // A failure scenario on bad input such as invalid shell script path // We know we cannot continue launching the container // so we should release it. - // TODO - numCompletedContainers.incrementAndGet(); - numFailedContainers.incrementAndGet(); + + // onStartContainerError will increment the num[Completed|Failed]Containers + // numCompletedContainers.incrementAndGet(); + // numFailedContainers.incrementAndGet(); + containerListener.onStartContainerError(container.getId(), e); return; } shellRsrc.setTimestamp(shellScriptPathTimestamp); Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/LinuxResourceCalculatorPlugin.java =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/LinuxResourceCalculatorPlugin.java (revision 1564845) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/LinuxResourceCalculatorPlugin.java (working copy) @@ -391,7 +391,8 @@ // Sleep so we can compute the CPU usage Thread.sleep(500L); } catch (InterruptedException e) { - // do nothing + // Let the users know about it just in case... + System.out.println("[WARN] Received an interrupt event when calculating CPU usage. " + e); } System.out.println("CPU usage % : " + plugin.getCpuUsage()); } Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java (revision 1564845) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java (working copy) @@ -517,10 +517,11 @@ new NodeManagerEvent(NodeManagerEventType.SHUTDOWN)); throw new YarnRuntimeException(e); } catch (Throwable e) { - - // TODO Better error handling. Thread can die with the rest of the - // NM still running. + // Thread can die with the rest of the + // NM still running. At least propagate the exception instead of + // silently swallowing it. May be insufficient. LOG.error("Caught exception in status-updater", e); + throw new YarnRuntimeException(e); } finally { synchronized (heartbeatMonitor) { nextHeartBeatInterval = nextHeartBeatInterval <= 0 ? Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/ResourceLocalizationService.java =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/ResourceLocalizationService.java (revision 1564845) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/ResourceLocalizationService.java (working copy) @@ -723,6 +723,7 @@ // ignore; shutting down } } catch (InterruptedException e) { + LOG.info("Interrupted: " + e); return; } } @@ -840,7 +841,9 @@ LOG.error("local path for PRIVATE localization could not be found." + "Disks might have failed.", e); } catch (URISyntaxException e) { - // TODO fail? Already translated several times... + // Fail. Already translated several times... + LOG.error("Failed to parse local path. ", e); + response.setLocalizerAction(LocalizerAction.DIE); } } else if (pending.isEmpty()) { // TODO: Synchronization @@ -863,7 +866,9 @@ try { req = new LocalResourceRequest(rsrc); } catch (URISyntaxException e) { - // TODO fail? Already translated several times... + // Fail. Already translated several times... + LOG.error("Failed to parse local path. ", e); + response.setLocalizerAction(LocalizerAction.DIE); } LocalizerResourceRequestEvent assoc = scheduled.get(req); if (assoc == null) { @@ -879,7 +884,9 @@ .handle( new ResourceLocalizedEvent(req, ConverterUtils .getPathFromYarnURL(stat.getLocalPath()), stat.getLocalSize())); - } catch (URISyntaxException e) { } + } catch (URISyntaxException e) { + LOG.error("Failed to parse local path. ", e); + } // unlocking the resource and removing it from scheduled resource // list @@ -903,7 +910,9 @@ LOG.error("local path for PRIVATE localization could not be " + "found. Disks might have failed.", e); } catch (URISyntaxException e) { - //TODO fail? Already translated several times... + // Fail. Already translated several times... + LOG.error("Failed to parse local path. ", e); + response.setLocalizerAction(LocalizerAction.DIE); } } break; Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java (revision 1564845) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java (working copy) @@ -189,7 +189,7 @@ try { this.monitoringThread.join(); } catch (InterruptedException e) { - ; + LOG.info("Interrupted during joining the monitoring thread in serviceStop: " + e); } } super.serviceStop(); @@ -548,7 +548,8 @@ } break; default: - // TODO: Wrong event. + // Wrong event. Should we throw a runtime exception? + LOG.error("Wrong monitoring event: " + monitoringEvent.getType()); } } } Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java (revision 1564845) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java (working copy) @@ -545,8 +545,9 @@ try { event = eventQueue.take(); } catch (InterruptedException e) { - LOG.error("Returning, interrupted : " + e); - return; // TODO: Kill RM. + LOG.error("Returning, interrupted. Existing, bbye" + e); + System.exit(-1); + return; // Killed RM. } try { Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java (revision 1564845) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java (working copy) @@ -632,8 +632,15 @@ /* keep the master in sync with the state machine */ this.stateMachine.doTransition(event.getType(), event); } catch (InvalidStateTransitonException e) { - LOG.error("Can't handle this event at current state", e); - /* TODO fail the application on the failed transition */ + LOG.error("Can't handle this event at current state for app" + appID + + "; Exception: ", e); + try { + /* fail the application on the failed transition */ + this.stateMachine.doTransition(RMAppEventType.KILL, new RMAppEvent(appID, RMAppEventType.KILL)); + } catch (InvalidStateTransitonException again) { + // Well, we tried... + LOG.error("Cannot kill the application " + appID + " either.", again); + } } if (oldState != getState()) { Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java (revision 1564845) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java (working copy) @@ -642,8 +642,16 @@ /* keep the master in sync with the state machine */ this.stateMachine.doTransition(event.getType(), event); } catch (InvalidStateTransitonException e) { - LOG.error("Can't handle this event at current state", e); - /* TODO fail the application on the failed transition */ + LOG.error("Can't handle this event at current state for app" + appAttemptID + + "; Exception: ", e); + try { + /* fail the application on the failed transition */ + this.stateMachine.doTransition(RMAppAttemptEventType.KILL, + new RMAppAttemptEvent(appAttemptID, RMAppAttemptEventType.KILL)); + } catch (InvalidStateTransitonException again) { + // Well, we tried... + LOG.error("Cannot kill the application " + appAttemptID + " either.", again); + } } if (oldState != getAppAttemptState()) { Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java (revision 1564845) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java (working copy) @@ -638,13 +638,19 @@ " request=" + request + " type=" + type); Resource capability = request.getCapability(); - int availableContainers = - node.getAvailableResource().getMemory() / capability.getMemory(); // TODO: A buggy - // application - // with this - // zero would - // crash the - // scheduler. + int availableContainers = 0; + try { + availableContainers = node.getAvailableResource().getMemory() / capability.getMemory(); + } catch (ArithmeticException e) { + // Preventing a buggy application + // with this + // zero would + // crash the + // scheduler. + LOG.error("The memory size of the request is 0, likely buggy application. "); + return 0; // assign 0 containers for this app + } + int assignedContainers = Math.min(assignableContainers, availableContainers); Index: hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-web-proxy/src/main/java/org/apache/hadoop/yarn/server/webproxy/WebAppProxy.java =================================================================== --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-web-proxy/src/main/java/org/apache/hadoop/yarn/server/webproxy/WebAppProxy.java (revision 1564845) +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-web-proxy/src/main/java/org/apache/hadoop/yarn/server/webproxy/WebAppProxy.java (working copy) @@ -128,6 +128,7 @@ try { proxyServer.join(); } catch (InterruptedException e) { + LOG.warn("Interrupted from join: " + e); } } }