diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/instance/ComponentInstance.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/instance/ComponentInstance.java index 25aba77f539..25cc19da556 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/instance/ComponentInstance.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/instance/ComponentInstance.java @@ -152,10 +152,14 @@ REINITIALIZED), START, new StartedAfterUpgradeTransition()) .addTransition(CANCEL_UPGRADING, EnumSet.of(CANCEL_UPGRADING, INIT), STOP, new StoppedAfterCancelUpgradeTransition()) + + // FROM REINITIALIZED .addTransition(REINITIALIZED, CANCEL_UPGRADING, CANCEL_UPGRADE, new CancelledAfterReinitTransition()) .addTransition(REINITIALIZED, READY, BECOME_READY, new ContainerBecomeReadyTransition(true)) + .addTransition(REINITIALIZED, REINITIALIZED, STOP, + new StoppedAfterUpgradeTransition()) .installTopology(); public ComponentInstance(Component component, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/component/instance/TestComponentInstance.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/component/instance/TestComponentInstance.java index 09652d7403b..f857353a629 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/component/instance/TestComponentInstance.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/component/instance/TestComponentInstance.java @@ -140,6 +140,42 @@ public void testContainerUpgradeFailed() throws Exception { .getId().toString()).getState()); } + @Test + public void testFailureAfterReinit() throws Exception { + ServiceContext context = TestComponent.createTestContext(rule, + "testContainerUpgradeFailed"); + Component component = context.scheduler.getAllComponents().entrySet() + .iterator().next().getValue(); + upgradeComponent(component); + + ComponentInstance instance = component.getAllComponentInstances().iterator() + .next(); + + ComponentInstanceEvent upgradeEvent = new ComponentInstanceEvent( + instance.getContainer().getId(), ComponentInstanceEventType.UPGRADE); + instance.handle(upgradeEvent); + + // NM finished updgrae + instance.handle(new ComponentInstanceEvent(instance.getContainer().getId(), + ComponentInstanceEventType.START)); + Assert.assertEquals("instance not running", + ContainerState.RUNNING_BUT_UNREADY, + component.getComponentSpec().getContainer(instance.getContainer() + .getId().toString()).getState()); + + ContainerStatus containerStatus = mock(ContainerStatus.class); + when(containerStatus.getExitStatus()).thenReturn( + ContainerExitStatus.ABORTED); + ComponentInstanceEvent stopEvent = new ComponentInstanceEvent( + instance.getContainer().getId(), ComponentInstanceEventType.STOP) + .setStatus(containerStatus); + // this is the call back from NM for the upgrade + instance.handle(stopEvent); + Assert.assertEquals("instance did not fail", ContainerState.FAILED_UPGRADE, + component.getComponentSpec().getContainer(instance.getContainer() + .getId().toString()).getState()); + } + @Test public void testCancelNothingToUpgrade() throws Exception { ServiceContext context = TestComponent.createTestContext(rule, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index 6716dbb02e9..a52b91172d3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -1729,7 +1729,10 @@ public void transition(ContainerImpl container, + "] for re-initialization !!"); container.wasLaunched = false; container.metrics.endRunningContainer(); - + // Remove the container from the resource-monitor. When container + // is launched again, it is added back to monitoring service. + container.dispatcher.getEventHandler().handle( + new ContainerStopMonitoringEvent(container.containerId, true)); container.launchContext = container.reInitContext.newLaunchContext; // Re configure the Retry Context @@ -1894,7 +1897,7 @@ public void transition(ContainerImpl container, ContainerEvent event) { if (container.containerMetrics != null) { container.containerMetrics .recordFinishTimeAndExitCode(clock.getTime(), container.exitCode); - container.containerMetrics.finished(); + container.containerMetrics.finished(false); } container.sendFinishedEvents(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerMetrics.java index 2a958494e8a..bca7c3fa1b3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerMetrics.java @@ -242,14 +242,18 @@ public synchronized void getMetrics(MetricsCollector collector, boolean all) { } } - public synchronized void finished() { + public synchronized void finished(boolean unregisterWithoutDelay) { if (!finished) { this.finished = true; if (timer != null) { timer.cancel(); timer = null; } - scheduleTimerTaskForUnregistration(); + if (!unregisterWithoutDelay) { + scheduleTimerTaskForUnregistration(); + } else { + ContainerMetrics.unregisterContainerMetrics(ContainerMetrics.this); + } this.pMemMBQuantiles.stop(); this.cpuCoreUsagePercentQuantiles.stop(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerStopMonitoringEvent.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerStopMonitoringEvent.java index 240c5c067fc..810271e3bcb 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerStopMonitoringEvent.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerStopMonitoringEvent.java @@ -22,8 +22,20 @@ public class ContainerStopMonitoringEvent extends ContainersMonitorEvent { + private final boolean forReInit; + public ContainerStopMonitoringEvent(ContainerId containerId) { super(containerId, ContainersMonitorEventType.STOP_MONITORING_CONTAINER); + forReInit = false; } + public ContainerStopMonitoringEvent(ContainerId containerId, + boolean forReInit) { + super(containerId, ContainersMonitorEventType.STOP_MONITORING_CONTAINER); + this.forReInit = forReInit; + } + + public boolean isForReInit() { + return forReInit; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java index b9e2c68abed..b7fca866791 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java @@ -533,7 +533,7 @@ public void run() { } catch (Exception e) { // Log the exception and proceed to the next container. LOG.warn("Uncaught exception in ContainersMonitorImpl " - + "while monitoring resource of " + containerId, e); + + "while monitoring resource of {}", containerId, e); } } if (LOG.isDebugEnabled()) { @@ -861,10 +861,12 @@ private void updateContainerMetrics(ContainersMonitorEvent monitoringEvent) { vmemLimitMBs, pmemLimitMBs, cpuVcores); break; case STOP_MONITORING_CONTAINER: + ContainerStopMonitoringEvent stopEvent = + (ContainerStopMonitoringEvent) monitoringEvent; usageMetrics = ContainerMetrics.getContainerMetrics( containerId); if (usageMetrics != null) { - usageMetrics.finished(); + usageMetrics.finished(stopEvent.isForReInit()); } break; case CHANGE_MONITORING_CONTAINER_RESOURCE: diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainerMetrics.java index 8b2bff1c28f..ce1baf90d86 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainerMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainerMetrics.java @@ -63,7 +63,7 @@ public void testContainerMetricsFlow() throws InterruptedException { assertEquals(ERR, 1, collector.getRecords().size()); collector.clear(); - metrics.finished(); + metrics.finished(false); metrics.getMetrics(collector, true); assertEquals(ERR, 1, collector.getRecords().size()); collector.clear(); @@ -137,8 +137,8 @@ public void testContainerMetricsFinished() throws InterruptedException { ContainerId containerId3 = ContainerId.newContainerId(appAttemptId, 3); ContainerMetrics metrics3 = ContainerMetrics.forContainer(system, containerId3, 1, 0); - metrics1.finished(); - metrics2.finished(); + metrics1.finished(false); + metrics2.finished(false); system.sampleMetrics(); system.sampleMetrics(); Thread.sleep(100);