diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/AMLauncher.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/AMLauncher.java index 7051f8c..d803ca7 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/AMLauncher.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/AMLauncher.java @@ -62,6 +62,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptLaunchFailedEvent; import org.apache.hadoop.yarn.util.ConverterUtils; import org.apache.hadoop.yarn.util.timeline.TimelineUtils; @@ -308,8 +309,8 @@ public void run() { String message = "Error launching " + application.getAppAttemptId() + ". Got exception: " + StringUtils.stringifyException(ie); LOG.info(message); - handler.handle(new RMAppAttemptEvent(application - .getAppAttemptId(), RMAppAttemptEventType.LAUNCH_FAILED, message)); + handler.handle(new RMAppAttemptLaunchFailedEvent( + application.getAppAttemptId(), ie)); } break; case CLEANUP: diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java index d66a97d..7b66530 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java @@ -21,6 +21,7 @@ import static org.apache.hadoop.yarn.util.StringHelper.pjoin; import java.io.IOException; +import java.net.SocketTimeoutException; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; @@ -89,6 +90,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptContainerFinishedEvent; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptLaunchFailedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptRegistrationEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptStatusupdateEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptUnregistrationEvent; @@ -1589,6 +1591,23 @@ private static boolean shouldCountTowardsNodeBlacklisting(int exitStatus) { } } + /** + * Check whether we should blacklist a node on which our AM launch failed + * for a given cause. + * @param launchFailureCause the cause of the launch failure + * @return true if the node should be blacklisted + */ + private static boolean shouldCountAMLaunchFailureTowardsNodeBlacklisting( + Throwable launchFailureCause) { + for(int depth = 0; depth < 3 && launchFailureCause != null; depth++) { + if(launchFailureCause instanceof SocketTimeoutException) { + return true; + } + launchFailureCause = launchFailureCause.getCause(); + } + return false; + } + private static final class UnmanagedAMAttemptSavedTransition extends AMLaunchedTransition { @Override @@ -1613,8 +1632,18 @@ public LaunchFailedTransition() { public void transition(RMAppAttemptImpl appAttempt, RMAppAttemptEvent event) { + final RMAppAttemptLaunchFailedEvent amLaunchFailedEvent = + ((RMAppAttemptLaunchFailedEvent) event); + // Use diagnostic from launcher - appAttempt.diagnostics.append(event.getDiagnosticMsg()); + appAttempt.diagnostics.append(amLaunchFailedEvent.getDiagnosticMsg()); + + // check if the node where we were trying to launch the AM container + // should be blacklisted. + if (shouldCountAMLaunchFailureTowardsNodeBlacklisting( + amLaunchFailedEvent.rootCause)) { + appAttempt.addAMNodeToBlackList(appAttempt.masterContainer.getNodeId()); + } // Tell the app, scheduler super.transition(appAttempt, event); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/event/RMAppAttemptLaunchFailedEvent.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/event/RMAppAttemptLaunchFailedEvent.java new file mode 100644 index 0000000..a67a288 --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/event/RMAppAttemptLaunchFailedEvent.java @@ -0,0 +1,42 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event; + +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEvent; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEventType; + +/** + * An instance of this class represents a launch failure of a RMAppAttempt. + */ +public class RMAppAttemptLaunchFailedEvent extends RMAppAttemptEvent { + /** + * The exception that has caused the launch failure. + */ + public final Throwable rootCause; + + public RMAppAttemptLaunchFailedEvent(ApplicationAttemptId appAttemptId, + Throwable cause) { + super(appAttemptId, RMAppAttemptEventType.LAUNCH_FAILED, + "Error launching " + appAttemptId + ". Got exception: " + + StringUtils.stringifyException(cause)); + rootCause = cause; + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java index 2ff4fb2..4adc09d 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java @@ -87,6 +87,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptLaunchFailedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; @@ -937,14 +938,14 @@ public MockAM sendAMLaunched(ApplicationAttemptId appAttemptId) return am; } - public void sendAMLaunchFailed(ApplicationAttemptId appAttemptId) - throws Exception { + public MockAM sendAMLaunchFailed(ApplicationAttemptId appAttemptId, + Throwable cause) throws Exception { MockAM am = new MockAM(getRMContext(), masterService, appAttemptId); waitForState(am.getApplicationAttemptId(), RMAppAttemptState.ALLOCATED); - getRMContext().getDispatcher().getEventHandler() - .handle(new RMAppAttemptEvent(appAttemptId, - RMAppAttemptEventType.LAUNCH_FAILED, "Failed")); + getRMContext().getDispatcher().getEventHandler().handle( + new RMAppAttemptLaunchFailedEvent(appAttemptId, cause)); drainEventsImplicitly(); + return am; } @Override @@ -1150,6 +1151,18 @@ public static MockAM launchAM(RMApp app, MockRM rm, MockNM nm) return am; } + public static MockAM launchAndFailAM(RMApp app, MockRM rm, MockNM nm, + Throwable cause) throws Exception { + rm.drainEventsImplicitly(); + RMAppAttempt attempt = waitForAttemptScheduled(app, rm); + LOG.info("Launching AM " + attempt.getAppAttemptId()); + nm.nodeHeartbeat(true); + rm.drainEventsImplicitly(); + MockAM am = rm.sendAMLaunchFailed(attempt.getAppAttemptId(), cause); + rm.waitForState(attempt.getAppAttemptId(), RMAppAttemptState.FAILED); + return am; + } + public static MockAM launchUAM(RMApp app, MockRM rm, MockNM nm) throws Exception { rm.drainEventsImplicitly(); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestNodeBlacklistingOnAMFailures.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestNodeBlacklistingOnAMFailures.java index 75ef5c7..2f10793 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestNodeBlacklistingOnAMFailures.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestNodeBlacklistingOnAMFailures.java @@ -18,6 +18,8 @@ package org.apache.hadoop.yarn.server.resourcemanager; +import java.io.IOException; +import java.net.SocketTimeoutException; import java.util.ArrayList; import java.util.List; @@ -158,6 +160,84 @@ public void testNodeBlacklistingOnAMFailure() throws Exception { } @Test(timeout = 100000) + public void testNodeBlacklistingOnAMLaunchFailure() throws Exception { + YarnConfiguration conf = new YarnConfiguration(); + // identify nodes by their host name + port so that we can distinguish + // them on a mini YARN cluster where all NMs are on the same host + conf.setBoolean(YarnConfiguration.RM_SCHEDULER_INCLUDE_PORT_IN_NODE_NAME, + true); + conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, + ResourceScheduler.class); + conf.setBoolean(YarnConfiguration.AM_SCHEDULING_NODE_BLACKLISTING_ENABLED, + true); + // make sure node blacklisting is always enabled regardless of how many + // percentage of nodes are unusable + conf.setFloat(YarnConfiguration. + AM_SCHEDULING_NODE_BLACKLISTING_DISABLE_THRESHOLD, 1.0f); + + MockRM rm = startRM(conf); + + // Register 5 nodes, so that we can blacklist at least one if AM launch + // failed. As per calculation it will be like, 5nodes * 0.2 (default)=1. + // First register 2 nodes, and after AM launched register 3 more nodes. + MockNM nm1 = + new MockNM("127.0.0.1:1234", 80000, rm.getResourceTrackerService()); + nm1.registerNode(); + MockNM nm2 = + new MockNM("127.0.0.2:2345", 80000, rm.getResourceTrackerService()); + nm2.registerNode(); + + RMApp app = rm.submitApp(200); + + Throwable launchFailureCause = + new IOException(new IOException(new SocketTimeoutException())); + MockAM am1 = MockRM.launchAndFailAM(app, rm, nm1, launchFailureCause); + + NodeId nodeWhereAMRan = app.getRMAppAttempt(am1.getApplicationAttemptId()) + .getMasterContainer().getNodeId(); + MockNM currentNode, otherNode; + if (nodeWhereAMRan.equals(nm1.getNodeId())) { + currentNode = nm1; + otherNode = nm2; + } else { + currentNode = nm2; + otherNode = nm1; + } + System.out.println("Attempt " + am1.getApplicationAttemptId() + " was at " + nodeWhereAMRan.toString()); + + // restart the am + RMAppAttempt attempt = MockRM.waitForAttemptScheduled(app, rm); + // Try the current node + currentNode.nodeHeartbeat(true); + rm.drainEvents(); + // Now try the other node + otherNode.nodeHeartbeat(true); + rm.drainEvents(); + // Now the AM container should be allocated + MockRM.waitForState(attempt, RMAppAttemptState.ALLOCATED, 20000); + + MockAM am2 = rm.sendAMLaunched(attempt.getAppAttemptId()); + rm.waitForState(attempt.getAppAttemptId(), RMAppAttemptState.LAUNCHED); + nodeWhereAMRan = attempt.getMasterContainer().getNodeId(); + System.out.println("Attempt " + attempt.getAppAttemptId() + " was at " + nodeWhereAMRan.toString()); + + // The other node should now receive the assignment + Assert.assertEquals( + "After blacklisting, AM should have run on the other node", + otherNode.getNodeId(), nodeWhereAMRan); + + am2.registerAppAttempt(); + rm.waitForState(app.getApplicationId(), RMAppState.RUNNING); + + List allocatedContainers = + TestAMRestart.allocateContainers(currentNode, am2, 1); + Assert.assertEquals( + "Even though AM is blacklisted from the node, application can " + + "still allocate non-AM containers there", + currentNode.getNodeId(), allocatedContainers.get(0).getNodeId()); + } + + @Test(timeout = 100000) public void testNodeBlacklistingOnAMFailureStrictNodeLocality() throws Exception { YarnConfiguration conf = new YarnConfiguration();