diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java index e5bb6e5..98f7aa1 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java @@ -23,6 +23,7 @@ import java.net.InetSocketAddress; import java.util.Map; import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -101,7 +102,8 @@ private boolean autoFailoverEnabled; private EmbeddedElectorService embeddedElector; - + // Boolean flag to check refreshAll failed + private AtomicBoolean reinitActiveServices = new AtomicBoolean(false); private Server server; // Address to use for binding. May be a wildcard address. @@ -309,15 +311,25 @@ public synchronized void transitionToActive( UserGroupInformation user = checkAccess("transitionToActive"); checkHaStateChange(reqInfo); try { - rm.transitionToActive(); + rm.transitionToActive(reinitActiveServices.get()); + reinitActiveServices.set(false); // call all refresh*s for active RM to get the updated configurations. refreshAll(); - RMAuditLogger.logSuccess(user.getShortUserName(), - "transitionToActive", "RMHAProtocolService"); + RMAuditLogger.logSuccess(user.getShortUserName(), "transitionToActive", + "RMHAProtocolService"); } catch (Exception e) { RMAuditLogger.logFailure(user.getShortUserName(), "transitionToActive", - "", "RMHAProtocolService", - "Exception transitioning to active"); + "", "RMHAProtocolService", "Exception transitioning to active"); + if (rmContext.getHAServiceState() + == HAServiceProtocol.HAServiceState.ACTIVE) { + try { + rm.transitionToStandby(false); + rm.reinitialize(true); + reinitActiveServices.set(false); + } catch (Exception re) { + reinitActiveServices.set(true); + } + } throw new ServiceFailedException( "Error when transitioning to Active mode", e); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java index 1b606b4..a56b661 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java @@ -990,12 +990,14 @@ protected boolean areActiveServicesRunning() { return activeServices != null && activeServices.isInState(STATE.STARTED); } - synchronized void transitionToActive() throws Exception { + synchronized void transitionToActive(boolean reinit) throws Exception { if (rmContext.getHAServiceState() == HAServiceProtocol.HAServiceState.ACTIVE) { LOG.info("Already in active state"); return; } - + if (reinit) { + reinitialize(true); + } LOG.info("Transitioning to active state"); this.rmLoginUGI.doAs(new PrivilegedExceptionAction() { @@ -1038,7 +1040,7 @@ protected void serviceStart() throws Exception { if (this.rmContext.isHAEnabled()) { transitionToStandby(true); } else { - transitionToActive(); + transitionToActive(false); } startWepApp(); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHA.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHA.java index 0200e85..cc2ee02 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHA.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHA.java @@ -52,6 +52,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration; import org.codehaus.jettison.json.JSONException; import org.codehaus.jettison.json.JSONObject; import org.junit.Assert; @@ -515,6 +516,42 @@ public void run() { rm.stop(); } + @Test(timeout = 90000) + public void testTransitionedToActiveRefreshFail() throws Exception { + configuration.setBoolean(YarnConfiguration.AUTO_FAILOVER_ENABLED, false); + YarnConfiguration conf = new YarnConfiguration(configuration); + configuration = new CapacitySchedulerConfiguration(conf); + rm = new MockRM(configuration) { + @Override + protected AdminService createAdminService() { + return new AdminService(this, getRMContext()) { + @Override + protected void setConfig(Configuration conf) { + super.setConfig(configuration); + } + }; + } + }; + rm.init(configuration); + rm.start(); + final StateChangeRequestInfo requestInfo = + new StateChangeRequestInfo( + HAServiceProtocol.RequestSource.REQUEST_BY_USER); + // Transition to active call with invalid value + configuration.set("yarn.scheduler.capacity.root.default.capacity", "200"); + try { + rm.adminService.transitionToActive(requestInfo); + } catch (Exception e) { + assertTrue("Error when transitioning to Active mode".contains(e + .getMessage())); + } + assertEquals(HAServiceState.STANDBY, rm.getRMContext().getHAServiceState()); + // Making correct conf and check the state + configuration.set("yarn.scheduler.capacity.root.default.capacity", "100"); + rm.adminService.transitionToActive(requestInfo); + assertEquals(HAServiceState.ACTIVE, rm.getRMContext().getHAServiceState()); + } + @Test public void testFailoverClearsRMContext() throws Exception { configuration.setBoolean(YarnConfiguration.AUTO_FAILOVER_ENABLED, false);