diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java index d2d706d..04d7cc7 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java @@ -801,6 +801,12 @@ public void handleTransitionToStandBy() { LOG.fatal("Failed to transition RM to Standby mode."); ExitUtil.terminate(1, e); } + } else { + LOG.fatal("RM was fenced in a non-HA configuration. Shutting down RM to " + + "prevent accidental corruption of the state store. Please check " + + "that no other RM using the configured state store is set to use " + + "the same cluster ID before attempting to restart the RM."); + ExitUtil.terminate(1); } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java index de273c4..dbca983 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java @@ -37,7 +37,6 @@ import org.apache.hadoop.classification.InterfaceStability.Unstable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; -import org.apache.hadoop.ipc.CallerContext; import org.apache.hadoop.security.Credentials; import org.apache.hadoop.security.token.delegation.DelegationKey; import org.apache.hadoop.service.AbstractService; @@ -1092,26 +1091,31 @@ protected void handleStoreEvent(RMStateStoreEvent event) { * @param failureCause the exception due to which the operation failed */ protected void notifyStoreOperationFailed(Exception failureCause) { - if (isFencedState()) { - return; - } - if (notifyStoreOperationFailedInternal(failureCause)) { + if (!isFencedState() && + notifyStoreOperationFailedInternal(failureCause)) { updateFencedState(); } } + /** + * This method notifies the {@link ResourceManager} that it has been fenced. + */ + protected void fence() { + LOG.warn("State-store fenced ! Transitioning RM to standby"); + + Thread standByTransitionThread = new Thread(new StandByTransitionThread()); + standByTransitionThread.setName("StandByTransitionThread Handler"); + standByTransitionThread.start(); + } + @SuppressWarnings("unchecked") private boolean notifyStoreOperationFailedInternal( Exception failureCause) { boolean isFenced = false; LOG.error("State store operation failed ", failureCause); if (HAUtil.isHAEnabled(getConfig())) { - LOG.warn("State-store fenced ! Transitioning RM to standby"); isFenced = true; - Thread standByTransitionThread = - new Thread(new StandByTransitionThread()); - standByTransitionThread.setName("StandByTransitionThread Handler"); - standByTransitionThread.start(); + fence(); } else if (YarnConfiguration.shouldRMFailFast(getConfig())) { LOG.fatal("Fail RM now due to state-store error!"); rmDispatcher.getEventHandler().handle( diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/StoreFencedException.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/StoreFencedException.java index 1f8eb16..7797a3a 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/StoreFencedException.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/StoreFencedException.java @@ -18,11 +18,29 @@ package org.apache.hadoop.yarn.server.resourcemanager.recovery; import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager; +/** + * This class is used by {@link RMStateStore} implementations when calling the + * {@link ResourceManager#noteFailure(java.lang.Exception)} method to signal to + * the {@link ResourceManager} that it has been fenced. + */ public class StoreFencedException extends YarnException { private static final long serialVersionUID = 1L; + /** + * Create a new {@code StoreFencedException}. + */ public StoreFencedException() { super("RMStateStore has been fenced"); } + + /** + * Create a new {@code StoreFencedException} with a given cause. + * + * @param cause the cause + */ + public StoreFencedException(Throwable cause) { + super("RMStateStore has been fenced", cause); + } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java index 51bb74d..8fcbdfe 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java @@ -57,6 +57,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.impl.pb.ApplicationStateDataPBImpl; import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.impl.pb.EpochPBImpl; import org.apache.zookeeper.CreateMode; +import org.apache.zookeeper.KeeperException.NoAuthException; import org.apache.zookeeper.ZooDefs; import org.apache.zookeeper.data.ACL; import org.apache.zookeeper.data.Id; @@ -262,11 +263,10 @@ public synchronized void startInternal() throws Exception { create(zkRootNodePath); setRootNodeAcls(); delete(fencingNodePath); - if (HAUtil.isHAEnabled(getConfig()) && !HAUtil - .isAutomaticFailoverEnabled(getConfig())) { - verifyActiveStatusThread = new VerifyActiveStatusThread(); - verifyActiveStatusThread.start(); - } + + verifyActiveStatusThread = new VerifyActiveStatusThread(); + verifyActiveStatusThread.start(); + create(rmAppRoot); create(rmDTSecretManagerRoot); create(dtMasterKeysRootPath); @@ -308,7 +308,7 @@ private void setRootNodeAcls() throws Exception { } @Override - protected synchronized void closeInternal() throws Exception { + protected void closeInternal() throws Exception { if (verifyActiveStatusThread != null) { verifyActiveStatusThread.interrupt(); verifyActiveStatusThread.join(1000); @@ -1032,8 +1032,10 @@ public void run() { } catch (InterruptedException ie) { LOG.info(getName() + " thread interrupted! Exiting!"); interrupt(); + } catch (NoAuthException e) { + fence(); } catch (Exception e) { - notifyStoreOperationFailed(new StoreFencedException()); + notifyStoreOperationFailed(new StoreFencedException(e)); } } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestZKRMStateStore.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestZKRMStateStore.java index 19d3064..e7e5882 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestZKRMStateStore.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestZKRMStateStore.java @@ -53,7 +53,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptMetrics; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; import org.apache.hadoop.yarn.server.resourcemanager.security.ClientToAMTokenSecretManagerInRM; -import org.apache.hadoop.yarn.util.ConverterUtils; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.ZooDefs.Perms; import org.apache.zookeeper.data.ACL; @@ -69,8 +68,11 @@ import java.io.IOException; import java.util.List; +import java.util.concurrent.atomic.AtomicBoolean; import javax.crypto.SecretKey; +import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; +import static org.junit.Assert.assertFalse; public class TestZKRMStateStore extends RMStateStoreTestBase { @@ -515,4 +517,71 @@ public void testDuplicateRMAppDeletion() throws Exception { } store.close(); } + + /** + * Test that an RM in non-HA mode will still respond to being fenced. + * + * @throws Exception if there's an issue updating the ACLs + */ + @Test + public void testFencingWithoutHA() throws Exception { + Configuration conf = new Configuration(); + + conf.setBoolean(YarnConfiguration.RM_HA_ENABLED, false); + conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true); + conf.set(YarnConfiguration.RM_STORE, + ZKRMStateStore.class.getCanonicalName()); + conf.set(YarnConfiguration.RM_ZK_ADDRESS, + curatorTestingServer.getConnectString()); + conf.setInt(YarnConfiguration.RM_ZK_TIMEOUT_MS, 100); + + final AtomicBoolean active = new AtomicBoolean(true); + + // The handleTransitionToStandBy() method will exit the VM, which jUnit + // doesn't like. Instead, we override it and note that it has been called. + MockRM rm = new MockRM(conf) { + @Override + public void handleTransitionToStandBy() { + active.set(false); + } + }; + + rm.init(conf); + + try { + rm.start(); + + assertEquals("The RM did not come up in active state", + HAServiceState.ACTIVE, rm.getRMContext().getHAServiceState()); + + // Lock the ZK node so that the state store thinks it's been fenced + String workPath = + conf.get(YarnConfiguration.ZK_RM_STATE_STORE_PARENT_PATH, + YarnConfiguration.DEFAULT_ZK_RM_STATE_STORE_PARENT_PATH); + String rootPath = workPath + "/" + ZKRMStateStore.ROOT_ZNODE_NAME; + + // We borrow the ACLs from the root node because the working node doesn't + // have any + List acls = curatorFramework.getACL().forPath(workPath); + + for (ACL acl : acls) { + System.out.println(workPath + ": " + acl); + acl.setPerms(0); + } + + // We only need to lock out the path where the fencing node is created + curatorFramework.setACL().withACL(acls).forPath(rootPath); + + int count = 100; + + while (active.get() && (count > 0)) { + count -= 1; + Thread.sleep(50); + } + + assertFalse("The RM did not enter standby state within 5s", active.get()); + } finally { + rm.stop(); + } + } }