From f0ac4153638e8de2a02e0907ed20bb7e0b6aebfa Mon Sep 17 00:00:00 2001 From: Elliott Clark Date: Mon, 8 Jul 2013 18:45:40 -0700 Subject: [PATCH] Add Suspend and Resume to Chaos Monkey --- .../hadoop/hbase/DistributedHBaseCluster.java | 12 ++++ ...IntegrationTestDataIngestSlowDeterministic.java | 13 ++-- .../org/apache/hadoop/hbase/util/ChaosMonkey.java | 64 +++++++++++++++++--- .../java/org/apache/hadoop/hbase/HBaseCluster.java | 24 +++++++- .../org/apache/hadoop/hbase/MiniHBaseCluster.java | 26 ++++++++ 5 files changed, 125 insertions(+), 14 deletions(-) diff --git hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java index 02aa504..71f58eb 100644 --- hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java +++ hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java @@ -112,6 +112,18 @@ public class DistributedHBaseCluster extends HBaseCluster { } @Override + public void suspendRegionServer(ServerName serverName) throws IOException { + LOG.info("Suspending RS: " + serverName.getServerName()); + clusterManager.suspend(ServiceType.HBASE_REGIONSERVER, serverName.getHostname()); + } + + @Override + public void resumeRegionServer(ServerName serverName) throws IOException { + LOG.info("Resume RS: " + serverName.getServerName()); + clusterManager.resume(ServiceType.HBASE_REGIONSERVER, serverName.getHostname()); + } + + @Override public void stopRegionServer(ServerName serverName) throws IOException { LOG.info("Stopping RS: " + serverName.getServerName()); clusterManager.stop(ServiceType.HBASE_REGIONSERVER, serverName.getHostname()); diff --git hbase-it/src/test/java/org/apache/hadoop/hbase/IntegrationTestDataIngestSlowDeterministic.java hbase-it/src/test/java/org/apache/hadoop/hbase/IntegrationTestDataIngestSlowDeterministic.java index 9d91a58..52734d4 100644 --- hbase-it/src/test/java/org/apache/hadoop/hbase/IntegrationTestDataIngestSlowDeterministic.java +++ hbase-it/src/test/java/org/apache/hadoop/hbase/IntegrationTestDataIngestSlowDeterministic.java @@ -23,16 +23,18 @@ import org.apache.hadoop.hbase.util.ChaosMonkey.RestartActiveMaster; import org.apache.hadoop.hbase.util.ChaosMonkey.RestartRandomRs; import org.apache.hadoop.hbase.util.ChaosMonkey.RestartRsHoldingMeta; import org.apache.hadoop.hbase.util.ChaosMonkey.RollingBatchRestartRs; +import org.apache.hadoop.hbase.util.ChaosMonkey.SuspendRandomRs; import org.junit.After; import org.junit.Before; import org.junit.Test; import org.junit.experimental.categories.Category; /** - * A system test which does large data ingestion and verify using {@link LoadTestTool}. - * It performs a set of actions deterministically using ChaosMonkey, then starts killing - * things randomly. You can configure how long should the load test run by using - * "hbase.IntegrationTestDataIngestSlowDeterministic.runtime" configuration parameter. + * A system test which does large data ingestion and verify using + * {@link org.apache.hadoop.hbase.util.LoadTestTool}. It performs a set of actions deterministically + * using ChaosMonkey, then starts killing things randomly. You can configure how long should the + * load test run by using "hbase.IntegrationTestDataIngestSlowDeterministic.runtime" configuration + * parameter. */ @Category(IntegrationTests.class) public class IntegrationTestDataIngestSlowDeterministic extends IngestIntegrationTestBase { @@ -50,7 +52,8 @@ public class IntegrationTestDataIngestSlowDeterministic extends IngestIntegratio new BatchRestartRs(5000, 0.5f), new RestartActiveMaster(5000), new RollingBatchRestartRs(5000, 1.0f), - new RestartRsHoldingMeta(35000) + new RestartRsHoldingMeta(35000), + new SuspendRandomRs(5000l) }; monkey = new ChaosMonkey(util, new ChaosMonkey.CompositeSequentialPolicy( new ChaosMonkey.DoActionsOncePolicy(CHAOS_EVERY_MS, actions), diff --git hbase-it/src/test/java/org/apache/hadoop/hbase/util/ChaosMonkey.java hbase-it/src/test/java/org/apache/hadoop/hbase/util/ChaosMonkey.java index 48a291f..1345d12 100644 --- hbase-it/src/test/java/org/apache/hadoop/hbase/util/ChaosMonkey.java +++ hbase-it/src/test/java/org/apache/hadoop/hbase/util/ChaosMonkey.java @@ -78,7 +78,7 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable { private static final long ONE_SEC = 1000; private static final long FIVE_SEC = 5 * ONE_SEC; private static final long ONE_MIN = 60 * ONE_SEC; - private static final long TIMEOUT = ONE_MIN; + private static final long TIMEOUT = 2 * ONE_MIN; final IntegrationTestingUtility util; @@ -161,6 +161,11 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable { return regionServers.toArray(new ServerName[regionServers.size()]); } + void sleep(long sleepTime) { + LOG.info("Sleeping for:" + sleepTime); + Threads.sleep(sleepTime); + } + protected void killMaster(ServerName server) throws IOException { LOG.info("Killing master:" + server); cluster.killMaster(server); @@ -191,6 +196,27 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable { + cluster.getClusterStatus().getServersSize()); } + protected void suspendRs(ServerName server) throws IOException { + LOG.info("Suspending region server " + server.getHostname()); + cluster.suspendRegionServer(server); + + + } + + protected void resumeRs(ServerName server) throws IOException { + LOG.info("Resuming region server "+ server.getHostname()); + cluster.resumeRegionServer(server); + + try { + cluster.waitForRegionServerToStart(server.getHostname(), ONE_MIN); + } catch (IOException e) { + LOG.info("Looks like the region server died after resume."); + cluster.killRegionServer(server); + cluster.startRegionServer(server.getHostname()); + cluster.waitForRegionServerToStart(server.getHostname(), TIMEOUT); + } + } + protected void unbalanceRegions(ClusterStatus clusterStatus, List fromServers, List toServers, double fractionOfRegions) throws Exception { @@ -233,10 +259,7 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable { this.sleepTime = sleepTime; } - void sleep(long sleepTime) { - LOG.info("Sleeping for:" + sleepTime); - Threads.sleep(sleepTime); - } + void restartMaster(ServerName server, long sleepTime) throws IOException { killMaster(server); @@ -278,7 +301,33 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable { } } - public static class RestartRsHoldingMeta extends RestartActionBase { + public static class SuspendActionBase extends Action { + protected final long sleepTime; + + public SuspendActionBase(long sleepTime) { + this.sleepTime = sleepTime; + } + + protected void suspendRs(ServerName server, long sleepTime) throws IOException { + suspendRs(server); + sleep(sleepTime); + resumeRs(server); + } + } + + public static class SuspendRandomRs extends SuspendActionBase { + public SuspendRandomRs(long sleepTime) { + super(sleepTime); + } + + public void perform() throws Exception { + LOG.info("Performing Action: Suspend region server"); + ServerName server = selectRandomItem(getCurrentServers()); + suspendRs(server, sleepTime); + } + } + + public static class RestartRsHoldingMeta extends RestartRandomRs { public RestartRsHoldingMeta(long sleepTime) { super(sleepTime); } @@ -709,7 +758,8 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable { new Pair(new RestartRandomRs(ONE_MIN), 2), new Pair(new RestartRsHoldingMeta(FIVE_SEC), 1), new Pair(new BatchRestartRs(FIVE_SEC, 0.5f), 2), - new Pair(new RollingBatchRestartRs(FIVE_SEC, 1.0f), 2) + new Pair(new RollingBatchRestartRs(FIVE_SEC, 1.0f), 2), + new Pair(new SuspendRandomRs(ONE_MIN), 2) ); public static final String EVERY_MINUTE_RANDOM_ACTION_POLICY = "EVERY_MINUTE_RANDOM_ACTION_POLICY"; diff --git hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java index b5d53f6..bdf136d 100644 --- hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java +++ hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java @@ -142,6 +142,22 @@ public abstract class HBaseCluster implements Closeable, Configurable { public abstract void stopRegionServer(ServerName serverName) throws IOException; /** + * Suspends the region server. This is either done by suspending the RegionServer thread, in the + * case of minicluster, or by sending SIGSTOP, in the case of a distributed cluster. + * @param serverName Server name of the server to suspend + * @throws IOException + */ + public abstract void suspendRegionServer(ServerName serverName) throws IOException; + + /** + * Resume the region server. This will resume the region server and attempt to make sure that + * the region server is running properly. + * @param serverName ServerName of the server to resume. + * @throws IOException If resuming and starting both timeout. + */ + public abstract void resumeRegionServer(ServerName serverName) throws IOException; + + /** * Wait for the specified region server to join the cluster * @return whether the operation finished with success * @throws IOException if something goes wrong or timeout occurs @@ -150,9 +166,13 @@ public abstract class HBaseCluster implements Closeable, Configurable { throws IOException { long start = System.currentTimeMillis(); while ((System.currentTimeMillis() - start) < timeout) { - for (ServerName server : getClusterStatus().getServers()) { + ClusterStatus status = getClusterStatus(); + for (ServerName server : status.getServers()) { if (server.getHostname().equals(hostname)) { - return; + ServerLoad load = status.getLoad(server); + if (load.obtainServerLoadPB().getReportEndTime() >= start) { + return; + } } } Threads.sleep(100); diff --git hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java index 75cca70..06bc919 100644 --- hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java +++ hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java @@ -252,6 +252,16 @@ public class MiniHBaseCluster extends HBaseCluster { } @Override + public void suspendRegionServer(ServerName serverName) throws IOException { + suspendRegionServer(getRegionServerIndex(serverName)); + } + + @Override + public void resumeRegionServer(ServerName serverName) throws IOException { + resumeRegionServer(getRegionServerIndex(serverName)); + } + + @Override public void stopRegionServer(ServerName serverName) throws IOException { stopRegionServer(getRegionServerIndex(serverName)); } @@ -346,6 +356,22 @@ public class MiniHBaseCluster extends HBaseCluster { return server; } + @SuppressWarnings("deprecation") + private void suspendRegionServer(int regionServerIndex) { + JVMClusterUtil.RegionServerThread server = + hbaseCluster.getRegionServers().get(regionServerIndex); + LOG.info("Suspending " + server.toString()); + server.suspend(); + } + + @SuppressWarnings("deprecation") + private void resumeRegionServer(int regionServerIndex) { + JVMClusterUtil.RegionServerThread server = + hbaseCluster.getRegionServers().get(regionServerIndex); + LOG.info("Suspending " + server.toString()); + server.resume(); + } + /** * Wait for the specified region server to stop. Removes this thread from list * of running threads. -- 1.7.10.2 (Apple Git-33)