From bac5769caa92548ddbc5a9c7dae8e4e3d6667cb0 Mon Sep 17 00:00:00 2001 From: Elliott Clark Date: Fri, 5 Jul 2013 15:04:40 -0700 Subject: [PATCH] add test for mttr --- ...grationTestRebalanceAndKillServersTargeted.java | 2 +- .../hadoop/hbase/mttr/IntegrationTestMTTR.java | 458 ++++++++++++++++++++ .../org/apache/hadoop/hbase/util/ChaosMonkey.java | 82 +++- .../hadoop/hbase/util/AbstractHBaseTool.java | 3 +- 4 files changed, 531 insertions(+), 14 deletions(-) create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/mttr/IntegrationTestMTTR.java diff --git hbase-it/src/test/java/org/apache/hadoop/hbase/IntegrationTestRebalanceAndKillServersTargeted.java hbase-it/src/test/java/org/apache/hadoop/hbase/IntegrationTestRebalanceAndKillServersTargeted.java index 1f4c279..e5fdd5b 100644 --- hbase-it/src/test/java/org/apache/hadoop/hbase/IntegrationTestRebalanceAndKillServersTargeted.java +++ hbase-it/src/test/java/org/apache/hadoop/hbase/IntegrationTestRebalanceAndKillServersTargeted.java @@ -74,7 +74,7 @@ public class IntegrationTestRebalanceAndKillServersTargeted extends IngestIntegr private static final long WAIT_AFTER_BALANCE_MS = 5 * 1000; @Override - protected void perform() throws Exception { + public void perform() throws Exception { ClusterStatus status = this.cluster.getClusterStatus(); List victimServers = new LinkedList(status.getServers()); int liveCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_LIVE * victimServers.size()); diff --git hbase-it/src/test/java/org/apache/hadoop/hbase/mttr/IntegrationTestMTTR.java hbase-it/src/test/java/org/apache/hadoop/hbase/mttr/IntegrationTestMTTR.java new file mode 100644 index 0000000..eac8072 --- /dev/null +++ hbase-it/src/test/java/org/apache/hadoop/hbase/mttr/IntegrationTestMTTR.java @@ -0,0 +1,458 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.mttr; + +import com.google.common.base.Objects; +import org.apache.commons.lang.RandomStringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.ClusterStatus; +import org.apache.hadoop.hbase.HColumnDescriptor; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.IntegrationTestingUtility; +import org.apache.hadoop.hbase.client.HBaseAdmin; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.filter.KeyOnlyFilter; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.ChaosMonkey; +import org.apache.hadoop.hbase.util.LoadTestTool; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; + +import static junit.framework.Assert.assertEquals; + +/** + * Integration test that should benchmark how fast HBase can recover from failures. This test starts + * different threads: + *
    + *
  1. + * Load Test Tool.
    + * This runs so that all RegionServers will have some load and HLogs will be full. + *
  2. + *
  3. + * Scan thread.
    + * This thread runs a very short scan over and over again recording how log it takes to respond. + * The longest response is assumed to be the time it took to recover. + *
  4. + *
  5. + * Put thread.
    + * This thread just like the scan thread except it does a very small put. + *
  6. + *
  7. + * Admin thread.
    + * This thread will continually go to the master to try and get the cluster status. Just like the + * put and scan threads, the time to respond is recorded. + *
  8. + *
  9. + * Chaos Monkey thread.
    + * This thread runs a ChaosMonkey.Action. + *
  10. + *
+ *

+ * The ChaosMonkey actions currently run are: + *

    + *
  • Restart the RegionServer holding meta.
  • + *
  • Restart the RegionServer holding the table the scan and put threads are targeting.
  • + *
  • Move the Regions of the table used by the scan and put threads.
  • + *
  • Restart the master.
  • + *
+ *

+ * At the end of the test a log line is output on the INFO level containing the timing data that was + */ +public class IntegrationTestMTTR { + /** + * Constants. + */ + private static final byte[] FAMILY = Bytes.toBytes("d"); + private static final Log LOG = LogFactory.getLog(IntegrationTestMTTR.class); + private static final long SLEEP_TIME = 60 * 1000l; + + /** + * Configurable table names. + */ + private static String tableName; + private static byte[] tableNameBytes; + private static String loadTableName; + + /** + * Util to get at the cluster. + */ + private static IntegrationTestingUtility util; + + /** + * Executor for test threads. + */ + private static ExecutorService executorService; + + /** + * All of the chaos monkey actions used. + */ + private static ChaosMonkey.Action restartRSAction; + private static ChaosMonkey.Action restartMetaAction; + private static ChaosMonkey.Action moveRegionAction; + private static ChaosMonkey.Action restartMasterAction; + + /** + * The load test tool used to create load and make sure that HLogs aren't empty. + */ + private static LoadTestTool loadTool; + + @BeforeClass + public static void setUp() throws Exception { + // Set up the integration test util + if (util == null) { + util = new IntegrationTestingUtility(); + } + + // Make sure there are three servers. + util.initializeCluster(3); + + // Set up the load test tool. + loadTool = new LoadTestTool(); + loadTool.setConf(util.getConfiguration()); + + // Create executor with enough threads to restart rs's, + // run scans, puts, admin ops and load test tool. + executorService = Executors.newFixedThreadPool(8); + + // Set up the tables needed. + setupTables(); + + // Set up the actions. + setupActions(); + } + + private static void setupActions() throws IOException { + // Set up the action that will restart a region server holding a region from our table + // because this table should only have one region we should be good. + restartRSAction = new ChaosMonkey.RestartRsHoldingTable(SLEEP_TIME, tableName); + + // Set up the action that will kill the region holding meta. + restartMetaAction = new ChaosMonkey.RestartRsHoldingMeta(SLEEP_TIME); + + // Set up the action that will move the regions of our table. + moveRegionAction = new ChaosMonkey.MoveRegionsOfTable(SLEEP_TIME, tableName); + + // Kill the master (No sleep time because there is only one master running at this time.) + restartMasterAction = new ChaosMonkey.RestartActiveMaster(0l); + + // Give the action the access to the cluster. + ChaosMonkey.ActionContext actionContext = new ChaosMonkey.ActionContext(util); + restartRSAction.init(actionContext); + restartMetaAction.init(actionContext); + moveRegionAction.init(actionContext); + restartMasterAction.init(actionContext); + } + + private static void setupTables() throws IOException { + // Get the table name. + tableName = util.getConfiguration() + .get("hbase.IntegrationTestMTTR.tableName", "IntegrationTestMTTR"); + tableNameBytes = Bytes.toBytes(tableName); + + loadTableName = util.getConfiguration() + .get("hbase.IntegrationTestMTTR.loadTableName", "IntegrationTestMTTRLoadTestTool"); + + if (util.getHBaseAdmin().tableExists(tableNameBytes)) { + util.deleteTable(tableNameBytes); + } + + if (util.getHBaseAdmin().tableExists(loadTableName)) { + util.deleteTable(loadTableName); + } + + // Create the table. If this fails then fail everything. + HTableDescriptor tableDescriptor = new HTableDescriptor(tableNameBytes); + + // Make the max file size huge so that splits don't happen during the test. + tableDescriptor.setMaxFileSize(2 * 1024 * 1024 * 1024); + + HColumnDescriptor descriptor = new HColumnDescriptor(FAMILY); + descriptor.setMaxVersions(1); + tableDescriptor.addFamily(descriptor); + util.getHBaseAdmin().createTable(tableDescriptor); + + // Setup the table for LoadTestTool + int ret = loadTool.run(new String[]{"-tn", loadTableName, "-init_only"}); + assertEquals("Failed to initialize LoadTestTool", 0, ret); + } + + @AfterClass + public static void after() throws IOException { + // Clean everything up. + util.restoreCluster(); + util = null; + + // Stop the threads so that we know everything is complete. + executorService.shutdown(); + executorService = null; + + // Clean up the actions. + moveRegionAction = null; + restartMetaAction = null; + restartRSAction = null; + restartMasterAction = null; + + loadTool = null; + } + + @Test + public void testRestartRsHoldingTable() throws Exception { + run(new ActionCallable(restartRSAction), "RestartRsHoldingTable"); + } + + @Test + public void testKillRsHoldingMeta() throws Exception { + run(new ActionCallable(restartMetaAction), "KillRsHoldingMeta"); + } + + @Test + public void testMoveRegion() throws Exception { + run(new ActionCallable(moveRegionAction), "MoveRegion"); + } + + @Test + public void testRestartMaster() throws Exception { + run(new ActionCallable(restartMasterAction), "RestartMaster"); + } + + public void run(Callable monkeyCallable, String testName) throws Exception { + int maxIters = util.getHBaseClusterInterface().isDistributedCluster() ? 10 : 3; + + // Array to keep track of times. + ArrayList timesPutMs = new ArrayList(maxIters); + ArrayList timesScanMs = new ArrayList(maxIters); + ArrayList timesAdminMs = new ArrayList(maxIters); + long start = System.nanoTime(); + + // We're going to try this multiple times + for (int fullIterations = 0; fullIterations < maxIters; fullIterations++) { + // Create and start executing a callable that will kill the servers + Future monkeyFuture = executorService.submit(monkeyCallable); + + // Pass that future to the timing Callables. + Future putFuture = executorService.submit(new PutCallable(monkeyFuture)); + Future scanFuture = executorService.submit(new ScanCallable(monkeyFuture)); + Future adminFuture = executorService.submit(new AdminCallable(monkeyFuture)); + + Future loadFuture = executorService.submit(new LoadCallable(monkeyFuture)); + + monkeyFuture.get(); + loadFuture.get(); + + // Get the values from the futures. + long putTime = TimeUnit.MILLISECONDS.convert(putFuture.get(), TimeUnit.NANOSECONDS); + long scanTime = TimeUnit.MILLISECONDS.convert(scanFuture.get(), TimeUnit.NANOSECONDS); + long adminTime = TimeUnit.MILLISECONDS.convert(adminFuture.get(), TimeUnit.NANOSECONDS); + + // Store the times to display later. + timesPutMs.add(putTime); + timesScanMs.add(scanTime); + timesAdminMs.add(adminTime); + + // Wait some time for everything to settle down. + Thread.sleep(5000l); + } + + long runtimeMs = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); + + Objects.ToStringHelper helper = Objects.toStringHelper("MTTRResults") + .add("timesPutMs", timesPutMs) + .add("timesScanMs", timesScanMs) + .add("timesAdminMs", timesAdminMs) + .add("totalRuntimeMs", runtimeMs) + .add("name", testName); + + // Log the info + LOG.info(helper.toString()); + } + + /** + * Base class for actions that need to record the time needed to recover from a failure. + */ + public abstract class TimingCallable implements Callable { + protected final Future future; + + public TimingCallable(Future f) { + future = f; + } + + @Override + public Long call() throws Exception { + // TODO(eclark): Do more than just max. Full histogram support. + long maxTime = 0; + int numAfterDone = 0; + // Keep trying until the rs is back up and we've gotten a put through + while (numAfterDone < 10) { + long start = System.nanoTime(); + try { + boolean result = doAction(); + if (result && future.isDone()) { + numAfterDone ++; + } + } catch (Exception e) { + numAfterDone = 0; + } + maxTime = Math.max(maxTime, System.nanoTime() - start); + } + return maxTime; + } + + protected abstract boolean doAction() throws Exception; + } + + /** + * Callable that will keep putting small amounts of data into a table + * until the future supplied returns. It keeps track of the max time. + */ + public class PutCallable extends TimingCallable { + + private final HTable table; + + public PutCallable(Future f) throws IOException { + super(f); + this.table = new HTable(util.getConfiguration(), tableNameBytes); + } + + @Override + protected boolean doAction() throws Exception { + Put p = new Put(Bytes.toBytes(RandomStringUtils.randomAlphanumeric(5))); + p.add(FAMILY, Bytes.toBytes("\0"), Bytes.toBytes(RandomStringUtils.randomAscii(5))); + table.put(p); + table.flushCommits(); + return true; + } + } + + /** + * Callable that will keep scanning for small amounts of data until the + * supplied future returns. Returns the max time taken to scan. + */ + public class ScanCallable extends TimingCallable { + private final HTable table; + + public ScanCallable(Future f) throws IOException { + super(f); + this.table = new HTable(util.getConfiguration(), tableNameBytes); + } + + @Override + protected boolean doAction() throws Exception { + ResultScanner rs = null; + try { + Scan s = new Scan(); + s.setBatch(2); + s.addFamily(FAMILY); + s.setFilter(new KeyOnlyFilter()); + s.setMaxVersions(1); + + rs = table.getScanner(s); + Result result = rs.next(); + return rs != null && result != null && result.size() > 0; + } finally { + if (rs != null) { + rs.close(); + } + } + } + } + + /** + * Callable that will keep going to the master for cluster status. Returns the max time taken. + */ + public class AdminCallable extends TimingCallable { + + public AdminCallable(Future f) throws IOException { + super(f); + } + + @Override + protected boolean doAction() throws Exception { + HBaseAdmin admin = new HBaseAdmin(util.getConfiguration()); + ClusterStatus status = admin.getClusterStatus(); + return status != null; + } + } + + + public class ActionCallable implements Callable { + private final ChaosMonkey.Action action; + + public ActionCallable(ChaosMonkey.Action action) { + + this.action = action; + } + + @Override + public Boolean call() throws Exception { + this.action.perform(); + return true; + } + } + + /** + * Callable used to make sure the cluster has some load on it. + * This callable uses LoadTest tool to + */ + public class LoadCallable implements Callable { + + private final Future future; + + public LoadCallable(Future f) { + future = f; + } + + @Override + public Boolean call() throws Exception { + int colsPerKey = 10; + int recordSize = 500; + int numServers = util.getHBaseClusterInterface().getInitialClusterStatus().getServersSize(); + int numKeys = numServers * 5000; + int writeThreads = 10; + + + // Loop until the chaos monkey future is done. + // But always go in just in case some action completes quickly + do { + int ret = loadTool.run(new String[]{ + "-tn", loadTableName, + "-write", String.format("%d:%d:%d", colsPerKey, recordSize, writeThreads), + "-num_keys", String.valueOf(numKeys), + "-skip_init" + }); + assertEquals("Load failed", 0, ret); + } while (!future.isDone()); + + return true; + } + } +} \ No newline at end of file diff --git hbase-it/src/test/java/org/apache/hadoop/hbase/util/ChaosMonkey.java hbase-it/src/test/java/org/apache/hadoop/hbase/util/ChaosMonkey.java index aa252ee..48a291f 100644 --- hbase-it/src/test/java/org/apache/hadoop/hbase/util/ChaosMonkey.java +++ hbase-it/src/test/java/org/apache/hadoop/hbase/util/ChaosMonkey.java @@ -29,6 +29,7 @@ import java.util.Queue; import java.util.Random; import org.apache.commons.cli.CommandLine; +import org.apache.commons.lang.math.RandomUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -42,6 +43,7 @@ import org.apache.hadoop.hbase.ServerLoad; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.Stoppable; import org.apache.hadoop.hbase.client.HBaseAdmin; +import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.ToolRunner; @@ -110,10 +112,10 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable { /** * Context for Action's */ - private static class ActionContext { + public static class ActionContext { private IntegrationTestingUtility util; - ActionContext(IntegrationTestingUtility util) { + public ActionContext(IntegrationTestingUtility util) { this.util = util; } @@ -141,7 +143,7 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable { protected ServerName[] initialServers; protected Random random = new Random(); - void init(ActionContext context) throws Exception { + public void init(ActionContext context) throws IOException { this.context = context; cluster = context.getHBaseCluster(); initialStatus = cluster.getInitialClusterStatus(); @@ -149,7 +151,7 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable { initialServers = regionServers.toArray(new ServerName[regionServers.size()]); } - protected void perform() throws Exception { }; + public void perform() throws Exception { }; // TODO: perhaps these methods should be elsewhere? /** Returns current region servers */ @@ -254,7 +256,7 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable { super(sleepTime); } @Override - protected void perform() throws Exception { + public void perform() throws Exception { LOG.info("Performing action: Restart active master"); ServerName master = cluster.getClusterStatus().getMaster(); @@ -268,7 +270,7 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable { } @Override - protected void perform() throws Exception { + public void perform() throws Exception { LOG.info("Performing action: Restart random region server"); ServerName server = selectRandomItem(getCurrentServers()); @@ -276,12 +278,12 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable { } } - public static class RestartRsHoldingMeta extends RestartRandomRs { + public static class RestartRsHoldingMeta extends RestartActionBase { public RestartRsHoldingMeta(long sleepTime) { super(sleepTime); } @Override - protected void perform() throws Exception { + public void perform() throws Exception { LOG.info("Performing action: Restart region server holding META"); ServerName server = cluster.getServerHoldingMeta(); if (server == null) { @@ -292,6 +294,62 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable { } } + public static class RestartRsHoldingTable extends RestartActionBase { + + private final String tableName; + + public RestartRsHoldingTable(long sleepTime, String tableName) { + super(sleepTime); + this.tableName = tableName; + } + + @Override + public void perform() throws Exception { + HTable table = null; + try { + Configuration conf = context.getHaseIntegrationTestingUtility().getConfiguration(); + table = new HTable(conf, tableName); + } catch (IOException e) { + LOG.debug("Error creating HTable used to get list of region locations.", e); + return; + } + + Collection serverNames = table.getRegionLocations().values(); + ServerName[] nameArray = serverNames.toArray(new ServerName[serverNames.size()]); + + restartRs(nameArray[random.nextInt(nameArray.length)], sleepTime); + } + } + + public static class MoveRegionsOfTable extends Action { + private final long sleepTime; + private final byte[] tableNameBytes; + + public MoveRegionsOfTable(long sleepTime, String tableName) { + this.sleepTime = sleepTime; + this.tableNameBytes = Bytes.toBytes(tableName); + } + + @Override + public void perform() throws Exception { + HBaseAdmin admin = this.context.getHaseIntegrationTestingUtility().getHBaseAdmin(); + + List regions = admin.getTableRegions(tableNameBytes); + Collection serversList = admin.getClusterStatus().getServers(); + ServerName[] servers = serversList.toArray(new ServerName[serversList.size()]); + + for (HRegionInfo regionInfo:regions) { + try { + byte[] destServerName = + Bytes.toBytes(servers[RandomUtils.nextInt(servers.length)].getServerName()); + admin.move(regionInfo.getRegionName(), destServerName); + } catch (Exception e) { + LOG.debug("Error moving region", e); + } + } + Thread.sleep(sleepTime); + } + } /** * Restarts a ratio of the running regionservers at the same time */ @@ -304,7 +362,7 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable { } @Override - protected void perform() throws Exception { + public void perform() throws Exception { LOG.info(String.format("Performing action: Batch restarting %d%% of region servers", (int)(ratio * 100))); List selectedServers = selectRandomItems(getCurrentServers(), ratio); @@ -346,7 +404,7 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable { } @Override - protected void perform() throws Exception { + public void perform() throws Exception { LOG.info(String.format("Performing action: Rolling batch restarting %d%% of region servers", (int)(ratio * 100))); List selectedServers = selectRandomItems(getCurrentServers(), ratio); @@ -394,7 +452,7 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable { } @Override - protected void perform() throws Exception { + public void perform() throws Exception { LOG.info("Unbalancing regions"); ClusterStatus status = this.cluster.getClusterStatus(); List victimServers = new LinkedList(status.getServers()); @@ -410,7 +468,7 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable { public static class ForceBalancerAction extends Action { @Override - protected void perform() throws Exception { + public void perform() throws Exception { LOG.info("Balancing regions"); forceBalancer(); } diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/util/AbstractHBaseTool.java hbase-server/src/main/java/org/apache/hadoop/hbase/util/AbstractHBaseTool.java index e0c170d..57d2fb5 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/util/AbstractHBaseTool.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/util/AbstractHBaseTool.java @@ -16,6 +16,7 @@ */ package org.apache.hadoop.hbase.util; +import java.io.IOException; import java.util.Set; import java.util.TreeSet; @@ -79,7 +80,7 @@ public abstract class AbstractHBaseTool implements Tool { } @Override - public final int run(String[] args) throws Exception { + public final int run(String[] args) throws IOException { if (conf == null) { LOG.error("Tool configuration is not initialized"); throw new NullPointerException("conf"); -- 1.7.10.2 (Apple Git-33)