diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java index 4b6ccfb..d10a8cf 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java @@ -18,11 +18,15 @@ package org.apache.hadoop.hbase.chaos.actions; +import java.io.IOException; +import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.Queue; import org.apache.commons.lang.math.RandomUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; @@ -31,6 +35,8 @@ import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; * server, or starts one, sleeping randomly (0-sleepTime) in between steps. */ public class RollingBatchRestartRsAction extends BatchRestartRsAction { + private static Log LOG = LogFactory.getLog(RollingBatchRestartRsAction.class); + public RollingBatchRestartRsAction(long sleepTime, float ratio) { super(sleepTime, ratio); } @@ -57,14 +63,64 @@ public class RollingBatchRestartRsAction extends BatchRestartRsAction { if (action) { ServerName server = serversToBeKilled.remove(); - killRs(server); + try { + killRs(server); + } catch (org.apache.hadoop.util.Shell.ExitCodeException e) { + // We've seen this in test runs where we timeout but the kill went through. HBASE-9743 + // So, add to deadServers even if exception so the start gets called. + LOG.info("Problem killing but presume successful; code=" + e.getExitCode(), e); + } deadServers.add(server); } else { - ServerName server = deadServers.remove(); - startRs(server); + try { + ServerName server = deadServers.remove(); + startRs(server); + } catch (org.apache.hadoop.util.Shell.ExitCodeException e) { + // The start may fail but better to just keep going though we may lose server. + // + LOG.info("Problem starting, will retry; code=" + e.getExitCode(), e); + } } sleep(RandomUtils.nextInt((int)sleepTime)); } } -} + + /** + * Small test to ensure the class basically works. + * @param args + * @throws Exception + */ + public static void main(final String[] args) throws Exception { + RollingBatchRestartRsAction action = new RollingBatchRestartRsAction(1, 1.0f) { + private int invocations = 0; + @Override + protected ServerName[] getCurrentServers() throws IOException { + final int count = 4; + List serverNames = new ArrayList(count); + for (int i = 0; i < 4; i++) { + serverNames.add(new ServerName(i + ".example.org", i, i)); + } + return serverNames.toArray(new ServerName [] {}); + } + + @Override + protected void killRs(ServerName server) throws IOException { + LOG.info("Killed " + server); + if (this.invocations++ % 3 == 0) { + throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed"); + } + } + + @Override + protected void startRs(ServerName server) throws IOException { + LOG.info("Started " + server); + if (this.invocations++ % 3 == 0) { + throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed"); + } + } + }; + + action.perform(); + } +} \ No newline at end of file