diff --git hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java index 1e24b8c..535f407 100644 --- hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java +++ hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java @@ -745,7 +745,19 @@ public final class HConstants { Arrays.asList(new String[] { HREGION_LOGDIR_NAME, HREGION_OLDLOGDIR_NAME, CORRUPT_DIR_NAME, toString(META_TABLE_NAME), toString(ROOT_TABLE_NAME), SPLIT_LOGDIR_NAME, HBCK_SIDELINEDIR_NAME, HFILE_ARCHIVE_DIRECTORY })); - + /** Health script related settings. */ + public static final String RS_HEALTH_SCRIPT_LOC = "hbase.regionserver.health.script.location"; + public static final String RS_HEALTH_SCRIPT_TIMEOUT = "hbase.regionserver.health.script.timeout"; + public static final String RS_HEALTH_CHORE_WAKE_FREQ = + "hbase.regionserver.health.script.frequency"; + public static final long DEFAULT_RS_HEALTH_SCRIPT_TIMEOUT = 60000; + /** + * The maximum number of health check failures a server can encounter consecutively. + */ + public static final String RS_HEALTH_FAILURE_THRESHOLD = + "hbase.regionserver.health.failure.threshold"; + public static final int DEFAULT_RS_HEALTH_FAILURE_THRESHOLD = 3; + private HConstants() { // Can't be instantiated with this ctor. } diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 4f000f6..b82212d 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -395,6 +395,9 @@ public class HRegionServer implements ClientProtocol, // reference to the Thrift Server. volatile private HRegionThriftServer thriftServer; + /** The health check chore. */ + private RegionServerHealthCheckChore healthCheckChore; + /** * The server name the Master sees us as. Its made from the hostname the * master passes us, port, and server startcode. Gets set after registration @@ -817,6 +820,10 @@ public class HRegionServer implements ClientProtocol, ".multiplier", 1000); this.compactionChecker = new CompactionChecker(this, this.threadWakeFrequency * multiplier, this); + // Health checker thread. + int sleepTime = this.conf.getInt(HConstants.RS_HEALTH_CHORE_WAKE_FREQ, + HConstants.DEFAULT_THREAD_WAKE_FREQUENCY); + healthCheckChore = new RegionServerHealthCheckChore(sleepTime, this, getConfiguration()); this.leases = new Leases(this.threadWakeFrequency); @@ -932,6 +939,9 @@ public class HRegionServer implements ClientProtocol, if (this.hlogRoller != null) this.hlogRoller.interruptIfNecessary(); if (this.compactionChecker != null) this.compactionChecker.interrupt(); + if (isHealthCheckerConfigured()) { + this.healthCheckChore.interrupt(); + } if (this.killed) { // Just skip out w/o closing regions. Used when testing. @@ -1487,6 +1497,10 @@ public class HRegionServer implements ClientProtocol, handler); Threads.setDaemonThreadRunning(this.compactionChecker.getThread(), n + ".compactionChecker", handler); + if (isHealthCheckerConfigured()) { + Threads + .setDaemonThreadRunning(this.healthCheckChore.getThread(), n + ".healthChecker", handler); + } // Leases is not a Thread. Internally it runs a daemon thread. If it gets // an unhandled exception, it will just exit. @@ -1711,6 +1725,9 @@ public class HRegionServer implements ClientProtocol, protected void join() { Threads.shutdown(this.compactionChecker.getThread()); Threads.shutdown(this.cacheFlusher.getThread()); + if (isHealthCheckerConfigured()) { + Threads.shutdown(this.healthCheckChore.getThread()); + } if (this.hlogRoller != null) { Threads.shutdown(this.hlogRoller.getThread()); } @@ -3991,4 +4008,9 @@ public class HRegionServer implements ClientProtocol, this.s = s; } } + + private boolean isHealthCheckerConfigured() { + String healthScriptLocation = this.conf.get(HConstants.RS_HEALTH_SCRIPT_LOC); + return org.apache.commons.lang.StringUtils.isNotBlank(healthScriptLocation); + } } diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HealthChecker.java hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HealthChecker.java new file mode 100644 index 0000000..a6513e6 --- /dev/null +++ hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HealthChecker.java @@ -0,0 +1,130 @@ +/** + * Copyright 2011 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.regionserver; + +import java.io.IOException; +import java.util.ArrayList; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.util.Shell.ExitCodeException; +import org.apache.hadoop.util.Shell.ShellCommandExecutor; + + +/** + * The Class HealthChecker to check health of the region server. + */ +class HealthChecker { + + private static Log LOG = LogFactory.getLog(HealthChecker.class); + private ShellCommandExecutor shexec = null; + private String exceptionStackTrace; + + /** Pattern used for searching in the output of the node health script */ + static private final String ERROR_PATTERN = "ERROR"; + + private String healthCheckScript; + private long scriptTimeout; + + enum HealthCheckerExitStatus { + SUCCESS, + TIMED_OUT, + FAILED_WITH_EXIT_CODE, + FAILED_WITH_EXCEPTION, + FAILED + } + + /** + * Initialize. + * + * @param configuration + */ + public void init(String location, long timeout) { + this.healthCheckScript = location; + this.scriptTimeout = timeout; + ArrayList execScript = new ArrayList(); + execScript.add(healthCheckScript); + shexec = new ShellCommandExecutor(execScript.toArray(new String[execScript.size()]), null, + null, scriptTimeout); + LOG.info("RegionServerHealthChecker initialized."); + } + + + public HealthReport checkHealth() { + HealthCheckerExitStatus status = HealthCheckerExitStatus.SUCCESS; + try { + shexec.execute(); + } catch (ExitCodeException e) { + // ignore the exit code of the script + LOG.warn("Caught exception : " + e); + status = HealthCheckerExitStatus.FAILED_WITH_EXIT_CODE; + } catch (IOException e) { + LOG.warn("Caught exception : " + e); + if (!shexec.isTimedOut()) { + status = HealthCheckerExitStatus.FAILED_WITH_EXCEPTION; + exceptionStackTrace = org.apache.hadoop.util.StringUtils.stringifyException(e); + } else { + status = HealthCheckerExitStatus.TIMED_OUT; + } + } finally { + if (status == HealthCheckerExitStatus.SUCCESS) { + if (hasErrors(shexec.getOutput())) { + status = HealthCheckerExitStatus.FAILED; + } + } + } + return new HealthReport(status, getHealthReport(status)); + } + + + private boolean hasErrors(String output) { + String[] splits = output.split("\n"); + for (String split : splits) { + if (split.startsWith(ERROR_PATTERN)) { + return true; + } + } + return false; + } + + + private String getHealthReport(HealthCheckerExitStatus status){ + String healthReport = null; + switch (status) { + case SUCCESS: + healthReport = "Server is healthy."; + break; + case TIMED_OUT: + healthReport = "Health script timed out"; + break; + case FAILED_WITH_EXCEPTION: + healthReport = exceptionStackTrace; + break; + case FAILED_WITH_EXIT_CODE: + healthReport = "Health script failed with exit code."; + break; + case FAILED: + healthReport = shexec.getOutput(); + break; + } + return healthReport; + } + +} diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HealthReport.java hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HealthReport.java new file mode 100644 index 0000000..1a2b0aa --- /dev/null +++ hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HealthReport.java @@ -0,0 +1,89 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.regionserver; + +import org.apache.hadoop.hbase.regionserver.HealthChecker.HealthCheckerExitStatus; + +/** + * The Class RegionServerHealthReport containing information about + * health of the region server. + */ +class HealthReport { + + private HealthCheckerExitStatus status; + private String healthReport; + + HealthReport(HealthCheckerExitStatus status, String healthReport) { + super(); + this.status = status; + this.healthReport = healthReport; + } + + /** + * Gets the status of the region server. + * + * @return HealthCheckerExitStatus + */ + HealthCheckerExitStatus getStatus() { + return status; + } + + /** + * Gets the health report of the region server. + * + * @return String + */ + String getHealthReport() { + return healthReport; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((healthReport == null) ? 0 : healthReport.hashCode()); + result = prime * result + ((status == null) ? 0 : status.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (!(obj instanceof HealthReport)) { + return false; + } + HealthReport other = (HealthReport) obj; + if (healthReport == null) { + if (other.healthReport != null) { + return false; + } + } else if (!healthReport.equals(other.healthReport)) { + return false; + } + if (status != other.status) { + return false; + } + return true; + } + +} diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerHealthCheckChore.java hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerHealthCheckChore.java new file mode 100644 index 0000000..02391d5 --- /dev/null +++ hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerHealthCheckChore.java @@ -0,0 +1,100 @@ +/** + * Copyright 2011 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.regionserver; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.Chore; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.Stoppable; +import org.apache.hadoop.hbase.regionserver.HealthChecker.HealthCheckerExitStatus; +import org.apache.hadoop.util.StringUtils; + +/** + * The Class RegionServerHealthCheckChore for checking the health of + * the region server. + */ + class RegionServerHealthCheckChore extends Chore { + private static Log LOG = LogFactory.getLog(RegionServerHealthCheckChore.class); + private HealthChecker rsHealthChecker; + private Configuration config; + private int threshold; + private int numTimesUnhealthy = 0; + private long failureWindow; + private long startWindow; + + RegionServerHealthCheckChore(int sleepTime, Stoppable stopper, Configuration conf) { + super("RegionHealthChecker", sleepTime, stopper); + LOG.info("Health Check Chore runs every " + StringUtils.formatTime(sleepTime)); + this.config = conf; + String healthCheckScript = this.config.get(HConstants.RS_HEALTH_SCRIPT_LOC); + long scriptTimeout = this.config.getLong(HConstants.RS_HEALTH_SCRIPT_TIMEOUT, + HConstants.DEFAULT_RS_HEALTH_SCRIPT_TIMEOUT); + rsHealthChecker = new HealthChecker(); + rsHealthChecker.init(healthCheckScript, scriptTimeout); + this.threshold = config.getInt(HConstants.RS_HEALTH_FAILURE_THRESHOLD, + HConstants.DEFAULT_RS_HEALTH_FAILURE_THRESHOLD); + this.failureWindow = this.threshold * sleepTime; + } + + @Override + protected void chore() { + HealthReport report = rsHealthChecker.checkHealth(); + boolean isHealthy = (report.getStatus() == HealthCheckerExitStatus.SUCCESS); + if (!isHealthy) { + boolean needToStop = decideToStop(); + if (needToStop) { + this.stopper.stop("The region server reported unhealthy " + threshold + + " number of times in consecutively."); + } + // Always log health report. + LOG.info("Health status at " + StringUtils.formatTime(System.currentTimeMillis()) + " : " + + report.getHealthReport()); + } + } + + private boolean decideToStop() { + boolean stop = false; + if (numTimesUnhealthy == 0) { + // First time we are seeing a failure. No need to stop, just + // record the time. + numTimesUnhealthy++; + stop = false; + startWindow = System.currentTimeMillis(); + } else { + if ((System.currentTimeMillis() - startWindow) < failureWindow) { + numTimesUnhealthy++; + if (numTimesUnhealthy == threshold) { + stop = true; + } else { + stop = false; + } + } else { + // Outside of failure window, so we reset to 1. + numTimesUnhealthy = 1; + startWindow = System.currentTimeMillis(); + stop = false; + } + } + return stop; + } + +} diff --git hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestServerHealthChecker.java hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestServerHealthChecker.java new file mode 100644 index 0000000..9031973 --- /dev/null +++ hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestServerHealthChecker.java @@ -0,0 +1,137 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.regionserver; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.PrintWriter; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.SmallTests; +import org.apache.hadoop.hbase.Stoppable; +import org.apache.hadoop.hbase.regionserver.HealthChecker.HealthCheckerExitStatus; +import org.junit.After; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +@Category(SmallTests.class) +public class TestServerHealthChecker { + + private static final Log LOG = LogFactory.getLog(TestServerHealthChecker.class); + private static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); + private File healthScriptFile; + + + @After + public void cleanUp() throws IOException { + UTIL.cleanupTestDir(); + } + + @Test + public void testHealthChecker() throws Exception { + Configuration config = getConfForNodeHealthScript(); + config.addResource(healthScriptFile.getName()); + String location = healthScriptFile.getAbsolutePath(); + long timeout = config.getLong(HConstants.RS_HEALTH_SCRIPT_TIMEOUT, 100); + + String normalScript = "echo \"I am all fine\""; + createScript(normalScript, true); + HealthChecker checker = new HealthChecker(); + checker.init(location, timeout); + HealthReport report = checker.checkHealth(); + assertTrue(report.getStatus() == HealthCheckerExitStatus.SUCCESS); + LOG.info("Health Status:" + checker); + + String errorScript = "echo ERROR\n echo \"Server not healthy\""; + createScript(errorScript, true); + report = checker.checkHealth(); + assertTrue(report.getStatus() == HealthCheckerExitStatus.FAILED); + LOG.info("Health Status:" + report.getHealthReport()); + + String timeOutScript = "sleep 4\n echo\"I am fine\""; + createScript(timeOutScript, true); + report = checker.checkHealth(); + assertTrue(report.getStatus() == HealthCheckerExitStatus.TIMED_OUT); + LOG.info("Health Status:" + report.getHealthReport()); + + healthScriptFile.delete(); + } + + @Test + public void testRSHealthChore() throws Exception{ + Stoppable stop = new StoppableImplementation(); + Configuration conf = getConfForNodeHealthScript(); + String errorScript = "echo ERROR\n echo \"Server not healthy\""; + createScript(errorScript, true); + RegionServerHealthCheckChore rsChore = new RegionServerHealthCheckChore(100, stop, conf); + //Default threshold is three. + rsChore.chore(); + rsChore.chore(); + assertFalse("Stoppable must not be stopped.", stop.isStopped()); + rsChore.chore(); + assertTrue("Stoppable must have been stopped.", stop.isStopped()); + } + + private void createScript(String scriptStr, boolean setExecutable) + throws Exception { + healthScriptFile.createNewFile(); + PrintWriter pw = new PrintWriter(new FileOutputStream(healthScriptFile)); + pw.println(scriptStr); + pw.flush(); + pw.close(); + healthScriptFile.setExecutable(setExecutable); + } + + private Configuration getConfForNodeHealthScript() { + Configuration conf = UTIL.getConfiguration(); + File tempDir = new File(UTIL.getDataTestDir().toString()); + tempDir.mkdirs(); + healthScriptFile = new File(tempDir.getAbsolutePath(), "HealthScript.sh"); + conf.set(HConstants.RS_HEALTH_SCRIPT_LOC, + healthScriptFile.getAbsolutePath()); + conf.setLong(HConstants.RS_HEALTH_FAILURE_THRESHOLD, 3); + conf.setLong(HConstants.RS_HEALTH_SCRIPT_TIMEOUT, 100); + return conf; + } + + /** + * Simple helper class that just keeps track of whether or not its stopped. + */ + private static class StoppableImplementation implements Stoppable { + private volatile boolean stop = false; + + @Override + public void stop(String why) { + this.stop = true; + } + + @Override + public boolean isStopped() { + return this.stop; + } + + } +}