diff --git src/main/java/org/apache/hadoop/hbase/HConstants.java src/main/java/org/apache/hadoop/hbase/HConstants.java index ba657e0..f0ea248 100644 --- src/main/java/org/apache/hadoop/hbase/HConstants.java +++ src/main/java/org/apache/hadoop/hbase/HConstants.java @@ -669,6 +669,19 @@ public final class HConstants { Bytes.toString(META_TABLE_NAME), Bytes.toString(ROOT_TABLE_NAME), SPLIT_LOGDIR_NAME, HBCK_SIDELINEDIR_NAME, HFILE_ARCHIVE_DIRECTORY })); + /** Health script related settings. */ + public static final String HEALTH_SCRIPT_LOC = "hbase.node.health.script.location"; + public static final String HEALTH_SCRIPT_TIMEOUT = "hbase.node.health.script.timeout"; + public static final String HEALTH_CHORE_WAKE_FREQ = + "hbase.node.health.script.frequency"; + public static final long DEFAULT_HEALTH_SCRIPT_TIMEOUT = 60000; + /** + * The maximum number of health check failures a server can encounter consecutively. + */ + public static final String HEALTH_FAILURE_THRESHOLD = + "hbase.node.health.failure.threshold"; + public static final int DEFAULT_HEALTH_FAILURE_THRESHOLD = 3; + private HConstants() { // Can't be instantiated with this ctor. } diff --git src/main/java/org/apache/hadoop/hbase/HealthCheckChore.java src/main/java/org/apache/hadoop/hbase/HealthCheckChore.java new file mode 100644 index 0000000..5e2f8f3 --- /dev/null +++ src/main/java/org/apache/hadoop/hbase/HealthCheckChore.java @@ -0,0 +1,96 @@ +/** + * Copyright 2011 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HealthChecker.HealthCheckerExitStatus; +import org.apache.hadoop.util.StringUtils; + +/** + * The Class HealthCheckChore for checking the health of node. + */ + public class HealthCheckChore extends Chore { + private static Log LOG = LogFactory.getLog(HealthCheckChore.class); + private HealthChecker healthChecker; + private Configuration config; + private int threshold; + private int numTimesUnhealthy = 0; + private long failureWindow; + private long startWindow; + + public HealthCheckChore(int sleepTime, Stoppable stopper, Configuration conf) { + super("HealthChecker", sleepTime, stopper); + LOG.info("Health Check Chore runs every " + StringUtils.formatTime(sleepTime)); + this.config = conf; + String healthCheckScript = this.config.get(HConstants.HEALTH_SCRIPT_LOC); + long scriptTimeout = this.config.getLong(HConstants.HEALTH_SCRIPT_TIMEOUT, + HConstants.DEFAULT_HEALTH_SCRIPT_TIMEOUT); + healthChecker = new HealthChecker(); + healthChecker.init(healthCheckScript, scriptTimeout); + this.threshold = config.getInt(HConstants.HEALTH_FAILURE_THRESHOLD, + HConstants.DEFAULT_HEALTH_FAILURE_THRESHOLD); + this.failureWindow = this.threshold * sleepTime; + } + + @Override + protected void chore() { + HealthReport report = healthChecker.checkHealth(); + boolean isHealthy = (report.getStatus() == HealthCheckerExitStatus.SUCCESS); + if (!isHealthy) { + boolean needToStop = decideToStop(); + if (needToStop) { + this.stopper.stop("The region server reported unhealthy " + threshold + + " number of times in consecutively."); + } + // Always log health report. + LOG.info("Health status at " + StringUtils.formatTime(System.currentTimeMillis()) + " : " + + report.getHealthReport()); + } + } + + private boolean decideToStop() { + boolean stop = false; + if (numTimesUnhealthy == 0) { + // First time we are seeing a failure. No need to stop, just + // record the time. + numTimesUnhealthy++; + stop = false; + startWindow = System.currentTimeMillis(); + } else { + if ((System.currentTimeMillis() - startWindow) < failureWindow) { + numTimesUnhealthy++; + if (numTimesUnhealthy == threshold) { + stop = true; + } else { + stop = false; + } + } else { + // Outside of failure window, so we reset to 1. + numTimesUnhealthy = 1; + startWindow = System.currentTimeMillis(); + stop = false; + } + } + return stop; + } + +} diff --git src/main/java/org/apache/hadoop/hbase/HealthChecker.java src/main/java/org/apache/hadoop/hbase/HealthChecker.java new file mode 100644 index 0000000..f47e152 --- /dev/null +++ src/main/java/org/apache/hadoop/hbase/HealthChecker.java @@ -0,0 +1,130 @@ +/** + * Copyright 2011 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase; + +import java.io.IOException; +import java.util.ArrayList; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.util.Shell.ExitCodeException; +import org.apache.hadoop.util.Shell.ShellCommandExecutor; + + +/** + * The Class HealthChecker to check health of node. + */ +class HealthChecker { + + private static Log LOG = LogFactory.getLog(HealthChecker.class); + private ShellCommandExecutor shexec = null; + private String exceptionStackTrace; + + /** Pattern used for searching in the output of the node health script */ + static private final String ERROR_PATTERN = "ERROR"; + + private String healthCheckScript; + private long scriptTimeout; + + enum HealthCheckerExitStatus { + SUCCESS, + TIMED_OUT, + FAILED_WITH_EXIT_CODE, + FAILED_WITH_EXCEPTION, + FAILED + } + + /** + * Initialize. + * + * @param configuration + */ + public void init(String location, long timeout) { + this.healthCheckScript = location; + this.scriptTimeout = timeout; + ArrayList execScript = new ArrayList(); + execScript.add(healthCheckScript); + shexec = new ShellCommandExecutor(execScript.toArray(new String[execScript.size()]), null, + null, scriptTimeout); + LOG.info("RegionServerHealthChecker initialized."); + } + + + public HealthReport checkHealth() { + HealthCheckerExitStatus status = HealthCheckerExitStatus.SUCCESS; + try { + shexec.execute(); + } catch (ExitCodeException e) { + // ignore the exit code of the script + LOG.warn("Caught exception : " + e); + status = HealthCheckerExitStatus.FAILED_WITH_EXIT_CODE; + } catch (IOException e) { + LOG.warn("Caught exception : " + e); + if (!shexec.isTimedOut()) { + status = HealthCheckerExitStatus.FAILED_WITH_EXCEPTION; + exceptionStackTrace = org.apache.hadoop.util.StringUtils.stringifyException(e); + } else { + status = HealthCheckerExitStatus.TIMED_OUT; + } + } finally { + if (status == HealthCheckerExitStatus.SUCCESS) { + if (hasErrors(shexec.getOutput())) { + status = HealthCheckerExitStatus.FAILED; + } + } + } + return new HealthReport(status, getHealthReport(status)); + } + + + private boolean hasErrors(String output) { + String[] splits = output.split("\n"); + for (String split : splits) { + if (split.startsWith(ERROR_PATTERN)) { + return true; + } + } + return false; + } + + + private String getHealthReport(HealthCheckerExitStatus status){ + String healthReport = null; + switch (status) { + case SUCCESS: + healthReport = "Server is healthy."; + break; + case TIMED_OUT: + healthReport = "Health script timed out"; + break; + case FAILED_WITH_EXCEPTION: + healthReport = exceptionStackTrace; + break; + case FAILED_WITH_EXIT_CODE: + healthReport = "Health script failed with exit code."; + break; + case FAILED: + healthReport = shexec.getOutput(); + break; + } + return healthReport; + } + +} diff --git src/main/java/org/apache/hadoop/hbase/HealthReport.java src/main/java/org/apache/hadoop/hbase/HealthReport.java new file mode 100644 index 0000000..5292929 --- /dev/null +++ src/main/java/org/apache/hadoop/hbase/HealthReport.java @@ -0,0 +1,89 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase; + +import org.apache.hadoop.hbase.HealthChecker.HealthCheckerExitStatus; + +/** + * The Class RegionServerHealthReport containing information about + * health of the region server. + */ +class HealthReport { + + private HealthCheckerExitStatus status; + private String healthReport; + + HealthReport(HealthCheckerExitStatus status, String healthReport) { + super(); + this.status = status; + this.healthReport = healthReport; + } + + /** + * Gets the status of the region server. + * + * @return HealthCheckerExitStatus + */ + HealthCheckerExitStatus getStatus() { + return status; + } + + /** + * Gets the health report of the region server. + * + * @return String + */ + String getHealthReport() { + return healthReport; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((healthReport == null) ? 0 : healthReport.hashCode()); + result = prime * result + ((status == null) ? 0 : status.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (!(obj instanceof HealthReport)) { + return false; + } + HealthReport other = (HealthReport) obj; + if (healthReport == null) { + if (other.healthReport != null) { + return false; + } + } else if (!healthReport.equals(other.healthReport)) { + return false; + } + if (status != other.status) { + return false; + } + return true; + } + +} diff --git src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 6f52758..d6938d9 100644 --- src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -66,6 +66,7 @@ import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HConstants.OperationStatusCode; import org.apache.hadoop.hbase.HDFSBlocksDistribution; +import org.apache.hadoop.hbase.HealthCheckChore; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HServerAddress; import org.apache.hadoop.hbase.HServerInfo; @@ -366,6 +367,9 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, */ private ClusterId clusterId = null; + /** The health check chore. */ + private HealthCheckChore healthCheckChore; + /** * Starts a HRegionServer at the default location * @@ -654,6 +658,13 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, this.compactionChecker = new CompactionChecker(this, this.threadWakeFrequency * multiplier, this); + // Health checker thread. + int sleepTime = this.conf.getInt(HConstants.HEALTH_CHORE_WAKE_FREQ, + HConstants.DEFAULT_THREAD_WAKE_FREQUENCY); + if (isHealthCheckerConfigured()) { + healthCheckChore = new HealthCheckChore(sleepTime, this, getConfiguration()); + } + this.leases = new Leases((int) conf.getLong( HConstants.HBASE_REGIONSERVER_LEASE_PERIOD_KEY, HConstants.DEFAULT_HBASE_REGIONSERVER_LEASE_PERIOD), @@ -769,6 +780,9 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, if (this.hlogRoller != null) this.hlogRoller.interruptIfNecessary(); if (this.compactionChecker != null) this.compactionChecker.interrupt(); + if (this.healthCheckChore != null) { + this.healthCheckChore.interrupt(); + } if (this.killed) { // Just skip out w/o closing regions. Used when testing. @@ -1551,6 +1565,10 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, handler); Threads.setDaemonThreadRunning(this.compactionChecker.getThread(), n + ".compactionChecker", handler); + if (this.healthCheckChore != null) { + Threads.setDaemonThreadRunning(this.healthCheckChore.getThread(), n + ".healthChecker", + handler); + } // Leases is not a Thread. Internally it runs a daemon thread. If it gets // an unhandled exception, it will just exit. @@ -1775,6 +1793,9 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, protected void join() { Threads.shutdown(this.compactionChecker.getThread()); Threads.shutdown(this.cacheFlusher.getThread()); + if (this.healthCheckChore != null) { + Threads.shutdown(this.healthCheckChore.getThread()); + } if (this.hlogRoller != null) { Threads.shutdown(this.hlogRoller.getThread()); } @@ -3824,4 +3845,9 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, } return 0; } + + private boolean isHealthCheckerConfigured() { + String healthScriptLocation = this.conf.get(HConstants.HEALTH_SCRIPT_LOC); + return org.apache.commons.lang.StringUtils.isNotBlank(healthScriptLocation); + } }