From f626c8c597ddc0a079c7f4c1a3c8a1a9a15d048c Mon Sep 17 00:00:00 2001 From: Andrew Purtell Date: Wed, 29 Nov 2017 10:38:52 -0800 Subject: [PATCH] HBASE-19238 Port Hadoop's new GcTimeMonitor to HBase --- .../hadoop/hbase/metrics/GcTimeMonitorSource.java | 34 +++ .../regionserver/MetricsRegionServerSource.java | 4 +- .../hadoop/hbase/rest/MetricsRESTSource.java | 4 +- .../hbase/thrift/MetricsThriftServerSource.java | 4 +- .../MetricsRegionServerSourceImpl.java | 9 + .../hadoop/hbase/rest/MetricsRESTSourceImpl.java | 8 + .../thrift/MetricsThriftServerSourceImpl.java | 8 +- .../org/apache/hadoop/hbase/rest/RESTServlet.java | 5 + .../hadoop/hbase/regionserver/HRegionServer.java | 7 + .../apache/hadoop/hbase/util/GcTimeMonitor.java | 310 +++++++++++++++++++++ .../hadoop/hbase/thrift/ThriftServerRunner.java | 8 + .../apache/hadoop/hbase/thrift2/ThriftServer.java | 4 + 12 files changed, 401 insertions(+), 4 deletions(-) create mode 100644 hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/metrics/GcTimeMonitorSource.java create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/util/GcTimeMonitor.java diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/metrics/GcTimeMonitorSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/metrics/GcTimeMonitorSource.java new file mode 100644 index 0000000000..55fdb29219 --- /dev/null +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/metrics/GcTimeMonitorSource.java @@ -0,0 +1,34 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.metrics; + +/** + * Interface for sources that will export JvmPauseMonitor metrics + */ +public interface GcTimeMonitorSource { + String GC_TIME_PERCENTAGE_KEY = "gcTimePercentage"; + String GC_TIME_PRECENTAGE_DESC = "Percentage of time the JVM was paused in GC"; + + /** + * Update the percentage of time the JVM was paused in GC + * + * @param v value + */ + void updateGcPauseTimePercentage(long v); +} diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSource.java index b72deb8070..7ddf10388b 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSource.java @@ -19,12 +19,14 @@ package org.apache.hadoop.hbase.regionserver; import org.apache.hadoop.hbase.metrics.BaseSource; +import org.apache.hadoop.hbase.metrics.GcTimeMonitorSource; import org.apache.hadoop.hbase.metrics.JvmPauseMonitorSource; /** * Interface for classes that expose metrics about the regionserver. */ -public interface MetricsRegionServerSource extends BaseSource, JvmPauseMonitorSource { +public interface MetricsRegionServerSource + extends BaseSource, JvmPauseMonitorSource, GcTimeMonitorSource { /** * The name of the metrics diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/rest/MetricsRESTSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/rest/MetricsRESTSource.java index a5101e0ad6..00d10f9888 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/rest/MetricsRESTSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/rest/MetricsRESTSource.java @@ -19,12 +19,14 @@ package org.apache.hadoop.hbase.rest; import org.apache.hadoop.hbase.metrics.BaseSource; +import org.apache.hadoop.hbase.metrics.GcTimeMonitorSource; import org.apache.hadoop.hbase.metrics.JvmPauseMonitorSource; /** * Interface of the Metrics Source that will export data to Hadoop's Metrics2 system. */ -public interface MetricsRESTSource extends BaseSource, JvmPauseMonitorSource { +public interface MetricsRESTSource + extends BaseSource, JvmPauseMonitorSource, GcTimeMonitorSource { String METRICS_NAME = "REST"; diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/thrift/MetricsThriftServerSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/thrift/MetricsThriftServerSource.java index 276a40ce26..2c9022cbc7 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/thrift/MetricsThriftServerSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/thrift/MetricsThriftServerSource.java @@ -19,12 +19,14 @@ package org.apache.hadoop.hbase.thrift; import org.apache.hadoop.hbase.metrics.ExceptionTrackingSource; +import org.apache.hadoop.hbase.metrics.GcTimeMonitorSource; import org.apache.hadoop.hbase.metrics.JvmPauseMonitorSource; /** * Interface of a class that will export metrics about Thrift to hadoop's metrics2. */ -public interface MetricsThriftServerSource extends ExceptionTrackingSource, JvmPauseMonitorSource { +public interface MetricsThriftServerSource + extends ExceptionTrackingSource, JvmPauseMonitorSource, GcTimeMonitorSource { String BATCH_GET_KEY = "batchGet"; String BATCH_MUTATE_KEY = "batchMutate"; diff --git a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSourceImpl.java b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSourceImpl.java index 208188e5c8..097a41a795 100644 --- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSourceImpl.java +++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionServerSourceImpl.java @@ -25,6 +25,7 @@ import org.apache.hadoop.metrics2.MetricHistogram; import org.apache.hadoop.metrics2.MetricsCollector; import org.apache.hadoop.metrics2.MetricsRecordBuilder; import org.apache.hadoop.metrics2.lib.MutableFastCounter; +import org.apache.hadoop.metrics2.lib.MutableGaugeLong; /** * Hadoop2 implementation of MetricsRegionServerSource. @@ -88,6 +89,7 @@ public class MetricsRegionServerSourceImpl private final MutableFastCounter warnPauseThresholdExceeded; private final MetricHistogram pausesWithGc; private final MetricHistogram pausesWithoutGc; + private final MutableGaugeLong pauseTimePercentage; public MetricsRegionServerSourceImpl(MetricsRegionServerWrapper rsWrap) { this(METRICS_NAME, METRICS_DESCRIPTION, METRICS_CONTEXT, METRICS_JMX_CONTEXT, rsWrap); @@ -176,6 +178,8 @@ public class MetricsRegionServerSourceImpl WARN_THRESHOLD_COUNT_DESC, 0L); pausesWithGc = getMetricsRegistry().newTimeHistogram(PAUSE_TIME_WITH_GC_KEY); pausesWithoutGc = getMetricsRegistry().newTimeHistogram(PAUSE_TIME_WITHOUT_GC_KEY); + + pauseTimePercentage = getMetricsRegistry().getGauge(GC_TIME_PERCENTAGE_KEY, 0l); } @Override @@ -545,4 +549,9 @@ public class MetricsRegionServerSourceImpl public void updatePutBatch(long t) { putBatchHisto.add(t); } + + @Override + public void updateGcPauseTimePercentage(long v) { + pauseTimePercentage.set(v); + } } diff --git a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/rest/MetricsRESTSourceImpl.java b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/rest/MetricsRESTSourceImpl.java index 9360e32e5e..24f0ff6bed 100644 --- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/rest/MetricsRESTSourceImpl.java +++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/rest/MetricsRESTSourceImpl.java @@ -22,6 +22,7 @@ import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.metrics.BaseSourceImpl; import org.apache.hadoop.metrics2.MetricHistogram; import org.apache.hadoop.metrics2.lib.MutableFastCounter; +import org.apache.hadoop.metrics2.lib.MutableGaugeLong; /** * Hadoop Two implementation of a metrics2 source that will export metrics from the Rest server to @@ -51,6 +52,7 @@ public class MetricsRESTSourceImpl extends BaseSourceImpl implements MetricsREST private final MutableFastCounter warnPauseThresholdExceeded; private final MetricHistogram pausesWithGc; private final MetricHistogram pausesWithoutGc; + private final MutableGaugeLong pauseTimePercentage; public MetricsRESTSourceImpl() { this(METRICS_NAME, METRICS_DESCRIPTION, CONTEXT, JMX_CONTEXT); @@ -69,6 +71,7 @@ public class MetricsRESTSourceImpl extends BaseSourceImpl implements MetricsREST WARN_THRESHOLD_COUNT_DESC, 0L); pausesWithGc = getMetricsRegistry().newTimeHistogram(PAUSE_TIME_WITH_GC_KEY); pausesWithoutGc = getMetricsRegistry().newTimeHistogram(PAUSE_TIME_WITHOUT_GC_KEY); + pauseTimePercentage = getMetricsRegistry().getGauge(GC_TIME_PERCENTAGE_KEY, 0l); } @Override @@ -175,4 +178,9 @@ public class MetricsRESTSourceImpl extends BaseSourceImpl implements MetricsREST public void updatePauseTimeWithoutGc(long t) { pausesWithoutGc.add(t); } + + @Override + public void updateGcPauseTimePercentage(long v) { + pauseTimePercentage.set(v); + } } diff --git a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/thrift/MetricsThriftServerSourceImpl.java b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/thrift/MetricsThriftServerSourceImpl.java index d47dbe7d6c..bac4e89b3e 100644 --- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/thrift/MetricsThriftServerSourceImpl.java +++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/thrift/MetricsThriftServerSourceImpl.java @@ -19,7 +19,6 @@ package org.apache.hadoop.hbase.thrift; import org.apache.hadoop.hbase.classification.InterfaceAudience; -import org.apache.hadoop.hbase.metrics.BaseSourceImpl; import org.apache.hadoop.hbase.metrics.ExceptionTrackingSourceImpl; import org.apache.hadoop.metrics2.MetricHistogram; import org.apache.hadoop.metrics2.lib.MutableFastCounter; @@ -49,6 +48,7 @@ public class MetricsThriftServerSourceImpl extends ExceptionTrackingSourceImpl i private final MutableFastCounter warnPauseThresholdExceeded; private final MetricHistogram pausesWithGc; private final MetricHistogram pausesWithoutGc; + private MutableGaugeLong pauseTimePercentage; private MutableGaugeLong activeWorkerCountGauge; @@ -77,6 +77,7 @@ public class MetricsThriftServerSourceImpl extends ExceptionTrackingSourceImpl i thriftSlowCallStat = getMetricsRegistry().newTimeHistogram(SLOW_THRIFT_CALL_KEY); callQueueLenGauge = getMetricsRegistry().getGauge(CALL_QUEUE_LEN_KEY, 0); activeWorkerCountGauge = getMetricsRegistry().getGauge(ACTIVE_WORKER_COUNT_KEY, 0); + pauseTimePercentage = getMetricsRegistry().getGauge(GC_TIME_PERCENTAGE_KEY, 0l); } @Override @@ -143,4 +144,9 @@ public class MetricsThriftServerSourceImpl extends ExceptionTrackingSourceImpl i public void updatePauseTimeWithoutGc(long t) { pausesWithoutGc.add(t); } + + @Override + public void updateGcPauseTimePercentage(long v) { + pauseTimePercentage.set(v); + } } diff --git a/hbase-rest/src/main/java/org/apache/hadoop/hbase/rest/RESTServlet.java b/hbase-rest/src/main/java/org/apache/hadoop/hbase/rest/RESTServlet.java index 411ced8543..da3bae04c0 100644 --- a/hbase-rest/src/main/java/org/apache/hadoop/hbase/rest/RESTServlet.java +++ b/hbase-rest/src/main/java/org/apache/hadoop/hbase/rest/RESTServlet.java @@ -29,6 +29,7 @@ import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.filter.ParseFilter; import org.apache.hadoop.hbase.security.UserProvider; import org.apache.hadoop.hbase.util.ConnectionCache; +import org.apache.hadoop.hbase.util.GcTimeMonitor; import org.apache.hadoop.hbase.util.JvmPauseMonitor; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.authorize.ProxyUsers; @@ -45,6 +46,7 @@ public class RESTServlet implements Constants { private final ConnectionCache connectionCache; private final UserGroupInformation realUser; private final JvmPauseMonitor pauseMonitor; + private final GcTimeMonitor gcTimeMonitor; static final String CLEANUP_INTERVAL = "hbase.rest.connection.cleanup-interval"; static final String MAX_IDLETIME = "hbase.rest.connection.max-idletime"; @@ -107,6 +109,9 @@ public class RESTServlet implements Constants { pauseMonitor = new JvmPauseMonitor(conf, metrics.getSource()); pauseMonitor.start(); + gcTimeMonitor = new GcTimeMonitor(conf, metrics.getSource()); + gcTimeMonitor.start(); + } Admin getAdmin() throws IOException { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 4853b2b050..c4ea6f3b76 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -163,6 +163,7 @@ import org.apache.hadoop.hbase.util.ConfigUtil; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.FSTableDescriptors; import org.apache.hadoop.hbase.util.FSUtils; +import org.apache.hadoop.hbase.util.GcTimeMonitor; import org.apache.hadoop.hbase.util.HasThread; import org.apache.hadoop.hbase.util.JSONBean; import org.apache.hadoop.hbase.util.JvmPauseMonitor; @@ -348,6 +349,7 @@ public class HRegionServer extends HasThread implements // into web context. protected InfoServer infoServer; private JvmPauseMonitor pauseMonitor; + private GcTimeMonitor gcTimeMonitor; /** region server process name */ public static final String REGIONSERVER = "regionserver"; @@ -1141,6 +1143,9 @@ public class HRegionServer extends HasThread implements if (this.pauseMonitor != null) { this.pauseMonitor.stop(); } + if (this.gcTimeMonitor != null) { + this.gcTimeMonitor.stop(); + } if (!killed) { stopServiceThreads(); @@ -1460,6 +1465,8 @@ public class HRegionServer extends HasThread implements // Now that we have a metrics source, start the pause monitor this.pauseMonitor = new JvmPauseMonitor(conf, getMetrics().getMetricsSource()); pauseMonitor.start(); + this.gcTimeMonitor = new GcTimeMonitor(conf, getMetrics().getMetricsSource()); + gcTimeMonitor.start(); startServiceThreads(); startHeapMemoryManager(); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/GcTimeMonitor.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/GcTimeMonitor.java new file mode 100644 index 0000000000..a384585a91 --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/GcTimeMonitor.java @@ -0,0 +1,310 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.util; + +import com.google.common.base.Preconditions; + +import java.lang.management.GarbageCollectorMXBean; +import java.lang.management.ManagementFactory; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.classification.InterfaceAudience; +import org.apache.hadoop.hbase.metrics.GcTimeMonitorSource; + +/** + * This class monitors the percentage of time the JVM is paused in GC within + * the specified observation window, say 1 minute. The user can provide a + * hook which will be called whenever this percentage exceeds the specified + * threshold. + */ +@InterfaceAudience.Private +public class GcTimeMonitor { + + /** The interval over which the percentage of GC time should be calculated. + * A practical value would be somewhere between 30 sec and several minutes. */ + public static final String OBSERVATION_WINDOW_MS_KEY = "gc.time.observationWindowMs"; + public static final long DEFAULT_OBSERVATION_WINDOW_MS = 30L * 1000; + + /** sleepIntervalMs how frequently this thread should wake up to check GC timings. + * This is also a frequency with which alertHandler will be invoked if GC time + * percentage exceeds the specified limit. A practical + * value would likely be 500..1000 ms. */ + public static final String SLEEP_INTERVAL_MS_KEY = "gc.time.sleepIntervalMs"; + public static final long DEFAULT_SLEEP_INTERVAL_MS = 1000; + + /** maxGcTimePercentage A GC time percentage limit (0..100) within observationWindowMs. + * Once this is exceeded, alertHandler will be invoked every sleepIntervalMs milliseconds + * until GC time percentage falls below this limit. */ + public static final String MAX_GC_TIME_PERCENTAGE_KEY = "gc.time.maxGcTimePercentage"; + public static final int DEFAULT_MAX_GC_TIME_PERCENTAGE = 50; + + private final long maxGcTimePercentage; + private final long observationWindowMs, sleepIntervalMs; + private final GcTimeAlertHandler alertHandler; + + private final List gcBeans = + ManagementFactory.getGarbageCollectorMXBeans(); + // Ring buffers containing GC timings and timestamps when timings were taken + private final TsAndData[] gcDataBuf; + private int bufSize, startIdx, endIdx; + + private long startTime; + private final GcData curData = new GcData(); + private Thread monitorThread; + private volatile boolean shouldRun = true; + private GcTimeMonitorSource metricsSource; + + /** + * Create an instance of GCTimeMonitor. Once it's started, it will stay alive + * and monitor GC time percentage until shutdown() is called. If you don't + * put a limit on the number of GCTimeMonitor instances that you create, and + * alertHandler != null, you should necessarily call shutdown() once the given + * instance is not needed. Otherwise, you may create a memory leak, because + * each running GCTimeMonitor will keep its alertHandler object in memory, + * which in turn may reference and keep in memory many more other objects. + * + * @param conf + * @param metricsSource + */ + public GcTimeMonitor(Configuration conf, GcTimeMonitorSource metricsSource) { + this(conf, metricsSource, null); + } + + /** + * Create an instance of GCTimeMonitor. Once it's started, it will stay alive + * and monitor GC time percentage until shutdown() is called. If you don't + * put a limit on the number of GCTimeMonitor instances that you create, and + * alertHandler != null, you should necessarily call shutdown() once the given + * instance is not needed. Otherwise, you may create a memory leak, because + * each running GCTimeMonitor will keep its alertHandler object in memory, + * which in turn may reference and keep in memory many more other objects. + * + * @param conf + * @param metricsSource + * @param alertHandler a single method in this interface is invoked when GC + * time percentage exceeds the specified limit. + */ + public GcTimeMonitor(Configuration conf, GcTimeMonitorSource metricsSource, + GcTimeAlertHandler alertHandler) { + final long observationWindowMs = conf.getLong(OBSERVATION_WINDOW_MS_KEY, + DEFAULT_OBSERVATION_WINDOW_MS); + final long sleepIntervalMs = conf.getLong(SLEEP_INTERVAL_MS_KEY, + DEFAULT_SLEEP_INTERVAL_MS); + final int maxGcTimePercentage = conf.getInt(MAX_GC_TIME_PERCENTAGE_KEY, + DEFAULT_MAX_GC_TIME_PERCENTAGE); + Preconditions.checkArgument(observationWindowMs > 0); + Preconditions.checkArgument( + sleepIntervalMs > 0 && sleepIntervalMs < observationWindowMs); + Preconditions.checkArgument( + maxGcTimePercentage >= 0 && maxGcTimePercentage <= 100); + + this.observationWindowMs = observationWindowMs; + this.sleepIntervalMs = sleepIntervalMs; + this.maxGcTimePercentage = maxGcTimePercentage; + this.alertHandler = alertHandler; + this.metricsSource = metricsSource; + + bufSize = (int) (observationWindowMs / sleepIntervalMs + 2); + // Prevent the user from accidentally creating an abnormally big buffer, + // which will result in slow calculations and likely inaccuracy. + Preconditions.checkArgument(bufSize <= 128 * 1024); + gcDataBuf = new TsAndData[bufSize]; + for (int i = 0; i < bufSize; i++) { + gcDataBuf[i] = new TsAndData(); + } + } + + public void start() { + Preconditions.checkState(monitorThread == null, "Already started"); + monitorThread = new Thread(new Monitor(), "GcTimeMonitor"); + monitorThread.setDaemon(true); + monitorThread.start(); + } + + public void stop() { + shouldRun = false; + monitorThread.interrupt(); + try { + monitorThread.join(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + + private class Monitor implements Runnable { + @Override + public void run() { + startTime = System.currentTimeMillis(); + curData.timestamp = startTime; + gcDataBuf[startIdx].setValues(startTime, 0); + + while (shouldRun) { + try { + Thread.sleep(sleepIntervalMs); + } catch (InterruptedException ie) { + return; + } + + calculateGCTimePercentageWithinObservedInterval(); + updateMetrics(curData.gcTimePercentage); + if (alertHandler != null && + curData.gcTimePercentage > maxGcTimePercentage) { + alertHandler.alert(curData.clone()); + } + } + } + } + + public void shutdown() { + shouldRun = false; + } + + /** Returns a copy of the most recent data measured by this monitor. */ + public GcData getLatestGcData() { + return curData.clone(); + } + + private void calculateGCTimePercentageWithinObservedInterval() { + long prevTotalGcTime = curData.totalGcTime; + long totalGcTime = 0; + long totalGcCount = 0; + for (GarbageCollectorMXBean gcBean : gcBeans) { + totalGcTime += gcBean.getCollectionTime(); + totalGcCount += gcBean.getCollectionCount(); + } + long gcTimeWithinSleepInterval = totalGcTime - prevTotalGcTime; + + long ts = System.currentTimeMillis(); + long gcMonitorRunTime = ts - startTime; + + endIdx = (endIdx + 1) % bufSize; + gcDataBuf[endIdx].setValues(ts, gcTimeWithinSleepInterval); + + // Move startIdx forward until we reach the first buffer entry with + // timestamp within the observation window. + long startObsWindowTs = ts - observationWindowMs; + while (gcDataBuf[startIdx].ts < startObsWindowTs && startIdx != endIdx) { + startIdx = (startIdx + 1) % bufSize; + } + + // Calculate total GC time within observationWindowMs. + // We should be careful about GC time that passed before the first timestamp + // in our observation window. + long gcTimeWithinObservationWindow = Math.min( + gcDataBuf[startIdx].gcPause, gcDataBuf[startIdx].ts - startObsWindowTs); + if (startIdx != endIdx) { + for (int i = (startIdx + 1) % bufSize; i != endIdx; + i = (i + 1) % bufSize) { + gcTimeWithinObservationWindow += gcDataBuf[i].gcPause; + } + } + + curData.update(ts, gcMonitorRunTime, totalGcTime, totalGcCount, + (int) (gcTimeWithinObservationWindow * 100 / + Math.min(observationWindowMs, gcMonitorRunTime))); + } + + /** + * The user can provide an instance of a class implementing this interface + * when initializing a GcTimeMonitor to receive alerts when GC time + * percentage exceeds the specified threshold. + */ + public interface GcTimeAlertHandler { + void alert(GcData gcData); + } + + /** Encapsulates data about GC pauses measured at the specific timestamp. */ + public static class GcData implements Cloneable { + private long timestamp; + private long gcMonitorRunTime, totalGcTime, totalGcCount; + private int gcTimePercentage; + + /** Returns the absolute timestamp when this measurement was taken. */ + public long getTimestamp() { + return timestamp; + } + + /** Returns the time since the start of the associated GcTimeMonitor. */ + public long getGcMonitorRunTime() { + return gcMonitorRunTime; + } + + /** Returns accumulated GC time since this JVM started. */ + public long getAccumulatedGcTime() { + return totalGcTime; + } + + /** Returns the accumulated number of GC pauses since this JVM started. */ + public long getAccumulatedGcCount() { + return totalGcCount; + } + + /** + * Returns the percentage (0..100) of time that the JVM spent in GC pauses + * within the observation window of the associated GcTimeMonitor. + */ + public int getGcTimePercentage() { + return gcTimePercentage; + } + + private synchronized void update(long inTimestamp, long inGcMonitorRunTime, + long inTotalGcTime, long inTotalGcCount, int inGcTimePercentage) { + this.timestamp = inTimestamp; + this.gcMonitorRunTime = inGcMonitorRunTime; + this.totalGcTime = inTotalGcTime; + this.totalGcCount = inTotalGcCount; + this.gcTimePercentage = inGcTimePercentage; + } + + @Override + public synchronized GcData clone() { + try { + return (GcData) super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + } + } + + private static class TsAndData { + private long ts; // Timestamp when this measurement was taken + private long gcPause; // Total GC pause time within the interval between ts + // and the timestamp of the previous measurement. + + void setValues(long inTs, long inGcPause) { + this.ts = inTs; + this.gcPause = inGcPause; + } + } + + public void updateMetrics(int gcTimePercentage) { + if (metricsSource != null) { + metricsSource.updateGcPauseTimePercentage(gcTimePercentage); + } + } + + public GcTimeMonitorSource getMetricsSource() { + return metricsSource; + } + + public void setMetricsSource(GcTimeMonitorSource metricsSource) { + this.metricsSource = metricsSource; + } +} diff --git a/hbase-thrift/src/main/java/org/apache/hadoop/hbase/thrift/ThriftServerRunner.java b/hbase-thrift/src/main/java/org/apache/hadoop/hbase/thrift/ThriftServerRunner.java index 7208a7bb41..2f6a96e6bc 100644 --- a/hbase-thrift/src/main/java/org/apache/hadoop/hbase/thrift/ThriftServerRunner.java +++ b/hbase-thrift/src/main/java/org/apache/hadoop/hbase/thrift/ThriftServerRunner.java @@ -100,6 +100,7 @@ import org.apache.hadoop.hbase.thrift.generated.TScan; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.ConnectionCache; import org.apache.hadoop.hbase.util.DNS; +import org.apache.hadoop.hbase.util.GcTimeMonitor; import org.apache.hadoop.hbase.util.JvmPauseMonitor; import org.apache.hadoop.hbase.util.Strings; import org.apache.hadoop.security.SaslRpcServer.SaslGssCallbackHandler; @@ -204,6 +205,7 @@ public class ThriftServerRunner implements Runnable { private final boolean doAsEnabled; private final JvmPauseMonitor pauseMonitor; + private final GcTimeMonitor gcTimeMonitor; /** An enum of server implementation selections */ enum ImplType { @@ -319,6 +321,8 @@ public class ThriftServerRunner implements Runnable { this.listenPort = conf.getInt(PORT_CONF_KEY, DEFAULT_LISTEN_PORT); this.metrics = new ThriftMetrics(conf, ThriftMetrics.ThriftServerType.ONE); this.pauseMonitor = new JvmPauseMonitor(conf, this.metrics.getSource()); + this.gcTimeMonitor = new GcTimeMonitor(conf, metrics.getSource()); + this.hbaseHandler = new HBaseHandler(conf, userProvider); this.hbaseHandler.initMetrics(metrics); this.handler = HbaseHandlerMetricsProxy.newInstance( @@ -361,6 +365,7 @@ public class ThriftServerRunner implements Runnable { public Object run() { try { pauseMonitor.start(); + gcTimeMonitor.start(); if (conf.getBoolean(USE_HTTP_CONF_KEY, false)) { setupHTTPServer(); httpServer.start(); @@ -384,6 +389,9 @@ public class ThriftServerRunner implements Runnable { if (pauseMonitor != null) { pauseMonitor.stop(); } + if (gcTimeMonitor != null) { + gcTimeMonitor.stop(); + } if (tserver != null) { tserver.stop(); tserver = null; diff --git a/hbase-thrift/src/main/java/org/apache/hadoop/hbase/thrift2/ThriftServer.java b/hbase-thrift/src/main/java/org/apache/hadoop/hbase/thrift2/ThriftServer.java index 24bff56e75..71ca2a1a18 100644 --- a/hbase-thrift/src/main/java/org/apache/hadoop/hbase/thrift2/ThriftServer.java +++ b/hbase-thrift/src/main/java/org/apache/hadoop/hbase/thrift2/ThriftServer.java @@ -61,6 +61,7 @@ import org.apache.hadoop.hbase.thrift.THBaseThreadPoolExecutor; import org.apache.hadoop.hbase.thrift.ThriftMetrics; import org.apache.hadoop.hbase.thrift2.generated.THBaseService; import org.apache.hadoop.hbase.util.DNS; +import org.apache.hadoop.hbase.util.GcTimeMonitor; import org.apache.hadoop.hbase.util.JvmPauseMonitor; import org.apache.hadoop.hbase.util.Strings; import org.apache.hadoop.security.SaslRpcServer.SaslGssCallbackHandler; @@ -473,6 +474,7 @@ public class ThriftServer { ThriftMetrics metrics = new ThriftMetrics(conf, ThriftMetrics.ThriftServerType.TWO); final JvmPauseMonitor pauseMonitor = new JvmPauseMonitor(conf, metrics.getSource()); + final GcTimeMonitor gcTimeMonitor = new GcTimeMonitor(conf, metrics.getSource()); String implType = "threadpool"; if (nonblocking) { @@ -593,10 +595,12 @@ public class ThriftServer { @Override public Object run() { pauseMonitor.start(); + gcTimeMonitor.start(); try { tserver.serve(); return null; } finally { + gcTimeMonitor.stop(); pauseMonitor.stop(); } } -- 2.13.4