Index: conf/hadoop-metrics.properties =================================================================== --- conf/hadoop-metrics.properties (revision 1021242) +++ conf/hadoop-metrics.properties (working copy) @@ -16,6 +16,10 @@ # hbase.period=10 # hbase.fileName=/tmp/metrics_hbase.log +# HBase-specific configuration. How long before resetting long-running stats +# If this variable is left out, then the default is no expiration. +hbase.marathon=3600 + # Configuration of the "hbase" context for ganglia # Pick one: Ganglia 3.0 (former) or Ganglia 3.1 (latter) # hbase.class=org.apache.hadoop.metrics.ganglia.GangliaContext Index: src/main/java/org/apache/hadoop/hbase/regionserver/metrics/RegionServerMetrics.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/regionserver/metrics/RegionServerMetrics.java (revision 1021242) +++ src/main/java/org/apache/hadoop/hbase/regionserver/metrics/RegionServerMetrics.java (working copy) @@ -25,6 +25,7 @@ import org.apache.hadoop.hbase.metrics.MetricsRate; import org.apache.hadoop.hbase.regionserver.wal.HLog; import org.apache.hadoop.hbase.util.Strings; +import org.apache.hadoop.metrics.ContextFactory; import org.apache.hadoop.metrics.MetricsContext; import org.apache.hadoop.metrics.MetricsRecord; import org.apache.hadoop.metrics.MetricsUtil; @@ -35,6 +36,7 @@ import org.apache.hadoop.metrics.util.MetricsRegistry; import org.apache.hadoop.metrics.util.MetricsTimeVaryingRate; +import java.io.IOException; import java.lang.management.ManagementFactory; import java.lang.management.MemoryUsage; @@ -50,6 +52,8 @@ private final Log LOG = LogFactory.getLog(this.getClass()); private final MetricsRecord metricsRecord; private long lastUpdate = System.currentTimeMillis(); + private long lastMarathon = System.currentTimeMillis(); + private long marathonPeriod = 0; private static final int MB = 1024*1024; private MetricsRegistry registry = new MetricsRegistry(); private final RegionServerStatistics statistics; @@ -134,6 +138,12 @@ public final MetricsTimeVaryingRate fsSyncLatency = new MetricsTimeVaryingRate("fsSyncLatency", registry); + /** + * time each scheduled compaction takes + */ + public final MetricsTimeVaryingRate compaction = + new MetricsTimeVaryingRate("compaction", registry); + public RegionServerMetrics() { MetricsContext context = MetricsUtil.getContext("hbase"); metricsRecord = MetricsUtil.createRecord(context, "regionserver"); @@ -145,6 +155,16 @@ // export for JMX statistics = new RegionServerStatistics(this.registry, name); + + // get custom attributes + try { + Object m = ContextFactory.getFactory().getAttribute("hbase.marathon"); + if (m instanceof String) { + this.marathonPeriod = Long.parseLong((String) m)*1000; + } + } catch (IOException ioe) { + LOG.info("Couldn't load ContextFactory for Metrics config info"); + } LOG.info("Initialized"); } @@ -157,10 +177,20 @@ /** * Since this object is a registered updater, this method will be called * periodically, e.g. every 5 seconds. - * @param unused unused argument + * @param caller the metrics context that this responsible for calling us */ - public void doUpdates(MetricsContext unused) { + public void doUpdates(MetricsContext caller) { synchronized (this) { + this.lastUpdate = System.currentTimeMillis(); + + // has the marathon period for long-living stats elapsed? + boolean marathonExpired = (this.marathonPeriod > 0) && + (this.lastUpdate - this.lastMarathon >= this.marathonPeriod); + if (marathonExpired) { + this.lastMarathon = this.lastUpdate; + this.resetAllMinMax(); + } + this.stores.pushMetric(this.metricsRecord); this.storefiles.pushMetric(this.metricsRecord); this.storefileIndexSizeMB.pushMetric(this.metricsRecord); @@ -196,15 +226,39 @@ this.fsReadLatency.pushMetric(this.metricsRecord); this.fsWriteLatency.pushMetric(this.metricsRecord); this.fsSyncLatency.pushMetric(this.metricsRecord); + this.compaction.pushMetric(this.metricsRecord); + + if (!marathonExpired) { + maintainStats(this.compaction); + } } this.metricsRecord.update(); - this.lastUpdate = System.currentTimeMillis(); } + + /* MetricsTimeVaryingRate will reset every time pushMetric() is called + * This is annoying for long-running stats that might not get a single + * operation in the polling period. This function ensures that values + * for those stat entries don't get reset. + */ + protected void maintainStats(MetricsTimeVaryingRate stat) { + int curOps = stat.getPreviousIntervalNumOps(); + if (curOps > 0) { + long curTime = stat.getPreviousIntervalAverageTime(); + long totalTime = curTime * curOps; + if (totalTime / curTime == curOps) { + stat.inc(curOps, totalTime); + } else { + LOG.info("Stats for " + stat.getName() + " overflowed! resetting"); + } + } + } public void resetAllMinMax() { this.atomicIncrementTime.resetMinMax(); this.fsReadLatency.resetMinMax(); this.fsWriteLatency.resetMinMax(); + this.fsSyncLatency.resetMinMax(); + this.compaction.resetMinMax(); } /** @@ -213,6 +267,15 @@ public float getRequests() { return this.requests.getPreviousIntervalValue(); } + + /** + * @param periodSec seconds that last compaction took + */ + public void addCompaction(final long periodSec) { + synchronized (this) { + this.compaction.inc(periodSec); + } + } /** * @param inc How much to add to requests. Index: src/main/java/org/apache/hadoop/hbase/regionserver/CompactSplitThread.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/regionserver/CompactSplitThread.java (revision 1021242) +++ src/main/java/org/apache/hadoop/hbase/regionserver/CompactSplitThread.java (working copy) @@ -102,6 +102,10 @@ if(!this.server.isStopped()) { // Don't interrupt us while we are working byte [] midKey = r.compactStores(); + long lastCompactPeriod = r.getLastCompactPeriod(); + if (lastCompactPeriod > 0) { // 0 == compaction aborted + this.server.getMetrics().addCompaction(lastCompactPeriod); + } if (shouldSplitRegion() && midKey != null && !this.server.isStopped()) { split(r, midKey); Index: src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java (revision 1021242) +++ src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java (working copy) @@ -172,6 +172,7 @@ * major compaction. Cleared each time through compaction code. */ private volatile boolean forceMajorCompaction = false; + private long lastCompactPeriod = 0; /* * Data structure of write state flags used coordinating flushes, @@ -603,6 +604,11 @@ return this.fs; } + /** @return how long the last compaction took */ + public long getLastCompactPeriod() { + return this.lastCompactPeriod; + } + /** @return the last time the region was flushed */ public long getLastFlushTime() { return this.lastFlushTime; @@ -741,6 +747,7 @@ LOG.info(((completed) ? "completed" : "aborted") + " compaction on region " + this + " after " + StringUtils.formatTimeDiff(now, startTime)); + this.lastCompactPeriod = (completed) ? (now - startTime) / 1000 : 0; } } finally { synchronized (writestate) {