From 37a974eeeb57855182a69e5b2368f340da3a5131 Mon Sep 17 00:00:00 2001 From: Caroline Zhou Date: Tue, 8 Dec 2020 15:59:23 -0800 Subject: [PATCH] HBASE-25329 Dump region hashes in logs for the regions that are stuck in transition for more than a configured amount of time --- .../MetricsAssignmentManagerSource.java | 6 +++- .../MetricsAssignmentManagerSourceImpl.java | 10 ++++++ .../hbase/master/AssignmentManager.java | 20 ++++++++++- .../master/MetricsAssignmentManager.java | 9 +++++ .../hadoop/hbase/master/RegionStates.java | 36 ++++++++++++++++--- .../master/TestAssignmentManagerMetrics.java | 10 +++++- 6 files changed, 83 insertions(+), 8 deletions(-) diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java index 08b4ac5dc1..ecf140ef3e 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hbase.master; +import java.util.Set; import org.apache.hadoop.hbase.metrics.BaseSource; public interface MetricsAssignmentManagerSource extends BaseSource { @@ -40,7 +41,7 @@ public interface MetricsAssignmentManagerSource extends BaseSource { /** * Description */ - String METRICS_DESCRIPTION = "Metrics about HBase master assingment manager."; + String METRICS_DESCRIPTION = "Metrics about HBase master assignment manager."; String RIT_COUNT_NAME = "ritCount"; String RIT_COUNT_OVER_THRESHOLD_NAME = "ritCountOverThreshold"; @@ -48,6 +49,7 @@ public interface MetricsAssignmentManagerSource extends BaseSource { String RIT_DURATION_NAME = "ritDuration"; String ASSIGN_TIME_NAME = "assign"; String BULK_ASSIGN_TIME_NAME = "bulkAssign"; + String RIT_HASHES_AND_STATES_NAME = "ritHashesAndStates"; void updateAssignmentTime(long time); @@ -75,4 +77,6 @@ public interface MetricsAssignmentManagerSource extends BaseSource { void setRITOldestAge(long age); void updateRitDuration(long duration); + + void updateRITHashesAndStates(Set ritHashesAndStates); } diff --git a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java index ab504f5605..432ccb54c8 100644 --- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java +++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java @@ -18,6 +18,8 @@ package org.apache.hadoop.hbase.master; +import java.util.Set; +import org.apache.commons.lang.StringUtils; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.metrics.BaseSourceImpl; import org.apache.hadoop.metrics2.MetricHistogram; @@ -80,4 +82,12 @@ public class MetricsAssignmentManagerSourceImpl public void updateRitDuration(long duration) { ritDurationHisto.add(duration); } + + @Override + public void updateRITHashesAndStates(Set ritHashesAndStates) { + metricsRegistry.tag(RIT_HASHES_AND_STATES_NAME, "", + StringUtils.join(ritHashesAndStates, ";"), true); + } + + } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index 842ce85687..53dcbd7e64 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -44,6 +44,7 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; +import com.google.common.collect.Sets; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -160,6 +161,8 @@ public class AssignmentManager extends ZooKeeperListener { Set replicasToClose = Collections.synchronizedSet(new HashSet()); + private long lastRITHashMetricUpdate = EnvironmentEdgeManager.currentTime(); + /** * Map of regions to reopen after the schema of a table is changed. Key - * encoded region name, value - HRegionInfo @@ -3543,9 +3546,11 @@ public class AssignmentManager extends ZooKeeperListener { int totalRITs = 0; int totalRITsOverThreshold = 0; long oldestRITTime = 0; + Set oldestRITHashesAndStates = Sets.newHashSet(); // set of : int ritThreshold = this.server.getConfiguration(). getInt(HConstants.METRICS_RIT_STUCK_WARNING_THRESHOLD, 60000); - for (RegionState state: regionStates.getRegionsInTransition()) { + int counter = 0; + for (RegionState state: regionStates.getRegionsInTransitionOrderedByDuration()) { totalRITs++; long ritTime = currentTime - state.getStamp(); if (ritTime > ritThreshold) { // more than the threshold @@ -3554,11 +3559,24 @@ public class AssignmentManager extends ZooKeeperListener { if (oldestRITTime < ritTime) { oldestRITTime = ritTime; } + if (counter < 500) { // Record 500 oldest RITs + oldestRITHashesAndStates.add( + state.getRegion().getRegionNameAsString() + ":" + state.getState().name() + ); + } + counter += 1; } if (this.metricsAssignmentManager != null) { this.metricsAssignmentManager.updateRITOldestAge(oldestRITTime); this.metricsAssignmentManager.updateRITCount(totalRITs); this.metricsAssignmentManager.updateRITCountOverThreshold(totalRITsOverThreshold); + + LOG.debug("Oldest RIT hashes and states: " + oldestRITHashesAndStates.toString()); + long time = EnvironmentEdgeManager.currentTime(); + if ((time - ritThreshold / 2) >= this.lastRITHashMetricUpdate) { + this.metricsAssignmentManager.updateRITHashesAndStates(oldestRITHashesAndStates); + this.lastRITHashMetricUpdate = time; + } } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java index 40e79aeff4..72cf75cdda 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hbase.master; +import java.util.Set; import org.apache.hadoop.hbase.CompatibilitySingletonFactory; public class MetricsAssignmentManager { @@ -72,4 +73,12 @@ public class MetricsAssignmentManager { public void updateRitDuration(long duration) { assignmentManagerSource.updateRitDuration(duration); } + + /** + * update the region hashes and states for regions in transition + * @param ritHashesAndStates set of rit_hash:rit_state + */ + public void updateRITHashesAndStates(Set ritHashesAndStates) { + assignmentManagerSource.updateRITHashesAndStates(ritHashesAndStates); + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java index b8b49d70d3..eb6f5aa72e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java @@ -17,22 +17,22 @@ */ package org.apache.hadoop.hbase.master; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.Comparator; import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -49,8 +49,8 @@ import org.apache.hadoop.hbase.client.Mutation; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.RegionReplicaUtil; import org.apache.hadoop.hbase.client.Result; -import org.apache.hadoop.hbase.master.RegionState.State; import org.apache.hadoop.hbase.client.TableState; +import org.apache.hadoop.hbase.master.RegionState.State; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.ConfigUtil; import org.apache.hadoop.hbase.util.FSUtils; @@ -73,6 +73,9 @@ public class RegionStates { public final static RegionStateStampComparator REGION_STATE_COMPARATOR = new RegionStateStampComparator(); + public final static RegionStateDurationComparator REGION_STATE_DURATION_COMPARATOR = + new RegionStateDurationComparator(); + // This comparator sorts the RegionStates by time stamp then Region name. // Comparing by timestamp alone can lead us to discard different RegionStates that happen // to share a timestamp. @@ -85,6 +88,18 @@ public class RegionStates { } } + // This comparator sorts the RegionStates by duration then Region name. + // Comparing by duration alone can lead us to discard different RegionStates that happen + // to share a duration. + private static class RegionStateDurationComparator implements Comparator { + @Override + public int compare(RegionState l, RegionState r) { + return Long.compare(l.getRitDuration(), r.getRitDuration()) == 0 ? + Bytes.compareTo(l.getRegion().getRegionName(), r.getRegion().getRegionName()) : + Long.compare(l.getRitDuration(), r.getRitDuration()); + } + } + /** * Regions currently in transition. */ @@ -233,6 +248,17 @@ public class RegionStates { return new HashSet(regionsInTransition.values()); } + /** + * Get regions in transition and their states, sorted by duration desc + */ + public synchronized SortedSet getRegionsInTransitionOrderedByDuration() { + final TreeSet rit = new TreeSet(Collections.reverseOrder(REGION_STATE_DURATION_COMPARATOR)); + for (RegionState rs: regionsInTransition.values()) { + rit.add(rs); + } + return rit; + } + /** * Get all regions and their states */ diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java index 481745762b..51a82596c1 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java @@ -22,8 +22,8 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.CompatibilityFactory; import org.apache.hadoop.hbase.HBaseTestingUtility; -import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HColumnDescriptor; +import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.MiniHBaseCluster; import org.apache.hadoop.hbase.TableName; @@ -109,6 +109,12 @@ public class TestAssignmentManagerMetrics { metricsHelper.assertGauge(MetricsAssignmentManagerSource.RIT_COUNT_NAME, 0, amSource); metricsHelper.assertGauge(MetricsAssignmentManagerSource.RIT_COUNT_OVER_THRESHOLD_NAME, 0, amSource); + metricsHelper.assertTag(MetricsAssignmentManagerSource.RIT_HASHES_AND_STATES_NAME, + "", amSource); + + // the region that should be in "FAILED_OPEN" state after altering table with non-existing coprocessor + String ritHashAndState = TEST_UTIL.getHBaseCluster().getRegions(TABLENAME).get(0) + .getRegionInfo().getRegionNameAsString() + ":" + "FAILED_OPEN"; // alter table with a non-existing coprocessor HTableDescriptor htd = new HTableDescriptor(TABLENAME); @@ -126,6 +132,8 @@ public class TestAssignmentManagerMetrics { metricsHelper.assertGauge(MetricsAssignmentManagerSource.RIT_COUNT_NAME, 1, amSource); metricsHelper.assertGauge(MetricsAssignmentManagerSource.RIT_COUNT_OVER_THRESHOLD_NAME, 1, amSource); + metricsHelper.assertTag(MetricsAssignmentManagerSource.RIT_HASHES_AND_STATES_NAME, + ritHashAndState, amSource); } finally { if (table != null) { -- 2.28.0