From 9978a2feabd5d6fae7ae6b237134b0b5c21aae16 Mon Sep 17 00:00:00 2001 From: Caroline Zhou Date: Wed, 9 Dec 2020 23:13:54 -0800 Subject: [PATCH] HBASE-25329 Dump region hashes in logs for the regions that are stuck in transition for more than a configured amount of time --- .../MetricsAssignmentManagerSource.java | 4 +++ .../MetricsAssignmentManagerSourceImpl.java | 8 ++++++ .../master/MetricsAssignmentManager.java | 9 +++++++ .../master/assignment/AssignmentManager.java | 18 ++++++++++++- .../hbase/master/assignment/RegionStates.java | 26 +++++++++++++++++++ .../master/TestAssignmentManagerMetrics.java | 11 +++++++- 6 files changed, 74 insertions(+), 2 deletions(-) diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java index 3a71c24b2d..83cdbc08ce 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hbase.master; +import java.util.Set; import org.apache.hadoop.hbase.metrics.BaseSource; import org.apache.hadoop.hbase.metrics.OperationMetrics; import org.apache.yetus.audience.InterfaceAudience; @@ -52,6 +53,7 @@ public interface MetricsAssignmentManagerSource extends BaseSource { String RIT_DURATION_NAME = "ritDuration"; String DEAD_SERVER_OPEN_REGIONS = "deadServerOpenRegions"; String UNKNOWN_SERVER_OPEN_REGIONS = "unknownServerOpenRegions"; + String RIT_HASHES_AND_STATES_NAME = "ritHashesAndStates"; String RIT_COUNT_DESC = "Current number of Regions In Transition (Gauge)."; String RIT_COUNT_OVER_THRESHOLD_DESC = @@ -99,6 +101,8 @@ public interface MetricsAssignmentManagerSource extends BaseSource { void updateUnknownServerOpenRegions(int unknownRegions); + void updateRITHashesAndStates(Set ritHashesAndStates); + /** * TODO: Remove. This may not be needed now as assign and unassign counts are tracked separately * Increment the count of operations (assign/unassign). diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java index e8bc67825d..7b880c7997 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java @@ -18,6 +18,8 @@ package org.apache.hadoop.hbase.master; +import java.util.Set; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.hbase.metrics.BaseSourceImpl; import org.apache.hadoop.hbase.metrics.OperationMetrics; import org.apache.hadoop.metrics2.MetricHistogram; @@ -120,6 +122,12 @@ public class MetricsAssignmentManagerSourceImpl unknownServerOpenRegions.set(unknownRegions); } + @Override + public void updateRITHashesAndStates(Set ritHashesAndStates) { + metricsRegistry.tag(RIT_HASHES_AND_STATES_NAME, "", + StringUtils.join(ritHashesAndStates, ";"), true); + } + @Override public OperationMetrics getAssignMetrics() { return assignMetrics; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java index 38aeef218e..0852ac8879 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java @@ -20,6 +20,7 @@ package org.apache.hadoop.hbase.master; import static org.apache.hadoop.hbase.master.MetricsMaster.convertToProcedureMetrics; +import java.util.Set; import org.apache.hadoop.hbase.CompatibilitySingletonFactory; import org.apache.hadoop.hbase.procedure2.ProcedureMetrics; import org.apache.yetus.audience.InterfaceAudience; @@ -104,6 +105,14 @@ public class MetricsAssignmentManager { assignmentManagerSource.updateUnknownServerOpenRegions(unknownRegions); } + /** + * update the region hashes and states for regions in transition + * @param ritHashesAndStates set of "rit_hash:rit_state" + */ + public void updateRITHashesAndStates(Set ritHashesAndStates) { + assignmentManagerSource.updateRITHashesAndStates(ritHashesAndStates); + } + /** * @return Set of common metrics for assign procedure */ diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 355dfde013..709d5b27ba 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -1367,6 +1367,7 @@ public class AssignmentManager { private long oldestRITTime = 0; private int totalRITsTwiceThreshold = 0; private int totalRITs = 0; + private Set oldestRITHashesAndStates = new HashSet<>(); @VisibleForTesting public RegionInTransitionStat(final Configuration conf) { @@ -1395,6 +1396,10 @@ public class AssignmentManager { return m != null ? m.size() : 0; } + public Set getOldestRITHashesAndStates() { + return oldestRITHashesAndStates; + } + public boolean hasRegionsTwiceOverThreshold() { return totalRITsTwiceThreshold > 0; } @@ -1429,11 +1434,12 @@ public class AssignmentManager { protected void update(final AssignmentManager am) { final RegionStates regionStates = am.getRegionStates(); this.statTimestamp = EnvironmentEdgeManager.currentTime(); - update(regionStates.getRegionsStateInTransition(), statTimestamp); + update(regionStates.getRegionsInTransitionOrderedByDuration(), statTimestamp); update(regionStates.getRegionFailedOpen(), statTimestamp); } private void update(final Collection regions, final long currentTime) { + int counter = 0; for (RegionState state: regions) { totalRITs++; final long ritStartedMs = state.getStamp(); @@ -1453,6 +1459,12 @@ public class AssignmentManager { if (oldestRITTime < ritTime) { oldestRITTime = ritTime; } + if (counter < 500) { // Record 500 oldest RITs + oldestRITHashesAndStates.add( + state.getRegion().getRegionNameAsString() + ":" + state.getState().name() + ); + } + counter += 1; } } } @@ -1461,6 +1473,10 @@ public class AssignmentManager { metrics.updateRITOldestAge(ritStat.getOldestRITTime()); metrics.updateRITCount(ritStat.getTotalRITs()); metrics.updateRITCountOverThreshold(ritStat.getTotalRITsOverThreshold()); + if ((EnvironmentEdgeManager.currentTime() - ritStat.ritThreshold / 2) >= ritStat.statTimestamp){ + LOG.debug("Oldest RIT hashes and states: "+ritStat.getOldestRITHashesAndStates().toString()); + metrics.updateRITHashesAndStates(ritStat.getOldestRITHashesAndStates()); + } } private void updateDeadServerRegionMetrics(int deadRegions, int unknownRegions) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java index 06378002ec..ebe16f47d8 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java @@ -70,6 +70,21 @@ public class RegionStates { public final static RegionStateStampComparator REGION_STATE_STAMP_COMPARATOR = new RegionStateStampComparator(); + // This comparator sorts the RegionStates by duration then Region name. + // Comparing by duration alone can lead us to discard different RegionStates that happen + // to share a duration. + private static class RegionStateDurationComparator implements Comparator { + @Override + public int compare(RegionState l, RegionState r) { + return Long.compare(l.getRitDuration(), r.getRitDuration()) == 0 ? + Bytes.compareTo(l.getRegion().getRegionName(), r.getRegion().getRegionName()) : + Long.compare(l.getRitDuration(), r.getRitDuration()); + } + } + + public final static RegionStateDurationComparator REGION_STATE_DURATION_COMPARATOR = + new RegionStateDurationComparator(); + private final Object regionsMapLock = new Object(); // TODO: Replace the ConcurrentSkipListMaps @@ -666,6 +681,17 @@ public class RegionStates { return rit; } + /** + * Get regions in transition and their states, sorted by duration desc + */ + public synchronized SortedSet getRegionsInTransitionOrderedByDuration() { + final SortedSet rit = new TreeSet(Collections.reverseOrder(REGION_STATE_DURATION_COMPARATOR)); + for (RegionStateNode node: regionInTransition.values()) { + rit.add(node.toRegionState()); + } + return rit; + } + // ========================================================================== // Region offline helpers // ========================================================================== diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java index c1e16bec9a..fc6100cbae 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java @@ -78,7 +78,7 @@ public class TestAssignmentManagerMetrics { conf.setBoolean(TableDescriptorChecker.TABLE_SANITY_CHECKS, false); // set RIT stuck warning threshold to a small value - conf.setInt(HConstants.METRICS_RIT_STUCK_WARNING_THRESHOLD, 20); + conf.setInt(HConstants.METRICS_RIT_STUCK_WARNING_THRESHOLD, 1); // set msgInterval to 1 second conf.setInt("hbase.regionserver.msginterval", MSG_INTERVAL); @@ -131,6 +131,13 @@ public class TestAssignmentManagerMetrics { METRICS_HELPER.assertGauge(MetricsAssignmentManagerSource.RIT_COUNT_NAME, 0, amSource); METRICS_HELPER.assertGauge(MetricsAssignmentManagerSource.RIT_COUNT_OVER_THRESHOLD_NAME, 0, amSource); + METRICS_HELPER.assertTag(MetricsAssignmentManagerSource.RIT_HASHES_AND_STATES_NAME, + "", amSource); + + // the region that should be in "FAILED_OPEN" state after altering table with non-existing + // coprocessor + String ritHashAndState = TEST_UTIL.getHBaseCluster().getRegions(TABLENAME).get(0) + .getRegionInfo().getRegionNameAsString() + ":" + "FAILED_OPEN"; // alter table with a non-existing coprocessor @@ -163,6 +170,8 @@ public class TestAssignmentManagerMetrics { amSource); METRICS_HELPER.assertCounter(MetricsAssignmentManagerSource.ASSIGN_METRIC_PREFIX + "SubmittedCount", 2, amSource); + METRICS_HELPER.assertTag(MetricsAssignmentManagerSource.RIT_HASHES_AND_STATES_NAME, + ritHashAndState, amSource); } } } -- 2.28.0