From 3eba2ab2946c2625b14a32e6df345db08e94c1f1 Mon Sep 17 00:00:00 2001 From: Caroline Zhou Date: Wed, 9 Dec 2020 20:07:57 -0800 Subject: [PATCH] HBASE-25329 Dump region hashes in logs for the regions that are stuck in transition for more than a configured amount of time --- .../MetricsAssignmentManagerSource.java | 4 +++ .../MetricsAssignmentManagerSourceImpl.java | 8 ++++++ .../master/MetricsAssignmentManager.java | 10 ++++++- .../master/assignment/AssignmentManager.java | 18 ++++++++++++- .../hbase/master/assignment/RegionStates.java | 26 +++++++++++++++++++ .../master/TestAssignmentManagerMetrics.java | 11 +++++++- 6 files changed, 74 insertions(+), 3 deletions(-) diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java index 822e407abd..7396d89dd8 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hbase.master; +import java.util.Set; import org.apache.hadoop.hbase.metrics.BaseSource; import org.apache.hadoop.hbase.metrics.OperationMetrics; import org.apache.yetus.audience.InterfaceAudience; @@ -52,6 +53,7 @@ public interface MetricsAssignmentManagerSource extends BaseSource { String RIT_DURATION_NAME = "ritDuration"; String DEAD_SERVER_OPEN_REGIONS = "deadServerOpenRegions"; String UNKNOWN_SERVER_OPEN_REGIONS = "unknownServerOpenRegions"; + String RIT_HASHES_AND_STATES_NAME = "ritHashesAndStates"; String RIT_COUNT_DESC = "Current number of Regions In Transition (Gauge)."; String RIT_COUNT_OVER_THRESHOLD_DESC = @@ -98,6 +100,8 @@ public interface MetricsAssignmentManagerSource extends BaseSource { void updateUnknownServerOpenRegions(int unknownRegions); + void updateRITHashesAndStates(Set ritHashesAndStates); + /** * TODO: Remove. This may not be needed now as assign and unassign counts are tracked separately * Increment the count of operations (assign/unassign). diff --git a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java index 59b364725c..d5b8eb952c 100644 --- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java +++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java @@ -18,6 +18,8 @@ package org.apache.hadoop.hbase.master; +import java.util.Set; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.hbase.metrics.BaseSourceImpl; import org.apache.hadoop.hbase.metrics.OperationMetrics; import org.apache.hadoop.metrics2.MetricHistogram; @@ -120,6 +122,12 @@ public class MetricsAssignmentManagerSourceImpl unknownServerOpenRegions.set(unknownRegions); } + @Override + public void updateRITHashesAndStates(Set ritHashesAndStates) { + metricsRegistry.tag(RIT_HASHES_AND_STATES_NAME, "", + StringUtils.join(ritHashesAndStates, ";"), true); + } + @Override public OperationMetrics getAssignMetrics() { return assignMetrics; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java index 38aeef218e..27ce716015 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java @@ -19,7 +19,7 @@ package org.apache.hadoop.hbase.master; import static org.apache.hadoop.hbase.master.MetricsMaster.convertToProcedureMetrics; - +import java.util.Set; import org.apache.hadoop.hbase.CompatibilitySingletonFactory; import org.apache.hadoop.hbase.procedure2.ProcedureMetrics; import org.apache.yetus.audience.InterfaceAudience; @@ -88,6 +88,14 @@ public class MetricsAssignmentManager { assignmentManagerSource.updateRitDuration(duration); } + /** + * update the region hashes and states for regions in transition + * @param ritHashesAndStates set of "rit_hash:rit_state" + */ + public void updateRITHashesAndStates(Set ritHashesAndStates) { + assignmentManagerSource.updateRITHashesAndStates(ritHashesAndStates); + } + /* * TODO: Remove. This may not be required as assign and unassign operations are tracked separately * Increment the count of assignment operation (assign/unassign). diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 47b81f5d26..aa3cbab3e9 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -1358,6 +1358,7 @@ public class AssignmentManager { private long oldestRITTime = 0; private int totalRITsTwiceThreshold = 0; private int totalRITs = 0; + private Set oldestRITHashesAndStates = new HashSet<>(); @VisibleForTesting public RegionInTransitionStat(final Configuration conf) { @@ -1386,6 +1387,10 @@ public class AssignmentManager { return m != null ? m.size() : 0; } + public Set getOldestRITHashesAndStates() { + return oldestRITHashesAndStates; + } + public boolean hasRegionsTwiceOverThreshold() { return totalRITsTwiceThreshold > 0; } @@ -1420,11 +1425,12 @@ public class AssignmentManager { protected void update(final AssignmentManager am) { final RegionStates regionStates = am.getRegionStates(); this.statTimestamp = EnvironmentEdgeManager.currentTime(); - update(regionStates.getRegionsStateInTransition(), statTimestamp); + update(regionStates.getRegionsInTransitionOrderedByDuration(), statTimestamp); update(regionStates.getRegionFailedOpen(), statTimestamp); } private void update(final Collection regions, final long currentTime) { + int counter = 0; for (RegionState state: regions) { totalRITs++; final long ritTime = currentTime - state.getStamp(); @@ -1438,6 +1444,12 @@ public class AssignmentManager { if (oldestRITTime < ritTime) { oldestRITTime = ritTime; } + if (counter < 500) { // Record 500 oldest RITs + oldestRITHashesAndStates.add( + state.getRegion().getRegionNameAsString() + ":" + state.getState().name() + ); + } + counter += 1; } } } @@ -1446,6 +1458,10 @@ public class AssignmentManager { metrics.updateRITOldestAge(ritStat.getOldestRITTime()); metrics.updateRITCount(ritStat.getTotalRITs()); metrics.updateRITCountOverThreshold(ritStat.getTotalRITsOverThreshold()); + if ((EnvironmentEdgeManager.currentTime() - ritStat.ritThreshold / 2) >= ritStat.statTimestamp){ + LOG.debug("Oldest RIT hashes and states: "+ritStat.getOldestRITHashesAndStates().toString()); + metrics.updateRITHashesAndStates(ritStat.getOldestRITHashesAndStates()); + } } private void updateDeadServerRegionMetrics(int deadRegions, int unknownRegions) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java index 54765402fb..9d12dfc87e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java @@ -70,6 +70,21 @@ public class RegionStates { public final static RegionStateStampComparator REGION_STATE_STAMP_COMPARATOR = new RegionStateStampComparator(); + // This comparator sorts the RegionStates by duration then Region name. + // Comparing by duration alone can lead us to discard different RegionStates that happen + // to share a duration. + private static class RegionStateDurationComparator implements Comparator { + @Override + public int compare(RegionState l, RegionState r) { + return Long.compare(l.getRitDuration(), r.getRitDuration()) == 0 ? + Bytes.compareTo(l.getRegion().getRegionName(), r.getRegion().getRegionName()) : + Long.compare(l.getRitDuration(), r.getRitDuration()); + } + } + + public final static RegionStateDurationComparator REGION_STATE_DURATION_COMPARATOR = + new RegionStateDurationComparator(); + // TODO: Replace the ConcurrentSkipListMaps /** * RegionName -- i.e. RegionInfo.getRegionName() -- as bytes to {@link RegionStateNode} @@ -648,6 +663,17 @@ public class RegionStates { return rit; } + /** + * Get regions in transition and their states, sorted by duration desc + */ + public synchronized SortedSet getRegionsInTransitionOrderedByDuration() { + final SortedSet rit = new TreeSet(Collections.reverseOrder(REGION_STATE_DURATION_COMPARATOR)); + for (RegionStateNode node: regionInTransition.values()) { + rit.add(node.toRegionState()); + } + return rit; + } + // ========================================================================== // Region offline helpers // ========================================================================== diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java index 079bf93e79..c1c25f546b 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java @@ -78,7 +78,7 @@ public class TestAssignmentManagerMetrics { conf.setBoolean(TableDescriptorChecker.TABLE_SANITY_CHECKS, false); // set RIT stuck warning threshold to a small value - conf.setInt(HConstants.METRICS_RIT_STUCK_WARNING_THRESHOLD, 20); + conf.setInt(HConstants.METRICS_RIT_STUCK_WARNING_THRESHOLD, 1); // set msgInterval to 1 second conf.setInt("hbase.regionserver.msginterval", MSG_INTERVAL); @@ -131,6 +131,13 @@ public class TestAssignmentManagerMetrics { METRICS_HELPER.assertGauge(MetricsAssignmentManagerSource.RIT_COUNT_NAME, 0, amSource); METRICS_HELPER.assertGauge(MetricsAssignmentManagerSource.RIT_COUNT_OVER_THRESHOLD_NAME, 0, amSource); + METRICS_HELPER.assertTag(MetricsAssignmentManagerSource.RIT_HASHES_AND_STATES_NAME, + "", amSource); + + // the region that should be in "FAILED_OPEN" state after altering table with non-existing + // coprocessor + String ritHashAndState = TEST_UTIL.getHBaseCluster().getRegions(TABLENAME).get(0) + .getRegionInfo().getRegionNameAsString() + ":" + "FAILED_OPEN"; // alter table with a non-existing coprocessor @@ -163,6 +170,8 @@ public class TestAssignmentManagerMetrics { amSource); METRICS_HELPER.assertCounter(MetricsAssignmentManagerSource.ASSIGN_METRIC_PREFIX + "SubmittedCount", 3, amSource); + METRICS_HELPER.assertTag(MetricsAssignmentManagerSource.RIT_HASHES_AND_STATES_NAME, + ritHashAndState, amSource); } } } -- 2.28.0