From 74325f66cac035d21c89fdff5815c3f297926558 Mon Sep 17 00:00:00 2001 From: Thiruvel Thirumoolan Date: Tue, 22 May 2018 18:28:31 -0700 Subject: [PATCH] HBASE-20548 Master fails to startup on large clusters, refreshing block distribution --- .../hbase/rsgroup/RSGroupBasedLoadBalancer.java | 5 +++++ .../org/apache/hadoop/hbase/master/HMaster.java | 11 +++++++++ .../apache/hadoop/hbase/master/LoadBalancer.java | 5 +++++ .../hbase/master/balancer/BaseLoadBalancer.java | 26 ++++++++++++++-------- 4 files changed, 38 insertions(+), 9 deletions(-) diff --git a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupBasedLoadBalancer.java b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupBasedLoadBalancer.java index 9f8f427dfc..c73116d27c 100644 --- a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupBasedLoadBalancer.java +++ b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupBasedLoadBalancer.java @@ -439,4 +439,9 @@ public class RSGroupBasedLoadBalancer implements RSGroupableBalancer, LoadBalanc public boolean isStopped() { return false; } + + @Override + public void postMasterStartupInitialize() { + this.internalBalancer.postMasterStartupInitialize(); + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 80bda8d26b..59e4437354 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -932,6 +932,17 @@ public class HMaster extends HRegionServer implements MasterServices, Server { } zombieDetector.interrupt(); + + /* + * After master has started up, lets do balancer post startup initialization. Since this runs + * in activeMasterManager thread, it should be fine. + */ + long start = System.currentTimeMillis(); + this.balancer.postMasterStartupInitialize(); + if (LOG.isDebugEnabled()) { + LOG.debug("Balancer post startup initialization complete, took " + ( + (System.currentTimeMillis() - start) / 1000) + " seconds"); + } } private void initQuotaManager() throws IOException { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/LoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/LoadBalancer.java index e387f5950d..8531ff7642 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/LoadBalancer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/LoadBalancer.java @@ -152,4 +152,9 @@ public interface LoadBalancer extends Configurable, Stoppable, ConfigurationObse */ @Override void onConfigurationChange(Configuration conf); + + /** + * If balancer needs to do initialization after Master has started up, lets do that here. + */ + void postMasterStartupInitialize(); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java index 41cbeaa83b..01df8a2b3a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java @@ -1267,6 +1267,19 @@ public abstract class BaseLoadBalancer implements LoadBalancer { } } + @Override + public void postMasterStartupInitialize() { + if (services != null && regionFinder != null) { + try { + Set regions = + services.getAssignmentManager().getRegionStates().getRegionAssignments().keySet(); + regionFinder.refreshAndWait(regions); + } catch (Exception e) { + LOG.warn("Refreshing region HDFS Block dist failed with exception, ignoring", e); + } + } + } + public void setRackManager(RackManager rackManager) { this.rackManager = rackManager; } @@ -1367,7 +1380,7 @@ public abstract class BaseLoadBalancer implements LoadBalancer { return assignments; } - Cluster cluster = createCluster(servers, regions, false); + Cluster cluster = createCluster(servers, regions); List unassignedRegions = new ArrayList(); roundRobinAssignment(cluster, regions, unassignedRegions, @@ -1413,11 +1426,7 @@ public abstract class BaseLoadBalancer implements LoadBalancer { return assignments; } - protected Cluster createCluster(List servers, - Collection regions, boolean forceRefresh) { - if (forceRefresh && useRegionFinder) { - regionFinder.refreshAndWait(regions); - } + protected Cluster createCluster(List servers, Collection regions) { // Get the snapshot of the current assignments for the regions in question, and then create // a cluster out of it. Note that we might have replicas already assigned to some servers // earlier. So we want to get the snapshot to see those assignments, but this will only contain @@ -1491,7 +1500,7 @@ public abstract class BaseLoadBalancer implements LoadBalancer { } List regions = Lists.newArrayList(regionInfo); - Cluster cluster = createCluster(servers, regions, false); + Cluster cluster = createCluster(servers, regions); return randomAssignment(cluster, regionInfo, servers); } @@ -1569,8 +1578,6 @@ public abstract class BaseLoadBalancer implements LoadBalancer { int numRandomAssignments = 0; int numRetainedAssigments = 0; - Cluster cluster = createCluster(servers, regions.keySet(), true); - for (Map.Entry entry : regions.entrySet()) { HRegionInfo region = entry.getKey(); ServerName oldServerName = entry.getValue(); @@ -1613,6 +1620,7 @@ public abstract class BaseLoadBalancer implements LoadBalancer { // If servers from prior assignment aren't present, then lets do randomAssignment on regions. if (randomAssignRegions.size() > 0) { + Cluster cluster = createCluster(servers, regions.keySet()); for (Map.Entry> entry : assignments.entrySet()) { ServerName sn = entry.getKey(); for (HRegionInfo region : entry.getValue()) { -- 2.15.1 (Apple Git-101)