diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 0e6f13d7b2..542642c2ea 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -162,6 +162,9 @@ import org.apache.hadoop.hbase.replication.master.TableCFsUpdater; import org.apache.hadoop.hbase.replication.regionserver.Replication; import org.apache.hadoop.hbase.security.AccessDeniedException; import org.apache.hadoop.hbase.security.UserProvider; +import org.apache.hadoop.hbase.shaded.com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.hbase.shaded.com.google.common.collect.Lists; +import org.apache.hadoop.hbase.shaded.com.google.common.collect.Maps; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.GetRegionInfoResponse.CompactionState; import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos.RegionServerInfo; @@ -198,9 +201,6 @@ import org.eclipse.jetty.server.ServerConnector; import org.eclipse.jetty.servlet.ServletHolder; import org.eclipse.jetty.webapp.WebAppContext; -import org.apache.hadoop.hbase.shaded.com.google.common.annotations.VisibleForTesting; -import org.apache.hadoop.hbase.shaded.com.google.common.collect.Lists; -import org.apache.hadoop.hbase.shaded.com.google.common.collect.Maps; import com.google.protobuf.Descriptors; import com.google.protobuf.Service; @@ -1340,7 +1340,7 @@ public class HMaster extends HRegionServer implements MasterServices { public boolean balance(boolean force) throws IOException { // if master not initialized, don't run balancer. if (!isInitialized()) { - LOG.debug("Master has not been initialized, don't run balancer."); + LOG.info("Master has not been initialized, don't run balancer."); return false; } @@ -1352,7 +1352,10 @@ public class HMaster extends HRegionServer implements MasterServices { int maxRegionsInTransition = getMaxRegionsInTransition(); synchronized (this.balancer) { // If balance not true, don't run balancer. - if (!this.loadBalancerTracker.isBalancerOn()) return false; + if (!this.loadBalancerTracker.isBalancerOn()) { + LOG.info("Not runnning balancer, because balancer is off."); + return false; + } // Only allow one balance run at at time. if (this.assignmentManager.hasRegionsInTransition()) { List regionsInTransition = assignmentManager.getRegionsInTransition(); @@ -1369,7 +1372,10 @@ public class HMaster extends HRegionServer implements MasterServices { } LOG.info(prefix + "unning balancer because " + regionsInTransition.size() + " region(s) in transition: " + toPrint + (truncated? "(truncated list)": "")); - if (!force || metaInTransition) return false; + if (!force || metaInTransition) { + LOG.info("Not running balancer, Meta is in transition."); + return false; + } } if (this.serverManager.areDeadServersInProgress()) { LOG.info("Not running balancer because processing dead regionserver(s): " + @@ -1380,7 +1386,7 @@ public class HMaster extends HRegionServer implements MasterServices { if (this.cpHost != null) { try { if (this.cpHost.preBalance()) { - LOG.debug("Coprocessor bypassing balancer request"); + LOG.info("Coprocessor bypassing balancer request"); return false; } } catch (IOException ioe) { @@ -1391,13 +1397,15 @@ public class HMaster extends HRegionServer implements MasterServices { Map>> assignmentsByTable = this.assignmentManager.getRegionStates().getAssignmentsByTable(); + augmentAssignments(assignmentsByTable); List plans = new ArrayList<>(); - //Give the balancer the current cluster state. + // Give the balancer the current cluster state. + // It should handle dead servers, which can be present + // assignmentsByTable this.balancer.setClusterStatus(getClusterStatus()); - this.balancer.setClusterLoad( - this.assignmentManager.getRegionStates().getAssignmentsByTable()); + this.balancer.setClusterLoad(assignmentsByTable); for (Entry>> e : assignmentsByTable.entrySet()) { List partialPlans = this.balancer.balanceCluster(e.getKey(), e.getValue()); @@ -1442,11 +1450,50 @@ public class HMaster extends HRegionServer implements MasterServices { } } } + //Remove dead servers from AM.regionStates + refreshAssignmentManager(); // If LoadBalancer did not generate any plans, it means the cluster is already balanced. // Return true indicating a success. return true; } + private void refreshAssignmentManager() throws InterruptedIOException { + RegionStates regionStates = assignmentManager.getRegionStates(); + ClusterStatus clusterStatus = getClusterStatus(); + regionStates.update(clusterStatus); + } + + /** + * Adds newly added servers (with zero assigned regions) + * @param assignmentsByTable + * @throws InterruptedIOException + */ + private void augmentAssignments( + Map>> assignmentsByTable) + throws InterruptedIOException { + + ClusterStatus status = null; + try { + status = getClusterStatus(); + } catch (InterruptedIOException e) { + Thread.currentThread().interrupt(); + throw e; + } + Collection servers = status.getServers(); + // For every table + for (TableName table : assignmentsByTable.keySet()) { + Map> serverMap = assignmentsByTable.get(table); + Set keySet = serverMap.keySet(); + for (ServerName sn : servers) { + if (keySet.contains(sn)) { + continue; + } + List list = new ArrayList(); + serverMap.put(sn, list); + } + } + } + @Override @VisibleForTesting public RegionNormalizer getRegionNormalizer() { diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java index df55c94b19..e97dd5291e 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java @@ -24,12 +24,13 @@ import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.Set; +import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; @@ -37,6 +38,7 @@ import java.util.concurrent.ConcurrentSkipListMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.ClusterStatus; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.ServerName; @@ -46,11 +48,10 @@ import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.master.RegionState.State; import org.apache.hadoop.hbase.procedure2.ProcedureEvent; +import org.apache.hadoop.hbase.shaded.com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; -import org.apache.hadoop.hbase.shaded.com.google.common.annotations.VisibleForTesting; - /** * RegionStates contains a set of Maps that describes the in-memory state of the AM, with * the regions available in the system, the region in transition, the offline regions and @@ -185,7 +186,7 @@ public class RegionStates { this.openSeqNum = seqId; } - + public ServerName setRegionLocation(final ServerName serverName) { ServerName lastRegionLocation = this.regionLocation; if (LOG.isTraceEnabled() && serverName == null) { @@ -277,7 +278,7 @@ public class RegionStates { public String toString() { return toDescriptiveString(); } - + public String toShortString() { // rit= is the current Region-In-Transition State -- see State enum. return String.format("rit=%s, location=%s", getState(), getRegionLocation()); @@ -613,6 +614,20 @@ public class RegionStates { } } + + public void update(ClusterStatus status) { + Collection servers = status.getServers(); + List toDelete = new ArrayList(); + for (ServerName name: serverMap.keySet()) { + if(!servers.contains(name)) { + toDelete.add(name); + } + } + for(ServerName name: toDelete) { + serverMap.remove(name); + } + } + // ============================================================================================ // TODO: // ============================================================================================ @@ -731,6 +746,10 @@ public class RegionStates { public Map>> getAssignmentsByTable() { final Map>> result = new HashMap<>(); + Enumeration en = serverMap.keys(); + while (en.hasMoreElements()) { + LOG.debug("server: "+ en.nextElement()); + } for (RegionStateNode node: regionsMap.values()) { Map> tableResult = result.get(node.getTable()); if (tableResult == null) { @@ -931,12 +950,27 @@ public class RegionStates { int numServers = 0; int totalLoad = 0; for (ServerStateNode node: serverMap.values()) { + if (isMaster(node)) { + // Skip Master when calculate load + continue; + } totalLoad += node.getRegionCount(); numServers++; } + return numServers == 0 ? 0.0: (double)totalLoad / (double)numServers; } + private boolean isMaster(ServerStateNode node) { + Set nodes = node.getRegions(); + for (RegionStateNode rn : nodes) { + if (rn.getRegionInfo().isMetaRegion()) { + return true; + } + } + return false; + } + public ServerStateNode addRegionToServer(final ServerName serverName, final RegionStateNode regionNode) { ServerStateNode serverNode = getOrCreateServer(serverName); diff --git hbase-server/src/test/java/org/apache/hadoop/hbase/TestRegionRebalancing.java hbase-server/src/test/java/org/apache/hadoop/hbase/TestRegionRebalancing.java index cff1a8d744..e8c913f7bd 100644 --- hbase-server/src/test/java/org/apache/hadoop/hbase/TestRegionRebalancing.java +++ hbase-server/src/test/java/org/apache/hadoop/hbase/TestRegionRebalancing.java @@ -42,7 +42,6 @@ import org.apache.hadoop.hbase.util.JVMClusterUtil; import org.apache.hadoop.hbase.util.Threads; import org.junit.After; import org.junit.Before; -import org.junit.Ignore; import org.junit.Test; import org.junit.experimental.categories.Category; import org.junit.runner.RunWith; @@ -52,8 +51,7 @@ import org.junit.runners.Parameterized.Parameters; /** * Test whether region re-balancing works. (HBASE-71) */ -@Ignore // This is broken since new RegionServers does proper average of regions -// and because Master is treated as a regionserver though it hosts two regions only. + @Category({FlakeyTests.class, LargeTests.class}) @RunWith(value = Parameterized.class) public class TestRegionRebalancing { @@ -61,7 +59,7 @@ public class TestRegionRebalancing { @Parameters public static Collection data() { Object[][] balancers = - new String[][] { { "org.apache.hadoop.hbase.master.balancer.SimpleLoadBalancer" }, + new String[][] {{ "org.apache.hadoop.hbase.master.balancer.SimpleLoadBalancer" }, { "org.apache.hadoop.hbase.master.balancer.StochasticLoadBalancer" } }; return Arrays.asList(balancers); } @@ -86,6 +84,7 @@ public class TestRegionRebalancing { @Before public void before() throws Exception { UTIL.getConfiguration().set("hbase.master.loadbalancer.class", this.balancerName); + UTIL.getConfiguration().setFloat("hbase.regions.slop", 0.1f); // set minCostNeedBalance to 0, make sure balancer run UTIL.startMiniCluster(1); this.desc = new HTableDescriptor(TableName.valueOf("test")); @@ -119,24 +118,28 @@ public class TestRegionRebalancing { // add a region server - total of 2 LOG.info("Started second server=" + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName()); - UTIL.getHBaseCluster().getMaster().balance(); + + assert(balance() == true); assertRegionsAreBalanced(); // On a balanced cluster, calling balance() should return true - assert(UTIL.getHBaseCluster().getMaster().balance() == true); + assert(balance() == true); + // if we add a server, then the balance() call should return true // add a region server - total of 3 LOG.info("Started third server=" + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName()); - assert(UTIL.getHBaseCluster().getMaster().balance() == true); + //assert(UTIL.getHBaseCluster().getMaster().balance() == true); + assert(balance() == true); assertRegionsAreBalanced(); // kill a region server - total of 2 LOG.info("Stopped third server=" + UTIL.getHBaseCluster().stopRegionServer(2, false)); UTIL.getHBaseCluster().waitOnRegionServer(2); waitOnCrashProcessing(); - UTIL.getHBaseCluster().getMaster().balance(); + assert(balance() == true); + assertRegionsAreBalanced(); // start two more region servers - total of 4 @@ -145,18 +148,30 @@ public class TestRegionRebalancing { LOG.info("Added fourth server=" + UTIL.getHBaseCluster().startRegionServer().getRegionServer().getServerName()); waitOnCrashProcessing(); - assert(UTIL.getHBaseCluster().getMaster().balance() == true); + assert(balance() == true); assertRegionsAreBalanced(); for (int i = 0; i < 6; i++){ LOG.info("Adding " + (i + 5) + "th region server"); UTIL.getHBaseCluster().startRegionServer(); } - assert(UTIL.getHBaseCluster().getMaster().balance() == true); + assert(balance() == true); assertRegionsAreBalanced(); regionLocator.close(); } } + private boolean balance() throws InterruptedException, IOException { + long startTime = System.currentTimeMillis(); + long timeout = 60000 ; // 60 sec + while (System.currentTimeMillis() - startTime < timeout) { + if (UTIL.getHBaseCluster().getMaster().balance()) { + return true; + } + Thread.sleep(100); + } + return false; + } + /** * Wait on crash processing. Balancer won't run if processing a crashed server. */ @@ -171,12 +186,13 @@ public class TestRegionRebalancing { * Determine if regions are balanced. Figure out the total, divide by the * number of online servers, then test if each server is +/- 1 of average * rounded up. + * @throws InterruptedException */ - private void assertRegionsAreBalanced() throws IOException { + private void assertRegionsAreBalanced() throws IOException, InterruptedException { // TODO: Fix this test. Old balancer used to run with 'slop'. New // balancer does not. boolean success = false; - float slop = (float)UTIL.getConfiguration().getFloat("hbase.regions.slop", 0.1f); + float slop = UTIL.getConfiguration().getFloat("hbase.regions.slop", 0.1f); if (slop <= 0) slop = 1; for (int i = 0; i < 5; i++) { @@ -185,8 +201,13 @@ public class TestRegionRebalancing { waitForAllRegionsAssigned(); long regionCount = UTIL.getMiniHBaseCluster().countServedRegions(); + // Less two system regions meta and ns + regionCount -= 2; List servers = getOnlineRegionServers(); double avg = UTIL.getHBaseCluster().getMaster().getAverageLoad(); + if (avg > regionCount) { + throw new IOException("Unexpected avg load: "+ avg); + } int avgLoadPlusSlop = (int)Math.ceil(avg * (1 + slop)); int avgLoadMinusSlop = (int)Math.floor(avg * (1 - slop)) - 1; LOG.debug("There are " + servers.size() + " servers and " + regionCount @@ -202,7 +223,6 @@ public class TestRegionRebalancing { for (HRegionInfo hri : ProtobufUtil.getOnlineRegions(server.getRSRpcServices())) { if (hri.isMetaRegion()) serverLoad--; - // LOG.debug(hri.getRegionNameAsString()); } if (!(serverLoad <= avgLoadPlusSlop && serverLoad >= avgLoadMinusSlop)) { LOG.debug(server.getServerName() + " Isn't balanced!!! Avg: " + avg + @@ -220,7 +240,7 @@ public class TestRegionRebalancing { Thread.sleep(10000); } catch (InterruptedException e) {} - UTIL.getHBaseCluster().getMaster().balance(); + balance(); continue; }