diff --git a/bin/hbase-daemon.sh b/bin/hbase-daemon.sh
index e45054b..034375e 100755
--- a/bin/hbase-daemon.sh
+++ b/bin/hbase-daemon.sh
@@ -1,4 +1,4 @@
-#!/usr/bin/env bash
+#!/usr/bin/env bash -x
#
#/**
# * Copyright 2007 The Apache Software Foundation
diff --git a/conf/hbase-site.xml b/conf/hbase-site.xml
index c516ac7..2d0518d 100644
--- a/conf/hbase-site.xml
+++ b/conf/hbase-site.xml
@@ -21,4 +21,20 @@
*/
-->
+
+ hbase.cluster.distributed
+ true
+
+
+ hbase.rootdir
+ hdfs://10.11.2.132:8020/apps/hbase
+
+
+ hbase.zookeeper.quorum
+ 10.11.2.132
+
+
+ zookeeper.session.timeout
+ 60000
+
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/FavoredNodeAssignmentHelper.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/FavoredNodeAssignmentHelper.java
index e0b8153..ba5a243 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/FavoredNodeAssignmentHelper.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/FavoredNodeAssignmentHelper.java
@@ -72,19 +72,19 @@ public class FavoredNodeAssignmentHelper {
public final static short FAVORED_NODES_NUM = 3;
public FavoredNodeAssignmentHelper(final List servers, Configuration conf) {
+ this(servers, new RackManager(conf));
+ }
+
+ public FavoredNodeAssignmentHelper(final List servers,
+ final RackManager rackManager) {
this.servers = servers;
- this.rackManager = new RackManager(conf);
+ this.rackManager = rackManager;
this.rackToRegionServerMap = new HashMap>();
this.regionServerToRackMap = new HashMap();
this.uniqueRackList = new ArrayList();
this.random = new Random();
}
- // For unit tests
- void setRackManager(RackManager rackManager) {
- this.rackManager = rackManager;
- }
-
/**
* Perform full scan of the meta table similar to
* {@link MetaReader#fullScan(CatalogTracker, Set, boolean)} except that this is
@@ -204,25 +204,47 @@ public class FavoredNodeAssignmentHelper {
}
// Place the regions round-robin across the racks picking one server from each
- // rack at a time. For example, if 2 racks (r1 and r2) with 8 servers (s1..s8) each, it will
- // choose s1 from r1, s1 from r2, s2 from r1, s2 from r2, ...
+ // rack at a time. Start with a random rack, and a random server from every rack.
+ // If a rack doesn't have enough servers it will go to the next rack and so on.
+ // for choosing a primary.
+ // For example, if 4 racks (r1 .. r4) with 8 servers (s1..s8) each, one possible
+ // placement could be r2:s5, r3:s5, r4:s5, r1:s5, r2:s6, r3:s6..
+ // If there were fewer servers in one rack, say r3, which had 3 servers, one possible
+ // placement could be r2:s5, , r4:s5, r1:s5, r2:s6, ...
+ // The regions should be distributed proportionately to the racksizes
void placePrimaryRSAsRoundRobin(Map> assignmentMap,
Map primaryRSMap, List regions) {
List rackList = new ArrayList(rackToRegionServerMap.size());
rackList.addAll(rackToRegionServerMap.keySet());
- Map currentProcessIndexMap = new HashMap();
- int rackIndex = 0;
+ int rackIndex = random.nextInt(rackList.size());
+ int maxRackSize = 0;
+ for (Map.Entry> r : rackToRegionServerMap.entrySet()) {
+ if (r.getValue().size() > maxRackSize) {
+ maxRackSize = r.getValue().size();
+ }
+ }
+ int numIterations = 0;
+ int firstServerIndex = random.nextInt(maxRackSize);
+ // Initialize the current processing host index.
+ int serverIndex = firstServerIndex;
for (HRegionInfo regionInfo : regions) {
- String rackName = rackList.get(rackIndex);
- // Initialize the current processing host index.
- int serverIndex = 0;
- // Restore the current process index from the currentProcessIndexMap
- Integer currentProcessIndex = currentProcessIndexMap.get(rackName);
- if (currentProcessIndex != null) {
- serverIndex = currentProcessIndex.intValue();
+ List currentServerList;
+ String rackName;
+ while (true) {
+ rackName = rackList.get(rackIndex);
+ numIterations++;
+ // Get the server list for the current rack
+ currentServerList = rackToRegionServerMap.get(rackName);
+
+ if (serverIndex >= currentServerList.size()) { //not enough machines in this rack
+ if (numIterations % rackList.size() == 0) {
+ if (++serverIndex >= maxRackSize) serverIndex = 0;
+ }
+ if ((++rackIndex) >= rackList.size()) {
+ rackIndex = 0; // reset the rack index to 0
+ }
+ } else break;
}
- // Get the server list for the current rack
- List currentServerList = rackToRegionServerMap.get(rackName);
// Get the current process region server
ServerName currentServer = currentServerList.get(serverIndex);
@@ -237,12 +259,9 @@ public class FavoredNodeAssignmentHelper {
regionsForServer.add(regionInfo);
// Set the next processing index
- if ((++serverIndex) >= currentServerList.size()) {
- // Reset the server index for the current rack
- serverIndex = 0;
+ if (numIterations % rackList.size() == 0) {
+ ++serverIndex;
}
- // Keep track of the next processing index
- currentProcessIndexMap.put(rackName, serverIndex);
if ((++rackIndex) >= rackList.size()) {
rackIndex = 0; // reset the rack index to 0
}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/FavoredNodeLoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/FavoredNodeLoadBalancer.java
index 6309779..c76ce44 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/FavoredNodeLoadBalancer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/FavoredNodeLoadBalancer.java
@@ -30,9 +30,13 @@ import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HRegionInfo;
+import org.apache.hadoop.hbase.ServerLoad;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.master.LoadBalancer;
+import org.apache.hadoop.hbase.master.RackManager;
import org.apache.hadoop.hbase.master.RegionPlan;
+import org.apache.hadoop.hbase.master.balancer.FavoredNodes.Position;
+import org.apache.hadoop.hbase.util.Pair;
/**
* An implementation of the {@link LoadBalancer} that assigns favored nodes for
@@ -52,12 +56,12 @@ public class FavoredNodeLoadBalancer extends BaseLoadBalancer {
private static final Log LOG = LogFactory.getLog(FavoredNodeLoadBalancer.class);
private FavoredNodes globalFavoredNodesAssignmentPlan;
- private Configuration configuration;
+ private RackManager rackManager;
@Override
public void setConf(Configuration conf) {
- this.configuration = conf;
globalFavoredNodesAssignmentPlan = new FavoredNodes();
+ this.rackManager = new RackManager(conf);
}
@Override
@@ -76,13 +80,36 @@ public class FavoredNodeLoadBalancer extends BaseLoadBalancer {
Map> assignmentMap;
try {
FavoredNodeAssignmentHelper assignmentHelper =
- new FavoredNodeAssignmentHelper(servers, configuration);
+ new FavoredNodeAssignmentHelper(servers, rackManager);
assignmentHelper.initialize();
if (!assignmentHelper.canPlaceFavoredNodes()) {
return super.roundRobinAssignment(regions, servers);
}
+ // Segregate the regions into two types:
+ // 1. The regions that have favored node assignment, and where at least
+ // one of the favored node is still alive. In this case, try to adhere
+ // to the current favored nodes assignment as much as possible - i.e.,
+ // if the current primary is gone, then make the secondary or tertiary
+ // as the new host for the region (based on their current load).
+ // Note that we don't change the favored
+ // node assignments here (even though one or more favored node is currently
+ // down). It is up to the balanceCluster to do this hard work. The HDFS
+ // can handle the fact that some nodes in the favored nodes hint is down
+ // It'd allocate some other DNs. In combination with stale settings for HDFS,
+ // we should be just fine.
+ // 2. The regions that currently don't have favored node assignment. We will
+ // need to come up with favored nodes assignments for them. The corner case
+ // in (1) above is that all the nodes are unavailable and in that case, we
+ // will note that this region doesn't have favored nodes.
+ Pair