Index: src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (revision 1040296) +++ src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (working copy) @@ -93,6 +93,11 @@ private TimeoutMonitor timeoutMonitor; + /* + * Maximum times we recurse an assignment. See below in {@link #assign()}. + */ + private final int maximumAssignmentRecursions; + /** * Regions currently in transition. Map of encoded region names to the master * in-memory state for that region. @@ -158,6 +163,8 @@ Threads.setDaemonThreadRunning(timeoutMonitor, master.getServerName() + ".timeoutMonitor"); this.zkTable = new ZKTable(this.master.getZooKeeper()); + this.maximumAssignmentRecursions = + this.master.getConfiguration().getInt("hbase.assignment.maximum.recursions", 10); } /** @@ -811,9 +818,23 @@ /** * Caller must hold lock on the passed state object. * @param state + * @param setOfflineInZK + * @param forceNewPlan */ private void assign(final RegionState state, final boolean setOfflineInZK, final boolean forceNewPlan) { + assign(state, setOfflineInZK, forceNewPlan, new AtomicInteger(0)); + } + + /** + * Caller must hold lock on the passed state object. + * @param state + * @param setOfflineInZK + * @param forceNewPlan + * @param recursions Bound on recursions. + */ + private void assign(final RegionState state, final boolean setOfflineInZK, + final boolean forceNewPlan, final AtomicInteger recursions) { if (setOfflineInZK && !setOfflineInZooKeeper(state)) return; if (this.master.isStopped()) { LOG.debug("Server stopped; skipping assign of " + state); @@ -829,20 +850,26 @@ // Send OPEN RPC. This can fail if the server on other end is is not up. serverManager.sendRegionOpen(plan.getDestination(), state.getRegion()); } catch (Throwable t) { + // Up our assignment recursion count. + int retry = recursions.getAndIncrement(); LOG.warn("Failed assignment of " + state.getRegion().getRegionNameAsString() + " to " + - plan.getDestination() + ", trying to assign elsewhere instead", t); + plan.getDestination() + ", trying to assign elsewhere instead; retry=" + + retry, t); // Clean out plan we failed execute and one that doesn't look like it'll // succeed anyways; we need a new plan! // Transition back to OFFLINE state.update(RegionState.State.OFFLINE); - // Force a new plan and reassign. + // Force a new plan and reassign. Will return null if no servers. if (getRegionPlan(state, plan.getDestination(), true) == null) { LOG.warn("Unable to find a viable location to assign region " + - state.getRegion().getRegionNameAsString()); + state.getRegion().getRegionNameAsString()); return; } - assign(state, false, false); + if (recursions.get() < this.maximumAssignmentRecursions) { + assign(state, false, false, recursions); + } + // Else Just leave the region in RIT. On timeout, we'll retry later. } } @@ -926,7 +953,7 @@ // The remove below hinges on the fact that the call to // serverManager.getOnlineServersList() returns a copy if (serverToExclude != null) servers.remove(serverToExclude); - if (servers.size() <= 0) return null; + if (servers.isEmpty()) return null; RegionPlan randomPlan = new RegionPlan(state.getRegion(), null, LoadBalancer.randomAssignment(servers)); synchronized (this.regionPlans) {