diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 255ea5e..acb1617 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -135,6 +135,10 @@ public class AssignmentManager implements ServerListener { "hbase.assignment.maximum.attempts"; private static final int DEFAULT_ASSIGN_MAX_ATTEMPTS = 10; + public static final String UNASSIGN_MAX_ATTEMPTS = + "hbase.unassignment.maximum.attempts"; + private static final int DEFAULT_UNASSIGN_MAX_ATTEMPTS = 10; + /** Region in Transition metrics threshold time */ public static final String METRICS_RIT_STUCK_WARNING_THRESHOLD = "hbase.metrics.rit.stuck.warning.threshold"; @@ -172,6 +176,7 @@ public class AssignmentManager implements ServerListener { private final int assignDispatchWaitQueueMaxSize; private final int assignDispatchWaitMillis; private final int assignMaxAttempts; + private final int unassignMaxAttempts; private final Object checkIfShouldMoveSystemRegionLock = new Object(); @@ -199,6 +204,8 @@ public class AssignmentManager implements ServerListener { this.assignMaxAttempts = Math.max(1, conf.getInt(ASSIGN_MAX_ATTEMPTS, DEFAULT_ASSIGN_MAX_ATTEMPTS)); + this.unassignMaxAttempts = Math.max(1, conf.getInt(UNASSIGN_MAX_ATTEMPTS, + DEFAULT_UNASSIGN_MAX_ATTEMPTS)); int ritChoreInterval = conf.getInt(RIT_CHORE_INTERVAL_MSEC_CONF_KEY, DEFAULT_RIT_CHORE_INTERVAL_MSEC); @@ -289,6 +296,10 @@ public class AssignmentManager implements ServerListener { return assignMaxAttempts; } + protected int getUnassignMaxAttempts() { + return unassignMaxAttempts; + } + /** * Add the listener to the notification list. * @param listener The AssignmentListener to register @@ -1102,6 +1113,7 @@ public class AssignmentManager implements ServerListener { this.statTimestamp = EnvironmentEdgeManager.currentTime(); update(regionStates.getRegionsStateInTransition(), statTimestamp); update(regionStates.getRegionFailedOpen(), statTimestamp); + update(regionStates.getRegionFailedClose(), statTimestamp); } private void update(final Collection regions, final long currentTime) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java index df55c94..d961ca5 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java @@ -417,8 +417,10 @@ public class RegionStates { private final ConcurrentSkipListMap regionOffline = new ConcurrentSkipListMap(); - private final ConcurrentSkipListMap regionFailedOpen = - new ConcurrentSkipListMap(Bytes.BYTES_COMPARATOR); + private final ConcurrentSkipListMap regionFailedOpen = + new ConcurrentSkipListMap(Bytes.BYTES_COMPARATOR); + private final ConcurrentSkipListMap regionFailedClose = + new ConcurrentSkipListMap(Bytes.BYTES_COMPARATOR); private final ConcurrentHashMap serverMap = new ConcurrentHashMap(); @@ -842,13 +844,13 @@ public class RegionStates { // ========================================================================== // Region FAIL_OPEN helpers // ========================================================================== - public static final class RegionFailedOpen { + public static final class RegionFailedOperation { private final RegionStateNode regionNode; private volatile Exception exception = null; private volatile int retries = 0; - public RegionFailedOpen(final RegionStateNode regionNode) { + public RegionFailedOperation(final RegionStateNode regionNode) { this.regionNode = regionNode; } @@ -877,18 +879,18 @@ public class RegionStates { } } - public RegionFailedOpen addToFailedOpen(final RegionStateNode regionNode) { + public RegionFailedOperation addToFailedOpen(final RegionStateNode regionNode) { final byte[] key = regionNode.getRegionInfo().getRegionName(); - RegionFailedOpen node = regionFailedOpen.get(key); + RegionFailedOperation node = regionFailedOpen.get(key); if (node == null) { - RegionFailedOpen newNode = new RegionFailedOpen(regionNode); - RegionFailedOpen oldNode = regionFailedOpen.putIfAbsent(key, newNode); + RegionFailedOperation newNode = new RegionFailedOperation(regionNode); + RegionFailedOperation oldNode = regionFailedOpen.putIfAbsent(key, newNode); node = oldNode != null ? oldNode : newNode; } return node; } - public RegionFailedOpen getFailedOpen(final HRegionInfo regionInfo) { + public RegionFailedOperation getFailedOpen(final HRegionInfo regionInfo) { return regionFailedOpen.get(regionInfo.getRegionName()); } @@ -900,7 +902,32 @@ public class RegionStates { if (regionFailedOpen.isEmpty()) return Collections.emptyList(); ArrayList regions = new ArrayList(regionFailedOpen.size()); - for (RegionFailedOpen r: regionFailedOpen.values()) { + for (RegionFailedOperation r: regionFailedOpen.values()) { + regions.add(createRegionState(r.getRegionNode())); + } + return regions; + } + + public RegionFailedOperation addToFailedClose(final RegionStateNode regionNode) { + final byte[] key = regionNode.getRegionInfo().getRegionName(); + RegionFailedOperation node = regionFailedClose.get(key); + if (node == null) { + RegionFailedOperation newNode = new RegionFailedOperation(regionNode); + RegionFailedOperation oldNode = regionFailedClose.putIfAbsent(key, newNode); + node = oldNode != null ? oldNode : newNode; + } + return node; + } + + public void removeFromFailedClose(final HRegionInfo regionInfo) { + regionFailedClose.remove(regionInfo.getRegionName()); + } + + public List getRegionFailedClose() { + if (regionFailedClose.isEmpty()) return Collections.emptyList(); + + ArrayList regions = new ArrayList(regionFailedClose.size()); + for (RegionFailedOperation r: regionFailedClose.values()) { regions.add(createRegionState(r.getRegionNode())); } return regions; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java index c6b7e4b..42e688e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java @@ -31,6 +31,7 @@ import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.NotServingRegionException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.classification.InterfaceAudience; +import org.apache.hadoop.hbase.client.RetriesExhaustedException; import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException; import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode; @@ -151,6 +152,15 @@ public class UnassignProcedure extends RegionTransitionProcedure { // by jumping to REGION_TRANSITION_DISPATCH throw new UnsupportedOperationException(); } + private boolean incrementAndCheckMaxAttempts(final MasterProcedureEnv env, + final RegionStateNode regionNode) { + final int retries = env.getAssignmentManager().getRegionStates(). + addToFailedClose(regionNode).incrementAndGetRetries(); + int max = env.getAssignmentManager().getUnassignMaxAttempts(); + LOG.info("Retry=" + retries + " of max=" + max + "; " + + this + "; " + regionNode.toShortString()); + return retries >= max; + } @Override protected boolean updateTransition(final MasterProcedureEnv env, final RegionStateNode regionNode) @@ -172,7 +182,13 @@ public class UnassignProcedure extends RegionTransitionProcedure { // if we haven't started the operation yet, we can abort if (aborted.get() && regionNode.isInState(State.OPEN)) { - setAbortFailure(getClass().getSimpleName(), "abort requested"); + if (incrementAndCheckMaxAttempts(env, regionNode)) { + regionNode.setState(State.FAILED_CLOSE); + setFailure(getClass().getSimpleName(), + new RetriesExhaustedException("Max attempts exceeded")); + } else { + setAbortFailure(getClass().getSimpleName(), "abort requested"); + } return false; } @@ -195,6 +211,7 @@ public class UnassignProcedure extends RegionTransitionProcedure { protected void finishTransition(final MasterProcedureEnv env, final RegionStateNode regionNode) throws IOException { env.getAssignmentManager().markRegionAsClosed(regionNode); + env.getAssignmentManager().getRegionStates().removeFromFailedClose(regionNode.getRegionInfo()); } @Override