### Eclipse Workspace Patch 1.0 #P 0.94 Index: src/main/java/org/apache/hadoop/hbase/master/handler/OpenedRegionHandler.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/handler/OpenedRegionHandler.java (revision 1559913) +++ src/main/java/org/apache/hadoop/hbase/master/handler/OpenedRegionHandler.java (working copy) @@ -101,6 +101,7 @@ if (regionState != null && regionState.getState().equals(RegionState.State.OPEN)) { openedNodeDeleted = deleteOpenedNode(expectedVersion); + this.assignmentManager.removeAssignmentCounter(regionInfo.getRegionNameAsString()); if (!openedNodeDeleted) { LOG.error("The znode of region " + regionInfo.getRegionNameAsString() + " could not be deleted."); Index: src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (revision 1559913) +++ src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (working copy) @@ -42,6 +42,7 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; +import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -122,8 +123,20 @@ * Maximum times we recurse an assignment. See below in {@link #assign()}. */ private final int maximumAssignmentAttempts; + + /** + * region assign counter. + * string: region name. + * integer: region assign times; + */ + private final Map assignmentCounter; /** + * Maximum times one region could be assign failed. then set it OFFLINE. + */ + private final int maxAssignFailedCount; + + /** * Regions currently in transition. Map of encoded region names to the master * in-memory state for that region. */ @@ -211,7 +224,9 @@ this.executorService = service; this.regionsToReopen = Collections.synchronizedMap (new HashMap ()); + this.assignmentCounter = Collections.synchronizedMap(new HashMap()); Configuration conf = master.getConfiguration(); + this.maxAssignFailedCount = conf.getInt("hbase.master.assignment.maxassignnum", 10); this.timeoutMonitor = new TimeoutMonitor( conf.getInt("hbase.master.assignment.timeoutmonitor.period", 10000), master, serverManager, @@ -1725,6 +1740,8 @@ try { LOG.debug("Assigning region " + state.getRegion().getRegionNameAsString() + " to " + plan.getDestination().toString()); + //increase assign num. + incAssignmentCounter(state.getRegion().getRegionNameAsString()); long currentOfflineTimeStamp = state.getStamp(); RegionOpeningState regionOpenState = serverManager.sendRegionOpen(plan.getDestination(), state.getRegion(), versionOfOfflineNode); @@ -1848,6 +1865,38 @@ } } } + + private void incAssignmentCounter(String regionName) { + if (StringUtils.isBlank(regionName)) { + return; + } + synchronized (this.assignmentCounter) { + if(!assignmentCounter.containsKey(regionName)){ + assignmentCounter.put(regionName, 0); + } + int currentAssignCount = assignmentCounter.get(regionName); + assignmentCounter.put(regionName, currentAssignCount + 1); + LOG.info("Try assigning region " + regionName + " at " + + (currentAssignCount + 1) + " times. "); + } + } + + /** + * remove region assign number when the region is online. + * @param regionName + */ + public void removeAssignmentCounter(String regionName) { + if (StringUtils.isBlank(regionName)) { + return; + } + synchronized (this.assignmentCounter) { + if(!assignmentCounter.containsKey(regionName)){ + return; + } + assignmentCounter.remove(regionName); + LOG.info("remove assign counter for " + regionName); + } + } private static boolean isAssigningSplitParentRegion(final HRegionInfo region) { if (region.isSplitParent()) { @@ -3157,6 +3206,20 @@ actOnTimeOut(regionState); } } + String regionName = regionState.getRegion().getRegionNameAsString(); + if (StringUtils.isNotBlank(regionName) && !regionState.getRegion().isMetaTable()) { + if (assignmentCounter.get(regionName) != null) { + int maxAssign = servers.size() * maxAssignFailedCount; + int currentAssignNum = assignmentCounter.get(regionName); + if (currentAssignNum >= maxAssign) { + assignmentCounter.remove(regionName); + regionOffline(regionState.getRegion()); + LOG.error("Assign region too long, give up to assign region " + + regionState); + return; + } + } + } } } setAllRegionServersOffline(allRSsOffline);