Index: src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (revision 1547390) +++ src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (working copy) @@ -190,6 +190,8 @@ // populated during master failover. private Map failoverProcessedRegions = new HashMap(); + + private final Set failedOpenRegions = new HashSet(); /** * Constructs a new assignment manager. @@ -872,8 +874,16 @@ // When there are more than one region server a new RS is selected as the // destination and the same is updated in the regionplan. (HBASE-5546) getRegionPlan(regionState, sn, true); - this.executorService.submit(new ClosedRegionHandler(master, - this, regionState.getRegion())); + synchronized (failedOpenRegions) { + if (!failedOpenRegions.contains(hri)) { + failedOpenRegions.add(hri); + } else { + LOG.debug("Already processing region "+hri+" from FAILED_OPEN state"); + break; + } + } + new ClosedRegionHandler(master, + this, regionState.getRegion()).process(); break; case RS_ZK_REGION_OPENING: @@ -1682,169 +1692,180 @@ boolean regionAlreadyInTransitionException = false; boolean serverNotRunningYet = false; long maxRegionServerStartupWaitTime = -1; - for (int i = 0; i < this.maximumAssignmentAttempts; i++) { - int versionOfOfflineNode = -1; - if (setOfflineInZK) { - // get the version of the znode after setting it to OFFLINE. - // versionOfOfflineNode will be -1 if the znode was not set to OFFLINE - versionOfOfflineNode = setOfflineInZooKeeper(state, hijack, - regionAlreadyInTransitionException); - if(versionOfOfflineNode != -1){ - if (isDisabledorDisablingRegionInRIT(region)) { - return; - } - // In case of assign from EnableTableHandler table state is ENABLING. Any how - // EnableTableHandler will set ENABLED after assigning all the table regions. If we - // try to set to ENABLED directly then client api may think ENABLE table is completed. - // When we have a case like all the regions are added directly into META and we call - // assignRegion then we need to make the table ENABLED. Hence in such case the table - // will not be in ENABLING or ENABLED state. - String tableName = region.getTableNameAsString(); - if (!zkTable.isEnablingTable(tableName) && !zkTable.isEnabledTable(tableName)) { - LOG.debug("Setting table " + tableName + " to ENABLED state."); - setEnabledTable(region); - } - } - } - - if (setOfflineInZK && versionOfOfflineNode == -1) { - return; - } - - if (this.master.isStopped()) { - LOG.debug("Server stopped; skipping assign of " + state); - return; - } - RegionPlan plan = getRegionPlan(state, !regionAlreadyInTransitionException - && !serverNotRunningYet && forceNewPlan); - if (plan == null) { - LOG.debug("Unable to determine a plan to assign " + state); - this.timeoutMonitor.setAllRegionServersOffline(true); - return; // Should get reassigned later when RIT times out. - } - try { - LOG.debug("Assigning region " + state.getRegion().getRegionNameAsString() + - " to " + plan.getDestination().toString()); - long currentOfflineTimeStamp = state.getStamp(); - RegionOpeningState regionOpenState = serverManager.sendRegionOpen(plan.getDestination(), - state.getRegion(), versionOfOfflineNode); - if (regionOpenState == RegionOpeningState.OPENED) { - // Transition RegionState to PENDING_OPEN - // Check if already the offline state has been updated due to a - // failure in prev assign - if (state.isOffline() && currentOfflineTimeStamp != state.getStamp()) { - return; - } - if (state.isOffline() && !state.isOpening()) { - state.update(RegionState.State.PENDING_OPEN, - System.currentTimeMillis(), plan.getDestination()); - } - if (state.isOpening()) return; - if (state.isOpened()) return; - } else if (regionOpenState == RegionOpeningState.ALREADY_OPENED) { - // Remove region from in-memory transition and unassigned node from ZK - // While trying to enable the table the regions of the table were - // already enabled. - LOG.debug("ALREADY_OPENED region " + state.getRegion().getRegionNameAsString() + - " to " + plan.getDestination().toString()); - String encodedRegionName = state.getRegion() - .getEncodedName(); - try { - ZKAssign.deleteOfflineNode(master.getZooKeeper(), encodedRegionName); - } catch (KeeperException.NoNodeException e) { - if(LOG.isDebugEnabled()){ - LOG.debug("The unassigned node "+encodedRegionName+" doesnot exist."); + try { + for (int i = 0; i < this.maximumAssignmentAttempts; i++) { + int versionOfOfflineNode = -1; + if (setOfflineInZK) { + // get the version of the znode after setting it to OFFLINE. + // versionOfOfflineNode will be -1 if the znode was not set to OFFLINE + versionOfOfflineNode = setOfflineInZooKeeper(state, hijack, + regionAlreadyInTransitionException); + if (versionOfOfflineNode != -1) { + if (isDisabledorDisablingRegionInRIT(region)) { + return; } - } catch (KeeperException e) { - master.abort( - "Error deleting OFFLINED node in ZK for transition ZK node (" - + encodedRegionName + ")", e); + // In case of assign from EnableTableHandler table state is + // ENABLING. Any how + // EnableTableHandler will set ENABLED after assigning all the table + // regions. If we + // try to set to ENABLED directly then client api may think ENABLE + // table is completed. + // When we have a case like all the regions are added directly into + // META and we call + // assignRegion then we need to make the table ENABLED. Hence in + // such case the table + // will not be in ENABLING or ENABLED state. + String tableName = region.getTableNameAsString(); + if (!zkTable.isEnablingTable(tableName) && !zkTable.isEnabledTable(tableName)) { + LOG.debug("Setting table " + tableName + " to ENABLED state."); + setEnabledTable(region); + } } - synchronized (this.regionsInTransition) { - this.regionsInTransition.remove(plan.getRegionInfo() - .getEncodedName()); - } - synchronized (this.regions) { - this.regions.put(plan.getRegionInfo(), plan.getDestination()); - } } - break; - } catch (Throwable t) { - if (t instanceof RemoteException) { - t = ((RemoteException) t).unwrapRemoteException(); + + if (setOfflineInZK && versionOfOfflineNode == -1) { + return; } - regionAlreadyInTransitionException = false; - serverNotRunningYet = false; - if (t instanceof RegionAlreadyInTransitionException) { - regionAlreadyInTransitionException = true; - if (LOG.isDebugEnabled()) { - LOG.debug("Failed assignment in: " + plan.getDestination() + " due to " - + t.getMessage()); + + if (this.master.isStopped()) { + LOG.debug("Server stopped; skipping assign of " + state); + return; + } + RegionPlan plan = getRegionPlan(state, !regionAlreadyInTransitionException + && !serverNotRunningYet && forceNewPlan); + if (plan == null) { + LOG.debug("Unable to determine a plan to assign " + state); + this.timeoutMonitor.setAllRegionServersOffline(true); + return; // Should get reassigned later when RIT times out. + } + try { + LOG.debug("Assigning region " + state.getRegion().getRegionNameAsString() + " to " + + plan.getDestination().toString()); + long currentOfflineTimeStamp = state.getStamp(); + RegionOpeningState regionOpenState = serverManager.sendRegionOpen(plan.getDestination(), + state.getRegion(), versionOfOfflineNode); + if (regionOpenState == RegionOpeningState.OPENED) { + // Transition RegionState to PENDING_OPEN + // Check if already the offline state has been updated due to a + // failure in prev assign + if (state.isOffline() && currentOfflineTimeStamp != state.getStamp()) { + return; + } + if (state.isOffline() && !state.isOpening()) { + state.update(RegionState.State.PENDING_OPEN, System.currentTimeMillis(), + plan.getDestination()); + } + if (state.isOpening()) + return; + if (state.isOpened()) + return; + } else if (regionOpenState == RegionOpeningState.ALREADY_OPENED) { + // Remove region from in-memory transition and unassigned node from + // ZK + // While trying to enable the table the regions of the table were + // already enabled. + LOG.debug("ALREADY_OPENED region " + state.getRegion().getRegionNameAsString() + " to " + + plan.getDestination().toString()); + String encodedRegionName = state.getRegion().getEncodedName(); + try { + ZKAssign.deleteOfflineNode(master.getZooKeeper(), encodedRegionName); + } catch (KeeperException.NoNodeException e) { + if (LOG.isDebugEnabled()) { + LOG.debug("The unassigned node " + encodedRegionName + " doesnot exist."); + } + } catch (KeeperException e) { + master.abort("Error deleting OFFLINED node in ZK for transition ZK node (" + + encodedRegionName + ")", e); + } + synchronized (this.regionsInTransition) { + this.regionsInTransition.remove(plan.getRegionInfo().getEncodedName()); + } + synchronized (this.regions) { + this.regions.put(plan.getRegionInfo(), plan.getDestination()); + } } - } else if (t instanceof ServerNotRunningYetException) { - if (maxRegionServerStartupWaitTime < 0) { - maxRegionServerStartupWaitTime = System.currentTimeMillis() - + this.master.getConfiguration().getLong("hbase.regionserver.rpc.startup.waittime", - 60000); + break; + } catch (Throwable t) { + if (t instanceof RemoteException) { + t = ((RemoteException) t).unwrapRemoteException(); } - try { - long now = System.currentTimeMillis(); - if (now < maxRegionServerStartupWaitTime) { - LOG.debug("Server is not yet up; waiting up to " - + (maxRegionServerStartupWaitTime - now) + "ms", t); - serverNotRunningYet = true; - Thread.sleep(100); - i--; // reset the try count - } else { - LOG.debug("Server is not up for a while; try a new one", t); + regionAlreadyInTransitionException = false; + serverNotRunningYet = false; + if (t instanceof RegionAlreadyInTransitionException) { + regionAlreadyInTransitionException = true; + if (LOG.isDebugEnabled()) { + LOG.debug("Failed assignment in: " + plan.getDestination() + " due to " + + t.getMessage()); } - } catch (InterruptedException ie) { - LOG.warn("Failed to assign " + state.getRegion().getRegionNameAsString() - + " since interrupted", ie); - Thread.currentThread().interrupt(); + } else if (t instanceof ServerNotRunningYetException) { + if (maxRegionServerStartupWaitTime < 0) { + maxRegionServerStartupWaitTime = System.currentTimeMillis() + + this.master.getConfiguration().getLong( + "hbase.regionserver.rpc.startup.waittime", 60000); + } + try { + long now = System.currentTimeMillis(); + if (now < maxRegionServerStartupWaitTime) { + LOG.debug("Server is not yet up; waiting up to " + + (maxRegionServerStartupWaitTime - now) + "ms", t); + serverNotRunningYet = true; + Thread.sleep(100); + i--; // reset the try count + } else { + LOG.debug("Server is not up for a while; try a new one", t); + } + } catch (InterruptedException ie) { + LOG.warn("Failed to assign " + state.getRegion().getRegionNameAsString() + + " since interrupted", ie); + Thread.currentThread().interrupt(); + return; + } + } else if (t instanceof java.net.SocketTimeoutException + && this.serverManager.isServerOnline(plan.getDestination())) { + LOG.warn("Call openRegion() to " + plan.getDestination() + + " has timed out when trying to assign " + region.getRegionNameAsString() + + ", but the region might already be opened on " + plan.getDestination() + ".", t); return; } - } else if (t instanceof java.net.SocketTimeoutException - && this.serverManager.isServerOnline(plan.getDestination())) { - LOG.warn("Call openRegion() to " + plan.getDestination() - + " has timed out when trying to assign " - + region.getRegionNameAsString() - + ", but the region might already be opened on " - + plan.getDestination() + ".", t); - return; + LOG.warn( + "Failed assignment of " + + state.getRegion().getRegionNameAsString() + + " to " + + plan.getDestination() + + ", trying to assign " + + (regionAlreadyInTransitionException || serverNotRunningYet ? "to the same region server because of " + + "RegionAlreadyInTransitionException/ServerNotRunningYetException;" + : "elsewhere instead; ") + "retry=" + i, t); + // Clean out plan we failed execute and one that doesn't look like + // it'll + // succeed anyways; we need a new plan! + // Transition back to OFFLINE + state.update(RegionState.State.OFFLINE); + // If region opened on destination of present plan, reassigning to new + // RS may cause double assignments. In case of + // RegionAlreadyInTransitionException + // reassigning to same RS. + RegionPlan newPlan = plan; + if (!regionAlreadyInTransitionException && !serverNotRunningYet) { + // Force a new plan and reassign. Will return null if no servers. + // The new plan could be the same as the existing plan since we + // don't + // exclude the server of the original plan, which should not be + // excluded since it could be the only server up now. + newPlan = getRegionPlan(state, true); + } + if (newPlan == null) { + this.timeoutMonitor.setAllRegionServersOffline(true); + LOG.warn("Unable to find a viable location to assign region " + + state.getRegion().getRegionNameAsString()); + return; + } } - LOG.warn("Failed assignment of " - + state.getRegion().getRegionNameAsString() - + " to " - + plan.getDestination() - + ", trying to assign " - + (regionAlreadyInTransitionException || serverNotRunningYet - ? "to the same region server because of " - + "RegionAlreadyInTransitionException/ServerNotRunningYetException;" - : "elsewhere instead; ") - + "retry=" + i, t); - // Clean out plan we failed execute and one that doesn't look like it'll - // succeed anyways; we need a new plan! - // Transition back to OFFLINE - state.update(RegionState.State.OFFLINE); - // If region opened on destination of present plan, reassigning to new - // RS may cause double assignments. In case of RegionAlreadyInTransitionException - // reassigning to same RS. - RegionPlan newPlan = plan; - if (!regionAlreadyInTransitionException && !serverNotRunningYet) { - // Force a new plan and reassign. Will return null if no servers. - // The new plan could be the same as the existing plan since we don't - // exclude the server of the original plan, which should not be - // excluded since it could be the only server up now. - newPlan = getRegionPlan(state, true); - } - if (newPlan == null) { - this.timeoutMonitor.setAllRegionServersOffline(true); - LOG.warn("Unable to find a viable location to assign region " + - state.getRegion().getRegionNameAsString()); - return; - } } + } finally { + synchronized (failedOpenRegions) { + failedOpenRegions.remove(region); + } } }