From 2a8eed9d6333dc5a03edbb3d5184ac55c84c66fa Mon Sep 17 00:00:00 2001 From: Andrew Purtell Date: Fri, 23 Feb 2018 16:05:49 -0800 Subject: [PATCH] Port the RIT FAILED_OPEN state hack from RSGroups to everywhere Because RSGroups can cause permanent RIT with regions in FAILED_OPEN state, we added logic to the master to enumerate RITs and retry assignment of regions in FAILED_OPEN state. This strategy can be applied generally to reduce operator involvement in cluster operations. Now an operator has to manually resolve FAILED_OPEN assignments but there is little risk in automatically retrying them after a while. If the reason the assignment failed has not cleared, the assignment will just fail again. --- .../hbase/rsgroup/RSGroupInfoManagerImpl.java | 73 ------------------- .../hadoop/hbase/master/FailedOpenRetryChore.java | 82 ++++++++++++++++++++++ .../org/apache/hadoop/hbase/master/HMaster.java | 11 +++ .../apache/hadoop/hbase/master/RegionStates.java | 5 ++ 4 files changed, 98 insertions(+), 73 deletions(-) create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/FailedOpenRetryChore.java diff --git a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupInfoManagerImpl.java b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupInfoManagerImpl.java index fb4b91d78d..e0d1cce47b 100644 --- a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupInfoManagerImpl.java +++ b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupInfoManagerImpl.java @@ -66,7 +66,6 @@ import org.apache.hadoop.hbase.constraint.ConstraintException; import org.apache.hadoop.hbase.coprocessor.MultiRowMutationEndpoint; import org.apache.hadoop.hbase.ipc.CoprocessorRpcChannel; import org.apache.hadoop.hbase.master.MasterServices; -import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.master.ServerListener; import org.apache.hadoop.hbase.master.procedure.CreateTableProcedure; import org.apache.hadoop.hbase.master.procedure.ProcedurePrepareLatch; @@ -121,7 +120,6 @@ public class RSGroupInfoManagerImpl implements RSGroupInfoManager, ServerListene private volatile Set prevRSGroups; private RSGroupSerDe rsGroupSerDe; private DefaultServerUpdater defaultServerUpdater; - private FailedOpenUpdater failedOpenUpdater; private boolean isInit = false; public RSGroupInfoManagerImpl(MasterServices master) throws IOException { @@ -139,8 +137,6 @@ public class RSGroupInfoManagerImpl implements RSGroupInfoManager, ServerListene refresh(); defaultServerUpdater = new DefaultServerUpdater(this); Threads.setDaemonThreadRunning(defaultServerUpdater); - failedOpenUpdater = new FailedOpenUpdater(this); - Threads.setDaemonThreadRunning(failedOpenUpdater); master.getServerManager().registerListener(this); isInit = true; } @@ -492,7 +488,6 @@ public class RSGroupInfoManagerImpl implements RSGroupInfoManager, ServerListene @Override public void serverAdded(ServerName serverName) { defaultServerUpdater.serverChanged(); - failedOpenUpdater.serverChanged(); } @Override @@ -560,74 +555,6 @@ public class RSGroupInfoManagerImpl implements RSGroupInfoManager, ServerListene } } - private static class FailedOpenUpdater extends Thread { - private static final Log LOG = LogFactory.getLog(FailedOpenUpdater.class); - - private final RSGroupInfoManagerImpl mgr; - private final long waitInterval; - private volatile boolean hasChanged = false; - - public FailedOpenUpdater(RSGroupInfoManagerImpl mgr) { - this.mgr = mgr; - this.waitInterval = mgr.master.getConfiguration().getLong(REASSIGN_WAIT_INTERVAL_KEY, - DEFAULT_REASSIGN_WAIT_INTERVAL); - setName(FailedOpenUpdater.class.getName()+"-" + mgr.master.getServerName()); - setDaemon(true); - } - - @Override - public void run() { - while (!mgr.master.isAborted() && !mgr.master.isStopped()) { - boolean interrupted = false; - try { - synchronized (this) { - while (!hasChanged) { - wait(); - } - hasChanged = false; - } - } catch (InterruptedException e) { - LOG.warn("Interrupted", e); - interrupted = true; - } - if (mgr.master.isAborted() || mgr.master.isStopped() || interrupted) { - continue; - } - - // First, wait a while in case more servers are about to rejoin the cluster - try { - Thread.sleep(waitInterval); - } catch (InterruptedException e) { - LOG.warn("Interrupted", e); - } - if (mgr.master.isAborted() || mgr.master.isStopped()) { - continue; - } - - // Kick all regions in FAILED_OPEN state - List failedAssignments = Lists.newArrayList(); - for (RegionState state: mgr.master.getAssignmentManager() - .getRegionStates().getRegionsInTransition().values()) { - if (state.isFailedOpen()) { - failedAssignments.add(state.getRegion()); - } - } - for (HRegionInfo region: failedAssignments) { - LOG.info("Retrying assignment of " + region); - mgr.master.getAssignmentManager().unassign(region); - } - } - } - - // Only called for server additions - public void serverChanged() { - synchronized (this) { - hasChanged = true; - this.notify(); - } - } - } - @Override public void waiting() { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/FailedOpenRetryChore.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/FailedOpenRetryChore.java new file mode 100644 index 0000000000..45baf4bb5c --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/FailedOpenRetryChore.java @@ -0,0 +1,82 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.master; + +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.ScheduledChore; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.classification.InterfaceAudience; +import org.apache.hadoop.hbase.master.HMaster; + +import com.google.common.collect.Lists; + +/** + * Chore that will attempt to redeploy regions in FAILED_OPEN state. + */ +@InterfaceAudience.Private +public class FailedOpenRetryChore extends ScheduledChore implements ServerListener { + + public static final String FAILED_OPEN_RETRY_KEY = "hbase.assignment.failed.open.retry.period"; + public static final int FAILED_OPEN_RETRY_DEFAULT = 60000; + + private static final Log LOG = LogFactory.getLog(FailedOpenRetryChore.class); + private final HMaster master; + + public FailedOpenRetryChore(HMaster master) { + super(master.getServerName() + "-FailedOpenRetryChore", master, + master.getConfiguration().getInt(FAILED_OPEN_RETRY_KEY, FAILED_OPEN_RETRY_DEFAULT)); + this.master = master; + } + + @Override + protected void chore() { + if (master.isAborted() || master.isStopped()) { + return; + } + // Kick all regions in FAILED_OPEN state + List failedOpenRegions = Lists.newArrayList(); + for (RegionState s: master.getAssignmentManager().getRegionStates().getAllRegions()) { + if (s.isFailedOpen()) { + failedOpenRegions.add(s); + } + } + for (RegionState s: failedOpenRegions) { + LOG.info("Retrying assignment of " + s.toDescriptiveString()); + master.getAssignmentManager().unassign(s.getRegion()); + } + } + + @Override + public void serverAdded(ServerName serverName) { + // A server was added, run our chore right away as this may have healed a + // constraint (like rsgroups) preventing assignment + triggerNow(); + } + + @Override + public void serverRemoved(ServerName serverName) { + } + + @Override + public void waiting() { + } +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 6e7ff7b900..2f9c58a633 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -325,6 +325,8 @@ public class HMaster extends HRegionServer implements MasterServices, Server { private LogCleaner logCleaner; private HFileCleaner hfileCleaner; + private FailedOpenRetryChore failedOpenUpdaterChore; + MasterCoprocessorHost cpHost; private final boolean preLoadTableDescriptors; @@ -848,6 +850,12 @@ public class HMaster extends HRegionServer implements MasterServices, Server { getChoreService().scheduleChore(normalizerChore); this.catalogJanitorChore = new CatalogJanitor(this, this); getChoreService().scheduleChore(catalogJanitorChore); + if (getConfiguration().getInt(FailedOpenRetryChore.FAILED_OPEN_RETRY_KEY, + FailedOpenRetryChore.FAILED_OPEN_RETRY_DEFAULT) > 0) { + this.failedOpenUpdaterChore = new FailedOpenRetryChore(this); + serverManager.registerListener(failedOpenUpdaterChore); + getChoreService().scheduleChore(failedOpenUpdaterChore); + } // Do Metrics periodically periodicDoMetricsChore = new PeriodicDoMetrics(msgInterval, this); @@ -1311,6 +1319,9 @@ public class HMaster extends HRegionServer implements MasterServices, Server { if (this.periodicDoMetricsChore != null) { periodicDoMetricsChore.cancel(); } + if (this.failedOpenUpdaterChore != null) { + failedOpenUpdaterChore.cancel(true); + } } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java index 661efdc613..4984b06612 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java @@ -20,6 +20,7 @@ package org.apache.hadoop.hbase.master; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; @@ -1141,6 +1142,10 @@ public class RegionStates { return getRegionState(hri.getEncodedName()); } + protected Collection getAllRegions() { + return Collections.unmodifiableCollection(regionStates.values()); + } + /** * Returns a clone of region assignments per server * @return a Map of ServerName to a List of HRegionInfo's -- 2.15.1