From 7987c7897ce3d5894b70b9226eef6a158784ae15 Mon Sep 17 00:00:00 2001
From: stack
Date: Tue, 17 Mar 2020 22:09:14 -0700
Subject: [PATCH] HBASE-23984 [Flakey Tests] TestMasterAbortAndRSGotKilled
fails in teardown Remove Region from RegionServer#regionsInTransitionInRS Map
even on exception (because server is aborted).
hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
Only dump metrics once on abort. Refactor closeRegion method removing overlap with
CloseRegionHandler; i.e. call to CP.
hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java
Refactor so we remove Region from regionsInTransitionRS on success or failure.
hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java
Cleanup but also made it so UnassignRegionHandler can use it closing Regions.
hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java
Use CloseRegionHandler to close Region rather than dupe code.
hbase-server/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java
Bit more info when thread dumping...
TestRegionServerNoMaster was changed because it presumes a behavior not
present anymore (that CRH will leave danglng mention in regionsInTransitionInRS
as it used to)
---
.../hbase/regionserver/HRegionServer.java | 127 +++++++--------
.../handler/AssignRegionHandler.java | 71 ++++-----
.../handler/CloseMetaHandler.java | 11 +-
.../handler/CloseRegionHandler.java | 138 +++++++++--------
.../handler/UnassignRegionHandler.java | 145 +++++++++---------
.../hadoop/hbase/util/JVMClusterUtil.java | 9 +-
.../master/TestMasterAbortAndRSGotKilled.java | 22 +--
.../TestRegionServerNoMaster.java | 7 -
8 files changed, 276 insertions(+), 254 deletions(-)
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
index fc0b69345c..dbee9e40fa 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
@@ -22,6 +22,7 @@ import static org.apache.hadoop.hbase.HConstants.DEFAULT_HBASE_SPLIT_WAL_MAX_SPL
import static org.apache.hadoop.hbase.HConstants.HBASE_SPLIT_WAL_COORDINATED_BY_ZK;
import static org.apache.hadoop.hbase.HConstants.HBASE_SPLIT_WAL_MAX_SPLITTER;
import static org.apache.hadoop.hbase.util.DNS.RS_HOSTNAME_KEY;
+import edu.umd.cs.findbugs.annotations.Nullable;
import java.io.IOException;
import java.lang.Thread.UncaughtExceptionHandler;
import java.lang.management.MemoryType;
@@ -53,6 +54,7 @@ import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.ReentrantReadWriteLock;
+import java.util.stream.Collectors;
import javax.management.MalformedObjectNameException;
import javax.servlet.http.HttpServlet;
import org.apache.commons.lang3.RandomUtils;
@@ -247,9 +249,8 @@ public class HRegionServer extends HasThread implements
public static boolean TEST_SKIP_REPORTING_TRANSITION = false;
/**
- * A map from RegionName to current action in progress. Boolean value indicates:
- * true - if open region action in progress
- * false - if close region action in progress
+ * A map from encoded region name as bytes to current action in progress. Boolean value indicates:
+ * TRUE if open region action in progress, and FALSE if close region action in progress
*/
private final ConcurrentMap regionsInTransitionInRS =
new ConcurrentSkipListMap<>(Bytes.BYTES_COMPARATOR);
@@ -1010,7 +1011,7 @@ public class HRegionServer extends HasThread implements
} else if (!this.stopping) {
this.stopping = true;
LOG.info("Closing user regions");
- closeUserRegions(this.abortRequested);
+ closeUserRegions(isAborted());
} else {
boolean allUserRegionsOffline = areAllUserRegionsOffline();
if (allUserRegionsOffline) {
@@ -1026,7 +1027,7 @@ public class HRegionServer extends HasThread implements
// Make sure all regions have been closed -- some regions may
// have not got it because we were splitting at the time of
// the call to closeUserRegions.
- closeUserRegions(this.abortRequested);
+ closeUserRegions(isAborted());
}
LOG.debug("Waiting on " + getOnlineRegionsAsPrintableString());
}
@@ -1081,18 +1082,18 @@ public class HRegionServer extends HasThread implements
// Stop the snapshot and other procedure handlers, forcefully killing all running tasks
if (rspmHost != null) {
- rspmHost.stop(this.abortRequested || this.killed);
+ rspmHost.stop(isAborted() || this.killed);
}
if (this.killed) {
// Just skip out w/o closing regions. Used when testing.
- } else if (abortRequested) {
+ } else if (isAborted()) {
if (this.dataFsOk) {
- closeUserRegions(abortRequested); // Don't leave any open file handles
+ closeUserRegions(isAborted()); // Don't leave any open file handles
}
LOG.info("aborting server " + this.serverName);
} else {
- closeUserRegions(abortRequested);
+ closeUserRegions(isAborted());
LOG.info("stopping server " + this.serverName);
}
@@ -1108,17 +1109,17 @@ public class HRegionServer extends HasThread implements
// Closing the compactSplit thread before closing meta regions
if (!this.killed && containsMetaTableRegions()) {
- if (!abortRequested || this.dataFsOk) {
+ if (!isAborted() || this.dataFsOk) {
if (this.compactSplitThread != null) {
this.compactSplitThread.join();
this.compactSplitThread = null;
}
- closeMetaTableRegions(abortRequested);
+ closeMetaTableRegions(isAborted());
}
}
if (!this.killed && this.dataFsOk) {
- waitOnAllRegionsToClose(abortRequested);
+ waitOnAllRegionsToClose(isAborted());
LOG.info("stopping server " + this.serverName + "; all regions closed.");
}
@@ -1133,7 +1134,7 @@ public class HRegionServer extends HasThread implements
// flag may be changed when closing regions throws exception.
if (this.dataFsOk) {
- shutdownWAL(!abortRequested);
+ shutdownWAL(!isAborted());
}
// Make sure the proxy is down.
@@ -1457,6 +1458,9 @@ public class HRegionServer extends HasThread implements
" because some regions failed closing");
}
break;
+ } else {
+ LOG.trace("Waiting on {}", this.regionsInTransitionInRS.entrySet().stream().
+ map(e -> Bytes.toString(e.getKey())).collect(Collectors.joining(", ")));
}
if (sleep(200)) {
interrupted = true;
@@ -2446,6 +2450,12 @@ public class HRegionServer extends HasThread implements
*/
@Override
public void abort(String reason, Throwable cause) {
+ if (isAborted()) {
+ // Don't run multiple aborts; fills logs w/ spew.
+ LOG.info("Abort called but server already aborted; by-passing; abort invoked because {}",
+ reason, cause);
+ return;
+ }
String msg = "***** ABORTING region server " + this + ": " + reason + " *****";
if (cause != null) {
LOG.error(HBaseMarkers.FATAL, msg, cause);
@@ -3185,71 +3195,61 @@ public class HRegionServer extends HasThread implements
* If a close was in progress, this new request will be ignored, and an exception thrown.
*
*
- * @param encodedName Region to close
* @param abort True if we are aborting
+ * @param destination Where the Region is being moved too... maybe null if unknown.
* @return True if closed a region.
* @throws NotServingRegionException if the region is not online
*/
- protected boolean closeRegion(String encodedName, final boolean abort, final ServerName sn)
- throws NotServingRegionException {
- //Check for permissions to close.
- HRegion actualRegion = this.getRegion(encodedName);
- // Can be null if we're calling close on a region that's not online
- if ((actualRegion != null) && (actualRegion.getCoprocessorHost() != null)) {
- try {
- actualRegion.getCoprocessorHost().preClose(false);
- } catch (IOException exp) {
- LOG.warn("Unable to close region: the coprocessor launched an error ", exp);
- return false;
- }
- }
-
- // previous can come back 'null' if not in map.
- final Boolean previous = this.regionsInTransitionInRS.putIfAbsent(Bytes.toBytes(encodedName),
- Boolean.FALSE);
-
+ protected boolean closeRegion(String encodedRegionName, final boolean abort,
+ @Nullable final ServerName destination) throws NotServingRegionException {
+ // TODO: Check for perms to close.
+ // Ideally we'd shove a bunch of the below logic into CloseRegionHandler only we have to do
+ // some handling inline w/ the request such as throwing NotServervingRegionException if we
+ // are not hosting so client gets the message directly; can't do this from inside an
+ // executor service.
+ Region region = getRegion(encodedRegionName);
+ if (region == null) {
+ // Changed behavior. We'd proceed though Region not online.
+ // The master deletes the znode when it receives this exception.
+ throw new NotServingRegionException(encodedRegionName);
+ }
+ byte [] encodedRegionNameAsBytes = Bytes.toBytes(encodedRegionName);
+ // This getting from this.regionsInTransitionInRS and checking the return is done in a few
+ // places, mostly over in Handlers; this.regionsInTransitionInRS is how we ensure we don't
+ // clash opens/closes or duplicate closes when one is ongoing. FYI.
+ final Boolean previous = this.regionsInTransitionInRS.
+ putIfAbsent(encodedRegionNameAsBytes, Boolean.FALSE);
if (Boolean.TRUE.equals(previous)) {
- LOG.info("Received CLOSE for the region:" + encodedName + " , which we are already " +
- "trying to OPEN. Cancelling OPENING.");
- if (!regionsInTransitionInRS.replace(Bytes.toBytes(encodedName), previous, Boolean.FALSE)) {
+ LOG.info("Received CLOSE for {} which is OPENING. Cancelling OPENING.", encodedRegionName);
+ if (!this.regionsInTransitionInRS.replace(encodedRegionNameAsBytes, previous,
+ Boolean.FALSE)) {
// The replace failed. That should be an exceptional case, but theoretically it can happen.
- // We're going to try to do a standard close then.
- LOG.warn("The opening for region " + encodedName + " was done before we could cancel it." +
- " Doing a standard close now");
- return closeRegion(encodedName, abort, sn);
+ // We're going to try to rerun the close.
+ LOG.warn("The opening of {} was done before we could cancel it; running a new close now",
+ encodedRegionName);
+ return closeRegion(encodedRegionName, abort, destination);
}
// Let's get the region from the online region list again
- actualRegion = this.getRegion(encodedName);
- if (actualRegion == null) { // If already online, we still need to close it.
- LOG.info("The opening previously in progress has been cancelled by a CLOSE request.");
+ region = this.getRegion(encodedRegionName);
+ if (region == null) {
+ LOG.info("The opening of {} has been cancelled by a CLOSE request", encodedRegionName);
// The master deletes the znode when it receives this exception.
- throw new NotServingRegionException("The region " + encodedName +
- " was opening but not yet served. Opening is cancelled.");
+ throw new NotServingRegionException(encodedRegionName +
+ " was opening but not yet served; cancelled by close");
}
} else if (previous == null) {
- LOG.info("Received CLOSE for {}", encodedName);
+ LOG.info("Received CLOSE for {}", encodedRegionName);
} else if (Boolean.FALSE.equals(previous)) {
- LOG.info("Received CLOSE for the region: " + encodedName +
- ", which we are already trying to CLOSE, but not completed yet");
+ LOG.info("Received CLOSE for {} which we are already trying to CLOSE", encodedRegionName);
return true;
}
+ // Above we've registered our CLOSE with this.regionsInTransitionInRS. Now run close handler.
+ // The close handler will undo the entry in this.regionsInTransitionInRS when dones
+ // (awkward but close handler runs in a different thread).
+ this.executorService.submit(region.getRegionInfo().isMetaRegion()?
+ new CloseMetaHandler(this, region.getRegionInfo(), abort, destination):
+ new CloseRegionHandler(this, region.getRegionInfo(), abort, destination));
- if (actualRegion == null) {
- LOG.debug("Received CLOSE for a region which is not online, and we're not opening.");
- this.regionsInTransitionInRS.remove(Bytes.toBytes(encodedName));
- // The master deletes the znode when it receives this exception.
- throw new NotServingRegionException("The region " + encodedName +
- " is not online, and is not opening.");
- }
-
- CloseRegionHandler crh;
- final RegionInfo hri = actualRegion.getRegionInfo();
- if (hri.isMetaRegion()) {
- crh = new CloseMetaHandler(this, this, hri, abort);
- } else {
- crh = new CloseRegionHandler(this, this, hri, abort, sn);
- }
- this.executorService.submit(crh);
return true;
}
@@ -3833,6 +3833,7 @@ public class HRegionServer extends HasThread implements
*
* @param procId the id of the open/close region procedure
* @return true if the procedure can be submitted.
+ * @see #finishRegionProcedure
*/
boolean submitRegionProcedure(long procId) {
if (procId == -1) {
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java
index 76e8f80271..8cdd404760 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@@ -34,7 +34,6 @@ import org.apache.hadoop.hbase.util.RetryCounter;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
/**
@@ -74,10 +73,8 @@ public class AssignRegionHandler extends EventHandler {
}
private void cleanUpAndReportFailure(IOException error) throws IOException {
- LOG.warn("Failed to open region {}, will report to master", regionInfo.getRegionNameAsString(),
- error);
+ LOG.warn("Failed to open {}, will report master", regionInfo.getRegionNameAsString(), error);
HRegionServer rs = getServer();
- rs.getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes(), Boolean.TRUE);
if (!rs.reportRegionStateTransition(new RegionStateTransitionContext(TransitionCode.FAILED_OPEN,
HConstants.NO_SEQNUM, openProcId, masterSystemTime, regionInfo))) {
throw new IOException(
@@ -93,7 +90,7 @@ public class AssignRegionHandler extends EventHandler {
String regionName = regionInfo.getRegionNameAsString();
Region onlineRegion = rs.getRegion(encodedName);
if (onlineRegion != null) {
- LOG.warn("Received OPEN for the region:{}, which is already online", regionName);
+ LOG.warn("Received OPEN for {}, which is already online", regionName);
// Just follow the old behavior, do we need to call reportRegionStateTransition? Maybe not?
// For normal case, it could happen that the rpc call to schedule this handler is succeeded,
// but before returning to master the connection is broken. And when master tries again, we
@@ -105,16 +102,14 @@ public class AssignRegionHandler extends EventHandler {
if (previous != null) {
if (previous) {
// The region is opening and this maybe a retry on the rpc call, it is safe to ignore it.
- LOG.info("Receiving OPEN for the region:{}, which we are already trying to OPEN" +
- " - ignoring this new request for this region.", regionName);
+ LOG.info("Receiving OPEN for {} which is OPENING; ignoring request", regionName);
} else {
// The region is closing. This is possible as we will update the region state to CLOSED when
// calling reportRegionStateTransition, so the HMaster will think the region is offline,
// before we actually close the region, as reportRegionStateTransition is part of the
// closing process.
long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
- LOG.info(
- "Receiving OPEN for the region:{}, which we are trying to close, try again after {}ms",
+ LOG.info("Receiving OPEN for {} which we are trying to CLOSE, try again after {}ms",
regionName, backoff);
rs.getExecutorService().delayedSubmit(this, backoff, TimeUnit.MILLISECONDS);
}
@@ -122,33 +117,39 @@ public class AssignRegionHandler extends EventHandler {
}
LOG.info("Open {}", regionName);
HRegion region;
+ // Start a try. Above we registered region with rs.getRegionsInTransitionInRS. Need to clear
+ // it on success or error on the end.
try {
- TableDescriptor htd =
- tableDesc != null ? tableDesc : rs.getTableDescriptors().get(regionInfo.getTable());
- if (htd == null) {
- throw new IOException("Missing table descriptor for " + regionName);
+ try {
+ TableDescriptor htd =
+ tableDesc != null ? tableDesc : rs.getTableDescriptors().get(regionInfo.getTable());
+ if (htd == null) {
+ throw new IOException("Missing table descriptor for " + regionName);
+ }
+ // pass null for the last parameter, which used to be a CancelableProgressable, as now the
+ // opening can not be interrupted by a close request any more.
+ region = HRegion
+ .openHRegion(regionInfo, htd, rs.getWAL(regionInfo), rs.getConfiguration(), rs, null);
+ } catch (IOException e) {
+ cleanUpAndReportFailure(e);
+ return;
}
- // pass null for the last parameter, which used to be a CancelableProgressable, as now the
- // opening can not be interrupted by a close request any more.
- region = HRegion.openHRegion(regionInfo, htd, rs.getWAL(regionInfo), rs.getConfiguration(),
- rs, null);
- } catch (IOException e) {
- cleanUpAndReportFailure(e);
- return;
- }
- rs.postOpenDeployTasks(new PostOpenDeployContext(region, openProcId, masterSystemTime));
- rs.addRegion(region);
- LOG.info("Opened {}", regionName);
- // Cache the open region procedure id after report region transition succeed.
- rs.finishRegionProcedure(openProcId);
- Boolean current = rs.getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes());
- if (current == null) {
- // Should NEVER happen, but let's be paranoid.
- LOG.error("Bad state: we've just opened a region that was NOT in transition. Region={}",
- regionName);
- } else if (!current) {
- // Should NEVER happen, but let's be paranoid.
- LOG.error("Bad state: we've just opened a region that was closing. Region={}", regionName);
+ rs.postOpenDeployTasks(new PostOpenDeployContext(region, openProcId, masterSystemTime));
+ rs.addRegion(region);
+ LOG.info("Opened {}", regionName);
+ } finally {
+ // Remove from regionsInTransitionRS whether success or failure.
+ rs.getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes(), Boolean.TRUE);
+ Boolean current = rs.getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes());
+ if (current == null) {
+ // Should NEVER happen, but let's be paranoid.
+ LOG.error("Bad state: we've just opened {} and it was NOT in transition", regionName);
+ } else if (!current) {
+ // Should NEVER happen, but let's be paranoid.
+ LOG.error("Bad state: we've just opened {} and it was closing", regionName);
+ }
+ // Cache the open region procedure id
+ rs.finishRegionProcedure(openProcId);
}
}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseMetaHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseMetaHandler.java
index 38097bafd6..25c64f9ed7 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseMetaHandler.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseMetaHandler.java
@@ -18,7 +18,8 @@
*/
package org.apache.hadoop.hbase.regionserver.handler;
-import org.apache.hadoop.hbase.Server;
+import edu.umd.cs.findbugs.annotations.Nullable;
+import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.executor.EventType;
import org.apache.hadoop.hbase.regionserver.RegionServerServices;
@@ -31,10 +32,8 @@ import org.apache.yetus.audience.InterfaceAudience;
public class CloseMetaHandler extends CloseRegionHandler {
// Called when regionserver determines its to go down; not master orchestrated
- public CloseMetaHandler(final Server server,
- final RegionServerServices rsServices,
- final RegionInfo regionInfo,
- final boolean abort) {
- super(server, rsServices, regionInfo, abort, EventType.M_RS_CLOSE_META, null);
+ public CloseMetaHandler(final RegionServerServices rsServices,
+ final RegionInfo regionInfo, final boolean abort, @Nullable ServerName destination) {
+ super(rsServices, regionInfo, abort, EventType.M_RS_CLOSE_META, destination);
}
}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java
index d4ea004cb2..9d6222ac4e 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java
@@ -1,5 +1,4 @@
-/**
- *
+/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@@ -18,10 +17,9 @@
*/
package org.apache.hadoop.hbase.regionserver.handler;
+import edu.umd.cs.findbugs.annotations.Nullable;
import java.io.IOException;
-
import org.apache.hadoop.hbase.HConstants;
-import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.executor.EventHandler;
@@ -38,9 +36,13 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProto
/**
* Handles closing of a region on a region server.
*
- * Now for regular close region request, we will use {@link UnassignRegionHandler} instead. But when
- * shutting down the region server, will also close regions and the related methods still use this
- * class so we keep it here.
+ * In normal operation, we use {@link UnassignRegionHandler} closing Regions but when shutting down
+ * the region server and closing out Regions, we use this handler instead; it does not expect to
+ * be able to communicate the close back to the Master.
+ * Expects that the close has been registered in the hosting RegionServer before
+ * submitting this Handler; i.e. rss.getRegionsInTransitionInRS().putIfAbsent(
+ * this.regionInfo.getEncodedNameAsBytes(), Boolean.FALSE); has been called first.
+ * In here when done, we do the deregister.
* @see UnassignRegionHandler
*/
@InterfaceAudience.Private
@@ -52,85 +54,101 @@ public class CloseRegionHandler extends EventHandler {
// have a running queue of user regions to close?
private static final Logger LOG = LoggerFactory.getLogger(CloseRegionHandler.class);
- private final RegionServerServices rsServices;
+ private final RegionServerServices rss;
private final RegionInfo regionInfo;
// If true, the hosting server is aborting. Region close process is different
// when we are aborting.
private final boolean abort;
- private ServerName destination;
+
+ /**
+ * The target the Region is going to after successful close; usually null as it depends on
+ * context whether this is passed in or not.
+ */
+ private final ServerName destination;
/**
* This method used internally by the RegionServer to close out regions.
- * @param server
- * @param rsServices
- * @param regionInfo
+ *
* @param abort If the regionserver is aborting.
- * @param destination
*/
- public CloseRegionHandler(final Server server,
- final RegionServerServices rsServices,
- final RegionInfo regionInfo, final boolean abort,
- ServerName destination) {
- this(server, rsServices, regionInfo, abort,
- EventType.M_RS_CLOSE_REGION, destination);
+ public CloseRegionHandler(final RegionServerServices rsServices, final RegionInfo regionInfo,
+ final boolean abort, @Nullable ServerName destination) {
+ this(rsServices, regionInfo, abort, EventType.M_RS_CLOSE_REGION, destination);
}
- protected CloseRegionHandler(final Server server,
- final RegionServerServices rsServices, RegionInfo regionInfo,
- boolean abort, EventType eventType, ServerName destination) {
- super(server, eventType);
- this.server = server;
- this.rsServices = rsServices;
+ protected CloseRegionHandler(RegionServerServices rsServices, RegionInfo regionInfo,
+ boolean abort, EventType eventType, @Nullable ServerName destination) {
+ super(rsServices, eventType);
this.regionInfo = regionInfo;
this.abort = abort;
+ this.rss = rsServices;
this.destination = destination;
}
- public RegionInfo getRegionInfo() {
- return regionInfo;
- }
-
- @Override
- public void process() {
+ @Override public void process() throws IOException {
try {
- String name = regionInfo.getEncodedName();
- LOG.trace("Processing close of {}", name);
- String encodedRegionName = regionInfo.getEncodedName();
// Check that this region is being served here
- HRegion region = (HRegion)rsServices.getRegion(encodedRegionName);
+ HRegion region = (HRegion)this.rss.getRegion(this.regionInfo.getEncodedName());
if (region == null) {
- LOG.warn("Received CLOSE for region {} but currently not serving - ignoring", name);
+ LOG.warn("Received CLOSE for {} but not ONLINE; ignoring CLOSE request",
+ this.regionInfo.getRegionNameAsString());
// TODO: do better than a simple warning
return;
}
-
- // Close the region
- try {
- if (region.close(abort) == null) {
- // This region got closed. Most likely due to a split.
- // The split message will clean up the master state.
- LOG.warn("Can't close region {}, was already closed during close()", name);
- return;
- }
- } catch (IOException ioe) {
- // An IOException here indicates that we couldn't successfully flush the
- // memstore before closing. So, we need to abort the server and allow
- // the master to split our logs in order to recover the data.
- server.abort("Unrecoverable exception while closing region " +
- regionInfo.getRegionNameAsString() + ", still finishing close", ioe);
- throw new RuntimeException(ioe);
+ if (region.getCoprocessorHost() != null) {
+ // XXX: The behavior is a bit broken. At master side there is no FAILED_CLOSE state, so if
+ // there are exceptions thrown from the CP, we can not report the error to the master, and
+ // if here is, we just return without calling reportRegionStateTransition, the TRSP at
+ // master side will hang there for ever. So here if the CP throws an exception out, the only
+ // way is to abort the RS...
+ region.getCoprocessorHost().preClose(abort);
}
-
- this.rsServices.removeRegion(region, destination);
- rsServices.reportRegionStateTransition(new RegionStateTransitionContext(TransitionCode.CLOSED,
- HConstants.NO_SEQNUM, Procedure.NO_PROC_ID, -1, regionInfo));
-
+ if (region.close(abort) == null) {
+ // This should not happen. Left over from old days when RegionServer
+ // ran the splits? Now splits are run by the Master.
+ LOG.warn("Can't close {}, already closed; split? (Unexpected state - investigate)",
+ this.regionInfo.getRegionNameAsString());
+ // Return null here. Makes more sense. We didnt do close on the Region.
+ return;
+ }
+ this.rss.removeRegion(region, destination);
+ boolean b = reportRegionStateTransition();
// Done! Region is closed on this RS
- LOG.debug("Closed " + region.getRegionInfo().getRegionNameAsString());
+ LOG.debug("Closed {} (notified master={})", this.regionInfo.getRegionNameAsString(), b);
+ // We don't care if we are able to inform the Master successfully; try but ignore response.
} finally {
- this.rsServices.getRegionsInTransitionInRS().
- remove(this.regionInfo.getEncodedNameAsBytes(), Boolean.FALSE);
+ // Presumption is that before the Close was run, the CLOSE operation had been registered
+ // with rss.getRegionsInTransitionInRS(). Here we do cleanup whether success or failure.
+ this.rss.getRegionsInTransitionInRS().remove(this.regionInfo.getEncodedNameAsBytes(),
+ Boolean.FALSE);
}
}
+
+ /**
+ * @throws IOException Let subclasses throw IOE if they need to.
+ */
+ boolean reportRegionStateTransition() throws IOException {
+ return this.rss.reportRegionStateTransition(
+ new RegionStateTransitionContext(TransitionCode.CLOSED, HConstants.NO_SEQNUM,
+ getClosePID(), -1, this.regionInfo));
+ }
+
+ /**
+ * @return PID of Close to report to Master; default is Procedure.NO_PROC_ID.
+ */
+ long getClosePID() {
+ return Procedure.NO_PROC_ID;
+ }
+
+ @Override protected void handleException(Throwable t) {
+ LOG.warn("Fatal error occurred while closing region {}, aborting...",
+ this.regionInfo.getRegionNameAsString(), t);
+ this.rss.abort("Failed to close region " + this.regionInfo.getRegionNameAsString() +
+ " and can not recover", t);
+ }
+
+ RegionInfo getRegionInfo() {
+ return this.regionInfo;
+ }
}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java
index 0fc8edc47a..096b9b80e2 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@@ -20,22 +20,18 @@ package org.apache.hadoop.hbase.regionserver.handler;
import edu.umd.cs.findbugs.annotations.Nullable;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
-import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.executor.EventHandler;
import org.apache.hadoop.hbase.executor.EventType;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.regionserver.Region;
-import org.apache.hadoop.hbase.regionserver.RegionServerServices.RegionStateTransitionContext;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.RetryCounter;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
-
/**
* Handles closing of a region on a region server.
*
@@ -44,11 +40,11 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProto
* with the zk less assignment for 1.x, otherwise it is not possible to do rolling upgrade.
*/
@InterfaceAudience.Private
-public class UnassignRegionHandler extends EventHandler {
+public final class UnassignRegionHandler extends EventHandler {
private static final Logger LOG = LoggerFactory.getLogger(UnassignRegionHandler.class);
- private final String encodedName;
+ private final String encodedRegionName;
private final long closeProcId;
// If true, the hosting server is aborting. Region close process is different
@@ -56,18 +52,25 @@ public class UnassignRegionHandler extends EventHandler {
// TODO: not used yet, we still use the old CloseRegionHandler when aborting
private final boolean abort;
- private final ServerName destination;
-
private final RetryCounter retryCounter;
- public UnassignRegionHandler(HRegionServer server, String encodedName, long closeProcId,
- boolean abort, @Nullable ServerName destination, EventType eventType) {
+ /**
+ * The target the Region is going to after successful close; usually null as it depends on
+ * context whether this is passed in or not.
+ */
+ private final ServerName destination;
+
+ /**
+ * @see #create(HRegionServer, String, long, boolean, ServerName)
+ */
+ private UnassignRegionHandler(HRegionServer server, String encodedRegionName, long closeProcId,
+ boolean abort, @Nullable ServerName destination, EventType eventType) {
super(server, eventType);
- this.encodedName = encodedName;
+ this.encodedRegionName = encodedRegionName;
this.closeProcId = closeProcId;
this.abort = abort;
- this.destination = destination;
this.retryCounter = HandlerUtil.getRetryCounter();
+ this.destination = destination;
}
private HRegionServer getServer() {
@@ -76,75 +79,75 @@ public class UnassignRegionHandler extends EventHandler {
@Override
public void process() throws IOException {
- HRegionServer rs = getServer();
- byte[] encodedNameBytes = Bytes.toBytes(encodedName);
- Boolean previous = rs.getRegionsInTransitionInRS().putIfAbsent(encodedNameBytes, Boolean.FALSE);
- if (previous != null) {
- if (previous) {
- // This could happen as we will update the region state to OPEN when calling
- // reportRegionStateTransition, so the HMaster will think the region is online, before we
- // actually open the region, as reportRegionStateTransition is part of the opening process.
- long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
- LOG.warn("Received CLOSE for the region: {}, which we are already " +
- "trying to OPEN. try again after {}ms", encodedName, backoff);
- rs.getExecutorService().delayedSubmit(this, backoff, TimeUnit.MILLISECONDS);
- } else {
- LOG.info("Received CLOSE for the region: {}, which we are already trying to CLOSE," +
- " but not completed yet", encodedName);
+ HRegionServer rss = getServer();
+ try {
+ byte[] encodedRegionNameAsBytes = Bytes.toBytes(this.encodedRegionName);
+ Boolean previous = rss.getRegionsInTransitionInRS().
+ putIfAbsent(encodedRegionNameAsBytes, Boolean.FALSE);
+ if (previous != null) {
+ if (previous) {
+ // This could happen as we will update the region state to OPEN when calling
+ // reportRegionStateTransition, so the HMaster will think the region is online, before
+ // we actually open the region, as reportRegionStateTransition is part of the opening
+ // process.
+ long backoff = this.retryCounter.getBackoffTimeAndIncrementAttempts();
+ LOG.warn("Received CLOSE for {} which is being OPENED; try again after {}ms",
+ encodedRegionNameAsBytes, backoff);
+ rss.getExecutorService().delayedSubmit(this, backoff, TimeUnit.MILLISECONDS);
+ } else {
+ LOG.info("Received CLOSE for {}, which is CLOSING but not complete yet",
+ encodedRegionNameAsBytes);
+ }
+ return;
}
- return;
- }
- HRegion region = rs.getRegion(encodedName);
- if (region == null) {
- LOG.debug(
- "Received CLOSE for a region {} which is not online, and we're not opening/closing.",
- encodedName);
- rs.getRegionsInTransitionInRS().remove(encodedNameBytes, Boolean.FALSE);
- return;
- }
- String regionName = region.getRegionInfo().getEncodedName();
- LOG.info("Close {}", regionName);
- if (region.getCoprocessorHost() != null) {
- // XXX: The behavior is a bit broken. At master side there is no FAILED_CLOSE state, so if
- // there are exception thrown from the CP, we can not report the error to master, and if here
- // we just return without calling reportRegionStateTransition, the TRSP at master side will
- // hang there for ever. So here if the CP throws an exception out, the only way is to abort
- // the RS...
- region.getCoprocessorHost().preClose(abort);
- }
- if (region.close(abort) == null) {
- // XXX: Is this still possible? The old comment says about split, but now split is done at
- // master side, so...
- LOG.warn("Can't close region {}, was already closed during close()", regionName);
- rs.getRegionsInTransitionInRS().remove(encodedNameBytes, Boolean.FALSE);
- return;
- }
- rs.removeRegion(region, destination);
- if (!rs.reportRegionStateTransition(new RegionStateTransitionContext(TransitionCode.CLOSED,
- HConstants.NO_SEQNUM, closeProcId, -1, region.getRegionInfo()))) {
- throw new IOException("Failed to report close to master: " + regionName);
+ HRegion region = rss.getRegion(this.encodedRegionName);
+ // Can't unassign a Region that is not online.
+ if (region == null) {
+ LOG.debug("Received CLOSE for {} which is not online, and we're not opening/closing.",
+ this.encodedRegionName);
+ // Remove what we added above.
+ rss.getRegionsInTransitionInRS().remove(encodedRegionNameAsBytes, Boolean.FALSE);
+ return;
+ }
+ CloseRegionHandler crh = new CloseRegionHandler(rss, region.getRegionInfo(), this.abort,
+ this.destination) {
+ @Override boolean reportRegionStateTransition() throws IOException {
+ boolean b = super.reportRegionStateTransition();
+ if (!b) {
+ String name = getRegionInfo().getRegionNameAsString();
+ throw new IOException("Failed report close of " + name + " to master");
+ }
+ return b;
+ }
+
+ @Override long getClosePID() {
+ return UnassignRegionHandler.this.closeProcId;
+ }
+ };
+ crh.process();
+ } finally {
+ // Cache the close region procedure id
+ rss.finishRegionProcedure(closeProcId);
}
- // Cache the close region procedure id after report region transition succeed.
- rs.finishRegionProcedure(closeProcId);
- rs.getRegionsInTransitionInRS().remove(encodedNameBytes, Boolean.FALSE);
- LOG.info("Closed {}", regionName);
}
@Override
protected void handleException(Throwable t) {
- LOG.warn("Fatal error occurred while closing region {}, aborting...", encodedName, t);
- getServer().abort("Failed to close region " + encodedName + " and can not recover", t);
+ LOG.warn("Fatal error occurred while closing {}, aborting...", this.encodedRegionName, t);
+ getServer().abort("Failed to close " + this.encodedRegionName, t);
}
+ /**
+ * @return Null if encodedName is not online
+ */
public static UnassignRegionHandler create(HRegionServer server, String encodedName,
- long closeProcId, boolean abort, @Nullable ServerName destination) {
+ long closeProcId, boolean abort, @Nullable ServerName destination) {
// Just try our best to determine whether it is for closing meta. It is not the end of the world
// if we put the handler into a wrong executor.
Region region = server.getRegion(encodedName);
- EventType eventType =
- region != null && region.getRegionInfo().isMetaRegion() ? EventType.M_RS_CLOSE_META
- : EventType.M_RS_CLOSE_REGION;
- return new UnassignRegionHandler(server, encodedName, closeProcId, abort, destination,
- eventType);
+ EventType eventType = region != null && region.getRegionInfo().isMetaRegion()?
+ EventType.M_RS_CLOSE_META: EventType.M_RS_CLOSE_REGION;
+ return new UnassignRegionHandler(server, encodedName, closeProcId, abort,
+ destination, eventType);
}
}
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java
index cfa6f75fae..9efd738407 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java
@@ -308,18 +308,21 @@ public class JVMClusterUtil {
if (t.isAlive()) {
atLeastOneLiveServer = true;
try {
- LOG.warn("RegionServerThreads remaining, give one more chance before interrupting");
+ LOG.warn("RegionServerThreads remaining, give one more chance before "
+ + "interrupting; joining " + t.getRegionServer().getServerName() + " for a second");
t.join(1000);
} catch (InterruptedException e) {
wasInterrupted = true;
}
}
}
- if (!atLeastOneLiveServer) break;
+ if (!atLeastOneLiveServer) {
+ break;
+ }
for (RegionServerThread t : regionservers) {
if (t.isAlive()) {
LOG.warn("RegionServerThreads taking too long to stop, interrupting; thread dump " +
- "if > 3 attempts: i=" + i);
+ "if > 3 attempts: i=" + i + "; alive=" + t.getRegionServer().getServerName());
if (i > 3) {
Threads.printThreadInfo(System.out, "Thread dump " + t.getName());
}
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterAbortAndRSGotKilled.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterAbortAndRSGotKilled.java
index 3df49290f6..0e10968522 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterAbortAndRSGotKilled.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterAbortAndRSGotKilled.java
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@@ -17,6 +17,7 @@
*/
package org.apache.hadoop.hbase.master;
+import static org.junit.Assert.assertNotEquals;
import java.io.IOException;
import java.util.Optional;
import java.util.concurrent.CountDownLatch;
@@ -62,12 +63,13 @@ public class TestMasterAbortAndRSGotKilled {
private static CountDownLatch countDownLatch = new CountDownLatch(1);
private static byte[] CF = Bytes.toBytes("cf");
+ private static int NODES = 2;
@BeforeClass
public static void setUp() throws Exception {
UTIL.getConfiguration().setStrings(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY,
DelayCloseCP.class.getName());
- UTIL.startMiniCluster(3);
+ UTIL.startMiniCluster(NODES);
UTIL.getAdmin().balancerSwitch(false, true);
UTIL.createTable(TABLE_NAME, CF);
UTIL.waitTableAvailable(TABLE_NAME);
@@ -99,14 +101,16 @@ public class TestMasterAbortAndRSGotKilled {
UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor()
.submitProcedure(moveRegionProcedure);
countDownLatch.await();
- UTIL.getMiniHBaseCluster().stopMaster(0);
- UTIL.getMiniHBaseCluster().startMaster();
+ JVMClusterUtil.MasterThread oldMaster = UTIL.getMiniHBaseCluster().stopMaster(0);
+ final JVMClusterUtil.MasterThread newMaster = UTIL.getMiniHBaseCluster().startMaster();
+ assertNotEquals(oldMaster.getMaster().getServerName().toString(),
+ newMaster.getMaster().getServerName().toString());
// wait until master initialized
- UTIL.waitFor(30000, () -> UTIL.getMiniHBaseCluster().getMaster() != null &&
- UTIL.getMiniHBaseCluster().getMaster().isInitialized());
- Assert.assertTrue("Should be 3 RS after master restart",
- UTIL.getMiniHBaseCluster().getLiveRegionServerThreads().size() == 3);
-
+ UTIL.waitFor(30000, () -> newMaster.getMaster().isInitialized());
+ Assert.assertTrue("Should be " + NODES + " RS after master restart",
+ UTIL.getMiniHBaseCluster().getLiveRegionServerThreads().size() == NODES);
+ // We don't check the merge completes? Should it? Waiting on it here doesn't work.
+ // Is it supposed to? I'd think so? St.Ack <= 03/13/2020
}
public static class DelayCloseCP implements RegionCoprocessor, RegionObserver {
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerNoMaster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerNoMaster.java
index 77e4079d09..dbbcc1cdd6 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerNoMaster.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerNoMaster.java
@@ -249,9 +249,6 @@ public class TestRegionServerNoMaster {
closeRegionNoZK();
checkRegionIsClosed(HTU, getRS(), hri);
- // Let do the initial steps, without having a handler
- getRS().getRegionsInTransitionInRS().put(hri.getEncodedNameAsBytes(), Boolean.TRUE);
-
// That's a close without ZK.
AdminProtos.CloseRegionRequest crr =
ProtobufUtil.buildCloseRegionRequest(getRS().getServerName(), regionName);
@@ -261,10 +258,6 @@ public class TestRegionServerNoMaster {
} catch (org.apache.hbase.thirdparty.com.google.protobuf.ServiceException expected) {
}
- // The state in RIT should have changed to close
- Assert.assertEquals(Boolean.FALSE, getRS().getRegionsInTransitionInRS().get(
- hri.getEncodedNameAsBytes()));
-
// Let's start the open handler
TableDescriptor htd = getRS().tableDescriptors.get(hri.getTable());
--
2.19.1