From 7987c7897ce3d5894b70b9226eef6a158784ae15 Mon Sep 17 00:00:00 2001 From: stack Date: Tue, 17 Mar 2020 22:09:14 -0700 Subject: [PATCH] HBASE-23984 [Flakey Tests] TestMasterAbortAndRSGotKilled fails in teardown Remove Region from RegionServer#regionsInTransitionInRS Map even on exception (because server is aborted). hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Only dump metrics once on abort. Refactor closeRegion method removing overlap with CloseRegionHandler; i.e. call to CP. hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java Refactor so we remove Region from regionsInTransitionRS on success or failure. hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java Cleanup but also made it so UnassignRegionHandler can use it closing Regions. hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java Use CloseRegionHandler to close Region rather than dupe code. hbase-server/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java Bit more info when thread dumping... TestRegionServerNoMaster was changed because it presumes a behavior not present anymore (that CRH will leave danglng mention in regionsInTransitionInRS as it used to) --- .../hbase/regionserver/HRegionServer.java | 127 +++++++-------- .../handler/AssignRegionHandler.java | 71 ++++----- .../handler/CloseMetaHandler.java | 11 +- .../handler/CloseRegionHandler.java | 138 +++++++++-------- .../handler/UnassignRegionHandler.java | 145 +++++++++--------- .../hadoop/hbase/util/JVMClusterUtil.java | 9 +- .../master/TestMasterAbortAndRSGotKilled.java | 22 +-- .../TestRegionServerNoMaster.java | 7 - 8 files changed, 276 insertions(+), 254 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index fc0b69345c..dbee9e40fa 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -22,6 +22,7 @@ import static org.apache.hadoop.hbase.HConstants.DEFAULT_HBASE_SPLIT_WAL_MAX_SPL import static org.apache.hadoop.hbase.HConstants.HBASE_SPLIT_WAL_COORDINATED_BY_ZK; import static org.apache.hadoop.hbase.HConstants.HBASE_SPLIT_WAL_MAX_SPLITTER; import static org.apache.hadoop.hbase.util.DNS.RS_HOSTNAME_KEY; +import edu.umd.cs.findbugs.annotations.Nullable; import java.io.IOException; import java.lang.Thread.UncaughtExceptionHandler; import java.lang.management.MemoryType; @@ -53,6 +54,7 @@ import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.stream.Collectors; import javax.management.MalformedObjectNameException; import javax.servlet.http.HttpServlet; import org.apache.commons.lang3.RandomUtils; @@ -247,9 +249,8 @@ public class HRegionServer extends HasThread implements public static boolean TEST_SKIP_REPORTING_TRANSITION = false; /** - * A map from RegionName to current action in progress. Boolean value indicates: - * true - if open region action in progress - * false - if close region action in progress + * A map from encoded region name as bytes to current action in progress. Boolean value indicates: + * TRUE if open region action in progress, and FALSE if close region action in progress */ private final ConcurrentMap regionsInTransitionInRS = new ConcurrentSkipListMap<>(Bytes.BYTES_COMPARATOR); @@ -1010,7 +1011,7 @@ public class HRegionServer extends HasThread implements } else if (!this.stopping) { this.stopping = true; LOG.info("Closing user regions"); - closeUserRegions(this.abortRequested); + closeUserRegions(isAborted()); } else { boolean allUserRegionsOffline = areAllUserRegionsOffline(); if (allUserRegionsOffline) { @@ -1026,7 +1027,7 @@ public class HRegionServer extends HasThread implements // Make sure all regions have been closed -- some regions may // have not got it because we were splitting at the time of // the call to closeUserRegions. - closeUserRegions(this.abortRequested); + closeUserRegions(isAborted()); } LOG.debug("Waiting on " + getOnlineRegionsAsPrintableString()); } @@ -1081,18 +1082,18 @@ public class HRegionServer extends HasThread implements // Stop the snapshot and other procedure handlers, forcefully killing all running tasks if (rspmHost != null) { - rspmHost.stop(this.abortRequested || this.killed); + rspmHost.stop(isAborted() || this.killed); } if (this.killed) { // Just skip out w/o closing regions. Used when testing. - } else if (abortRequested) { + } else if (isAborted()) { if (this.dataFsOk) { - closeUserRegions(abortRequested); // Don't leave any open file handles + closeUserRegions(isAborted()); // Don't leave any open file handles } LOG.info("aborting server " + this.serverName); } else { - closeUserRegions(abortRequested); + closeUserRegions(isAborted()); LOG.info("stopping server " + this.serverName); } @@ -1108,17 +1109,17 @@ public class HRegionServer extends HasThread implements // Closing the compactSplit thread before closing meta regions if (!this.killed && containsMetaTableRegions()) { - if (!abortRequested || this.dataFsOk) { + if (!isAborted() || this.dataFsOk) { if (this.compactSplitThread != null) { this.compactSplitThread.join(); this.compactSplitThread = null; } - closeMetaTableRegions(abortRequested); + closeMetaTableRegions(isAborted()); } } if (!this.killed && this.dataFsOk) { - waitOnAllRegionsToClose(abortRequested); + waitOnAllRegionsToClose(isAborted()); LOG.info("stopping server " + this.serverName + "; all regions closed."); } @@ -1133,7 +1134,7 @@ public class HRegionServer extends HasThread implements // flag may be changed when closing regions throws exception. if (this.dataFsOk) { - shutdownWAL(!abortRequested); + shutdownWAL(!isAborted()); } // Make sure the proxy is down. @@ -1457,6 +1458,9 @@ public class HRegionServer extends HasThread implements " because some regions failed closing"); } break; + } else { + LOG.trace("Waiting on {}", this.regionsInTransitionInRS.entrySet().stream(). + map(e -> Bytes.toString(e.getKey())).collect(Collectors.joining(", "))); } if (sleep(200)) { interrupted = true; @@ -2446,6 +2450,12 @@ public class HRegionServer extends HasThread implements */ @Override public void abort(String reason, Throwable cause) { + if (isAborted()) { + // Don't run multiple aborts; fills logs w/ spew. + LOG.info("Abort called but server already aborted; by-passing; abort invoked because {}", + reason, cause); + return; + } String msg = "***** ABORTING region server " + this + ": " + reason + " *****"; if (cause != null) { LOG.error(HBaseMarkers.FATAL, msg, cause); @@ -3185,71 +3195,61 @@ public class HRegionServer extends HasThread implements * If a close was in progress, this new request will be ignored, and an exception thrown. *

* - * @param encodedName Region to close * @param abort True if we are aborting + * @param destination Where the Region is being moved too... maybe null if unknown. * @return True if closed a region. * @throws NotServingRegionException if the region is not online */ - protected boolean closeRegion(String encodedName, final boolean abort, final ServerName sn) - throws NotServingRegionException { - //Check for permissions to close. - HRegion actualRegion = this.getRegion(encodedName); - // Can be null if we're calling close on a region that's not online - if ((actualRegion != null) && (actualRegion.getCoprocessorHost() != null)) { - try { - actualRegion.getCoprocessorHost().preClose(false); - } catch (IOException exp) { - LOG.warn("Unable to close region: the coprocessor launched an error ", exp); - return false; - } - } - - // previous can come back 'null' if not in map. - final Boolean previous = this.regionsInTransitionInRS.putIfAbsent(Bytes.toBytes(encodedName), - Boolean.FALSE); - + protected boolean closeRegion(String encodedRegionName, final boolean abort, + @Nullable final ServerName destination) throws NotServingRegionException { + // TODO: Check for perms to close. + // Ideally we'd shove a bunch of the below logic into CloseRegionHandler only we have to do + // some handling inline w/ the request such as throwing NotServervingRegionException if we + // are not hosting so client gets the message directly; can't do this from inside an + // executor service. + Region region = getRegion(encodedRegionName); + if (region == null) { + // Changed behavior. We'd proceed though Region not online. + // The master deletes the znode when it receives this exception. + throw new NotServingRegionException(encodedRegionName); + } + byte [] encodedRegionNameAsBytes = Bytes.toBytes(encodedRegionName); + // This getting from this.regionsInTransitionInRS and checking the return is done in a few + // places, mostly over in Handlers; this.regionsInTransitionInRS is how we ensure we don't + // clash opens/closes or duplicate closes when one is ongoing. FYI. + final Boolean previous = this.regionsInTransitionInRS. + putIfAbsent(encodedRegionNameAsBytes, Boolean.FALSE); if (Boolean.TRUE.equals(previous)) { - LOG.info("Received CLOSE for the region:" + encodedName + " , which we are already " + - "trying to OPEN. Cancelling OPENING."); - if (!regionsInTransitionInRS.replace(Bytes.toBytes(encodedName), previous, Boolean.FALSE)) { + LOG.info("Received CLOSE for {} which is OPENING. Cancelling OPENING.", encodedRegionName); + if (!this.regionsInTransitionInRS.replace(encodedRegionNameAsBytes, previous, + Boolean.FALSE)) { // The replace failed. That should be an exceptional case, but theoretically it can happen. - // We're going to try to do a standard close then. - LOG.warn("The opening for region " + encodedName + " was done before we could cancel it." + - " Doing a standard close now"); - return closeRegion(encodedName, abort, sn); + // We're going to try to rerun the close. + LOG.warn("The opening of {} was done before we could cancel it; running a new close now", + encodedRegionName); + return closeRegion(encodedRegionName, abort, destination); } // Let's get the region from the online region list again - actualRegion = this.getRegion(encodedName); - if (actualRegion == null) { // If already online, we still need to close it. - LOG.info("The opening previously in progress has been cancelled by a CLOSE request."); + region = this.getRegion(encodedRegionName); + if (region == null) { + LOG.info("The opening of {} has been cancelled by a CLOSE request", encodedRegionName); // The master deletes the znode when it receives this exception. - throw new NotServingRegionException("The region " + encodedName + - " was opening but not yet served. Opening is cancelled."); + throw new NotServingRegionException(encodedRegionName + + " was opening but not yet served; cancelled by close"); } } else if (previous == null) { - LOG.info("Received CLOSE for {}", encodedName); + LOG.info("Received CLOSE for {}", encodedRegionName); } else if (Boolean.FALSE.equals(previous)) { - LOG.info("Received CLOSE for the region: " + encodedName + - ", which we are already trying to CLOSE, but not completed yet"); + LOG.info("Received CLOSE for {} which we are already trying to CLOSE", encodedRegionName); return true; } + // Above we've registered our CLOSE with this.regionsInTransitionInRS. Now run close handler. + // The close handler will undo the entry in this.regionsInTransitionInRS when dones + // (awkward but close handler runs in a different thread). + this.executorService.submit(region.getRegionInfo().isMetaRegion()? + new CloseMetaHandler(this, region.getRegionInfo(), abort, destination): + new CloseRegionHandler(this, region.getRegionInfo(), abort, destination)); - if (actualRegion == null) { - LOG.debug("Received CLOSE for a region which is not online, and we're not opening."); - this.regionsInTransitionInRS.remove(Bytes.toBytes(encodedName)); - // The master deletes the znode when it receives this exception. - throw new NotServingRegionException("The region " + encodedName + - " is not online, and is not opening."); - } - - CloseRegionHandler crh; - final RegionInfo hri = actualRegion.getRegionInfo(); - if (hri.isMetaRegion()) { - crh = new CloseMetaHandler(this, this, hri, abort); - } else { - crh = new CloseRegionHandler(this, this, hri, abort, sn); - } - this.executorService.submit(crh); return true; } @@ -3833,6 +3833,7 @@ public class HRegionServer extends HasThread implements * * @param procId the id of the open/close region procedure * @return true if the procedure can be submitted. + * @see #finishRegionProcedure */ boolean submitRegionProcedure(long procId) { if (procId == -1) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java index 76e8f80271..8cdd404760 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -34,7 +34,6 @@ import org.apache.hadoop.hbase.util.RetryCounter; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; /** @@ -74,10 +73,8 @@ public class AssignRegionHandler extends EventHandler { } private void cleanUpAndReportFailure(IOException error) throws IOException { - LOG.warn("Failed to open region {}, will report to master", regionInfo.getRegionNameAsString(), - error); + LOG.warn("Failed to open {}, will report master", regionInfo.getRegionNameAsString(), error); HRegionServer rs = getServer(); - rs.getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes(), Boolean.TRUE); if (!rs.reportRegionStateTransition(new RegionStateTransitionContext(TransitionCode.FAILED_OPEN, HConstants.NO_SEQNUM, openProcId, masterSystemTime, regionInfo))) { throw new IOException( @@ -93,7 +90,7 @@ public class AssignRegionHandler extends EventHandler { String regionName = regionInfo.getRegionNameAsString(); Region onlineRegion = rs.getRegion(encodedName); if (onlineRegion != null) { - LOG.warn("Received OPEN for the region:{}, which is already online", regionName); + LOG.warn("Received OPEN for {}, which is already online", regionName); // Just follow the old behavior, do we need to call reportRegionStateTransition? Maybe not? // For normal case, it could happen that the rpc call to schedule this handler is succeeded, // but before returning to master the connection is broken. And when master tries again, we @@ -105,16 +102,14 @@ public class AssignRegionHandler extends EventHandler { if (previous != null) { if (previous) { // The region is opening and this maybe a retry on the rpc call, it is safe to ignore it. - LOG.info("Receiving OPEN for the region:{}, which we are already trying to OPEN" + - " - ignoring this new request for this region.", regionName); + LOG.info("Receiving OPEN for {} which is OPENING; ignoring request", regionName); } else { // The region is closing. This is possible as we will update the region state to CLOSED when // calling reportRegionStateTransition, so the HMaster will think the region is offline, // before we actually close the region, as reportRegionStateTransition is part of the // closing process. long backoff = retryCounter.getBackoffTimeAndIncrementAttempts(); - LOG.info( - "Receiving OPEN for the region:{}, which we are trying to close, try again after {}ms", + LOG.info("Receiving OPEN for {} which we are trying to CLOSE, try again after {}ms", regionName, backoff); rs.getExecutorService().delayedSubmit(this, backoff, TimeUnit.MILLISECONDS); } @@ -122,33 +117,39 @@ public class AssignRegionHandler extends EventHandler { } LOG.info("Open {}", regionName); HRegion region; + // Start a try. Above we registered region with rs.getRegionsInTransitionInRS. Need to clear + // it on success or error on the end. try { - TableDescriptor htd = - tableDesc != null ? tableDesc : rs.getTableDescriptors().get(regionInfo.getTable()); - if (htd == null) { - throw new IOException("Missing table descriptor for " + regionName); + try { + TableDescriptor htd = + tableDesc != null ? tableDesc : rs.getTableDescriptors().get(regionInfo.getTable()); + if (htd == null) { + throw new IOException("Missing table descriptor for " + regionName); + } + // pass null for the last parameter, which used to be a CancelableProgressable, as now the + // opening can not be interrupted by a close request any more. + region = HRegion + .openHRegion(regionInfo, htd, rs.getWAL(regionInfo), rs.getConfiguration(), rs, null); + } catch (IOException e) { + cleanUpAndReportFailure(e); + return; } - // pass null for the last parameter, which used to be a CancelableProgressable, as now the - // opening can not be interrupted by a close request any more. - region = HRegion.openHRegion(regionInfo, htd, rs.getWAL(regionInfo), rs.getConfiguration(), - rs, null); - } catch (IOException e) { - cleanUpAndReportFailure(e); - return; - } - rs.postOpenDeployTasks(new PostOpenDeployContext(region, openProcId, masterSystemTime)); - rs.addRegion(region); - LOG.info("Opened {}", regionName); - // Cache the open region procedure id after report region transition succeed. - rs.finishRegionProcedure(openProcId); - Boolean current = rs.getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes()); - if (current == null) { - // Should NEVER happen, but let's be paranoid. - LOG.error("Bad state: we've just opened a region that was NOT in transition. Region={}", - regionName); - } else if (!current) { - // Should NEVER happen, but let's be paranoid. - LOG.error("Bad state: we've just opened a region that was closing. Region={}", regionName); + rs.postOpenDeployTasks(new PostOpenDeployContext(region, openProcId, masterSystemTime)); + rs.addRegion(region); + LOG.info("Opened {}", regionName); + } finally { + // Remove from regionsInTransitionRS whether success or failure. + rs.getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes(), Boolean.TRUE); + Boolean current = rs.getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes()); + if (current == null) { + // Should NEVER happen, but let's be paranoid. + LOG.error("Bad state: we've just opened {} and it was NOT in transition", regionName); + } else if (!current) { + // Should NEVER happen, but let's be paranoid. + LOG.error("Bad state: we've just opened {} and it was closing", regionName); + } + // Cache the open region procedure id + rs.finishRegionProcedure(openProcId); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseMetaHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseMetaHandler.java index 38097bafd6..25c64f9ed7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseMetaHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseMetaHandler.java @@ -18,7 +18,8 @@ */ package org.apache.hadoop.hbase.regionserver.handler; -import org.apache.hadoop.hbase.Server; +import edu.umd.cs.findbugs.annotations.Nullable; +import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.executor.EventType; import org.apache.hadoop.hbase.regionserver.RegionServerServices; @@ -31,10 +32,8 @@ import org.apache.yetus.audience.InterfaceAudience; public class CloseMetaHandler extends CloseRegionHandler { // Called when regionserver determines its to go down; not master orchestrated - public CloseMetaHandler(final Server server, - final RegionServerServices rsServices, - final RegionInfo regionInfo, - final boolean abort) { - super(server, rsServices, regionInfo, abort, EventType.M_RS_CLOSE_META, null); + public CloseMetaHandler(final RegionServerServices rsServices, + final RegionInfo regionInfo, final boolean abort, @Nullable ServerName destination) { + super(rsServices, regionInfo, abort, EventType.M_RS_CLOSE_META, destination); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java index d4ea004cb2..9d6222ac4e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java @@ -1,5 +1,4 @@ -/** - * +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -18,10 +17,9 @@ */ package org.apache.hadoop.hbase.regionserver.handler; +import edu.umd.cs.findbugs.annotations.Nullable; import java.io.IOException; - import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.executor.EventHandler; @@ -38,9 +36,13 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProto /** * Handles closing of a region on a region server. *

- * Now for regular close region request, we will use {@link UnassignRegionHandler} instead. But when - * shutting down the region server, will also close regions and the related methods still use this - * class so we keep it here. + * In normal operation, we use {@link UnassignRegionHandler} closing Regions but when shutting down + * the region server and closing out Regions, we use this handler instead; it does not expect to + * be able to communicate the close back to the Master. + *

Expects that the close has been registered in the hosting RegionServer before + * submitting this Handler; i.e. rss.getRegionsInTransitionInRS().putIfAbsent( + * this.regionInfo.getEncodedNameAsBytes(), Boolean.FALSE); has been called first. + * In here when done, we do the deregister.

* @see UnassignRegionHandler */ @InterfaceAudience.Private @@ -52,85 +54,101 @@ public class CloseRegionHandler extends EventHandler { // have a running queue of user regions to close? private static final Logger LOG = LoggerFactory.getLogger(CloseRegionHandler.class); - private final RegionServerServices rsServices; + private final RegionServerServices rss; private final RegionInfo regionInfo; // If true, the hosting server is aborting. Region close process is different // when we are aborting. private final boolean abort; - private ServerName destination; + + /** + * The target the Region is going to after successful close; usually null as it depends on + * context whether this is passed in or not. + */ + private final ServerName destination; /** * This method used internally by the RegionServer to close out regions. - * @param server - * @param rsServices - * @param regionInfo + * * @param abort If the regionserver is aborting. - * @param destination */ - public CloseRegionHandler(final Server server, - final RegionServerServices rsServices, - final RegionInfo regionInfo, final boolean abort, - ServerName destination) { - this(server, rsServices, regionInfo, abort, - EventType.M_RS_CLOSE_REGION, destination); + public CloseRegionHandler(final RegionServerServices rsServices, final RegionInfo regionInfo, + final boolean abort, @Nullable ServerName destination) { + this(rsServices, regionInfo, abort, EventType.M_RS_CLOSE_REGION, destination); } - protected CloseRegionHandler(final Server server, - final RegionServerServices rsServices, RegionInfo regionInfo, - boolean abort, EventType eventType, ServerName destination) { - super(server, eventType); - this.server = server; - this.rsServices = rsServices; + protected CloseRegionHandler(RegionServerServices rsServices, RegionInfo regionInfo, + boolean abort, EventType eventType, @Nullable ServerName destination) { + super(rsServices, eventType); this.regionInfo = regionInfo; this.abort = abort; + this.rss = rsServices; this.destination = destination; } - public RegionInfo getRegionInfo() { - return regionInfo; - } - - @Override - public void process() { + @Override public void process() throws IOException { try { - String name = regionInfo.getEncodedName(); - LOG.trace("Processing close of {}", name); - String encodedRegionName = regionInfo.getEncodedName(); // Check that this region is being served here - HRegion region = (HRegion)rsServices.getRegion(encodedRegionName); + HRegion region = (HRegion)this.rss.getRegion(this.regionInfo.getEncodedName()); if (region == null) { - LOG.warn("Received CLOSE for region {} but currently not serving - ignoring", name); + LOG.warn("Received CLOSE for {} but not ONLINE; ignoring CLOSE request", + this.regionInfo.getRegionNameAsString()); // TODO: do better than a simple warning return; } - - // Close the region - try { - if (region.close(abort) == null) { - // This region got closed. Most likely due to a split. - // The split message will clean up the master state. - LOG.warn("Can't close region {}, was already closed during close()", name); - return; - } - } catch (IOException ioe) { - // An IOException here indicates that we couldn't successfully flush the - // memstore before closing. So, we need to abort the server and allow - // the master to split our logs in order to recover the data. - server.abort("Unrecoverable exception while closing region " + - regionInfo.getRegionNameAsString() + ", still finishing close", ioe); - throw new RuntimeException(ioe); + if (region.getCoprocessorHost() != null) { + // XXX: The behavior is a bit broken. At master side there is no FAILED_CLOSE state, so if + // there are exceptions thrown from the CP, we can not report the error to the master, and + // if here is, we just return without calling reportRegionStateTransition, the TRSP at + // master side will hang there for ever. So here if the CP throws an exception out, the only + // way is to abort the RS... + region.getCoprocessorHost().preClose(abort); } - - this.rsServices.removeRegion(region, destination); - rsServices.reportRegionStateTransition(new RegionStateTransitionContext(TransitionCode.CLOSED, - HConstants.NO_SEQNUM, Procedure.NO_PROC_ID, -1, regionInfo)); - + if (region.close(abort) == null) { + // This should not happen. Left over from old days when RegionServer + // ran the splits? Now splits are run by the Master. + LOG.warn("Can't close {}, already closed; split? (Unexpected state - investigate)", + this.regionInfo.getRegionNameAsString()); + // Return null here. Makes more sense. We didnt do close on the Region. + return; + } + this.rss.removeRegion(region, destination); + boolean b = reportRegionStateTransition(); // Done! Region is closed on this RS - LOG.debug("Closed " + region.getRegionInfo().getRegionNameAsString()); + LOG.debug("Closed {} (notified master={})", this.regionInfo.getRegionNameAsString(), b); + // We don't care if we are able to inform the Master successfully; try but ignore response. } finally { - this.rsServices.getRegionsInTransitionInRS(). - remove(this.regionInfo.getEncodedNameAsBytes(), Boolean.FALSE); + // Presumption is that before the Close was run, the CLOSE operation had been registered + // with rss.getRegionsInTransitionInRS(). Here we do cleanup whether success or failure. + this.rss.getRegionsInTransitionInRS().remove(this.regionInfo.getEncodedNameAsBytes(), + Boolean.FALSE); } } + + /** + * @throws IOException Let subclasses throw IOE if they need to. + */ + boolean reportRegionStateTransition() throws IOException { + return this.rss.reportRegionStateTransition( + new RegionStateTransitionContext(TransitionCode.CLOSED, HConstants.NO_SEQNUM, + getClosePID(), -1, this.regionInfo)); + } + + /** + * @return PID of Close to report to Master; default is Procedure.NO_PROC_ID. + */ + long getClosePID() { + return Procedure.NO_PROC_ID; + } + + @Override protected void handleException(Throwable t) { + LOG.warn("Fatal error occurred while closing region {}, aborting...", + this.regionInfo.getRegionNameAsString(), t); + this.rss.abort("Failed to close region " + this.regionInfo.getRegionNameAsString() + + " and can not recover", t); + } + + RegionInfo getRegionInfo() { + return this.regionInfo; + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java index 0fc8edc47a..096b9b80e2 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -20,22 +20,18 @@ package org.apache.hadoop.hbase.regionserver.handler; import edu.umd.cs.findbugs.annotations.Nullable; import java.io.IOException; import java.util.concurrent.TimeUnit; -import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.executor.EventHandler; import org.apache.hadoop.hbase.executor.EventType; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.regionserver.Region; -import org.apache.hadoop.hbase.regionserver.RegionServerServices.RegionStateTransitionContext; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.RetryCounter; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; - /** * Handles closing of a region on a region server. *

@@ -44,11 +40,11 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProto * with the zk less assignment for 1.x, otherwise it is not possible to do rolling upgrade. */ @InterfaceAudience.Private -public class UnassignRegionHandler extends EventHandler { +public final class UnassignRegionHandler extends EventHandler { private static final Logger LOG = LoggerFactory.getLogger(UnassignRegionHandler.class); - private final String encodedName; + private final String encodedRegionName; private final long closeProcId; // If true, the hosting server is aborting. Region close process is different @@ -56,18 +52,25 @@ public class UnassignRegionHandler extends EventHandler { // TODO: not used yet, we still use the old CloseRegionHandler when aborting private final boolean abort; - private final ServerName destination; - private final RetryCounter retryCounter; - public UnassignRegionHandler(HRegionServer server, String encodedName, long closeProcId, - boolean abort, @Nullable ServerName destination, EventType eventType) { + /** + * The target the Region is going to after successful close; usually null as it depends on + * context whether this is passed in or not. + */ + private final ServerName destination; + + /** + * @see #create(HRegionServer, String, long, boolean, ServerName) + */ + private UnassignRegionHandler(HRegionServer server, String encodedRegionName, long closeProcId, + boolean abort, @Nullable ServerName destination, EventType eventType) { super(server, eventType); - this.encodedName = encodedName; + this.encodedRegionName = encodedRegionName; this.closeProcId = closeProcId; this.abort = abort; - this.destination = destination; this.retryCounter = HandlerUtil.getRetryCounter(); + this.destination = destination; } private HRegionServer getServer() { @@ -76,75 +79,75 @@ public class UnassignRegionHandler extends EventHandler { @Override public void process() throws IOException { - HRegionServer rs = getServer(); - byte[] encodedNameBytes = Bytes.toBytes(encodedName); - Boolean previous = rs.getRegionsInTransitionInRS().putIfAbsent(encodedNameBytes, Boolean.FALSE); - if (previous != null) { - if (previous) { - // This could happen as we will update the region state to OPEN when calling - // reportRegionStateTransition, so the HMaster will think the region is online, before we - // actually open the region, as reportRegionStateTransition is part of the opening process. - long backoff = retryCounter.getBackoffTimeAndIncrementAttempts(); - LOG.warn("Received CLOSE for the region: {}, which we are already " + - "trying to OPEN. try again after {}ms", encodedName, backoff); - rs.getExecutorService().delayedSubmit(this, backoff, TimeUnit.MILLISECONDS); - } else { - LOG.info("Received CLOSE for the region: {}, which we are already trying to CLOSE," + - " but not completed yet", encodedName); + HRegionServer rss = getServer(); + try { + byte[] encodedRegionNameAsBytes = Bytes.toBytes(this.encodedRegionName); + Boolean previous = rss.getRegionsInTransitionInRS(). + putIfAbsent(encodedRegionNameAsBytes, Boolean.FALSE); + if (previous != null) { + if (previous) { + // This could happen as we will update the region state to OPEN when calling + // reportRegionStateTransition, so the HMaster will think the region is online, before + // we actually open the region, as reportRegionStateTransition is part of the opening + // process. + long backoff = this.retryCounter.getBackoffTimeAndIncrementAttempts(); + LOG.warn("Received CLOSE for {} which is being OPENED; try again after {}ms", + encodedRegionNameAsBytes, backoff); + rss.getExecutorService().delayedSubmit(this, backoff, TimeUnit.MILLISECONDS); + } else { + LOG.info("Received CLOSE for {}, which is CLOSING but not complete yet", + encodedRegionNameAsBytes); + } + return; } - return; - } - HRegion region = rs.getRegion(encodedName); - if (region == null) { - LOG.debug( - "Received CLOSE for a region {} which is not online, and we're not opening/closing.", - encodedName); - rs.getRegionsInTransitionInRS().remove(encodedNameBytes, Boolean.FALSE); - return; - } - String regionName = region.getRegionInfo().getEncodedName(); - LOG.info("Close {}", regionName); - if (region.getCoprocessorHost() != null) { - // XXX: The behavior is a bit broken. At master side there is no FAILED_CLOSE state, so if - // there are exception thrown from the CP, we can not report the error to master, and if here - // we just return without calling reportRegionStateTransition, the TRSP at master side will - // hang there for ever. So here if the CP throws an exception out, the only way is to abort - // the RS... - region.getCoprocessorHost().preClose(abort); - } - if (region.close(abort) == null) { - // XXX: Is this still possible? The old comment says about split, but now split is done at - // master side, so... - LOG.warn("Can't close region {}, was already closed during close()", regionName); - rs.getRegionsInTransitionInRS().remove(encodedNameBytes, Boolean.FALSE); - return; - } - rs.removeRegion(region, destination); - if (!rs.reportRegionStateTransition(new RegionStateTransitionContext(TransitionCode.CLOSED, - HConstants.NO_SEQNUM, closeProcId, -1, region.getRegionInfo()))) { - throw new IOException("Failed to report close to master: " + regionName); + HRegion region = rss.getRegion(this.encodedRegionName); + // Can't unassign a Region that is not online. + if (region == null) { + LOG.debug("Received CLOSE for {} which is not online, and we're not opening/closing.", + this.encodedRegionName); + // Remove what we added above. + rss.getRegionsInTransitionInRS().remove(encodedRegionNameAsBytes, Boolean.FALSE); + return; + } + CloseRegionHandler crh = new CloseRegionHandler(rss, region.getRegionInfo(), this.abort, + this.destination) { + @Override boolean reportRegionStateTransition() throws IOException { + boolean b = super.reportRegionStateTransition(); + if (!b) { + String name = getRegionInfo().getRegionNameAsString(); + throw new IOException("Failed report close of " + name + " to master"); + } + return b; + } + + @Override long getClosePID() { + return UnassignRegionHandler.this.closeProcId; + } + }; + crh.process(); + } finally { + // Cache the close region procedure id + rss.finishRegionProcedure(closeProcId); } - // Cache the close region procedure id after report region transition succeed. - rs.finishRegionProcedure(closeProcId); - rs.getRegionsInTransitionInRS().remove(encodedNameBytes, Boolean.FALSE); - LOG.info("Closed {}", regionName); } @Override protected void handleException(Throwable t) { - LOG.warn("Fatal error occurred while closing region {}, aborting...", encodedName, t); - getServer().abort("Failed to close region " + encodedName + " and can not recover", t); + LOG.warn("Fatal error occurred while closing {}, aborting...", this.encodedRegionName, t); + getServer().abort("Failed to close " + this.encodedRegionName, t); } + /** + * @return Null if encodedName is not online + */ public static UnassignRegionHandler create(HRegionServer server, String encodedName, - long closeProcId, boolean abort, @Nullable ServerName destination) { + long closeProcId, boolean abort, @Nullable ServerName destination) { // Just try our best to determine whether it is for closing meta. It is not the end of the world // if we put the handler into a wrong executor. Region region = server.getRegion(encodedName); - EventType eventType = - region != null && region.getRegionInfo().isMetaRegion() ? EventType.M_RS_CLOSE_META - : EventType.M_RS_CLOSE_REGION; - return new UnassignRegionHandler(server, encodedName, closeProcId, abort, destination, - eventType); + EventType eventType = region != null && region.getRegionInfo().isMetaRegion()? + EventType.M_RS_CLOSE_META: EventType.M_RS_CLOSE_REGION; + return new UnassignRegionHandler(server, encodedName, closeProcId, abort, + destination, eventType); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java index cfa6f75fae..9efd738407 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/JVMClusterUtil.java @@ -308,18 +308,21 @@ public class JVMClusterUtil { if (t.isAlive()) { atLeastOneLiveServer = true; try { - LOG.warn("RegionServerThreads remaining, give one more chance before interrupting"); + LOG.warn("RegionServerThreads remaining, give one more chance before " + + "interrupting; joining " + t.getRegionServer().getServerName() + " for a second"); t.join(1000); } catch (InterruptedException e) { wasInterrupted = true; } } } - if (!atLeastOneLiveServer) break; + if (!atLeastOneLiveServer) { + break; + } for (RegionServerThread t : regionservers) { if (t.isAlive()) { LOG.warn("RegionServerThreads taking too long to stop, interrupting; thread dump " + - "if > 3 attempts: i=" + i); + "if > 3 attempts: i=" + i + "; alive=" + t.getRegionServer().getServerName()); if (i > 3) { Threads.printThreadInfo(System.out, "Thread dump " + t.getName()); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterAbortAndRSGotKilled.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterAbortAndRSGotKilled.java index 3df49290f6..0e10968522 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterAbortAndRSGotKilled.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterAbortAndRSGotKilled.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -17,6 +17,7 @@ */ package org.apache.hadoop.hbase.master; +import static org.junit.Assert.assertNotEquals; import java.io.IOException; import java.util.Optional; import java.util.concurrent.CountDownLatch; @@ -62,12 +63,13 @@ public class TestMasterAbortAndRSGotKilled { private static CountDownLatch countDownLatch = new CountDownLatch(1); private static byte[] CF = Bytes.toBytes("cf"); + private static int NODES = 2; @BeforeClass public static void setUp() throws Exception { UTIL.getConfiguration().setStrings(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, DelayCloseCP.class.getName()); - UTIL.startMiniCluster(3); + UTIL.startMiniCluster(NODES); UTIL.getAdmin().balancerSwitch(false, true); UTIL.createTable(TABLE_NAME, CF); UTIL.waitTableAvailable(TABLE_NAME); @@ -99,14 +101,16 @@ public class TestMasterAbortAndRSGotKilled { UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor() .submitProcedure(moveRegionProcedure); countDownLatch.await(); - UTIL.getMiniHBaseCluster().stopMaster(0); - UTIL.getMiniHBaseCluster().startMaster(); + JVMClusterUtil.MasterThread oldMaster = UTIL.getMiniHBaseCluster().stopMaster(0); + final JVMClusterUtil.MasterThread newMaster = UTIL.getMiniHBaseCluster().startMaster(); + assertNotEquals(oldMaster.getMaster().getServerName().toString(), + newMaster.getMaster().getServerName().toString()); // wait until master initialized - UTIL.waitFor(30000, () -> UTIL.getMiniHBaseCluster().getMaster() != null && - UTIL.getMiniHBaseCluster().getMaster().isInitialized()); - Assert.assertTrue("Should be 3 RS after master restart", - UTIL.getMiniHBaseCluster().getLiveRegionServerThreads().size() == 3); - + UTIL.waitFor(30000, () -> newMaster.getMaster().isInitialized()); + Assert.assertTrue("Should be " + NODES + " RS after master restart", + UTIL.getMiniHBaseCluster().getLiveRegionServerThreads().size() == NODES); + // We don't check the merge completes? Should it? Waiting on it here doesn't work. + // Is it supposed to? I'd think so? St.Ack <= 03/13/2020 } public static class DelayCloseCP implements RegionCoprocessor, RegionObserver { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerNoMaster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerNoMaster.java index 77e4079d09..dbbcc1cdd6 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerNoMaster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerNoMaster.java @@ -249,9 +249,6 @@ public class TestRegionServerNoMaster { closeRegionNoZK(); checkRegionIsClosed(HTU, getRS(), hri); - // Let do the initial steps, without having a handler - getRS().getRegionsInTransitionInRS().put(hri.getEncodedNameAsBytes(), Boolean.TRUE); - // That's a close without ZK. AdminProtos.CloseRegionRequest crr = ProtobufUtil.buildCloseRegionRequest(getRS().getServerName(), regionName); @@ -261,10 +258,6 @@ public class TestRegionServerNoMaster { } catch (org.apache.hbase.thirdparty.com.google.protobuf.ServiceException expected) { } - // The state in RIT should have changed to close - Assert.assertEquals(Boolean.FALSE, getRS().getRegionsInTransitionInRS().get( - hri.getEncodedNameAsBytes())); - // Let's start the open handler TableDescriptor htd = getRS().tableDescriptors.get(hri.getTable()); -- 2.19.1