From 8df0b1172348d754183d2eb294b8e21b9709bc2c Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Mon, 8 Oct 2018 16:54:33 -0700 Subject: [PATCH] HBASE-21259 [amv2] Revived deadservers; recreated serverstatenode Remove a bunch of places where we create ServerStateNode. We were creating a SSN even though the server was long dead and processed. The revived SSN was messing up the little dance we do unassigning procedures. In particular, in UnassignProcedure, the check for a dead server inside in isLogSplittingDone returns true -- we can proceed because server is dead -- fails if an SSN exists. We were creating SSN when we didn't need it as well as inadvertently. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Print serverstatenode when reporting expiration. Helps debugging. Make moveFromOnlineToDeadServers return if server online or not. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java Make do w/ serverName in place of serverNode in a few places. In waitServerReportEvent, create a ServerStateNode if none though we should not have to at this point; to figure out later: TODO. addRegionToServer no longer automatically calls create SSN so do explicit create processing load meta and the region is OPEN so we can associate OPEN regions with the SSN. Do not schedule an SCP if server is not online, not in fs, and not in dead servers. No point (and there may be cases where server is long gone but hbase:meta still refers to it though it has not carried regions in a long time; running an assign/unassign against such a server will fail because it is not there but SCP won't clean up the outstanding hung RPC because our region is not on the long-gone server). M hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java Just cleanup. Make it so addRegionToServer and remove can deal if no SSN. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java Add isWALDirectoryNameWithWALS utility. --- .../hadoop/hbase/master/MasterFileSystem.java | 6 +- .../hadoop/hbase/master/MasterWalManager.java | 65 +++++++++++-- .../apache/hadoop/hbase/master/ServerManager.java | 53 +++++++++-- .../hadoop/hbase/master/SplitLogManager.java | 4 +- .../hbase/master/assignment/AssignmentManager.java | 62 +++++++------ .../hbase/master/assignment/RegionStates.java | 30 ++++-- .../assignment/RegionTransitionProcedure.java | 10 +- .../hbase/master/assignment/UnassignProcedure.java | 9 +- .../master/balancer/RegionLocationFinder.java | 6 +- .../hadoop/hbase/master/TestMasterWALManager.java | 103 +++++++++++++++++++++ 10 files changed, 285 insertions(+), 63 deletions(-) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterWALManager.java diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java index 864be02900..a06a369cc9 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java @@ -199,14 +199,16 @@ public class MasterFileSystem { } /** - * @return HBase root dir. + * @return HBase root directory. */ public Path getRootDir() { return this.rootdir; } /** - * @return HBase root log dir. + * @return HBase WAL root directory, usually the same as {@link #getRootDir()} but can be + * different if hfiles on one fs and WALs on another. The 'WALs' dir gets made underneath + * the root dir returned here; i.e. this is '/hbase', not '/hbase/WALs'. */ public Path getWALRootDir() { return this.walRootDir; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java index 2b1a81f357..049c561042 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java @@ -18,6 +18,7 @@ */ package org.apache.hadoop.hbase.master; +import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; @@ -77,6 +78,11 @@ public class MasterWalManager { // The Path to the old logs dir private final Path oldLogDir; + + /** + * This is the hbase rootdir. + * We'll put the WALs under this dir. + */ private final Path rootDir; // create the split log lock @@ -182,11 +188,18 @@ public class MasterWalManager { }).filter(s -> s != null).collect(Collectors.toSet()); } + /** + * @return Returns the WALs dir under rootDir + */ + Path getWALDirPath() { + return new Path(this.rootDir, HConstants.HREGION_LOGDIR_NAME); + } + /** * @return List of all RegionServer WAL dirs; i.e. this.rootDir/HConstants.HREGION_LOGDIR_NAME. */ public FileStatus[] getWALDirPaths(final PathFilter filter) throws IOException { - Path walDirPath = new Path(rootDir, HConstants.HREGION_LOGDIR_NAME); + Path walDirPath = getWALDirPath(); FileStatus[] walDirForServerNames = FSUtils.listStatus(fs, walDirPath, filter); return walDirForServerNames == null? new FileStatus[0]: walDirForServerNames; } @@ -205,7 +218,7 @@ public class MasterWalManager { WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT); Set serverNames = new HashSet<>(); - Path logsDirPath = new Path(this.rootDir, HConstants.HREGION_LOGDIR_NAME); + Path logsDirPath = getWALDirPath(); do { if (services.isStopped()) { @@ -286,10 +299,50 @@ public class MasterWalManager { splitLog(serverNames, META_FILTER); } + /** + * @return True if a WAL directory exists (will return true also if WALs found in + * servername'-splitting' too). + */ + boolean isWALDirectoryNameWithWALs(ServerName serverName) { + FileStatus [] fss = null; + try { + // 'startsWith' will also return dirs ending in AbstractFSWALProvider.SPLITTING_EXT + fss = getWALDirPaths(p -> p.getName().startsWith(serverName.toString())); + } catch (IOException ioe) { + LOG.warn("{}", serverName, ioe); + // Something wrong reading from fs. Returning 'true' to bring on more fs activity + return true; + } + if (fss != null) { + for (FileStatus fileStatus: fss) { + if (fileStatus.isDirectory()) { + // Not testing for existence; presuming exists if we got it out of getWALDirPaths + // listing. I used to test for presence of WAL and return false if empty but it can be + // empty if a clean shutdown. Even clean shutdowns need to be recovered so the meta + // and namespace assigns get triggered. + return true; + } + } + } + return false; + } + + /** + * Depends on current FS Layout! + * @return The Path to the WAL directory for serverName + */ + Path getWALDirectoryName(ServerName serverName) { + return new Path(this.rootDir, AbstractFSWALProvider.getWALDirectoryName(serverName.toString())); + } + + /** + * Finds WAL dirs for serverNames and renames them with '-splitting' suffix. + * @return List of '-splitting' directories that pertain to serverNames + */ @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UL_UNRELEASED_LOCK", justification= "We only release this lock when we set it. Updates to code that uses it should verify use " + "of the guard boolean.") - private List getLogDirs(final Set serverNames) throws IOException { + List createAndGetLogDirs(final Set serverNames) throws IOException { List logDirs = new ArrayList<>(); boolean needReleaseLock = false; if (!this.services.isInitialized()) { @@ -300,8 +353,8 @@ public class MasterWalManager { } try { for (ServerName serverName : serverNames) { - Path logDir = new Path(this.rootDir, - AbstractFSWALProvider.getWALDirectoryName(serverName.toString())); + Path logDir = getWALDirectoryName(serverName); + // This adds the -splitting suffix to logDir. Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT); // Rename the directory so a rogue RS doesn't create more WALs if (fs.exists(logDir)) { @@ -341,7 +394,7 @@ public class MasterWalManager { */ public void splitLog(final Set serverNames, PathFilter filter) throws IOException { long splitTime = 0, splitLogSize = 0; - List logDirs = getLogDirs(serverNames); + List logDirs = createAndGetLogDirs(serverNames); splitLogManager.handleDeadWorkers(serverNames); splitTime = EnvironmentEdgeManager.currentTime(); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index ee61747532..7f9f441a66 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -34,23 +34,30 @@ import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.Predicate; + +import jdk.nashorn.internal.ir.Assignment; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.ClockOutOfSyncException; import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.MetaTableAccessor; import org.apache.hadoop.hbase.NotServingRegionException; import org.apache.hadoop.hbase.RegionMetrics; +import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.ServerMetrics; import org.apache.hadoop.hbase.ServerMetricsBuilder; import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.YouAreDeadException; import org.apache.hadoop.hbase.client.ClusterConnection; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RetriesExhaustedException; import org.apache.hadoop.hbase.ipc.HBaseRpcController; import org.apache.hadoop.hbase.ipc.RpcControllerFactory; +import org.apache.hadoop.hbase.master.assignment.AssignmentManager; import org.apache.hadoop.hbase.monitoring.MonitoredTask; import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.hadoop.hbase.zookeeper.ZKWatcher; import org.apache.yetus.audience.InterfaceAudience; @@ -247,8 +254,7 @@ public class ServerManager { } @VisibleForTesting - public void regionServerReport(ServerName sn, - ServerMetrics sl) throws YouAreDeadException { + public void regionServerReport(ServerName sn, ServerMetrics sl) throws YouAreDeadException { checkIsDead(sn, "REPORT"); if (null == this.onlineServers.replace(sn, sl)) { // Already have this host+port combo and its just different start code? @@ -528,6 +534,16 @@ public class ServerManager { return ZKUtil.listChildrenNoWatch(zkw, zkw.getZNodePaths().rsZNode); } + /** + * @return True if we should expire serverName + */ + boolean expire(ServerName serverName) { + return this.onlineServers.containsKey(serverName) || + this.deadservers.isDeadServer(serverName) || + this.master.getAssignmentManager().getRegionStates().getServerNode(serverName) != null || + this.master.getMasterWalManager().isWALDirectoryNameWithWALs(serverName); + } + /** * Expire the passed server. Add it to list of dead servers and queue a shutdown processing. * @return True if we queued a ServerCrashProcedure else false if we did not (could happen for @@ -543,11 +559,24 @@ public class ServerManager { } return false; } + // Check if we should bother running an expire! + if (!expire(serverName)) { + LOG.info("Skipping expire; {} is not online, not in deadservers, not in fs -- presuming " + + "long gone server instance!", serverName); + return false; + } + if (this.deadservers.isDeadServer(serverName)) { - LOG.warn("Expiration called on {} but crash processing already in progress", serverName); + LOG.warn("Expiration called on {} but crash processing in progress, serverStateNode={}", + serverName, + this.master.getAssignmentManager().getRegionStates().getServerNode(serverName)); return false; } - moveFromOnlineToDeadServers(serverName); + + if (!moveFromOnlineToDeadServers(serverName)) { + LOG.info("Expiration called on {} but NOT online", serverName); + // Continue. + } // If cluster is going down, yes, servers are going to be expiring; don't // process as a dead server @@ -571,20 +600,24 @@ public class ServerManager { return true; } + /** + * @return Returns true if was online. + */ @VisibleForTesting - public void moveFromOnlineToDeadServers(final ServerName sn) { + public boolean moveFromOnlineToDeadServers(final ServerName sn) { + boolean online = false; synchronized (onlineServers) { - if (!this.onlineServers.containsKey(sn)) { - LOG.trace("Expiration of {} but server not online", sn); - } // Remove the server from the known servers lists and update load info BUT // add to deadservers first; do this so it'll show in dead servers list if // not in online servers list. this.deadservers.add(sn); - this.onlineServers.remove(sn); - onlineServers.notifyAll(); + if (this.onlineServers.remove(sn) != null) { + online = true; + onlineServers.notifyAll(); + } } this.rsAdmins.remove(sn); + return online; } /* diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java index 4d977d3427..414fc98044 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java @@ -452,7 +452,7 @@ public class SplitLogManager { } deadWorkers.add(workerName); } - LOG.info("Dead splitlog worker {}", workerName); + LOG.debug("Dead splitlog worker {}", workerName); } void handleDeadWorkers(Set serverNames) { @@ -462,7 +462,7 @@ public class SplitLogManager { } deadWorkers.addAll(serverNames); } - LOG.info("dead splitlog workers " + serverNames); + LOG.debug("Dead splitlog workers {}", serverNames); } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 04f888c971..49fe93a0ae 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -824,6 +824,10 @@ public class AssignmentManager implements ServerListener { return builder.build(); } + /** + * Called when the RegionServer wants to report a Procedure transition. + * Ends up calling {@link #reportTransition(RegionStateNode, ServerName, TransitionCode, long)} + */ private void updateRegionTransition(final ServerName serverName, final TransitionCode state, final RegionInfo regionInfo, final long seqId) throws PleaseHoldException, UnexpectedStateException { @@ -842,8 +846,7 @@ public class AssignmentManager implements ServerListener { serverName, regionNode, state)); } - final ServerStateNode serverNode = regionStates.getOrCreateServer(serverName); - if (!reportTransition(regionNode, serverNode, state, seqId)) { + if (!reportTransition(regionNode, serverName, state, seqId)) { // Don't log WARN if shutting down cluster; during shutdown. Avoid the below messages: // 2018-08-13 10:45:10,551 WARN ...AssignmentManager: No matching procedure found for // rit=OPEN, location=ve0538.halxg.cloudera.com,16020,1533493000958, @@ -862,10 +865,9 @@ public class AssignmentManager implements ServerListener { } // FYI: regionNode is sometimes synchronized by the caller but not always. - private boolean reportTransition(final RegionStateNode regionNode, - final ServerStateNode serverNode, final TransitionCode state, final long seqId) + private boolean reportTransition(final RegionStateNode regionNode, ServerName serverName, + final TransitionCode state, final long seqId) throws UnexpectedStateException { - final ServerName serverName = serverNode.getServerName(); synchronized (regionNode) { final RegionTransitionProcedure proc = regionNode.getProcedure(); if (proc == null) return false; @@ -954,8 +956,8 @@ public class AssignmentManager implements ServerListener { collect(Collectors.toList())); } - final ServerStateNode serverNode = regionStates.getOrCreateServer(serverName); - + // Make sure there is a ServerStateNode for this server that just checked in. + ServerStateNode serverNode = regionStates.getOrCreateServer(serverName); synchronized (serverNode) { if (!serverNode.isInState(ServerState.ONLINE)) { LOG.warn("Got a report from a server result in state " + serverNode.getState()); @@ -968,7 +970,7 @@ public class AssignmentManager implements ServerListener { LOG.trace("no online region found on " + serverName); } else if (!isMetaLoaded()) { // if we are still on startup, discard the report unless is from someone holding meta - checkOnlineRegionsReportForMeta(serverNode, regionNames); + checkOnlineRegionsReportForMeta(serverName, regionNames); } else { // The Heartbeat updates us of what regions are only. check and verify the state. checkOnlineRegionsReport(serverNode, regionNames); @@ -978,8 +980,7 @@ public class AssignmentManager implements ServerListener { wakeServerReportEvent(serverNode); } - void checkOnlineRegionsReportForMeta(final ServerStateNode serverNode, - final Set regionNames) { + void checkOnlineRegionsReportForMeta(final ServerName serverName, final Set regionNames) { try { for (byte[] regionName: regionNames) { final RegionInfo hri = getMetaRegionFromName(regionName); @@ -993,18 +994,16 @@ public class AssignmentManager implements ServerListener { final RegionStateNode regionNode = regionStates.getOrCreateRegionStateNode(hri); LOG.info("META REPORTED: " + regionNode); - if (!reportTransition(regionNode, serverNode, TransitionCode.OPENED, 0)) { - LOG.warn("META REPORTED but no procedure found (complete?); set location=" + - serverNode.getServerName()); - regionNode.setRegionLocation(serverNode.getServerName()); + if (!reportTransition(regionNode, serverName, TransitionCode.OPENED, 0)) { + LOG.warn("META REPORTED but no procedure found (complete?); set location={}", serverName); + regionNode.setRegionLocation(serverName); } else if (LOG.isTraceEnabled()) { LOG.trace("META REPORTED: " + regionNode); } } } catch (UnexpectedStateException e) { - final ServerName serverName = serverNode.getServerName(); LOG.warn("KILLING " + serverName + ": " + e.getMessage()); - killRegionServer(serverNode); + killRegionServer(serverName); } } @@ -1026,7 +1025,8 @@ public class AssignmentManager implements ServerListener { " but state has otherwise."); } else if (regionNode.isInState(State.OPENING)) { try { - if (!reportTransition(regionNode, serverNode, TransitionCode.OPENED, 0)) { + if (!reportTransition(regionNode, serverNode.getServerName(), + TransitionCode.OPENED, 0)) { LOG.warn(regionNode.toString() + " reported OPEN on server=" + serverName + " but state has otherwise AND NO procedure is running"); } @@ -1048,17 +1048,18 @@ public class AssignmentManager implements ServerListener { } } catch (UnexpectedStateException e) { LOG.warn("Killing " + serverName + ": " + e.getMessage()); - killRegionServer(serverNode); + killRegionServer(serverName); throw (YouAreDeadException)new YouAreDeadException(e.getMessage()).initCause(e); } } protected boolean waitServerReportEvent(ServerName serverName, Procedure proc) { - final ServerStateNode serverNode = regionStates.getOrCreateServer(serverName); - if (serverNode == null) { - LOG.warn("serverName=null; {}", proc); + ServerStateNode ssn = this.regionStates.getServerNode(serverName); + if (ssn == null) { + LOG.warn("Why is ServerStateNode for {} empty at this point? Creating...", serverName); + ssn = this.regionStates.getOrCreateServer(serverName); } - return serverNode.getReportEvent().suspendIfNotReady(proc); + return ssn.getReportEvent().suspendIfNotReady(proc); } protected void wakeServerReportEvent(final ServerStateNode serverNode) { @@ -1269,9 +1270,9 @@ public class AssignmentManager implements ServerListener { regionNode.setLastHost(lastHost); regionNode.setRegionLocation(regionLocation); regionNode.setOpenSeqNum(openSeqNum); - if (localState == State.OPEN) { assert regionLocation != null : "found null region location for " + regionNode; + regionStates.getOrCreateServer(regionNode.getRegionLocation()); regionStates.addRegionToServer(regionNode); } else if (localState == State.OFFLINE || regionInfo.isOffline()) { regionStates.addToOfflineRegions(regionNode); @@ -1281,9 +1282,18 @@ public class AssignmentManager implements ServerListener { // The region is CLOSED and the table is DISABLED/ DISABLING, there is nothing to // schedule; the region is inert. } else { - // These regions should have a procedure in replay + // This is region in CLOSING or OPENING state. + // These regions should have a procedure in replay. + // If they don't, then they will show as STUCK after a while because of the below + // registration and will need intervention by operator to fix. Add them to a server + // even if the server is not around so a SCP cleans them up. + if (regionLocation != null) { + regionStates.getOrCreateServer(regionLocation); + } regionStates.addRegionInTransition(regionNode, null); } + } else { + LOG.info("RIT {}", regionNode); } } } @@ -1821,8 +1831,8 @@ public class AssignmentManager implements ServerListener { wakeServerReportEvent(serverNode); } - private void killRegionServer(final ServerStateNode serverNode) { - master.getServerManager().expireServer(serverNode.getServerName()); + private void killRegionServer(final ServerName serverName) { + master.getServerManager().expireServer(serverName); } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java index 40e82f9534..8f6a2328a8 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java @@ -436,7 +436,8 @@ public class RegionStates { @Override public String toString() { - return String.format("ServerStateNode(%s)", getServerName()); + return String.format("name=%s, state=%s, regionCount=%d", getServerName(), getState(), + getRegionCount()); } } @@ -1109,9 +1110,10 @@ public class RegionStates { * you could mess up online server accounting. TOOD: Review usage and convert * to {@link #getServerNode(ServerName)} where we can. */ - public ServerStateNode getOrCreateServer(final ServerName serverName) { + ServerStateNode getOrCreateServer(final ServerName serverName) { ServerStateNode node = serverMap.get(serverName); if (node == null) { + LOG.trace("CREATING! {}", serverName, new RuntimeException("WHERE AM I?")); node = new ServerStateNode(serverName); ServerStateNode oldNode = serverMap.putIfAbsent(serverName, node); node = oldNode != null ? oldNode : node; @@ -1123,7 +1125,7 @@ public class RegionStates { serverMap.remove(serverName); } - protected ServerStateNode getServerNode(final ServerName serverName) { + public ServerStateNode getServerNode(final ServerName serverName) { return serverMap.get(serverName); } @@ -1137,10 +1139,18 @@ public class RegionStates { return numServers == 0 ? 0.0: (double)totalLoad / (double)numServers; } - public ServerStateNode addRegionToServer(final RegionStateNode regionNode) { - ServerStateNode serverNode = getOrCreateServer(regionNode.getRegionLocation()); - serverNode.addRegion(regionNode); - return serverNode; + /** + * Add reference to region to serverstatenode. + * DOES NOT AUTO-CREATE ServerStateNode instance. + * @return Return serverstatenode or null if none. + */ + ServerStateNode addRegionToServer(final RegionStateNode regionNode) { + ServerStateNode ssn = getServerNode(regionNode.getRegionLocation()); + if (ssn == null) { + return ssn; + } + ssn.addRegion(regionNode); + return ssn; } public boolean isReplicaAvailableForRegion(final RegionInfo info) { @@ -1165,8 +1175,10 @@ public class RegionStates { public ServerStateNode removeRegionFromServer(final ServerName serverName, final RegionStateNode regionNode) { - ServerStateNode serverNode = getOrCreateServer(serverName); - serverNode.removeRegion(regionNode); + ServerStateNode serverNode = getServerNode(serverName); + if (serverNode != null) { + serverNode.removeRegion(regionNode); + } return serverNode; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java index fb6090de84..84cb707786 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java @@ -241,7 +241,7 @@ public abstract class RegionTransitionProcedure public synchronized void remoteCallFailed(final MasterProcedureEnv env, final ServerName serverName, final IOException exception) { final RegionStateNode regionNode = getRegionState(env); - LOG.warn("Remote call failed {}; {}", regionNode.toShortString(), this, exception); + LOG.warn("Remote call failed {} {}", this, regionNode.toShortString(), exception); if (remoteCallFailed(env, regionNode, exception)) { // NOTE: This call to wakeEvent puts this Procedure back on the scheduler. // Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond @@ -262,7 +262,7 @@ public abstract class RegionTransitionProcedure */ protected boolean addToRemoteDispatcher(final MasterProcedureEnv env, final ServerName targetServer) { - LOG.info("Dispatch {}; {}", this, getRegionState(env).toShortString()); + LOG.info("Dispatch {}", this); // Put this procedure into suspended mode to wait on report of state change // from remote regionserver. Means Procedure associated ProcedureEvent is marked not 'ready'. @@ -424,9 +424,9 @@ public abstract class RegionTransitionProcedure // There is no rollback for assignment unless we cancel the operation by // dropping/disabling the table. - LOG.warn("Unhandled state {}; no rollback for assignment! Doing NOTHING!" + - " May need manual intervention. TODO: IS THIS WORKING? {}", - transitionState, this); + LOG.warn("Unhandled state " + transitionState + + "; there is no rollback for assignment unless we cancel the operation by " + + "dropping/disabling the table; doing nothing; {}", this); } protected abstract boolean isRollbackSupported(final RegionTransitionState state); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java index 8a391daa04..589b732d4f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java @@ -137,8 +137,10 @@ public class UnassignProcedure extends RegionTransitionProcedure { throws IOException { UnassignRegionStateData.Builder state = UnassignRegionStateData.newBuilder() .setTransitionState(getTransitionState()) - .setHostingServer(ProtobufUtil.toServerName(this.hostingServer)) .setRegionInfo(ProtobufUtil.toRegionInfo(getRegionInfo())); + if (this.hostingServer != null) { + state.setHostingServer(ProtobufUtil.toServerName(this.hostingServer)); + } if (this.destinationServer != null) { state.setDestinationServer(ProtobufUtil.toServerName(destinationServer)); } @@ -161,9 +163,10 @@ public class UnassignProcedure extends RegionTransitionProcedure { serializer.deserialize(UnassignRegionStateData.class); setTransitionState(state.getTransitionState()); setRegionInfo(ProtobufUtil.toRegionInfo(state.getRegionInfo())); - this.hostingServer = ProtobufUtil.toServerName(state.getHostingServer()); // The 'force' flag is the override flag in unassign. setOverride(state.getForce()); + this.hostingServer = + state.hasHostingServer()? ProtobufUtil.toServerName(state.getHostingServer()): null; if (state.hasDestinationServer()) { this.destinationServer = ProtobufUtil.toServerName(state.getDestinationServer()); } @@ -259,6 +262,7 @@ public class UnassignProcedure extends RegionTransitionProcedure { // This exception comes from ServerCrashProcedure AFTER log splitting. Its a signaling // exception. SCP found this region as a RIT during its processing of the crash. Its call // into here says it is ok to let this procedure go complete. + LOG.info("Safe to let procedure move to next step; {}", this); return true; } if (exception instanceof NotServingRegionException) { @@ -333,6 +337,7 @@ public class UnassignProcedure extends RegionTransitionProcedure { // Return true; wake up the procedure so we can act on proceed. return true; } + LOG.info("Failed expiration and log splitting not done on {}", serverName); } // Return false so this procedure stays in suspended state. It will be woken up by the // ServerCrashProcedure that was scheduled when we called #expireServer above. SCP calls diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionLocationFinder.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionLocationFinder.java index fb7731fa75..f9448020d7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionLocationFinder.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/RegionLocationFinder.java @@ -299,11 +299,14 @@ class RegionLocationFinder { } public void refreshAndWait(Collection hris) { - ArrayList> regionLocationFutures = new ArrayList<>(hris.size()); + ArrayList> regionLocationFutures = + new ArrayList<>(hris.size()); for (RegionInfo hregionInfo : hris) { regionLocationFutures.add(asyncGetBlockDistribution(hregionInfo)); } int index = 0; + LOG.info("Refreshing block distribution cache for {} regions (Can take a while on big cluster)", + hris.size()); for (RegionInfo hregionInfo : hris) { ListenableFuture future = regionLocationFutures .get(index); @@ -318,6 +321,7 @@ class RegionLocationFinder { } index++; } + LOG.info("Finished refreshing block distribution cache for {} regions", hris.size()); } // For test diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterWALManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterWALManager.java new file mode 100644 index 0000000000..97c1bf4f85 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterWALManager.java @@ -0,0 +1,103 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master; + +import static junit.framework.TestCase.assertTrue; +import static org.junit.Assert.assertFalse; + +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.SmallTests; +import org.junit.Before; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.mockito.Mockito; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + + +@Category({MasterTests.class, SmallTests.class}) +public class TestMasterWALManager { + private static final Logger LOG = LoggerFactory.getLogger(TestMasterWALManager.class); + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestMasterWALManager.class); + + private static final HBaseTestingUtility HTU = new HBaseTestingUtility(); + + private MasterWalManager mwm; + private MasterServices masterServices; + + @Before + public void before() throws IOException { + MasterFileSystem mfs = Mockito.mock(MasterFileSystem.class); + Mockito.when(mfs.getWALFileSystem()).thenReturn(HTU.getTestFileSystem()); + Path walRootDir = HTU.createWALRootDir(); + Mockito.when(mfs.getWALRootDir()).thenReturn(walRootDir); + this.masterServices = Mockito.mock(MasterServices.class); + Mockito.when(this.masterServices.getConfiguration()).thenReturn(HTU.getConfiguration()); + Mockito.when(this.masterServices.getMasterFileSystem()).thenReturn(mfs); + Mockito.when(this.masterServices.getServerName()). + thenReturn(ServerName.parseServerName("master.example.org,0123,456")); + this.mwm = new MasterWalManager(this.masterServices); + } + + @Test + public void testIsWALDirectoryNameWithWALs() throws IOException { + ServerName sn = ServerName.parseServerName("x.example.org,1234,5678"); + assertFalse(this.mwm.isWALDirectoryNameWithWALs(sn)); + FileSystem walFS = this.masterServices.getMasterFileSystem().getWALFileSystem(); + Path dir = new Path(this.mwm.getWALDirPath(), sn.toString()); + assertTrue(walFS.mkdirs(dir)); + // Should be false because dir has nothing in it. + assertFalse(this.mwm.isWALDirectoryNameWithWALs(sn)); + Path wal = new Path(dir, "wal"); + FSDataOutputStream stream = walFS.create(wal); + stream.close(); + // Should be true because dir has something in it. + assertTrue(this.mwm.isWALDirectoryNameWithWALs(sn)); + // Make sure works when dir is SPLITTING + Set sns = new HashSet(); + sns.add(sn); + List paths = this.mwm.createAndGetLogDirs(sns); + assertTrue(this.mwm.isWALDirectoryNameWithWALs(sn)); + // Clear the wal from under the server dir and we should get false again. + boolean delete = false; + for (Path path: paths) { + Path p = new Path(path, wal.getName()); + if (walFS.exists(p)) { + assertTrue(walFS.delete(p, true)); + delete = true; + } + } + assertTrue(delete); + assertFalse(this.mwm.isWALDirectoryNameWithWALs(sn)); + } +} -- 2.16.3