From 9d6164684b2c6915655aa2364322cc83cc5d0f83 Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Wed, 5 Sep 2018 15:26:16 -0700 Subject: [PATCH] HBASE-21155 Save on a few log strings and some churn in wal splitter by skipping out early if no logs in dir --- .../org/apache/hadoop/hbase/master/HMaster.java | 5 +- .../hadoop/hbase/master/MasterWalManager.java | 5 ++ .../hadoop/hbase/master/RegionServerTracker.java | 7 ++- .../apache/hadoop/hbase/master/ServerManager.java | 6 +-- .../hadoop/hbase/master/SplitLogManager.java | 54 ++++++++++++---------- 5 files changed, 46 insertions(+), 31 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 2a92104e7e..6e2c3b8b41 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -574,7 +574,7 @@ public class HMaster extends HRegionServer implements MasterServices { abort(error, t); } } - })); + }), getName() + ":becomeActiveMaster"); } // Fall in here even if we have been aborted. Need to run the shutdown services and // the super run call will do this for us. @@ -879,6 +879,9 @@ public class HMaster extends HRegionServer implements MasterServices { // Create Assignment Manager this.assignmentManager = new AssignmentManager(this); this.assignmentManager.start(); + // Start RegionServerTracker with listing of servers found with exiting SCPs -- these should + // be registered in the deadServers set -- and with the list of servernames out on the + // filesystem that COULD BE 'alive' (we'll schedule SCPs for each and let SCP figure it out). this.regionServerTracker = new RegionServerTracker(zooKeeper, this, this.serverManager); this.regionServerTracker.start( procedureExecutor.getProcedures().stream().filter(p -> p instanceof ServerCrashProcedure) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java index 2dc891872c..864e2af6c1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java @@ -143,6 +143,11 @@ public class MasterWalManager { return this.fsOk; } + /** + * @return Return listing of ServerNames found in the filesystem under the WAL directory + * that COULD BE 'alive'; excludes those that have a '-splitting' suffix as these are already + * being split -- they cannot be 'alive'. + */ public Set getLiveServersFromWALDir() throws IOException { Path walDirPath = new Path(rootDir, HConstants.HREGION_LOGDIR_NAME); FileStatus[] walDirForLiveServers = FSUtils.listStatus(fs, walDirPath, diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java index db46ef06e4..135fa82b3d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java @@ -111,13 +111,16 @@ public class RegionServerTracker extends ZKListener { * In this method, we will also construct the region server sets in {@link ServerManager}. If a * region server is dead between the crash of the previous master instance and the start of the * current master instance, we will schedule a SCP for it. This is done in - * {@link ServerManager#findOutDeadServersAndProcess(Set, Set)}, we call it here under the lock + * {@link ServerManager#findDeadServersAndProcess(Set, Set)}, we call it here under the lock * protection to prevent concurrency issues with server expiration operation. * @param deadServersFromPE the region servers which already have SCP associated. * @param liveServersFromWALDir the live region servers from wal directory. */ public void start(Set deadServersFromPE, Set liveServersFromWALDir) throws KeeperException, IOException { + LOG.info("Starting RegionServerTracker; {} have existing ServerCrashProcedures, {} " + + "possibly 'live' servers.", deadServersFromPE == null? 0: deadServersFromPE.size(), + liveServersFromWALDir == null? 0: liveServersFromWALDir.size()); watcher.registerListener(this); synchronized (this) { List servers = @@ -132,7 +135,7 @@ public class RegionServerTracker extends ZKListener { info.getVersionInfo().getVersion()) : ServerMetricsBuilder.of(serverName); serverManager.checkAndRecordNewServer(serverName, serverMetrics); } - serverManager.findOutDeadServersAndProcess(deadServersFromPE, liveServersFromWALDir); + serverManager.findDeadServersAndProcess(deadServersFromPE, liveServersFromWALDir); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 2b0cff1c15..ee61747532 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -313,10 +313,10 @@ public class ServerManager { *

* Must be called inside the initialization method of {@code RegionServerTracker} to avoid * concurrency issue. - * @param deadServersFromPE the region servers which already have SCP associated. + * @param deadServersFromPE the region servers which already have a SCP associated. * @param liveServersFromWALDir the live region servers from wal directory. */ - void findOutDeadServersAndProcess(Set deadServersFromPE, + void findDeadServersAndProcess(Set deadServersFromPE, Set liveServersFromWALDir) { deadServersFromPE.forEach(deadservers::add); liveServersFromWALDir.stream().filter(sn -> !onlineServers.containsKey(sn)) @@ -575,7 +575,7 @@ public class ServerManager { public void moveFromOnlineToDeadServers(final ServerName sn) { synchronized (onlineServers) { if (!this.onlineServers.containsKey(sn)) { - LOG.warn("Expiration of " + sn + " but server not online"); + LOG.trace("Expiration of {} but server not online", sn); } // Remove the server from the known servers lists and update load info BUT // add to deadservers first; do this so it'll show in dead servers list if diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java index 2e2f8bf47d..1e4e2ce17e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java @@ -174,7 +174,7 @@ public class SplitLogManager { } FileStatus[] logfiles = FSUtils.listStatus(fs, logDir, filter); if (logfiles == null || logfiles.length == 0) { - LOG.info(logDir + " is empty dir, no logs to split"); + LOG.info(logDir + " dir is empty, no logs to split."); } else { Collections.addAll(fileStatus, logfiles); } @@ -235,29 +235,33 @@ public class SplitLogManager { PathFilter filter) throws IOException { MonitoredTask status = TaskMonitor.get().createStatus("Doing distributed log split in " + logDirs + " for serverName=" + serverNames); - FileStatus[] logfiles = getFileList(logDirs, filter); - status.setStatus("Checking directory contents..."); - SplitLogCounters.tot_mgr_log_split_batch_start.increment(); - LOG.info("Started splitting " + logfiles.length + " logs in " + logDirs + - " for " + serverNames); - long t = EnvironmentEdgeManager.currentTime(); long totalSize = 0; - TaskBatch batch = new TaskBatch(); - for (FileStatus lf : logfiles) { - // TODO If the log file is still being written to - which is most likely - // the case for the last log file - then its length will show up here - // as zero. The size of such a file can only be retrieved after - // recover-lease is done. totalSize will be under in most cases and the - // metrics that it drives will also be under-reported. - totalSize += lf.getLen(); - String pathToLog = FSUtils.removeWALRootPath(lf.getPath(), conf); - if (!enqueueSplitTask(pathToLog, batch)) { - throw new IOException("duplicate log split scheduled for " + lf.getPath()); + TaskBatch batch = null; + long startTime = 0; + FileStatus[] logfiles = getFileList(logDirs, filter); + if (logfiles.length != 0) { + status.setStatus("Checking directory contents..."); + SplitLogCounters.tot_mgr_log_split_batch_start.increment(); + LOG.info("Started splitting " + logfiles.length + " logs in " + logDirs + + " for " + serverNames); + startTime = EnvironmentEdgeManager.currentTime(); + batch = new TaskBatch(); + for (FileStatus lf : logfiles) { + // TODO If the log file is still being written to - which is most likely + // the case for the last log file - then its length will show up here + // as zero. The size of such a file can only be retrieved after + // recover-lease is done. totalSize will be under in most cases and the + // metrics that it drives will also be under-reported. + totalSize += lf.getLen(); + String pathToLog = FSUtils.removeWALRootPath(lf.getPath(), conf); + if (!enqueueSplitTask(pathToLog, batch)) { + throw new IOException("duplicate log split scheduled for " + lf.getPath()); + } } + waitForSplittingCompletion(batch, status); } - waitForSplittingCompletion(batch, status); - if (batch.done != batch.installed) { + if (batch != null && batch.done != batch.installed) { batch.isDead = true; SplitLogCounters.tot_mgr_log_split_batch_err.increment(); LOG.warn("error while splitting logs in " + logDirs + " installed = " + batch.installed @@ -285,10 +289,10 @@ public class SplitLogManager { } SplitLogCounters.tot_mgr_log_split_batch_success.increment(); } - String msg = - "finished splitting (more than or equal to) " + totalSize + " bytes in " + batch.installed - + " log files in " + logDirs + " in " - + (EnvironmentEdgeManager.currentTime() - t) + "ms"; + String msg = "Finished splitting (more than or equal to) " + totalSize + + " bytes in " + ((batch == null)? 0: batch.installed) + + " log files in " + logDirs + " in " + + ((startTime == -1)? startTime: (EnvironmentEdgeManager.currentTime() - startTime)) + "ms"; status.markComplete(msg); LOG.info(msg); return totalSize; @@ -448,7 +452,7 @@ public class SplitLogManager { } deadWorkers.add(workerName); } - LOG.info("dead splitlog worker " + workerName); + LOG.info("Dead splitlog worker {}", workerName); } void handleDeadWorkers(Set serverNames) { -- 2.16.3