From 4cc7950cc1996dbde8e770a26a867ed98bf0cd57 Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Wed, 26 Sep 2018 21:22:46 -0700 Subject: [PATCH] HBASE-21242 [amv2] Miscellaneous minor log and assign procedure create improvements For RIT Duration, do better than print ms/seconds. Remove redundant UI column dedicated to duration when we log it in the status field too. Make bypass log at INFO level. Make it so on complete of subprocedure, we note count of outstanding siblings so we have a clue how much further the parent has to go before it is done (Helpful when hundreds of servers doing SCP). Have the SCP run the AP preflight check before creating an AP; saves creation of thousands of APs during fixup. Don't log tablename three times when reporting remote call failed. If lock is held already, note who has it. Also log after we get lock or if we have to wait rather than log on entrance though we may later have to wait (or we may have just picked up the lock). --- .../apache/hadoop/hbase/master/RegionState.java | 6 ++--- .../apache/hadoop/hbase/procedure2/Procedure.java | 3 +++ .../hadoop/hbase/procedure2/ProcedureExecutor.java | 30 ++++++++++++++-------- .../tmpl/master/AssignmentManagerStatusTmpl.jamon | 3 +-- .../master/procedure/MasterProcedureScheduler.java | 5 +++- .../master/procedure/ServerCrashProcedure.java | 2 +- 6 files changed, 31 insertions(+), 18 deletions(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java index 745e1ea134..ad9d22f055 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java @@ -381,9 +381,9 @@ public class RegionState { public String toDescriptiveString() { long relTime = System.currentTimeMillis() - stamp; return hri.getRegionNameAsString() - + " state=" + state - + ", ts=" + new Date(stamp) + " (" + (relTime/1000) + "s ago)" - + ", server=" + serverName; + + " state=" + state + ", ts=" + new Date(stamp) + " (" + + java.time.Duration.ofMillis(relTime).toString() + + " ago), server=" + serverName; } /** diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java index 08c7ce3bab..9ae1b4c747 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java @@ -876,6 +876,9 @@ public abstract class Procedure implements Comparable 0; } + /** + * @return Count of children outstanding (Badly named). + */ protected synchronized int getChildrenLatch() { return childrenLatch; } diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java index 12520d6a67..dd51bc30a9 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java @@ -1022,25 +1022,25 @@ public class ProcedureExecutor { return false; } - LOG.debug("Begin bypass {} with lockWait={}, force={}", procedure, lockWait, force); + LOG.info("Begin bypass {} with lockWait={}, force={}", procedure, lockWait, force); IdLock.Entry lockEntry = procExecutionLock.tryLockEntry(procedure.getProcId(), lockWait); if (lockEntry == null && !force) { - LOG.debug("Waited {} ms, but {} is still running, skipping bypass with force={}", + LOG.info("Waited {} ms, but {} is still running, skipping bypass with force={}", lockWait, procedure, force); return false; } else if (lockEntry == null) { - LOG.debug("Waited {} ms, but {} is still running, begin bypass with force={}", + LOG.info("Waited {} ms, but {} is still running, begin bypass with force={}", lockWait, procedure, force); } try { // check whether the procedure is already finished if (procedure.isFinished()) { - LOG.debug("{} is already finished, skipping bypass", procedure); + LOG.info("{} is already finished, skipping bypass", procedure); return false; } if (procedure.hasChildren()) { - LOG.debug("{} has children, skipping bypass", procedure); + LOG.info("{} has children, skipping bypass", procedure); return false; } @@ -1048,7 +1048,7 @@ public class ProcedureExecutor { if (procedure.hasParent() && procedure.getState() != ProcedureState.RUNNABLE && procedure.getState() != ProcedureState.WAITING && procedure.getState() != ProcedureState.WAITING_TIMEOUT) { - LOG.debug("Bypassing procedures in RUNNABLE, WAITING and WAITING_TIMEOUT states " + LOG.info("Bypassing procedures in RUNNABLE, WAITING and WAITING_TIMEOUT states " + "(with no parent), {}", procedure); return false; @@ -1059,7 +1059,7 @@ public class ProcedureExecutor { // finished yet Procedure current = procedure; while (current != null) { - LOG.debug("Bypassing {}", current); + LOG.info("Bypassing {}", current); current.bypass(); store.update(procedure); long parentID = current.getParentProcId(); @@ -1079,9 +1079,9 @@ public class ProcedureExecutor { if (lockEntry != null) { // add the procedure to run queue, scheduler.addFront(procedure); - LOG.debug("Bypassing {} and its ancestors successfully, adding to queue", procedure); + LOG.info("Bypassing {} and its ancestors successfully, adding to queue", procedure); } else { - LOG.debug("Bypassing {} and its ancestors successfully, but since it is already running, " + LOG.info("Bypassing {} and its ancestors successfully, but since it is already running, " + "skipping add to queue", procedure); } return true; @@ -1502,9 +1502,17 @@ public class ProcedureExecutor { procStack.release(proc); if (proc.isSuccess()) { - // update metrics on finishing the procedure + // update metrics on finishing the procedure. proc.updateMetricsOnFinish(getEnvironment(), proc.elapsedTime(), true); - LOG.info("Finished " + proc + " in " + StringUtils.humanTimeDiff(proc.elapsedTime())); + // Print out count of outstanding siblings if this procedure has a parent. + Procedure parent = null; + if (proc.hasParent()) { + parent = procedures.get(proc.getParentProcId()); + } + LOG.info("Finished {} in {}{}", + proc, + StringUtils.humanTimeDiff(proc.elapsedTime()), + parent != null? (", unfinishedSiblingCount=" + parent.getChildrenLatch()): ""); // Finalize the procedure state if (proc.getProcId() == rootProcId) { procedureFinished(proc); diff --git a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon index bb3686b298..51dd43808c 100644 --- a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon +++ b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon @@ -73,7 +73,7 @@ int numOfPages = (int) Math.ceil(numOfRITs * 1.0 / ritsPerPage);
- + <%if ritStat.isRegionTwiceOverThreshold(rs.getRegion()) %> @@ -96,7 +96,6 @@ int numOfPages = (int) Math.ceil(numOfRITs * 1.0 / ritsPerPage); - <%java recordItr++; %> diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java index 0d3853130e..49982f72b2 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java @@ -687,7 +687,6 @@ public class MasterProcedureScheduler extends AbstractProcedureScheduler { boolean hasLock = true; final LockAndQueue[] regionLocks = new LockAndQueue[regionInfo.length]; for (int i = 0; i < regionInfo.length; ++i) { - LOG.info("{} checking lock on {}", procedure, regionInfo[i].getEncodedName()); assert regionInfo[i] != null; assert regionInfo[i].getTable() != null; assert regionInfo[i].getTable().equals(table): regionInfo[i] + " " + procedure; @@ -695,12 +694,16 @@ public class MasterProcedureScheduler extends AbstractProcedureScheduler { regionLocks[i] = locking.getRegionLock(regionInfo[i].getEncodedName()); if (!regionLocks[i].tryExclusiveLock(procedure)) { + LOG.info("Waiting xlock for {} held by pid={}", procedure, + regionLocks[i].getExclusiveLockProcIdOwner()); waitProcedure(regionLocks[i], procedure); hasLock = false; while (i-- > 0) { regionLocks[i].releaseExclusiveLock(procedure); } break; + } else { + LOG.info("xlock for {}", procedure); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java index 1fcc6eb6b9..cae5a2ddd4 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hbase.client.RegionReplicaUtil; import org.apache.hadoop.hbase.client.TableState; import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.MasterWalManager; +import org.apache.hadoop.hbase.master.assignment.AssignProcedure; import org.apache.hadoop.hbase.master.assignment.AssignmentManager; import org.apache.hadoop.hbase.master.assignment.RegionStateNode; import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure; @@ -169,7 +170,6 @@ public class ServerCrashProcedure .trace("Assigning regions " + RegionInfo.getShortNameToLog(regionsOnCrashedServer) + ", " + this + "; cycles=" + getCycles()); } - assignRegions(env, regionsOnCrashedServer); } setNextState(ServerCrashState.SERVER_CRASH_FINISH); break; -- 2.16.3
RegionStateRIT time (ms) Retries
StateRetries
<% rs.getRegion().getEncodedName() %> <% RegionInfoDisplay.getDescriptiveNameFromRegionStateForDisplay(rs, assignmentManager.getConfiguration()) %><% (currentTime - rs.getStamp()) %> <% retryStatus %>