From 82e9b80353407db4cf253ada4fbd90c4ad5dfdf2 Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Wed, 26 Sep 2018 21:22:46 -0700 Subject: [PATCH] HBASE-21242 [amv2] Miscellaneous minor log and assign procedure create improvements For RIT Duration, do better than print ms/seconds. Remove redundant UI column dedicated to duration when we log it in the status field too. Make bypass log at INFO level. Make it so on complete of subprocedure, we note count of outstanding siblings so we have a clue how much further the parent has to go before it is done (Helpful when hundreds of servers doing SCP). Have the SCP run the AP preflight check before creating an AP; saves creation of thousands of APs during fixup. Don't log tablename three times when reporting remote call failed. If lock is held already, note who has it. Also log after we get lock or if we have to wait rather than log on entrance though we may later have to wait (or we may have just picked up the lock). --- .../apache/hadoop/hbase/master/RegionState.java | 6 ++-- .../apache/hadoop/hbase/procedure2/Procedure.java | 3 ++ .../hadoop/hbase/procedure2/ProcedureExecutor.java | 30 ++++++++++------ .../tmpl/master/AssignmentManagerStatusTmpl.jamon | 3 +- .../hbase/master/assignment/AssignProcedure.java | 40 ++++++++++++++++++---- .../assignment/RegionTransitionProcedure.java | 3 +- .../hbase/master/assignment/UnassignProcedure.java | 2 +- .../master/procedure/MasterProcedureScheduler.java | 6 +++- .../master/procedure/ServerCrashProcedure.java | 8 +++++ 9 files changed, 74 insertions(+), 27 deletions(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java index 7289ce855a..a1e2ca66c4 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java @@ -371,9 +371,9 @@ public class RegionState { public String toDescriptiveString() { long relTime = System.currentTimeMillis() - stamp; return hri.getRegionNameAsString() - + " state=" + state - + ", ts=" + new Date(stamp) + " (" + (relTime/1000) + "s ago)" - + ", server=" + serverName; + + " state=" + state + ", ts=" + new Date(stamp) + " (" + + java.time.Duration.ofMillis(relTime).toString() + + " ago), server=" + serverName; } /** diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java index a832c783fe..6131f375f5 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java @@ -850,6 +850,9 @@ public abstract class Procedure implements Comparable 0; } + /** + * @return Count of children outstanding (Badly named). + */ protected synchronized int getChildrenLatch() { return childrenLatch; } diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java index 76ce46e222..14b783d243 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java @@ -1014,25 +1014,25 @@ public class ProcedureExecutor { return false; } - LOG.debug("Begin bypass {} with lockWait={}, force={}", procedure, lockWait, force); + LOG.info("Begin bypass {} with lockWait={}, force={}", procedure, lockWait, force); IdLock.Entry lockEntry = procExecutionLock.tryLockEntry(procedure.getProcId(), lockWait); if (lockEntry == null && !force) { - LOG.debug("Waited {} ms, but {} is still running, skipping bypass with force={}", + LOG.info("Waited {} ms, but {} is still running, skipping bypass with force={}", lockWait, procedure, force); return false; } else if (lockEntry == null) { - LOG.debug("Waited {} ms, but {} is still running, begin bypass with force={}", + LOG.info("Waited {} ms, but {} is still running, begin bypass with force={}", lockWait, procedure, force); } try { // check whether the procedure is already finished if (procedure.isFinished()) { - LOG.debug("{} is already finished, skipping bypass", procedure); + LOG.info("{} is already finished, skipping bypass", procedure); return false; } if (procedure.hasChildren()) { - LOG.debug("{} has children, skipping bypass", procedure); + LOG.info("{} has children, skipping bypass", procedure); return false; } @@ -1040,7 +1040,7 @@ public class ProcedureExecutor { if (procedure.hasParent() && procedure.getState() != ProcedureState.RUNNABLE && procedure.getState() != ProcedureState.WAITING && procedure.getState() != ProcedureState.WAITING_TIMEOUT) { - LOG.debug("Bypassing procedures in RUNNABLE, WAITING and WAITING_TIMEOUT states " + LOG.info("Bypassing procedures in RUNNABLE, WAITING and WAITING_TIMEOUT states " + "(with no parent), {}", procedure); return false; @@ -1051,7 +1051,7 @@ public class ProcedureExecutor { // finished yet Procedure current = procedure; while (current != null) { - LOG.debug("Bypassing {}", current); + LOG.info("Bypassing {}", current); current.bypass(); store.update(procedure); long parentID = current.getParentProcId(); @@ -1071,9 +1071,9 @@ public class ProcedureExecutor { if (lockEntry != null) { // add the procedure to run queue, scheduler.addFront(procedure); - LOG.debug("Bypassing {} and its ancestors successfully, adding to queue", procedure); + LOG.info("Bypassing {} and its ancestors successfully, adding to queue", procedure); } else { - LOG.debug("Bypassing {} and its ancestors successfully, but since it is already running, " + LOG.info("Bypassing {} and its ancestors successfully, but since it is already running, " + "skipping add to queue", procedure); } return true; @@ -1483,9 +1483,17 @@ public class ProcedureExecutor { procStack.release(proc); if (proc.isSuccess()) { - // update metrics on finishing the procedure + // update metrics on finishing the procedure. proc.updateMetricsOnFinish(getEnvironment(), proc.elapsedTime(), true); - LOG.info("Finished " + proc + " in " + StringUtils.humanTimeDiff(proc.elapsedTime())); + // Print out count of outstanding siblings if this procedure has a parent. + Procedure parent = null; + if (proc.hasParent()) { + parent = procedures.get(proc.getParentProcId()); + } + LOG.info("Finished {} in {}{}", + proc, + StringUtils.humanTimeDiff(proc.elapsedTime()), + parent != null? (", siblings=" + parent.getChildrenLatch()): ""); // Finalize the procedure state if (proc.getProcId() == rootProcId) { procedureFinished(proc); diff --git a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon index bb3686b298..51dd43808c 100644 --- a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon +++ b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon @@ -73,7 +73,7 @@ int numOfPages = (int) Math.ceil(numOfRITs * 1.0 / ritsPerPage);
- + <%if ritStat.isRegionTwiceOverThreshold(rs.getRegion()) %> @@ -96,7 +96,6 @@ int numOfPages = (int) Math.ceil(numOfRITs * 1.0 / ritsPerPage); - <%java recordItr++; %> diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java index 55aee4ad07..8b4ed8eae3 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java @@ -26,7 +26,9 @@ import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RetriesExhaustedException; +import org.apache.hadoop.hbase.client.TableState; import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; +import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.RegionState.State; import org.apache.hadoop.hbase.master.TableStateManager; import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode; @@ -156,19 +158,43 @@ public class AssignProcedure extends RegionTransitionProcedure { } } - @Override - protected boolean startTransition(final MasterProcedureEnv env, final RegionStateNode regionNode) + /** + * Used by ServerCrashProcedure too skip creating Assigns if not needed. + * @return Skip out on the assign; returns 'true'/assign if exception. + */ + public static boolean assign(MasterServices masterServices, RegionInfo ri) { + try { + return assign(masterServices, + masterServices.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(ri)); + } catch (IOException e) { + LOG.warn("Letting assign proceed", e); + } + return true; + } + + protected static boolean assign(MasterServices masterServices, final RegionStateNode regionNode) throws IOException { // If the region is already open we can't do much... - if (regionNode.isInState(State.OPEN) && isServerOnline(env, regionNode)) { - LOG.info("Assigned, not reassigning; " + this + "; " + regionNode.toShortString()); + if (regionNode.isInState(State.OPEN) && + masterServices.getServerManager().isServerOnline(regionNode.getRegionLocation())) { + LOG.info("Assigned, not reassigning {}", regionNode.toShortString()); return false; } // Don't assign if table is in disabling or disabled state. - TableStateManager tsm = env.getMasterServices().getTableStateManager(); + TableStateManager tsm = masterServices.getTableStateManager(); TableName tn = regionNode.getRegionInfo().getTable(); - if (tsm.getTableState(tn).isDisabledOrDisabling()) { - LOG.info("Table " + tn + " state=" + tsm.getTableState(tn) + ", skipping " + this); + TableState ts = tsm.getTableState(tn); + if (ts.isDisabledOrDisabling()) { + LOG.info("{} so SKIPPING assign of {}", ts, regionNode.getRegionInfo().getEncodedName()); + return false; + } + return true; + } + + @Override + protected boolean startTransition(final MasterProcedureEnv env, final RegionStateNode regionNode) + throws IOException { + if (!assign(env.getMasterServices(), regionNode)) { return false; } // If the region is SPLIT, we can't assign it. But state might be CLOSED, rather than diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java index 3b0735e46e..2c0a026c18 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java @@ -217,8 +217,7 @@ public abstract class RegionTransitionProcedure public synchronized void remoteCallFailed(final MasterProcedureEnv env, final ServerName serverName, final IOException exception) { final RegionStateNode regionNode = getRegionState(env); - LOG.warn("Remote call failed {}; {}; {}; exception={}", serverName, - this, regionNode.toShortString(), exception.getClass().getSimpleName(), exception); + LOG.warn("Remote call failed {}; {}", regionNode.toShortString(), this, exception); if (remoteCallFailed(env, regionNode, exception)) { // NOTE: This call to wakeEvent puts this Procedure back on the scheduler. // Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java index 4f58a0f305..a8762aeaaa 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java @@ -310,7 +310,7 @@ public class UnassignProcedure extends RegionTransitionProcedure { exception.getClass().getSimpleName()); if (!env.getMasterServices().getServerManager().expireServer(serverName)) { // Failed to queue an expire. Lots of possible reasons including it may be already expired. - // In ServerCrashProcedure and RecoverMetaProcedure, there is a handleRIT stage where we + // In ServerCrashProcedure, there is a handleRIT stage where we // will iterator over all the RIT procedures for the related regions of a crashed RS and // fail them with ServerCrashException. You can see the isSafeToProceed method above for // more details. diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java index 12b4267bb3..5a0e9af64a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java @@ -686,7 +686,6 @@ public class MasterProcedureScheduler extends AbstractProcedureScheduler { boolean hasLock = true; final LockAndQueue[] regionLocks = new LockAndQueue[regionInfo.length]; for (int i = 0; i < regionInfo.length; ++i) { - LOG.info("{} checking lock on {}", procedure, regionInfo[i].getEncodedName()); assert regionInfo[i] != null; assert regionInfo[i].getTable() != null; assert regionInfo[i].getTable().equals(table): regionInfo[i] + " " + procedure; @@ -694,12 +693,17 @@ public class MasterProcedureScheduler extends AbstractProcedureScheduler { regionLocks[i] = locking.getRegionLock(regionInfo[i].getEncodedName()); if (!regionLocks[i].tryExclusiveLock(procedure)) { + LOG.info("{} waiting on {} xlock held by pid={}", + procedure, regionInfo[i].getEncodedName(), + regionLocks[i].getExclusiveLockProcIdOwner()); waitProcedure(regionLocks[i], procedure); hasLock = false; while (i-- > 0) { regionLocks[i].releaseExclusiveLock(procedure); } break; + } else { + LOG.info("{} took {} xlock", procedure, regionInfo[i].getEncodedName()); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java index 775c8c2bea..6ec0079875 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java @@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.client.RegionInfoBuilder; import org.apache.hadoop.hbase.client.RegionReplicaUtil; import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.MasterWalManager; +import org.apache.hadoop.hbase.master.assignment.AssignProcedure; import org.apache.hadoop.hbase.master.assignment.AssignmentManager; import org.apache.hadoop.hbase.master.assignment.RegionTransitionProcedure; import org.apache.hadoop.hbase.procedure2.ProcedureMetrics; @@ -176,6 +177,13 @@ public class ServerCrashProcedure // it does the check by calling am#isLogSplittingDone. List toAssign = handleRIT(env, regionsOnCrashedServer); AssignmentManager am = env.getAssignmentManager(); + // Do not create assigns for Regions on disabling or disabled Tables. + // We do this inside in the AssignProcedure. + int size = toAssign.size(); + if (toAssign.removeIf(r -> !AssignProcedure.assign(env.getMasterServices(), r))) { + LOG.debug("Dropped {} assigns because against disabling/disabled tables", + size - toAssign.size()); + } // CreateAssignProcedure will try to use the old location for the region deploy. addChildProcedure(am.createAssignProcedures(toAssign)); setNextState(ServerCrashState.SERVER_CRASH_HANDLE_RIT2); -- 2.16.3
RegionStateRIT time (ms) Retries
StateRetries
<% rs.getRegion().getEncodedName() %> <% RegionInfoDisplay.getDescriptiveNameFromRegionStateForDisplay(rs, assignmentManager.getConfiguration()) %><% (currentTime - rs.getStamp()) %> <% retryStatus %>