From b8adc6b4ad4b0ee793bf5207f8f0f877fdb1d4a2 Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Wed, 26 Sep 2018 21:22:46 -0700 Subject: [PATCH] HBASE-21242 [amv2] Miscellaneous minor log and assign procedure create improvements For RIT Duration, do better than print ms/seconds. Remove redundant UI column dedicated to duration when we log it in the status field too. Make bypass log at INFO level. Make it so on complete of subprocedure, we note count of outstanding siblings so we have a clue how much further the parent has to go before it is done (Helpful when hundreds of servers doing SCP). Have the SCP run the AP preflight check before creating an AP; saves creation of thousands of APs during fixup. Don't log tablename three times when reporting remote call failed. If lock is held already, note who has it. Also log after we get lock or if we have to wait rather than log on entrance though we may later have to wait (or we may have just picked up the lock). --- .../apache/hadoop/hbase/master/RegionState.java | 6 ++-- .../apache/hadoop/hbase/procedure2/Procedure.java | 3 ++ .../hadoop/hbase/procedure2/ProcedureExecutor.java | 32 ++++++++++++++-------- .../tmpl/master/AssignmentManagerStatusTmpl.jamon | 3 +- .../master/procedure/ServerCrashProcedure.java | 2 +- .../hadoop/hbase/regionserver/TestHRegionInfo.java | 7 ++++- 6 files changed, 34 insertions(+), 19 deletions(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java index 745e1ea134..ad9d22f055 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java @@ -381,9 +381,9 @@ public class RegionState { public String toDescriptiveString() { long relTime = System.currentTimeMillis() - stamp; return hri.getRegionNameAsString() - + " state=" + state - + ", ts=" + new Date(stamp) + " (" + (relTime/1000) + "s ago)" - + ", server=" + serverName; + + " state=" + state + ", ts=" + new Date(stamp) + " (" + + java.time.Duration.ofMillis(relTime).toString() + + " ago), server=" + serverName; } /** diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java index 089caa06ff..5428105358 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java @@ -884,6 +884,9 @@ public abstract class Procedure implements Comparable 0; } + /** + * @return Count of children outstanding (Badly named). + */ protected synchronized int getChildrenLatch() { return childrenLatch; } diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java index db58ac779c..f7b3ed0fe5 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java @@ -1067,25 +1067,25 @@ public class ProcedureExecutor { Preconditions.checkArgument(lockWait > 0, "lockWait should be positive"); final Procedure procedure = getProcedure(pid); if (procedure == null) { - LOG.debug("Procedure pid={} does not exist, skipping bypass", pid); + LOG.info("Procedure pid={} does not exist, skipping bypass", pid); return false; } - LOG.debug("Begin bypass {} with lockWait={}, override={}, recursive={}", + LOG.info("Begin bypass {} with lockWait={}, override={}, recursive={}", procedure, lockWait, override, recursive); IdLock.Entry lockEntry = procExecutionLock.tryLockEntry(procedure.getProcId(), lockWait); if (lockEntry == null && !override) { - LOG.debug("Waited {} ms, but {} is still running, skipping bypass with force={}", + LOG.info("Waited {} ms, but {} is still running, skipping bypass with force={}", lockWait, procedure, override); return false; } else if (lockEntry == null) { - LOG.debug("Waited {} ms, but {} is still running, begin bypass with force={}", + LOG.info("Waited {} ms, but {} is still running, begin bypass with force={}", lockWait, procedure, override); } try { // check whether the procedure is already finished if (procedure.isFinished()) { - LOG.debug("{} is already finished, skipping bypass", procedure); + LOG.info("{} is already finished, skipping bypass", procedure); return false; } @@ -1106,7 +1106,7 @@ public class ProcedureExecutor { } }); } else { - LOG.debug("{} has children, skipping bypass", procedure); + LOG.info("{} has children, skipping bypass", procedure); return false; } } @@ -1115,7 +1115,7 @@ public class ProcedureExecutor { if (procedure.hasParent() && procedure.getState() != ProcedureState.RUNNABLE && procedure.getState() != ProcedureState.WAITING && procedure.getState() != ProcedureState.WAITING_TIMEOUT) { - LOG.debug("Bypassing procedures in RUNNABLE, WAITING and WAITING_TIMEOUT states " + LOG.info("Bypassing procedures in RUNNABLE, WAITING and WAITING_TIMEOUT states " + "(with no parent), {}", procedure); // Question: how is the bypass done here? @@ -1127,7 +1127,7 @@ public class ProcedureExecutor { // finished yet Procedure current = procedure; while (current != null) { - LOG.debug("Bypassing {}", current); + LOG.info("Bypassing {}", current); current.bypass(getEnvironment()); store.update(procedure); long parentID = current.getParentProcId(); @@ -1147,9 +1147,9 @@ public class ProcedureExecutor { if (lockEntry != null) { // add the procedure to run queue, scheduler.addFront(procedure); - LOG.debug("Bypassing {} and its ancestors successfully, adding to queue", procedure); + LOG.info("Bypassing {} and its ancestors successfully, adding to queue", procedure); } else { - LOG.debug("Bypassing {} and its ancestors successfully, but since it is already running, " + LOG.info("Bypassing {} and its ancestors successfully, but since it is already running, " + "skipping add to queue", procedure); } return true; @@ -1570,9 +1570,17 @@ public class ProcedureExecutor { procStack.release(proc); if (proc.isSuccess()) { - // update metrics on finishing the procedure + // update metrics on finishing the procedure. proc.updateMetricsOnFinish(getEnvironment(), proc.elapsedTime(), true); - LOG.info("Finished " + proc + " in " + StringUtils.humanTimeDiff(proc.elapsedTime())); + // Print out count of outstanding siblings if this procedure has a parent. + Procedure parent = null; + if (proc.hasParent()) { + parent = procedures.get(proc.getParentProcId()); + } + LOG.info("Finished {} in {}{}", + proc, + StringUtils.humanTimeDiff(proc.elapsedTime()), + parent != null? (", unfinishedSiblingCount=" + parent.getChildrenLatch()): ""); // Finalize the procedure state if (proc.getProcId() == rootProcId) { procedureFinished(proc); diff --git a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon index b94ba4338e..9c335dc8d9 100644 --- a/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon +++ b/hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon @@ -73,7 +73,7 @@ int numOfPages = (int) Math.ceil(numOfRITs * 1.0 / ritsPerPage);
- + <%if ritStat.isRegionTwiceOverThreshold(rs.getRegion()) %> @@ -96,7 +96,6 @@ int numOfPages = (int) Math.ceil(numOfRITs * 1.0 / ritsPerPage); - <%java recordItr++; %> diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java index 1fcc6eb6b9..cae5a2ddd4 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hbase.client.RegionReplicaUtil; import org.apache.hadoop.hbase.client.TableState; import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.MasterWalManager; +import org.apache.hadoop.hbase.master.assignment.AssignProcedure; import org.apache.hadoop.hbase.master.assignment.AssignmentManager; import org.apache.hadoop.hbase.master.assignment.RegionStateNode; import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure; @@ -169,7 +170,6 @@ public class ServerCrashProcedure .trace("Assigning regions " + RegionInfo.getShortNameToLog(regionsOnCrashedServer) + ", " + this + "; cycles=" + getCycles()); } - assignRegions(env, regionsOnCrashedServer); } setNextState(ServerCrashState.SERVER_CRASH_FINISH); break; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegionInfo.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegionInfo.java index 50b675dfb0..48b0ff3560 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegionInfo.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegionInfo.java @@ -339,7 +339,12 @@ public class TestHRegionInfo { origDesc.indexOf(Bytes.toStringBinary(startKey)) + Bytes.toStringBinary(startKey).length()); assert(firstPart.equals(firstPartOrig)); - assert(secondPart.equals(secondPartOrig)); + // The elapsed time may be different in the two Strings since they were calculated at different + // times... so, don't include that portion when we compare. It starts with a '('. + int indexOfElapsedTime = secondPart.indexOf("("); + assertTrue(indexOfElapsedTime > 0); + assert(secondPart.substring(0, indexOfElapsedTime). + equals(secondPartOrig.substring(0, indexOfElapsedTime))); } private void checkEquality(HRegionInfo h, Configuration conf) throws IOException { -- 2.16.3
RegionStateRIT time (ms) Retries
StateRetries
<% rs.getRegion().getEncodedName() %> <% RegionInfoDisplay.getDescriptiveNameFromRegionStateForDisplay(rs, assignmentManager.getConfiguration()) %><% (currentTime - rs.getStamp()) %> <% retryStatus %>