From 3c87893c1c247fff1d9d34bf4840d9686dc869ba Mon Sep 17 00:00:00 2001 From: zhangduo Date: Mon, 22 Oct 2018 21:25:17 +0800 Subject: [PATCH] HBASE-21352 Polish the rollback implementation in ProcedureExecutor --- .../hadoop/hbase/procedure2/Procedure.java | 4 + .../hbase/procedure2/ProcedureExecutor.java | 192 ++++++++++-------- .../hbase/procedure2/RootProcedureState.java | 56 +++-- .../TransitRegionStateProcedure.java | 12 +- 4 files changed, 164 insertions(+), 100 deletions(-) diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java index d4d945ddbe..878588857a 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java @@ -899,6 +899,10 @@ public abstract class Procedure implements Comparable 1) { stackIndexes = Arrays.copyOf(stackIndexes, stackIndexes.length - 1); diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java index 438b495365..d1df1e21f2 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java @@ -1481,95 +1481,59 @@ public class ProcedureExecutor { LOG.debug("{} is already finished, skipping execution", proc); return; } - final Long rootProcId = getRootProcedureId(proc); + Long rootProcId = getRootProcedureId(proc); if (rootProcId == null) { - // The 'proc' was ready to run but the root procedure was rolledback - LOG.warn("Rollback because parent is done/rolledback proc=" + proc); - executeRollback(proc); + LOG.warn("Can not find root procedure for {}", proc); return; } RootProcedureState procStack = rollbackStack.get(rootProcId); if (procStack == null) { - LOG.warn("RootProcedureState is null for " + proc.getProcId()); + LOG.warn("RootProcedureState is null for {}", proc); + return; + } + // Try to acquire the execution + if (!procStack.acquire(proc)) { + tryRollback(proc, rootProcId, procStack); return; } - do { - // Try to acquire the execution - if (!procStack.acquire(proc)) { - if (procStack.setRollback()) { - // we have the 'rollback-lock' we can start rollingback - switch (executeRollback(rootProcId, procStack)) { - case LOCK_ACQUIRED: - break; - case LOCK_YIELD_WAIT: - procStack.unsetRollback(); - scheduler.yield(proc); - break; - case LOCK_EVENT_WAIT: - LOG.info("LOCK_EVENT_WAIT rollback..." + proc); - procStack.unsetRollback(); - break; - default: - throw new UnsupportedOperationException(); - } - } else { - // if we can't rollback means that some child is still running. - // the rollback will be executed after all the children are done. - // If the procedure was never executed, remove and mark it as rolledback. - if (!proc.wasExecuted()) { - switch (executeRollback(proc)) { - case LOCK_ACQUIRED: - break; - case LOCK_YIELD_WAIT: - scheduler.yield(proc); - break; - case LOCK_EVENT_WAIT: - LOG.info("LOCK_EVENT_WAIT can't rollback child running?..." + proc); - break; - default: - throw new UnsupportedOperationException(); - } - } - } - break; - } - - // Execute the procedure - assert proc.getState() == ProcedureState.RUNNABLE : proc; - // Note that lock is NOT about concurrency but rather about ensuring - // ownership of a procedure of an entity such as a region or table - LockState lockState = acquireLock(proc); - switch (lockState) { - case LOCK_ACQUIRED: - execProcedure(procStack, proc); - break; - case LOCK_YIELD_WAIT: - LOG.info(lockState + " " + proc); - scheduler.yield(proc); - break; - case LOCK_EVENT_WAIT: - // Someone will wake us up when the lock is available - LOG.debug(lockState + " " + proc); - break; - default: - throw new UnsupportedOperationException(); - } - procStack.release(proc); - if (proc.isSuccess()) { - // update metrics on finishing the procedure - proc.updateMetricsOnFinish(getEnvironment(), proc.elapsedTime(), true); - LOG.info("Finished " + proc + " in " + StringUtils.humanTimeDiff(proc.elapsedTime())); - // Finalize the procedure state - if (proc.getProcId() == rootProcId) { - procedureFinished(proc); - } else { - execCompletionCleanup(proc); - } + // Execute the procedure + assert proc.getState() == ProcedureState.RUNNABLE : proc; + // Note that lock is NOT about concurrency but rather about ensuring + // ownership of a procedure of an entity such as a region or table + LockState lockState = acquireLock(proc); + switch (lockState) { + case LOCK_ACQUIRED: + execProcedure(procStack, proc); + break; + case LOCK_YIELD_WAIT: + LOG.info(lockState + " " + proc); + scheduler.yield(proc); + break; + case LOCK_EVENT_WAIT: + // Someone will wake us up when the lock is available + LOG.debug(lockState + " " + proc); break; + default: + throw new UnsupportedOperationException(); + } + procStack.release(proc); + + if (proc.isSuccess()) { + // update metrics on finishing the procedure + proc.updateMetricsOnFinish(getEnvironment(), proc.elapsedTime(), true); + LOG.info("Finished " + proc + " in " + StringUtils.humanTimeDiff(proc.elapsedTime())); + // Finalize the procedure state + if (proc.getProcId() == rootProcId) { + procedureFinished(proc); + } else { + execCompletionCleanup(proc); } - } while (procStack.isFailed()); + } else if (procStack.isFailed()) { + // TODO: change to use proc.isFailed + tryRollback(proc, rootProcId, procStack); + } } private LockState acquireLock(Procedure proc) { @@ -1591,6 +1555,58 @@ public class ProcedureExecutor { } } + private void tryRollback(Procedure proc, long rootProcId, + RootProcedureState procStack) { + if (!proc.wasExecuted()) { + // If the procedure was never executed, just rollback it, and one step is enough. + // XXX: this is a bit strange, as we do not have any stack index yet, which means in a normal + // rollback we should not be rolled back... But maybe no harm, and we truly need to delete the + // procedure, we is part of the rollbackOneStep. + switch (rollbackOneStep(proc)) { + case LOCK_ACQUIRED: + break; + case LOCK_YIELD_WAIT: + scheduler.yield(proc); + break; + case LOCK_EVENT_WAIT: + LOG.info("LOCK_EVENT_WAIT can't rollback child running?..." + proc); + break; + default: + throw new IllegalArgumentException("unrecognized set rollbackOneStep return value"); + } + return; + } + switch (procStack.setRollback()) { + case SUCCESS: + switch (executeRollback(rootProcId, procStack)) { + case LOCK_ACQUIRED: + break; + case LOCK_YIELD_WAIT: + procStack.unsetRollback(); + scheduler.yield(proc); + break; + case LOCK_EVENT_WAIT: + LOG.info("LOCK_EVENT_WAIT rollback..." + proc); + procStack.unsetRollback(); + break; + default: + throw new IllegalArgumentException("unrecognized set executeRollback return value"); + } + return; + case ALREADY_START_ROLLINGBACK: + // another procedure will take care of the rollback, just do nothing + return; + case CAN_NOT_ROLLBACK_YET: + // This usually because there are still running sub procedures, so here we do a yield to + // retry later. + // TODO: add backoff, otherwise we may consume too many CPUs. + scheduler.yield(proc); + return; + default: + throw new IllegalArgumentException("unrecognized set rollback return value"); + } + } + /** * Execute the rollback of the full procedure stack. Once the procedure is rolledback, the * root-procedure will be visible as finished to user, and the result will be the fatal exception. @@ -1632,7 +1648,7 @@ public class ProcedureExecutor { return lockState; } - lockState = executeRollback(proc); + lockState = rollbackOneStep(proc); releaseLock(proc, false); boolean abortRollback = lockState != LockState.LOCK_ACQUIRED; abortRollback |= !isRunning() || !store.isRunning(); @@ -1657,14 +1673,15 @@ public class ProcedureExecutor { } // Finalize the procedure state - LOG.info("Rolled back " + rootProc + - " exec-time=" + StringUtils.humanTimeDiff(rootProc.elapsedTime())); + LOG.info("Rolled back " + rootProc + " exec-time=" + + StringUtils.humanTimeDiff(rootProc.elapsedTime())); procedureFinished(rootProc); return LockState.LOCK_ACQUIRED; } private void cleanupAfterRollbackOneStep(Procedure proc) { if (proc.removeStackIndex()) { + // we have finished rolling back this procedure if (!proc.isSuccess()) { proc.setState(ProcedureState.ROLLEDBACK); } @@ -1673,10 +1690,16 @@ public class ProcedureExecutor { proc.updateMetricsOnFinish(getEnvironment(), proc.elapsedTime(), false); if (proc.hasParent()) { + // Delete the procedure, so that after restarting, we will not be rolled back again, as we + // delete the stack id so we will not be in the procedure stack any more. + // In the normal execution we will not delete the sub procedure until the root procedure is + // finished, this is because we may need to rollback later(which need to pop the procedure + // stack and go over all the sub procedures), but here we are rolling back the + // procedures, so it is OK to delete the sub procedures before finishing the root procedure. store.delete(proc.getProcId()); procedures.remove(proc.getProcId()); } else { - final long[] childProcIds = rollbackStack.get(proc.getProcId()).getSubprocedureIds(); + long[] childProcIds = rollbackStack.get(proc.getProcId()).getSubprocedureIds(); if (childProcIds != null) { store.delete(proc, childProcIds); } else { @@ -1684,6 +1707,9 @@ public class ProcedureExecutor { } } } else { + // The procedure has not been fully rolled back yet, which means we have still appeared at + // other stack indexes. But we still need to update the state, the most important thing is to + // persist the stack indexes, so that we will not rollback this step again. store.update(proc); } } @@ -1693,7 +1719,7 @@ public class ProcedureExecutor { * It updates the store with the new state (stack index) * or will remove completly the procedure in case it is a child. */ - private LockState executeRollback(Procedure proc) { + private LockState rollbackOneStep(Procedure proc) { try { proc.doRollback(getEnvironment()); } catch (IOException e) { diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RootProcedureState.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RootProcedureState.java index 2fc00301e9..20dff24033 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RootProcedureState.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RootProcedureState.java @@ -15,7 +15,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.hadoop.hbase.procedure2; import java.util.ArrayList; @@ -23,23 +22,25 @@ import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.yetus.audience.InterfaceAudience; -import org.apache.yetus.audience.InterfaceStability; import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.ProcedureState; /** * Internal state of the ProcedureExecutor that describes the state of a "Root Procedure". - * A "Root Procedure" is a Procedure without parent, each subprocedure will be - * added to the "Root Procedure" stack (or rollback-stack). - * + *

+ * A "Root Procedure" is a Procedure without parent, each subprocedure will be added to the "Root + * Procedure" stack (or rollback-stack). + *

* RootProcedureState is used and managed only by the ProcedureExecutor. + * + *

  *    Long rootProcId = getRootProcedureId(proc);
  *    rollbackStack.get(rootProcId).acquire(proc)
  *    rollbackStack.get(rootProcId).release(proc)
  *    ...
+ * 
*/ @InterfaceAudience.Private -@InterfaceStability.Evolving class RootProcedureState { private enum State { @@ -68,15 +69,33 @@ class RootProcedureState { return state == State.ROLLINGBACK; } + // return whether all the procedures are successful or failed. + private boolean areAllSubProceduresNotRunning() { + if (subprocs == null) { + return true; + } + return subprocs.stream().allMatch(p -> p.isSuccess() || p.isFailed()); + } + + public enum SetRollbackResult { + SUCCESS, CAN_NOT_ROLLBACK_YET, ALREADY_START_ROLLINGBACK + } + /** * Called by the ProcedureExecutor to mark rollback execution */ - protected synchronized boolean setRollback() { - if (running == 0 && state == State.FAILED) { - state = State.ROLLINGBACK; - return true; + protected synchronized SetRollbackResult setRollback() { + if (running > 0) { + return SetRollbackResult.CAN_NOT_ROLLBACK_YET; } - return false; + if (state != State.FAILED) { + return SetRollbackResult.ALREADY_START_ROLLINGBACK; + } + if (!areAllSubProceduresNotRunning()) { + return SetRollbackResult.CAN_NOT_ROLLBACK_YET; + } + state = State.ROLLINGBACK; + return SetRollbackResult.SUCCESS; } /** @@ -113,12 +132,17 @@ class RootProcedureState { * Called by the ProcedureExecutor to mark the procedure step as running. */ protected synchronized boolean acquire(Procedure proc) { - if (state != State.RUNNING) { - return false; + if (state == State.RUNNING) { + running++; + return true; } - - running++; - return true; + if (proc.isRunnable() && proc.wasExecuted()) { + // always let the runnable procedures to execute and finish, when rolling back we will skip + // these procedures and only focus on the failed ones. + running++; + return true; + } + return false; } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java index df1503ba5d..75307a5168 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java @@ -456,7 +456,17 @@ public class TransitRegionStateProcedure @Override protected void rollbackState(MasterProcedureEnv env, RegionStateTransitionState state) throws IOException, InterruptedException { - // no rollback + if (!wasExecuted()) { + // if we haven't executed yet then it is fine to rollback. + RegionStateNode regionNode = getRegionStateNode(env); + regionNode.lock(); + try { + regionNode.unsetProcedure(this); + } finally { + regionNode.unlock(); + } + return; + } throw new UnsupportedOperationException(); } -- 2.17.1