From 92d1179fdd1c3ecd0befe7d99f1f41c8f7192ee1 Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Thu, 20 Sep 2018 16:53:58 -0700 Subject: [PATCH] HBASE-21268 Backport to branch-2.0 " HBASE-21213 [hbck2] bypass leaves behind state in RegionStates when assign/unassign" Below is comment on HBASE-21213. This backport includes a good bit of HBASE-21156 needed because this patch modifies the API it adds for hbck2. HBASE-21213 [hbck2] bypass leaves behind state in RegionStates when assign/unassign Adds override to assigns and unassigns. Changes bypass 'force' to align calling the param 'override' instead. Adds recursive to 'bypass', a means of calling bypass on parent and its subprocedures (usually bypass works on leaf nodes rippling the bypass up to parent -- recursive has us work in the opposite direction): EXPERIMENTAL. bypass on an assign/unassign leaves region in RIT and the RegionStateNode loaded with the bypassed procedure. First implementation had assign/unassign cleanup leftover state. Second implementation, on feedback, keeps the state in place as a fence against other Procedures assuming the region entity, and instead adds an 'override' function that hbck2 can set on assigns/unassigns to override the fencing. Note that the below also converts ProcedureExceptions that come out of the Pv2 system into DoNotRetryIOEs. It is a little awkward because DNRIOE is in client-module, not in procedure module. Previous, we'd just keep retrying the bypass, etc. M hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java Have bypass take an environment like all other methods so subclasses. Fix javadoc issues. M hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java Javadoc issues. Pass environment when we invoke bypass. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Rename waitUntilNamespace... etc. to align with how these method types are named elsehwere .. i.e. waitFor rather than waitUntil.. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java Cleanup message we emit when we find an exisitng procedure working against this entity. Add support for a force function which allows Assigns/Unassigns force ownership of the Region entity. A hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRegionBypass.java Test bypass and force. M hbase-shell/src/main/ruby/shell/commands/list_procedures.rb Minor cleanup of the json output... do iso8601 timestamps. --- .../org/apache/hadoop/hbase/client/HBaseHbck.java | 63 ++++++- .../java/org/apache/hadoop/hbase/client/Hbck.java | 65 +++++++- .../hbase/shaded/protobuf/RequestConverter.java | 24 +++ .../hadoop/hbase/util/RetryCounterFactory.java | 4 + .../apache/hadoop/hbase/procedure2/Procedure.java | 29 ++-- .../hadoop/hbase/procedure2/ProcedureExecutor.java | 66 ++++++-- .../hadoop/hbase/procedure2/ProcedureUtil.java | 2 +- .../hbase/procedure2/RemoteProcedureException.java | 6 +- .../hbase/procedure2/TestProcedureBypass.java | 23 ++- .../src/main/protobuf/Master.proto | 71 ++++++++ .../src/main/protobuf/MasterProcedure.proto | 2 + .../hbase/rsgroup/RSGroupInfoManagerImpl.java | 3 +- .../balancer/TestRSGroupBasedLoadBalancer.java | 3 + .../org/apache/hadoop/hbase/master/HMaster.java | 86 +++++++++- .../hadoop/hbase/master/MasterRpcServices.java | 124 +++++++++++++- .../hbase/master/assignment/AssignProcedure.java | 14 +- .../hbase/master/assignment/AssignmentManager.java | 32 +++- .../assignment/RegionTransitionProcedure.java | 79 +++++++-- .../hbase/master/assignment/UnassignProcedure.java | 14 +- .../hbase/master/balancer/BaseLoadBalancer.java | 2 + .../master/procedure/MasterProcedureUtil.java | 16 ++ .../hbase/master/procedure/ProcedureDescriber.java | 3 +- .../master/procedure/ProcedurePrepareLatch.java | 2 +- .../hbase/master/procedure/ProcedureSyncWait.java | 5 +- .../apache/hadoop/hbase/TestMetaTableAccessor.java | 16 ++ .../org/apache/hadoop/hbase/client/TestHbck.java | 128 ++++++++++++--- .../hbase/master/assignment/TestRegionBypass.java | 181 +++++++++++++++++++++ .../main/ruby/shell/commands/list_procedures.rb | 7 +- .../org/apache/hadoop/hbase/client/TestShell.java | 1 - hbase-shell/src/test/ruby/shell/shell_test.rb | 6 +- 30 files changed, 963 insertions(+), 114 deletions(-) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRegionBypass.java diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java index 03a6f69c22..abe55469d2 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java @@ -18,13 +18,19 @@ package org.apache.hadoop.hbase.client; import java.io.IOException; +import java.util.List; +import java.util.concurrent.Callable; +import java.util.stream.Collectors; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.ipc.RpcControllerFactory; +import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException; import org.apache.hadoop.hbase.shaded.protobuf.RequestConverter; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.GetTableStateResponse; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.HbckService.BlockingInterface; @@ -87,9 +93,62 @@ public class HBaseHbck implements Hbck { RequestConverter.buildSetTableStateInMetaRequest(state)); return TableState.convert(state.getTableName(), response.getTableState()); } catch (ServiceException se) { - LOG.debug("ServiceException while updating table state in meta. table={}, state={}", - state.getTableName(), state.getState()); + LOG.debug("table={}, state={}", state.getTableName(), state.getState(), se); + throw new IOException(se); + } + } + + @Override + public List assigns(List encodedRegionNames, boolean override) + throws IOException { + try { + MasterProtos.AssignsResponse response = + this.hbck.assigns(rpcControllerFactory.newController(), + RequestConverter.toAssignRegionsRequest(encodedRegionNames, override)); + return response.getPidList(); + } catch (ServiceException se) { + LOG.debug(toCommaDelimitedString(encodedRegionNames), se); + throw new IOException(se); + } + } + + @Override + public List unassigns(List encodedRegionNames, boolean override) + throws IOException { + try { + MasterProtos.UnassignsResponse response = + this.hbck.unassigns(rpcControllerFactory.newController(), + RequestConverter.toUnassignRegionsRequest(encodedRegionNames, override)); + return response.getPidList(); + } catch (ServiceException se) { + LOG.debug(toCommaDelimitedString(encodedRegionNames), se); throw new IOException(se); } } + + private static String toCommaDelimitedString(List list) { + return list.stream().collect(Collectors.joining(", ")); + } + + @Override + public List bypassProcedure(List pids, long waitTime, boolean override, + boolean recursive) + throws IOException { + MasterProtos.BypassProcedureResponse response = ProtobufUtil.call( + new Callable() { + @Override + public MasterProtos.BypassProcedureResponse call() throws Exception { + try { + return hbck.bypassProcedure(rpcControllerFactory.newController(), + MasterProtos.BypassProcedureRequest.newBuilder().addAllProcId(pids). + setWaitTime(waitTime).setOverride(override).setRecursive(recursive).build()); + } catch (Throwable t) { + LOG.error(pids.stream().map(i -> i.toString()). + collect(Collectors.joining(", ")), t); + throw t; + } + } + }); + return response.getBypassedList(); + } } diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java index a216cdbc6b..d5fdfce1ce 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java @@ -19,6 +19,8 @@ package org.apache.hadoop.hbase.client; import java.io.Closeable; import java.io.IOException; +import java.util.List; + import org.apache.hadoop.hbase.Abortable; import org.apache.hadoop.hbase.HBaseInterfaceAudience; import org.apache.yetus.audience.InterfaceAudience; @@ -26,12 +28,14 @@ import org.apache.yetus.audience.InterfaceAudience; /** * Hbck APIs for HBase. Obtain an instance from {@link ClusterConnection#getHbck()} and call * {@link #close()} when done. - *

Hbck client APIs will be mostly used by hbck tool which in turn can be used by operators to - * fix HBase and bringging it to consistent state.

+ *

WARNING: the below methods can damage the cluster. It may leave the cluster in an + * indeterminate state, e.g. region not assigned, or some hdfs files left behind. After running + * any of the below, operators may have to do some clean up on hdfs or schedule some assign + * procedures to get regions back online. DO AT YOUR OWN RISK. For experienced users only. * * @see ConnectionFactory * @see ClusterConnection - * @since 2.2.0 + * @since 2.0.2, 2.1.1 */ @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.HBCK) public interface Hbck extends Abortable, Closeable { @@ -47,4 +51,59 @@ public interface Hbck extends Abortable, Closeable { * @return previous state of the table in Meta */ TableState setTableStateInMeta(TableState state) throws IOException; + + /** + * Like {@link Admin#assign(byte[])} but 'raw' in that it can do more than one Region at a time + * -- good if many Regions to online -- and it will schedule the assigns even in the case where + * Master is initializing (as long as the ProcedureExecutor is up). Does NOT call Coprocessor + * hooks. + * @param override You need to add the override for case where a region has previously been + * bypassed. When a Procedure has been bypassed, a Procedure will have completed + * but no other Procedure will be able to make progress on the target entity + * (intentionally). This override flag will override this fencing mechanism. + * @param encodedRegionNames Region encoded names; e.g. 1588230740 is the hard-coded encoding + * for hbase:meta region and de00010733901a05f5a2a3a382e27dd4 is an + * example of what a random user-space encoded Region name looks like. + */ + List assigns(List encodedRegionNames, boolean override) throws IOException; + + default List assigns(List encodedRegionNames) throws IOException { + return assigns(encodedRegionNames, false); + } + + /** + * Like {@link Admin#unassign(byte[], boolean)} but 'raw' in that it can do more than one Region + * at a time -- good if many Regions to offline -- and it will schedule the assigns even in the + * case where Master is initializing (as long as the ProcedureExecutor is up). Does NOT call + * Coprocessor hooks. + * @param override You need to add the override for case where a region has previously been + * bypassed. When a Procedure has been bypassed, a Procedure will have completed + * but no other Procedure will be able to make progress on the target entity + * (intentionally). This override flag will override this fencing mechanism. + * @param encodedRegionNames Region encoded names; e.g. 1588230740 is the hard-coded encoding + * for hbase:meta region and de00010733901a05f5a2a3a382e27dd4 is an + * example of what a random user-space encoded Region name looks like. + */ + List unassigns(List encodedRegionNames, boolean override) throws IOException; + + default List unassigns(List encodedRegionNames) throws IOException { + return unassigns(encodedRegionNames, false); + } + + /** + * Bypass specified procedure and move it to completion. Procedure is marked completed but + * no actual work is done from the current state/step onwards. Parents of the procedure are + * also marked for bypass. + * + * @param pids of procedures to complete. + * @param waitTime wait time in ms for acquiring lock for a procedure + * @param override if override set to true, we will bypass the procedure even if it is executing. + * This is for procedures which can't break out during execution (bugs?). + * @param recursive If set, if a parent procedure, we will find and bypass children and then + * the parent procedure (Dangerous but useful in case where child procedure has been 'lost'). + * Does not always work. Experimental. + * @return true if procedure is marked for bypass successfully, false otherwise + */ + List bypassProcedure(List pids, long waitTime, boolean override, boolean recursive) + throws IOException; } diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java index 19b9fc8e2f..020ca73812 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java @@ -25,6 +25,7 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; +import java.util.stream.Collectors; import org.apache.hadoop.hbase.CellScannable; import org.apache.hadoop.hbase.ClusterMetrics.Option; @@ -1879,4 +1880,27 @@ public final class RequestConverter { } return pbServers; } + + // HBCK2 + public static MasterProtos.AssignsRequest toAssignRegionsRequest( + List encodedRegionNames, boolean override) { + MasterProtos.AssignsRequest.Builder b = MasterProtos.AssignsRequest.newBuilder(); + return b.addAllRegion(toEncodedRegionNameRegionSpecifiers(encodedRegionNames)). + setOverride(override).build(); + } + + public static MasterProtos.UnassignsRequest toUnassignRegionsRequest( + List encodedRegionNames, boolean override) { + MasterProtos.UnassignsRequest.Builder b = + MasterProtos.UnassignsRequest.newBuilder(); + return b.addAllRegion(toEncodedRegionNameRegionSpecifiers(encodedRegionNames)). + setOverride(override).build(); + } + + private static List toEncodedRegionNameRegionSpecifiers( + List encodedRegionNames) { + return encodedRegionNames.stream(). + map(r -> buildRegionSpecifier(RegionSpecifierType.ENCODED_REGION_NAME, Bytes.toBytes(r))). + collect(Collectors.toList()); + } } diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounterFactory.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounterFactory.java index dcf6626ae4..c15cfb2cc7 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounterFactory.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounterFactory.java @@ -28,6 +28,10 @@ import org.apache.yetus.audience.InterfaceAudience; public class RetryCounterFactory { private final RetryConfig retryConfig; + public RetryCounterFactory(int sleepIntervalMillis) { + this(Integer.MAX_VALUE, sleepIntervalMillis); + } + public RetryCounterFactory(int maxAttempts, int sleepIntervalMillis) { this(maxAttempts, sleepIntervalMillis, -1); } diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java index 08c7ce3bab..2b66961d7d 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java @@ -145,12 +145,16 @@ public abstract class Procedure implements Comparable

Bypassing a procedure is not like aborting. Aborting a procedure will trigger * a rollback. And since the {@link #abort(Object)} method is overrideable * Some procedures may have chosen to ignore the aborting. */ @@ -175,12 +179,15 @@ public abstract class Procedure implements Comparable implements Comparable { private Configuration conf; /** - * Created in the {@link #start(int, boolean)} method. Destroyed in {@link #join()} (FIX! Doing + * Created in the {@link #init(int, boolean)} method. Destroyed in {@link #join()} (FIX! Doing * resource handling rather than observing in a #join is unexpected). * Overridden when we do the ProcedureTestingUtility.testRecoveryAndDoubleExecution trickery * (Should be ok). @@ -314,7 +314,7 @@ public class ProcedureExecutor { private ThreadGroup threadGroup; /** - * Created in the {@link #start(int, boolean)} method. Terminated in {@link #join()} (FIX! Doing + * Created in the {@link #init(int, boolean)} method. Terminated in {@link #join()} (FIX! Doing * resource handling rather than observing in a #join is unexpected). * Overridden when we do the ProcedureTestingUtility.testRecoveryAndDoubleExecution trickery * (Should be ok). @@ -322,7 +322,7 @@ public class ProcedureExecutor { private CopyOnWriteArrayList workerThreads; /** - * Created in the {@link #start(int, boolean)} method. Terminated in {@link #join()} (FIX! Doing + * Created in the {@link #init(int, boolean)} method. Terminated in {@link #join()} (FIX! Doing * resource handling rather than observing in a #join is unexpected). * Overridden when we do the ProcedureTestingUtility.testRecoveryAndDoubleExecution trickery * (Should be ok). @@ -966,7 +966,7 @@ public class ProcedureExecutor { * Bypass a procedure. If the procedure is set to bypass, all the logic in * execute/rollback will be ignored and it will return success, whatever. * It is used to recover buggy stuck procedures, releasing the lock resources - * and letting other procedures to run. Bypassing one procedure (and its ancestors will + * and letting other procedures run. Bypassing one procedure (and its ancestors will * be bypassed automatically) may leave the cluster in a middle state, e.g. region * not assigned, or some hdfs files left behind. After getting rid of those stuck procedures, * the operators may have to do some clean up on hdfs or schedule some assign procedures @@ -985,7 +985,7 @@ public class ProcedureExecutor { *

* If the procedure is in WAITING state, will set it to RUNNABLE add it to run queue. * TODO: What about WAITING_TIMEOUT? - * @param id the procedure id + * @param pids the procedure ids * @param lockWait time to wait lock * @param force if force set to true, we will bypass the procedure even if it is executing. * This is for procedures which can't break out during executing(due to bug, mostly) @@ -993,26 +993,38 @@ public class ProcedureExecutor { * there. We need to restart the master after bypassing, and letting the problematic * procedure to execute wth bypass=true, so in that condition, the procedure can be * successfully bypassed. + * @param recursive We will do an expensive search for children of each pid. EXPENSIVE! * @return true if bypass success * @throws IOException IOException */ - public boolean bypassProcedure(long id, long lockWait, boolean force) throws IOException { - Procedure procedure = getProcedure(id); + public List bypassProcedure(List pids, long lockWait, boolean force, + boolean recursive) + throws IOException { + List result = new ArrayList(pids.size()); + for(long pid: pids) { + result.add(bypassProcedure(pid, lockWait, force, recursive)); + } + return result; + } + + boolean bypassProcedure(long pid, long lockWait, boolean override, boolean recursive) + throws IOException { + final Procedure procedure = getProcedure(pid); if (procedure == null) { - LOG.debug("Procedure with id={} does not exist, skipping bypass", id); + LOG.debug("Procedure pid={} does not exist, skipping bypass", pid); return false; } - LOG.debug("Begin bypass {} with lockWait={}, force={}", procedure, lockWait, force); - + LOG.debug("Begin bypass {} with lockWait={}, override={}, recursive={}", + procedure, lockWait, override, recursive); IdLock.Entry lockEntry = procExecutionLock.tryLockEntry(procedure.getProcId(), lockWait); - if (lockEntry == null && !force) { + if (lockEntry == null && !override) { LOG.debug("Waited {} ms, but {} is still running, skipping bypass with force={}", - lockWait, procedure, force); + lockWait, procedure, override); return false; } else if (lockEntry == null) { LOG.debug("Waited {} ms, but {} is still running, begin bypass with force={}", - lockWait, procedure, force); + lockWait, procedure, override); } try { // check whether the procedure is already finished @@ -1022,8 +1034,29 @@ public class ProcedureExecutor { } if (procedure.hasChildren()) { - LOG.debug("{} has children, skipping bypass", procedure); - return false; + if (recursive) { + // EXPENSIVE. Checks each live procedure of which there could be many!!! + // Is there another way to get children of a procedure? + LOG.info("Recursive bypass on children of pid={}", procedure.getProcId()); + this.procedures.forEachValue(1 /*Single-threaded*/, + // Transformer + v -> { + return v.getParentProcId() == procedure.getProcId()? v: null; + }, + // Consumer + v -> { + boolean result = false; + IOException ioe = null; + try { + result = bypassProcedure(v.getProcId(), lockWait, override, recursive); + } catch (IOException e) { + LOG.warn("Recursive bypass of pid={}", v.getProcId(), e); + } + }); + } else { + LOG.debug("{} has children, skipping bypass", procedure); + return false; + } } // If the procedure has no parent or no child, we are safe to bypass it in whatever state @@ -1033,6 +1066,7 @@ public class ProcedureExecutor { LOG.debug("Bypassing procedures in RUNNABLE, WAITING and WAITING_TIMEOUT states " + "(with no parent), {}", procedure); + // Question: how is the bypass done here? return false; } @@ -1042,7 +1076,7 @@ public class ProcedureExecutor { Procedure current = procedure; while (current != null) { LOG.debug("Bypassing {}", current); - current.bypass(); + current.bypass(getEnvironment()); store.update(procedure); long parentID = current.getParentProcId(); current = getProcedure(parentID); diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureUtil.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureUtil.java index 27802efecd..b7362200d0 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureUtil.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureUtil.java @@ -272,7 +272,7 @@ public final class ProcedureUtil { } if (proto.getBypass()) { - proc.bypass(); + proc.bypass(null); } ProcedureStateSerializer serializer = null; diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureException.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureException.java index 6f4795d9dc..612ccf27cc 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureException.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureException.java @@ -21,7 +21,6 @@ import java.io.IOException; import org.apache.hadoop.ipc.RemoteException; import org.apache.yetus.audience.InterfaceAudience; -import org.apache.yetus.audience.InterfaceStability; import org.apache.hadoop.hbase.shaded.protobuf.generated.ErrorHandlingProtos.ForeignExceptionMessage; import org.apache.hadoop.hbase.util.ForeignExceptionUtil; @@ -37,7 +36,6 @@ import org.apache.hadoop.hbase.util.ForeignExceptionUtil; * their stacks traces and messages overridden to reflect the original 'remote' exception. */ @InterfaceAudience.Private -@InterfaceStability.Evolving @SuppressWarnings("serial") public class RemoteProcedureException extends ProcedureException { @@ -74,6 +72,10 @@ public class RemoteProcedureException extends ProcedureException { return new Exception(cause); } + // NOTE: Does not throw DoNotRetryIOE because it does not + // have access (DNRIOE is in the client module). Use + // MasterProcedureUtil.unwrapRemoteIOException if need to + // throw DNRIOE. public IOException unwrapRemoteIOException() { final Exception cause = unwrapRemoteException(); if (cause instanceof IOException) { diff --git a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestProcedureBypass.java b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestProcedureBypass.java index d58d57e76e..739d1612c3 100644 --- a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestProcedureBypass.java +++ b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestProcedureBypass.java @@ -87,7 +87,7 @@ public class TestProcedureBypass { long id = procExecutor.submitProcedure(proc); Thread.sleep(500); //bypass the procedure - assertTrue(procExecutor.bypassProcedure(id, 30000, false)); + assertTrue(procExecutor.bypassProcedure(id, 30000, false, false)); htu.waitFor(5000, () -> proc.isSuccess() && proc.isBypass()); LOG.info("{} finished", proc); } @@ -98,7 +98,7 @@ public class TestProcedureBypass { long id = procExecutor.submitProcedure(proc); Thread.sleep(500); //bypass the procedure - assertTrue(procExecutor.bypassProcedure(id, 1000, true)); + assertTrue(procExecutor.bypassProcedure(id, 1000, true, false)); //Since the procedure is stuck there, we need to restart the executor to recovery. ProcedureTestingUtility.restart(procExecutor); htu.waitFor(5000, () -> proc.isSuccess() && proc.isBypass()); @@ -114,12 +114,24 @@ public class TestProcedureBypass { .size() > 0); SuspendProcedure suspendProcedure = (SuspendProcedure)procExecutor.getProcedures().stream() .filter(p -> p.getParentProcId() == rootId).collect(Collectors.toList()).get(0); - assertTrue(procExecutor.bypassProcedure(suspendProcedure.getProcId(), 1000, false)); + assertTrue(procExecutor.bypassProcedure(suspendProcedure.getProcId(), 1000, false, false)); htu.waitFor(5000, () -> proc.isSuccess() && proc.isBypass()); LOG.info("{} finished", proc); } - + @Test + public void testBypassingProcedureWithParentRecursive() throws Exception { + final RootProcedure proc = new RootProcedure(); + long rootId = procExecutor.submitProcedure(proc); + htu.waitFor(5000, () -> procExecutor.getProcedures().stream() + .filter(p -> p.getParentProcId() == rootId).collect(Collectors.toList()) + .size() > 0); + SuspendProcedure suspendProcedure = (SuspendProcedure)procExecutor.getProcedures().stream() + .filter(p -> p.getParentProcId() == rootId).collect(Collectors.toList()).get(0); + assertTrue(procExecutor.bypassProcedure(rootId, 1000, false, true)); + htu.waitFor(5000, () -> proc.isSuccess() && proc.isBypass()); + LOG.info("{} finished", proc); + } @AfterClass public static void tearDown() throws Exception { @@ -179,7 +191,4 @@ public class TestProcedureBypass { } } } - - - } diff --git a/hbase-protocol-shaded/src/main/protobuf/Master.proto b/hbase-protocol-shaded/src/main/protobuf/Master.proto index ca8d915705..6b37006350 100644 --- a/hbase-protocol-shaded/src/main/protobuf/Master.proto +++ b/hbase-protocol-shaded/src/main/protobuf/Master.proto @@ -99,6 +99,7 @@ message MergeTableRegionsResponse { message AssignRegionRequest { required RegionSpecifier region = 1; + optional bool override = 2 [default = false]; } message AssignRegionResponse { @@ -993,8 +994,78 @@ service MasterService { } +// HBCK Service definitions. + + +/** Like Admin's AssignRegionRequest except it can + * take one or more Regions at a time. + */ +// NOTE: In hbck.proto, there is a define for +// AssignRegionRequest -- singular 'Region'. This +// is plural to convey it can carry more than one +// Region at a time. +message AssignsRequest { + repeated RegionSpecifier region = 1; + optional bool override = 2 [default = false]; +} + +/** Like Admin's AssignRegionResponse except it can + * return one or more pids as result -- one per assign. + */ +message AssignsResponse { + repeated uint64 pid = 1; +} + +/** Like Admin's UnassignRegionRequest except it can + * take one or more Regions at a time. + */ +message UnassignsRequest { + repeated RegionSpecifier region = 1; + optional bool override = 2 [default = false]; +} + +/** Like Admin's UnassignRegionResponse except it can + * return one or more pids as result -- one per unassign. + */ +message UnassignsResponse { + repeated uint64 pid = 1; +} + +message BypassProcedureRequest { + repeated uint64 proc_id = 1; + optional uint64 waitTime = 2; // wait time in ms to acquire lock on a procedure + optional bool override = 3 [default = false]; // if true, procedure is marked for bypass even if its executing + optional bool recursive = 4; +} + +message BypassProcedureResponse { + repeated bool bypassed = 1; +} + service HbckService { /** Update state of the table in meta only*/ rpc SetTableStateInMeta(SetTableStateInMetaRequest) returns(GetTableStateResponse); + + /** + * Assign regions. + * Like Admin's assign but works even if the + * Master is initializing. Also allows bulk'ing up + * assigns rather than one region at a time. + */ + rpc Assigns(AssignsRequest) + returns(AssignsResponse); + + /** + * Unassign regions + * Like Admin's unssign but works even if the + * Master is initializing. Also allows bulk'ing up + * assigns rather than one region at a time. + */ + rpc Unassigns(UnassignsRequest) + returns(UnassignsResponse); + + /** Bypass a procedure to completion, procedure is completed but no actual work is done*/ + rpc BypassProcedure(BypassProcedureRequest) + returns(BypassProcedureResponse); } diff --git a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto index 3826548deb..37ed1948f7 100644 --- a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto +++ b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto @@ -328,6 +328,7 @@ message AssignRegionStateData { optional ServerName target_server = 4; // Current attempt index used for expotential backoff when stuck optional int32 attempt = 5; + optional bool override = 6 [default = false]; } message UnassignRegionStateData { @@ -339,6 +340,7 @@ message UnassignRegionStateData { // This is the server currently hosting the Region, the // server we will send the unassign rpc too. optional ServerName hosting_server = 5; + // We hijacked an old param named 'force' and use it as 'override'. optional bool force = 4 [default = false]; optional bool remove_after_unassigning = 6 [default = false]; // Current attempt index used for expotential backoff when stuck diff --git a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupInfoManagerImpl.java b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupInfoManagerImpl.java index 06d684c523..e32542b350 100644 --- a/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupInfoManagerImpl.java +++ b/hbase-rsgroup/src/main/java/org/apache/hadoop/hbase/rsgroup/RSGroupInfoManagerImpl.java @@ -65,6 +65,7 @@ import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.ServerListener; import org.apache.hadoop.hbase.master.TableStateManager; import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureUtil; import org.apache.hadoop.hbase.net.Address; import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.protobuf.ProtobufMagic; @@ -869,7 +870,7 @@ final class RSGroupInfoManagerImpl implements RSGroupInfoManager { Procedure result = masterServices.getMasterProcedureExecutor().getResult(procId); if (result != null && result.isFailed()) { throw new IOException("Failed to create group table. " + - result.getException().unwrapRemoteIOException()); + MasterProcedureUtil.unwrapRemoteIOException(result)); } } } diff --git a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/master/balancer/TestRSGroupBasedLoadBalancer.java b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/master/balancer/TestRSGroupBasedLoadBalancer.java index 0ef0a0118c..67923602f7 100644 --- a/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/master/balancer/TestRSGroupBasedLoadBalancer.java +++ b/hbase-rsgroup/src/test/java/org/apache/hadoop/hbase/master/balancer/TestRSGroupBasedLoadBalancer.java @@ -49,6 +49,7 @@ import org.apache.hadoop.hbase.master.LoadBalancer; import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.RegionPlan; import org.apache.hadoop.hbase.master.assignment.AssignmentManager; +import org.apache.hadoop.hbase.master.assignment.RegionStates; import org.apache.hadoop.hbase.net.Address; import org.apache.hadoop.hbase.rsgroup.RSGroupBasedLoadBalancer; import org.apache.hadoop.hbase.rsgroup.RSGroupInfo; @@ -569,6 +570,8 @@ public class TestRSGroupBasedLoadBalancer { Mockito.when(services.getTableDescriptors()).thenReturn(tds); AssignmentManager am = Mockito.mock(AssignmentManager.class); Mockito.when(services.getAssignmentManager()).thenReturn(am); + RegionStates rss = Mockito.mock(RegionStates.class); + Mockito.when(am.getRegionStates()).thenReturn(rss); return services; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index d5167c88d0..a3bb01546d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -179,6 +179,8 @@ import org.apache.hadoop.hbase.util.HasThread; import org.apache.hadoop.hbase.util.IdLock; import org.apache.hadoop.hbase.util.ModifyRegionUtils; import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.hbase.util.RetryCounter; +import org.apache.hadoop.hbase.util.RetryCounterFactory; import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.util.VersionInfo; import org.apache.hadoop.hbase.zookeeper.LoadBalancerTracker; @@ -924,7 +926,20 @@ public class HMaster extends HRegionServer implements MasterServices { return; } - //Initialize after meta as it scans meta + status.setStatus("Starting assignment manager"); + // FIRST HBASE:META READ!!!! + // The below cannot make progress w/o hbase:meta being online. + // This is the FIRST attempt at going to hbase:meta. Meta on-lining is going on in background + // as procedures run -- in particular SCPs for crashed servers... One should put up hbase:meta + // if it is down. It may take a while to come online. So, wait here until meta if for sure + // available. Thats what waitUntilMetaOnline does. + if (!waitForMetaOnline()) { + return; + } + this.assignmentManager.joinCluster(); + // The below depends on hbase:meta being online. + this.tableStateManager.start(); + // Initialize after meta is up as below scans meta if (favoredNodesManager != null) { SnapshotOfRegionAssignmentFromMeta snapshotOfRegionAssignment = new SnapshotOfRegionAssignmentFromMeta(getConnection()); @@ -950,6 +965,13 @@ public class HMaster extends HRegionServer implements MasterServices { this.catalogJanitorChore = new CatalogJanitor(this); getChoreService().scheduleChore(catalogJanitorChore); + // NAMESPACE READ!!!! + // Here we expect hbase:namespace to be online. See inside initClusterSchemaService. + // TODO: Fix this. Namespace is a pain being a sort-of system table. Fold it in to hbase:meta. + // isNamespace does like isMeta and waits until namespace is onlined before allowing progress. + if (!waitForNamespaceOnline()) { + return; + } status.setStatus("Starting cluster schema service"); initClusterSchemaService(); @@ -1025,6 +1047,68 @@ public class HMaster extends HRegionServer implements MasterServices { } } + /** + * Check hbase:meta is up and ready for reading. For use during Master startup only. + * @return True if meta is UP and online and startup can progress. Otherwise, meta is not online + * and we will hold here until operator intervention. + */ + @VisibleForTesting + public boolean waitForMetaOnline() throws InterruptedException { + return isRegionOnline(RegionInfoBuilder.FIRST_META_REGIONINFO); + } + + /** + * @return True if region is online and scannable else false if an error or shutdown (Otherwise + * we just block in here holding up all forward-progess). + */ + private boolean isRegionOnline(RegionInfo ri) throws InterruptedException { + RetryCounter rc = null; + while (!isStopped()) { + RegionState rs = this.assignmentManager.getRegionStates().getRegionState(ri); + if (rs.isOpened()) { + if (this.getServerManager().isServerOnline(rs.getServerName())) { + return true; + } + } + // Region is not OPEN. + Optional> optProc = this.procedureExecutor.getProcedures(). + stream().filter(p -> p instanceof ServerCrashProcedure).findAny(); + // TODO: Add a page to refguide on how to do repair. Have this log message point to it. + // Page will talk about loss of edits, how to schedule at least the meta WAL recovery, and + // then how to assign including how to break region lock if one held. + LOG.warn("{} is NOT online; state={}; ServerCrashProcedures={}. Master startup cannot " + + "progress, in holding-pattern until region onlined.", + ri.getRegionNameAsString(), rs, optProc.isPresent()); + // Check once-a-minute. + if (rc == null) { + rc = new RetryCounterFactory(1000).create(); + } + Threads.sleep(rc.getBackoffTimeAndIncrementAttempts()); + } + return false; + } + + /** + * Check hbase:namespace table is assigned. If not, startup will hang looking for the ns table + * (TODO: Fix this! NS should not hold-up startup). + * @return True if namespace table is up/online. + */ + @VisibleForTesting + public boolean waitForNamespaceOnline() throws InterruptedException { + List ris = this.assignmentManager.getRegionStates(). + getRegionsOfTable(TableName.NAMESPACE_TABLE_NAME); + if (ris.isEmpty()) { + // If empty, means we've not assigned the namespace table yet... Just return true so startup + // continues and the namespace table gets created. + return true; + } + // Else there are namespace regions up in meta. Ensure they are assigned before we go on. + for (RegionInfo ri: ris) { + isRegionOnline(ri); + } + return true; + } + /** * Adds the {@code MasterQuotasObserver} to the list of configured Master observers to * automatically remove quotas for a table when that table is deleted. diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java index a0b6d8d0a4..48a41ee641 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java @@ -29,6 +29,7 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; +import java.util.function.BiFunction; import java.util.stream.Collectors; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.ClusterMetricsBuilder; @@ -567,7 +568,7 @@ public class MasterRpcServices extends RSRpcServices BalanceRequest request) throws ServiceException { try { return BalanceResponse.newBuilder().setBalancerRan(master.balance( - request.hasForce() ? request.getForce() : false)).build(); + request.hasForce()? request.getForce(): false)).build(); } catch (IOException ex) { throw new ServiceException(ex); } @@ -1153,7 +1154,8 @@ public class MasterRpcServices extends RSRpcServices if (executor.isFinished(procId)) { builder.setState(GetProcedureResultResponse.State.FINISHED); if (result.isFailed()) { - IOException exception = result.getException().unwrapRemoteIOException(); + IOException exception = + MasterProcedureUtil.unwrapRemoteIOException(result); builder.setException(ForeignExceptionUtil.toProtoForeignException(exception)); } byte[] resultData = result.getResult(); @@ -2266,4 +2268,122 @@ public class MasterRpcServices extends RSRpcServices throw new ServiceException(e); } } + + /** + * Get RegionInfo from Master using content of RegionSpecifier as key. + * @return RegionInfo found by decoding rs or null if none found + */ + private RegionInfo getRegionInfo(HBaseProtos.RegionSpecifier rs) throws UnknownRegionException { + RegionInfo ri = null; + switch(rs.getType()) { + case REGION_NAME: + final byte[] regionName = rs.getValue().toByteArray(); + ri = this.master.getAssignmentManager().getRegionInfo(regionName); + break; + case ENCODED_REGION_NAME: + String encodedRegionName = Bytes.toString(rs.getValue().toByteArray()); + RegionState regionState = this.master.getAssignmentManager().getRegionStates(). + getRegionState(encodedRegionName); + ri = regionState == null? null: regionState.getRegion(); + break; + default: + break; + } + return ri; + } + + /** + * Submit the Procedure that gets created by f + * @return pid of the submitted Procedure. + */ + private long submitProcedure(HBaseProtos.RegionSpecifier rs, boolean override, + BiFunction f) + throws UnknownRegionException { + RegionInfo ri = getRegionInfo(rs); + long pid = Procedure.NO_PROC_ID; + if (ri == null) { + LOG.warn("No RegionInfo found to match {}", rs); + } else { + pid = this.master.getMasterProcedureExecutor().submitProcedure(f.apply(ri, override)); + } + return pid; + } + + /** + * A 'raw' version of assign that does bulk and skirts Master state checks (assigns can be made + * during Master startup). For use by Hbck2. + */ + @Override + public MasterProtos.AssignsResponse assigns(RpcController controller, + MasterProtos.AssignsRequest request) + throws ServiceException { + LOG.info(master.getClientIdAuditPrefix() + " assigns"); + if (this.master.getMasterProcedureExecutor() == null) { + throw new ServiceException("Master's ProcedureExecutor not initialized; retry later"); + } + MasterProtos.AssignsResponse.Builder responseBuilder = + MasterProtos.AssignsResponse.newBuilder(); + try { + boolean override = request.getOverride(); + for (HBaseProtos.RegionSpecifier rs: request.getRegionList()) { + long pid = submitProcedure(rs, override, + (r, b) -> this.master.getAssignmentManager().createAssignProcedure(r, b)); + responseBuilder.addPid(pid); + } + return responseBuilder.build(); + } catch (IOException ioe) { + throw new ServiceException(ioe); + } + } + + /** + * A 'raw' version of unassign that does bulk and skirts Master state checks (unassigns can be + * made during Master startup). For use by Hbck2. + */ + @Override + public MasterProtos.UnassignsResponse unassigns(RpcController controller, + MasterProtos.UnassignsRequest request) + throws ServiceException { + LOG.info(master.getClientIdAuditPrefix() + " unassigns"); + if (this.master.getMasterProcedureExecutor() == null) { + throw new ServiceException("Master's ProcedureExecutor not initialized; retry later"); + } + MasterProtos.UnassignsResponse.Builder responseBuilder = + MasterProtos.UnassignsResponse.newBuilder(); + try { + boolean override = request.getOverride(); + for (HBaseProtos.RegionSpecifier rs: request.getRegionList()) { + long pid = submitProcedure(rs, override, + (r, b) -> this.master.getAssignmentManager().createUnassignProcedure(r, b)); + responseBuilder.addPid(pid); + } + return responseBuilder.build(); + } catch (IOException ioe) { + throw new ServiceException(ioe); + } + } + + /** + * Bypass specified procedure to completion. Procedure is marked completed but no actual work + * is done from the current state/ step onwards. Parents of the procedure are also marked for + * bypass. + * + * NOTE: this is a dangerous operation and may be used to unstuck buggy procedures. This may + * leave system in inconherent state. This may need to be followed by some cleanup steps/ + * actions by operator. + * + * @return BypassProcedureToCompletionResponse indicating success or failure + */ + @Override + public MasterProtos.BypassProcedureResponse bypassProcedure(RpcController controller, + MasterProtos.BypassProcedureRequest request) throws ServiceException { + try { + List ret = + master.getMasterProcedureExecutor().bypassProcedure(request.getProcIdList(), + request.getWaitTime(), request.getOverride(), request.getRecursive()); + return MasterProtos.BypassProcedureResponse.newBuilder().addAllBypassed(ret).build(); + } catch (IOException e) { + throw new ServiceException(e); + } + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java index 55aee4ad07..ac4a67f323 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java @@ -99,12 +99,16 @@ public class AssignProcedure extends RegionTransitionProcedure { } public AssignProcedure(final RegionInfo regionInfo) { - super(regionInfo); - this.targetServer = null; + this(regionInfo, null); } public AssignProcedure(final RegionInfo regionInfo, final ServerName destinationServer) { - super(regionInfo); + this(regionInfo, destinationServer, false); + } + + public AssignProcedure(final RegionInfo regionInfo, final ServerName destinationServer, + boolean override) { + super(regionInfo, override); this.targetServer = destinationServer; } @@ -138,6 +142,9 @@ public class AssignProcedure extends RegionTransitionProcedure { if (getAttempt() > 0) { state.setAttempt(getAttempt()); } + if (isOverride()) { + state.setOverride(isOverride()); + } serializer.serialize(state.build()); } @@ -148,6 +155,7 @@ public class AssignProcedure extends RegionTransitionProcedure { setTransitionState(state.getTransitionState()); setRegionInfo(ProtobufUtil.toRegionInfo(state.getRegionInfo())); forceNewPlan = state.getForceNewPlan(); + setOverride(state.getOverride()); if (state.hasTargetServer()) { this.targetServer = ProtobufUtil.toServerName(state.getTargetServer()); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 5d3de65b9a..8af98cd212 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -660,23 +660,39 @@ public class AssignmentManager implements ServerListener { * Called by things like DisableTableProcedure to get a list of UnassignProcedure * to unassign the regions of the table. */ - public UnassignProcedure[] createUnassignProcedures(final TableName tableName) { - return createUnassignProcedures(regionStates.getTableRegionStateNodes(tableName)); + public AssignProcedure createAssignProcedure(final RegionInfo regionInfo) { + return createAssignProcedure(regionInfo, null, false); } - public AssignProcedure createAssignProcedure(final RegionInfo regionInfo) { - AssignProcedure proc = new AssignProcedure(regionInfo); - proc.setOwner(getProcedureEnvironment().getRequestUser().getShortName()); - return proc; + public AssignProcedure createAssignProcedure(final RegionInfo regionInfo, boolean override) { + return createAssignProcedure(regionInfo, null, override); + } + + public AssignProcedure createAssignProcedure(final RegionInfo regionInfo, + ServerName targetServer) { + return createAssignProcedure(regionInfo, targetServer, false); } public AssignProcedure createAssignProcedure(final RegionInfo regionInfo, - final ServerName targetServer) { - AssignProcedure proc = new AssignProcedure(regionInfo, targetServer); + final ServerName targetServer, boolean override) { + AssignProcedure proc = new AssignProcedure(regionInfo, targetServer, override); proc.setOwner(getProcedureEnvironment().getRequestUser().getShortName()); return proc; } + public UnassignProcedure createUnassignProcedure(final RegionInfo regionInfo) { + return createUnassignProcedure(regionInfo, null, false); + } + + public UnassignProcedure createUnassignProcedure(final RegionInfo regionInfo, + boolean override) { + return createUnassignProcedure(regionInfo, null, override); + } + + public UnassignProcedure[] createUnassignProcedures(final TableName tableName) { + return createUnassignProcedures(regionStates.getTableRegionStateNodes(tableName)); + } + UnassignProcedure createUnassignProcedure(final RegionInfo regionInfo, final ServerName destinationServer, final boolean force) { return createUnassignProcedure(regionInfo, destinationServer, force, false); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java index 8f696989d7..b226f3028f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java @@ -36,6 +36,7 @@ import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; import org.apache.hadoop.hbase.procedure2.ProcedureUtil; import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteOperation; import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteProcedure; +import org.apache.hadoop.hbase.procedure2.RemoteProcedureException; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -112,6 +113,12 @@ public abstract class RegionTransitionProcedure */ private RegionInfo regionInfo; + /** + * this data member must also be persisted. + * @see #regionInfo + */ + private boolean override; + /** * Like {@link #regionInfo}, the expectation is that subclasses persist the value of this * data member. It is used doing backoff when Procedure gets stuck. @@ -121,8 +128,9 @@ public abstract class RegionTransitionProcedure // Required by the Procedure framework to create the procedure on replay public RegionTransitionProcedure() {} - public RegionTransitionProcedure(final RegionInfo regionInfo) { + public RegionTransitionProcedure(final RegionInfo regionInfo, boolean override) { this.regionInfo = regionInfo; + this.override = override; } @VisibleForTesting @@ -135,7 +143,7 @@ public abstract class RegionTransitionProcedure * {@link #deserializeStateData(ProcedureStateSerializer)} method. Expectation is that * subclasses will persist `regioninfo` in their * {@link #serializeStateData(ProcedureStateSerializer)} method and then restore `regionInfo` on - * deserialization by calling. + * deserialization by calling this. */ protected void setRegionInfo(final RegionInfo regionInfo) { this.regionInfo = regionInfo; @@ -143,9 +151,21 @@ public abstract class RegionTransitionProcedure /** * This setter is for subclasses to call in their - * {@link #deserializeStateData(ProcedureStateSerializer)} method. - * @see #setRegionInfo(RegionInfo) + * {@link #deserializeStateData(ProcedureStateSerializer)} method. Expectation is that + * subclasses will persist `override` in their + * {@link #serializeStateData(ProcedureStateSerializer)} method and then restore `override` on + * deserialization by calling this. */ + protected void setOverride(boolean override) { + this.override = override; + } + + + /** + * This setter is for subclasses to call in their + * {@link #deserializeStateData(ProcedureStateSerializer)} method. + * @see #setRegionInfo(RegionInfo) + */ protected void setAttempt(int attempt) { this.attempt = attempt; } @@ -171,6 +191,11 @@ public abstract class RegionTransitionProcedure sb.append(getTableName()); sb.append(", region="); sb.append(getRegionInfo() == null? null: getRegionInfo().getEncodedName()); + if (isOverride()) { + // Only log if set. + sb.append(", override="); + sb.append(isOverride()); + } } public RegionStateNode getRegionState(final MasterProcedureEnv env) { @@ -315,12 +340,19 @@ public abstract class RegionTransitionProcedure final AssignmentManager am = env.getAssignmentManager(); final RegionStateNode regionNode = getRegionState(env); if (!am.addRegionInTransition(regionNode, this)) { - String msg = String.format( - "There is already another procedure running on this region this=%s owner=%s", - this, regionNode.getProcedure()); - LOG.warn(msg + " " + this + "; " + regionNode.toShortString()); - setAbortFailure(getClass().getSimpleName(), msg); - return null; + if (this.isOverride()) { + LOG.info("{} owned by pid={}, OVERRIDDEN by 'this' (pid={}, override=true).", + regionNode.getRegionInfo().getEncodedName(), + regionNode.getProcedure().getProcId(), getProcId()); + regionNode.unsetProcedure(regionNode.getProcedure()); + } else { + String msg = String.format("%s owned by pid=%d, CANNOT run 'this' (pid=%d).", + regionNode.getRegionInfo().getEncodedName(), + regionNode.getProcedure().getProcId(), getProcId()); + LOG.warn(msg); + setAbortFailure(getClass().getSimpleName(), msg); + return null; + } } try { boolean retry; @@ -432,8 +464,12 @@ public abstract class RegionTransitionProcedure // TODO: Revisit this and move it to the executor if (env.getProcedureScheduler().waitRegion(this, getRegionInfo())) { try { - LOG.debug(LockState.LOCK_EVENT_WAIT + " pid=" + getProcId() + " " + - env.getProcedureScheduler().dumpLocks()); + // Enable TRACE on this class to see lock dump. Can be really large when cluster is big + // or big tables being enabled/disabled. + if (LOG.isTraceEnabled()) { + LOG.trace("{} pid={} {}", LockState.LOCK_EVENT_WAIT, getProcId(), + env.getProcedureScheduler().dumpLocks()); + } } catch (IOException e) { // ignore, just for logging } @@ -464,4 +500,23 @@ public abstract class RegionTransitionProcedure * @return ServerName the Assign or Unassign is going against. */ public abstract ServerName getServer(final MasterProcedureEnv env); + + @Override + protected void bypass(MasterProcedureEnv env) { + // This override is just so I can write a note on how bypass is done in + // RTP. For RTP procedures -- i.e. assign/unassign -- if bypass is called, + // we intentionally do NOT cleanup our state. We leave a reference to the + // bypassed Procedure in the RegionStateNode. Doing this makes it so the + // RSN is in an odd state. The bypassed Procedure is finished but no one + // else can make progress on this RSN entity (see the #execute above where + // we check the RSN to see if an already registered procedure and if so, + // we exit without proceeding). This is done to intentionally block + // subsequent Procedures from running. Only a Procedure with the 'override' flag + // set can overwrite the RSN and make progress. + super.bypass(env); + } + + boolean isOverride() { + return this.override; + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java index 4f58a0f305..b001885ee9 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java @@ -84,10 +84,6 @@ public class UnassignProcedure extends RegionTransitionProcedure { */ protected volatile ServerName destinationServer; - // TODO: should this be in a reassign procedure? - // ...and keep unassign for 'disable' case? - private boolean force; - /** * Whether deleting the region from in-memory states after unassigning the region. */ @@ -109,12 +105,11 @@ public class UnassignProcedure extends RegionTransitionProcedure { } public UnassignProcedure(final RegionInfo regionInfo, final ServerName hostingServer, - final ServerName destinationServer, final boolean force, + final ServerName destinationServer, final boolean override, final boolean removeAfterUnassigning) { - super(regionInfo); + super(regionInfo, override); this.hostingServer = hostingServer; this.destinationServer = destinationServer; - this.force = force; this.removeAfterUnassigning = removeAfterUnassigning; // we don't need REGION_TRANSITION_QUEUE, we jump directly to sending the request @@ -147,7 +142,7 @@ public class UnassignProcedure extends RegionTransitionProcedure { if (this.destinationServer != null) { state.setDestinationServer(ProtobufUtil.toServerName(destinationServer)); } - if (force) { + if (isOverride()) { state.setForce(true); } if (removeAfterUnassigning) { @@ -167,7 +162,8 @@ public class UnassignProcedure extends RegionTransitionProcedure { setTransitionState(state.getTransitionState()); setRegionInfo(ProtobufUtil.toRegionInfo(state.getRegionInfo())); this.hostingServer = ProtobufUtil.toServerName(state.getHostingServer()); - force = state.getForce(); + // The 'force' flag is the override flag in unassign. + setOverride(state.getForce()); if (state.hasDestinationServer()) { this.destinationServer = ProtobufUtil.toServerName(state.getDestinationServer()); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java index 4c6ba99e76..20e443a774 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java @@ -50,6 +50,8 @@ import org.apache.hadoop.hbase.master.LoadBalancer; import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.RackManager; import org.apache.hadoop.hbase.master.RegionPlan; +import org.apache.hadoop.hbase.master.assignment.AssignmentManager; +import org.apache.hadoop.hbase.master.assignment.RegionStates; import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer.Cluster.Action.Type; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureUtil.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureUtil.java index 58263d3044..1e488b6185 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureUtil.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureUtil.java @@ -19,9 +19,12 @@ package org.apache.hadoop.hbase.master.procedure; import java.io.IOException; import java.util.regex.Pattern; + +import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.procedure2.Procedure; +import org.apache.hadoop.hbase.procedure2.ProcedureException; import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; import org.apache.hadoop.hbase.security.User; import org.apache.hadoop.hbase.util.NonceKey; @@ -176,4 +179,17 @@ public final class MasterProcedureUtil { public static int getServerPriority(ServerProcedureInterface proc) { return proc.hasMetaTableRegion() ? 100 : 1; } + + /** + * This is a version of unwrapRemoteIOException that can do DoNotRetryIOE. + * We need to throw DNRIOE to clients if a failed Procedure else they will + * keep trying. The default proc.getException().unwrapRemoteException + * doesn't have access to DNRIOE from the procedure2 module. + */ + public static IOException unwrapRemoteIOException(Procedure proc) { + Exception e = proc.getException().unwrapRemoteException(); + // Do not retry ProcedureExceptions! + return (e instanceof ProcedureException)? new DoNotRetryIOException(e): + proc.getException().unwrapRemoteIOException(); + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ProcedureDescriber.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ProcedureDescriber.java index 41b30ad5a2..f9fdad8aba 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ProcedureDescriber.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ProcedureDescriber.java @@ -69,7 +69,8 @@ public class ProcedureDescriber { description.put("LAST_UPDATE", new Date(proc.getLastUpdate())); if (proc.isFailed()) { - description.put("ERRORS", proc.getException().unwrapRemoteIOException().getMessage()); + description.put("ERRORS", + MasterProcedureUtil.unwrapRemoteIOException(proc).getMessage()); } description.put("PARAMETERS", parametersToObject(proc)); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ProcedurePrepareLatch.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ProcedurePrepareLatch.java index 283cde0a72..0cc4dbd5da 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ProcedurePrepareLatch.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ProcedurePrepareLatch.java @@ -99,7 +99,7 @@ public abstract class ProcedurePrepareLatch { @Override protected void countDown(final Procedure proc) { if (proc.hasException()) { - exception = proc.getException().unwrapRemoteIOException(); + exception = MasterProcedureUtil.unwrapRemoteIOException(proc); } latch.countDown(); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ProcedureSyncWait.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ProcedureSyncWait.java index f72905bb52..c8ff9f8045 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ProcedureSyncWait.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ProcedureSyncWait.java @@ -160,9 +160,10 @@ public final class ProcedureSyncWait { throw new IOException("The Master is Aborting"); } + // If the procedure fails, we should always have an exception captured. Throw it. + // Needs to be an IOE to get out of here. if (proc.hasException()) { - // If the procedure fails, we should always have an exception captured. Throw it. - throw proc.getException().unwrapRemoteIOException(); + throw MasterProcedureUtil.unwrapRemoteIOException(proc); } else { return proc.getResult(); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestMetaTableAccessor.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestMetaTableAccessor.java index 619b0795dd..f1bd5592bb 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestMetaTableAccessor.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestMetaTableAccessor.java @@ -46,6 +46,7 @@ import org.apache.hadoop.hbase.ipc.CallRunner; import org.apache.hadoop.hbase.ipc.DelegatingRpcScheduler; import org.apache.hadoop.hbase.ipc.PriorityFunction; import org.apache.hadoop.hbase.ipc.RpcScheduler; +import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.regionserver.RSRpcServices; import org.apache.hadoop.hbase.regionserver.SimpleRpcSchedulerFactory; @@ -104,6 +105,21 @@ public class TestMetaTableAccessor { UTIL.shutdownMiniCluster(); } + @Test + public void testIsMetaWhenAllHealthy() throws InterruptedException { + HMaster m = UTIL.getMiniHBaseCluster().getMaster(); + assertTrue(m.waitForMetaOnline()); + } + + @Test + public void testIsMetaWhenMetaGoesOffline() throws InterruptedException { + HMaster m = UTIL.getMiniHBaseCluster().getMaster(); + int index = UTIL.getMiniHBaseCluster().getServerWithMeta(); + HRegionServer rsWithMeta = UTIL.getMiniHBaseCluster().getRegionServer(index); + rsWithMeta.abort("TESTING"); + assertTrue(m.waitForMetaOnline()); + } + /** * Does {@link MetaTableAccessor#getRegion(Connection, byte[])} and a write * against hbase:meta while its hosted server is restarted to prove our retrying diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java index 86652d84e4..7680845cce 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java @@ -19,17 +19,28 @@ package org.apache.hadoop.hbase.client; import static junit.framework.TestCase.assertTrue; +import static org.junit.Assert.assertEquals; import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtility; -import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; +import org.apache.hadoop.hbase.master.procedure.TableProcedureInterface; +import org.apache.hadoop.hbase.procedure2.Procedure; +import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; +import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; +import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; import org.apache.hadoop.hbase.testclassification.ClientTests; import org.apache.hadoop.hbase.testclassification.LargeTests; -import org.junit.After; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.Threads; import org.junit.AfterClass; -import org.junit.Before; import org.junit.BeforeClass; import org.junit.ClassRule; import org.junit.Rule; @@ -52,23 +63,19 @@ public class TestHbck { private static final Logger LOG = LoggerFactory.getLogger(TestHbck.class); private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); - private Admin admin; - private Hbck hbck; @Rule public TestName name = new TestName(); - private static final TableName tableName = TableName.valueOf(TestHbck.class.getSimpleName()); + private static final TableName TABLE_NAME = TableName.valueOf(TestHbck.class.getSimpleName()); + + private static ProcedureExecutor procExec; @BeforeClass public static void setUpBeforeClass() throws Exception { - TEST_UTIL.getConfiguration().setInt("hbase.regionserver.msginterval", 100); - TEST_UTIL.getConfiguration().setInt("hbase.client.pause", 250); - TEST_UTIL.getConfiguration().setInt("hbase.client.retries.number", 6); - TEST_UTIL.getConfiguration().setBoolean("hbase.master.enabletable.roundrobin", true); TEST_UTIL.startMiniCluster(3); - - TEST_UTIL.createTable(tableName, "family1"); + TEST_UTIL.createMultiRegionTable(TABLE_NAME, Bytes.toBytes("family1"), 5); + procExec = TEST_UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); } @AfterClass @@ -76,29 +83,102 @@ public class TestHbck { TEST_UTIL.shutdownMiniCluster(); } - @Before - public void setUp() throws Exception { - this.admin = TEST_UTIL.getAdmin(); - this.hbck = TEST_UTIL.getHbck(); - } + public static class SuspendProcedure extends + ProcedureTestingUtility.NoopProcedure implements TableProcedureInterface { + public SuspendProcedure() { + super(); + } - @After - public void tearDown() throws Exception { - for (HTableDescriptor htd : this.admin.listTables()) { - TEST_UTIL.deleteTable(htd.getTableName()); + @Override + protected Procedure[] execute(final MasterProcedureEnv env) + throws ProcedureSuspendedException { + // Always suspend the procedure + throw new ProcedureSuspendedException(); } - this.hbck.close(); + + @Override + public TableName getTableName() { + return TABLE_NAME; + } + + @Override + public TableOperationType getTableOperationType() { + return TableOperationType.READ; + } + } + + @Test + public void testBypassProcedure() throws Exception { + // SuspendProcedure + final SuspendProcedure proc = new SuspendProcedure(); + long procId = procExec.submitProcedure(proc); + Thread.sleep(500); + + //bypass the procedure + List pids = Arrays.asList(procId); + List results = + TEST_UTIL.getHbck().bypassProcedure(pids, 30000, false, false); + assertTrue("Failed to by pass procedure!", results.get(0)); + TEST_UTIL.waitFor(5000, () -> proc.isSuccess() && proc.isBypass()); + LOG.info("{} finished", proc); } @Test public void testSetTableStateInMeta() throws IOException { + Hbck hbck = TEST_UTIL.getHbck(); // set table state to DISABLED - hbck.setTableStateInMeta(new TableState(tableName, TableState.State.DISABLED)); + hbck.setTableStateInMeta(new TableState(TABLE_NAME, TableState.State.DISABLED)); // Method {@link Hbck#setTableStateInMeta()} returns previous state, which in this case // will be DISABLED TableState prevState = - hbck.setTableStateInMeta(new TableState(tableName, TableState.State.ENABLED)); + hbck.setTableStateInMeta(new TableState(TABLE_NAME, TableState.State.ENABLED)); assertTrue("Incorrect previous state! expeced=DISABLED, found=" + prevState.getState(), prevState.isDisabled()); } + + @Test + public void testAssigns() throws IOException { + Hbck hbck = TEST_UTIL.getHbck(); + try (Admin admin = TEST_UTIL.getConnection().getAdmin()) { + List regions = admin.getRegions(TABLE_NAME); + for (RegionInfo ri: regions) { + RegionState rs = TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager(). + getRegionStates().getRegionState(ri.getEncodedName()); + LOG.info("RS: {}", rs.toString()); + } + List pids = hbck.unassigns(regions.stream().map(r -> r.getEncodedName()). + collect(Collectors.toList())); + waitOnPids(pids); + for (RegionInfo ri: regions) { + RegionState rs = TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager(). + getRegionStates().getRegionState(ri.getEncodedName()); + LOG.info("RS: {}", rs.toString()); + assertTrue(rs.toString(), rs.isClosed()); + } + pids = hbck.assigns(regions.stream().map(r -> r.getEncodedName()). + collect(Collectors.toList())); + waitOnPids(pids); + for (RegionInfo ri: regions) { + RegionState rs = TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager(). + getRegionStates().getRegionState(ri.getEncodedName()); + LOG.info("RS: {}", rs.toString()); + assertTrue(rs.toString(), rs.isOpened()); + } + // What happens if crappy region list passed? + pids = hbck.assigns(Arrays.stream(new String [] {"a", "some rubbish name"}). + collect(Collectors.toList())); + for (long pid: pids) { + assertEquals(org.apache.hadoop.hbase.procedure2.Procedure.NO_PROC_ID, pid); + } + } + } + + private void waitOnPids(List pids) { + for (Long pid: pids) { + while (!TEST_UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor(). + isFinished(pid)) { + Threads.sleep(100); + } + } + } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRegionBypass.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRegionBypass.java new file mode 100644 index 0000000000..4bc9f272da --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRegionBypass.java @@ -0,0 +1,181 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.assignment; + +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.CountDownLatch; + +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Admin; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; +import org.apache.hadoop.hbase.procedure2.Procedure; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos; +import org.apache.hadoop.hbase.testclassification.LargeTests; +import org.apache.hadoop.hbase.util.Bytes; + +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.rules.TestName; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * Tests bypass on a region assign/unassign + */ +@Category({LargeTests.class}) +public class TestRegionBypass { + private final static Logger LOG = LoggerFactory.getLogger(TestRegionBypass.class); + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestRegionBypass.class); + + @Rule + public TestName name = new TestName(); + + private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); + private TableName tableName; + + @BeforeClass + public static void startCluster() throws Exception { + TEST_UTIL.startMiniCluster(2); + } + + @AfterClass + public static void stopCluster() throws Exception { + TEST_UTIL.shutdownMiniCluster(); + } + + @Before + public void before() throws IOException { + this.tableName = TableName.valueOf(this.name.getMethodName()); + // Create a table. Has one region at least. + TEST_UTIL.createTable(this.tableName, Bytes.toBytes("cf")); + + } + + @Test + public void testBypass() throws IOException { + Admin admin = TEST_UTIL.getAdmin(); + List regions = admin.getRegions(this.tableName); + for (RegionInfo ri: regions) { + admin.unassign(ri.getRegionName(), false); + } + List pids = new ArrayList<>(regions.size()); + for (RegionInfo ri: regions) { + Procedure p = new StallingAssignProcedure(ri); + pids.add(TEST_UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor(). + submitProcedure(p)); + } + for (Long pid: pids) { + while (!TEST_UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor().isStarted(pid)) { + Thread.currentThread().yield(); + } + } + // Call bypass on all. We should be stuck in the dispatch at this stage. + List> ps = + TEST_UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor().getProcedures(); + for (Procedure p: ps) { + if (p instanceof StallingAssignProcedure) { + List bs = TEST_UTIL.getHbck(). + bypassProcedure(Arrays.asList(p.getProcId()), 0, false, false); + for (Boolean b: bs) { + LOG.info("BYPASSED {} {}", p.getProcId(), b); + } + } + } + // Countdown the latch so its not hanging out. + for (Procedure p: ps) { + if (p instanceof StallingAssignProcedure) { + ((StallingAssignProcedure)p).latch.countDown(); + } + } + // Try and assign WITHOUT override flag. Should fail!. + for (RegionInfo ri: regions) { + try { + admin.assign(ri.getRegionName()); + } catch (Throwable dnrioe) { + // Expected + LOG.info("Expected {}", dnrioe); + } + } + while (!TEST_UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor(). + getActiveProcIds().isEmpty()) { + Thread.currentThread().yield(); + } + // Now assign with the override flag. + for (RegionInfo ri: regions) { + TEST_UTIL.getHbck().assigns(Arrays.asList(ri.getEncodedName()), true); + } + while (!TEST_UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor(). + getActiveProcIds().isEmpty()) { + Thread.currentThread().yield(); + } + for (RegionInfo ri: regions) { + assertTrue(ri.toString(), TEST_UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(). + getRegionStates().isRegionOnline(ri)); + } + } + + /** + * An AssignProcedure that Stalls just before the finish. + */ + public static class StallingAssignProcedure extends AssignProcedure { + public final CountDownLatch latch = new CountDownLatch(2); + + public StallingAssignProcedure() { + super(); + } + + public StallingAssignProcedure(RegionInfo regionInfo) { + super(regionInfo); + } + + @Override + void setTransitionState(MasterProcedureProtos.RegionTransitionState state) { + if (state == MasterProcedureProtos.RegionTransitionState.REGION_TRANSITION_DISPATCH) { + try { + LOG.info("LATCH2 {}", this.latch.getCount()); + this.latch.await(); + LOG.info("LATCH3 {}", this.latch.getCount()); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } else if (state == MasterProcedureProtos.RegionTransitionState.REGION_TRANSITION_QUEUE) { + // Set latch. + LOG.info("LATCH1 {}", this.latch.getCount()); + this.latch.countDown(); + } + super.setTransitionState(state); + } + } +} diff --git a/hbase-shell/src/main/ruby/shell/commands/list_procedures.rb b/hbase-shell/src/main/ruby/shell/commands/list_procedures.rb index 68842a0410..cd6e7516f9 100644 --- a/hbase-shell/src/main/ruby/shell/commands/list_procedures.rb +++ b/hbase-shell/src/main/ruby/shell/commands/list_procedures.rb @@ -31,12 +31,11 @@ EOF end def command - formatter.header(%w[Id Name State Submitted_Time Last_Update Parameters]) - + formatter.header(%w[PID Name State Submitted Last_Update Parameters]) list = JSON.parse(admin.list_procedures) list.each do |proc| - submitted_time = Time.at(Integer(proc['submittedTime']) / 1000).to_s - last_update = Time.at(Integer(proc['lastUpdate']) / 1000).to_s + submitted_time = Time.at(Integer(proc['submittedTime'])/1000).to_s + last_update = Time.at(Integer(proc['lastUpdate'])/1000).to_s formatter.row([proc['procId'], proc['className'], proc['state'], submitted_time, last_update, proc['stateMessage']]) end diff --git a/hbase-shell/src/test/java/org/apache/hadoop/hbase/client/TestShell.java b/hbase-shell/src/test/java/org/apache/hadoop/hbase/client/TestShell.java index ab36edc278..9bb8eacce8 100644 --- a/hbase-shell/src/test/java/org/apache/hadoop/hbase/client/TestShell.java +++ b/hbase-shell/src/test/java/org/apache/hadoop/hbase/client/TestShell.java @@ -39,5 +39,4 @@ public class TestShell extends AbstractTestShell { // Start all ruby tests jruby.runScriptlet(PathType.ABSOLUTE, "src/test/ruby/tests_runner.rb"); } - } diff --git a/hbase-shell/src/test/ruby/shell/shell_test.rb b/hbase-shell/src/test/ruby/shell/shell_test.rb index c1e9017edb..abf92357c4 100644 --- a/hbase-shell/src/test/ruby/shell/shell_test.rb +++ b/hbase-shell/src/test/ruby/shell/shell_test.rb @@ -85,9 +85,9 @@ class ShellTest < Test::Unit::TestCase define_test "Shell::Shell interactive mode should not throw" do # incorrect number of arguments - @shell.command('create', 'foo') - @shell.command('create', 'foo', 'family_1') + @shell.command('create', 'nothrow_table') + @shell.command('create', 'nothrow_table', 'family_1') # create a table that exists - @shell.command('create', 'foo', 'family_1') + @shell.command('create', 'nothrow_table', 'family_1') end end -- 2.16.3