From 9fad05547d7a7613977e3c7501d63de191410167 Mon Sep 17 00:00:00 2001 From: jingyuntian Date: Fri, 26 Oct 2018 15:04:28 +0800 Subject: [PATCH] HBASE-21322 Add a scheduleServerCrashProcedure() API to HbckService --- .../org/apache/hadoop/hbase/client/HBaseHbck.java | 13 +++++ .../java/org/apache/hadoop/hbase/client/Hbck.java | 2 + .../hbase/shaded/protobuf/RequestConverter.java | 7 +++ .../src/main/protobuf/Master.proto | 12 ++++ .../hadoop/hbase/master/MasterRpcServices.java | 66 ++++++++++++++++++++++ .../hbase/master/assignment/AssignmentManager.java | 11 ++-- .../org/apache/hadoop/hbase/client/TestHbck.java | 18 ++++++ 7 files changed, 124 insertions(+), 5 deletions(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java index 2d08825..76ead51 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java @@ -154,4 +154,17 @@ public class HBaseHbck implements Hbck { }); return response.getBypassedList(); } + + @Override + public List scheduleServerCrashProcedure(List serverNames) throws IOException { + try { + MasterProtos.ScheduleServerCrashProcedureResponse response = + this.hbck.scheduleServerCrashProcedure(rpcControllerFactory.newController(), + RequestConverter.toScheduleServerCrashProcedureRequest(serverNames)); + return response.getPidList(); + } catch (ServiceException se) { + LOG.debug(toCommaDelimitedString(serverNames), se); + throw new IOException(se); + } + } } diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java index 5c97d97..a309ddb 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java @@ -101,4 +101,6 @@ public interface Hbck extends Abortable, Closeable { */ List bypassProcedure(List pids, long waitTime, boolean override, boolean recursive) throws IOException; + + List scheduleServerCrashProcedure(List serverNames) throws IOException; } diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java index 1253974..4bd5483 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java @@ -1906,6 +1906,13 @@ public final class RequestConverter { setOverride(override).build(); } + public static MasterProtos.ScheduleServerCrashProcedureRequest + toScheduleServerCrashProcedureRequest(List serverNames) { + MasterProtos.ScheduleServerCrashProcedureRequest.Builder b = + MasterProtos.ScheduleServerCrashProcedureRequest.newBuilder(); + return b.addAllServerName(serverNames).build(); + } + private static List toEncodedRegionNameRegionSpecifiers( List encodedRegionNames) { return encodedRegionNames.stream(). diff --git a/hbase-protocol-shaded/src/main/protobuf/Master.proto b/hbase-protocol-shaded/src/main/protobuf/Master.proto index 8318da3..d9ea124 100644 --- a/hbase-protocol-shaded/src/main/protobuf/Master.proto +++ b/hbase-protocol-shaded/src/main/protobuf/Master.proto @@ -1046,6 +1046,14 @@ message BypassProcedureResponse { repeated bool bypassed = 1; } +message ScheduleServerCrashProcedureRequest { + repeated string serverName = 1; +} + +message ScheduleServerCrashProcedureResponse { + repeated uint64 pid = 1; +} + service HbckService { /** Update state of the table in meta only*/ rpc SetTableStateInMeta(SetTableStateInMetaRequest) @@ -1072,4 +1080,8 @@ service HbckService { /** Bypass a procedure to completion, procedure is completed but no actual work is done*/ rpc BypassProcedure(BypassProcedureRequest) returns(BypassProcedureResponse); + + /** Schedule a ServerCrashProcedure to help recover a crash server */ + rpc ScheduleServerCrashProcedure(ScheduleServerCrashProcedureRequest) + returns(ScheduleServerCrashProcedureResponse); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java index a6bc086..5cc7d40 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java @@ -18,6 +18,8 @@ */ package org.apache.hadoop.hbase.master; +import static org.apache.hadoop.hbase.master.MasterWalManager.META_FILTER; + import java.io.IOException; import java.net.BindException; import java.net.InetAddress; @@ -31,6 +33,7 @@ import java.util.Map.Entry; import java.util.Set; import java.util.stream.Collectors; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.ClusterMetricsBuilder; import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.HConstants; @@ -64,8 +67,10 @@ import org.apache.hadoop.hbase.ipc.RpcServerInterface; import org.apache.hadoop.hbase.ipc.ServerRpcController; import org.apache.hadoop.hbase.master.assignment.RegionStates; import org.apache.hadoop.hbase.master.locking.LockProcedure; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.MasterProcedureUtil; import org.apache.hadoop.hbase.master.procedure.MasterProcedureUtil.NonceProcedureRunnable; +import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; import org.apache.hadoop.hbase.mob.MobUtils; import org.apache.hadoop.hbase.procedure.MasterProcedureManager; import org.apache.hadoop.hbase.procedure2.LockType; @@ -96,11 +101,14 @@ import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.ForeignExceptionUtil; import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; import org.apache.yetus.audience.InterfaceAudience; import org.apache.zookeeper.KeeperException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hbase.thirdparty.com.google.common.base.Splitter; +import org.apache.hbase.thirdparty.com.google.common.collect.Lists; import org.apache.hbase.thirdparty.com.google.protobuf.RpcController; import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException; import org.apache.hbase.thirdparty.com.google.protobuf.UnsafeByteOperations; @@ -2447,4 +2455,62 @@ public class MasterRpcServices extends RSRpcServices throw new ServiceException(e); } } + + @Override + public MasterProtos.ScheduleServerCrashProcedureResponse scheduleServerCrashProcedure( + RpcController controller, MasterProtos.ScheduleServerCrashProcedureRequest request) + throws ServiceException { + List serverNames = request.getServerNameList(); + List pids = new ArrayList<>(); + try { + for (String serverName : serverNames) { + ServerName server = parseServerName(serverName); + if (shouldSubmitSCP(server)) { + ProcedureExecutor procExec = this.master.getMasterProcedureExecutor(); + pids.add(procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(), + server, true, containMetaWals(server)))); + } else { + pids.add(-1L); + } + } + return MasterProtos.ScheduleServerCrashProcedureResponse.newBuilder().addAllPid(pids).build(); + } catch (IOException e) { + throw new ServiceException(e); + } + } + + private boolean containMetaWals(ServerName serverName) throws IOException { + Path logDir = new Path(master.getWALRootDir(), + AbstractFSWALProvider.getWALDirectoryName(serverName.toString())); + Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT); + Path checkDir = master.getFileSystem().exists(splitDir) ? splitDir : logDir; + return master.getFileSystem().listStatus(checkDir, META_FILTER).length > 0; + } + + private ServerName parseServerName(String serverName) throws IOException { + List parts = Lists.newArrayList(Splitter.on(",").split(serverName)); + if (parts.size() != 3) { + throw new IOException( + serverName + ": Bad format of ServerName, which should be [hostName,port,startCode]"); + } + return ServerName.valueOf(parts.get(0), Integer.parseInt(parts.get(1)), + Long.parseLong(parts.get(2))); + } + + private boolean shouldSubmitSCP(ServerName serverName) throws IOException { + // check if there is already a SCP of this server running + List> procedures = + master.getMasterProcedureExecutor().getProcedures(); + for (Procedure procedure : procedures) { + if (procedure instanceof ServerCrashProcedure) { + if (serverName.compareTo(((ServerCrashProcedure) procedure).getServerName()) == 0 + && !procedure.isFinished()) { + LOG.info("there is already a SCP of this server {} running, pid {}", serverName, + procedure.getProcId()); + return false; + } + } + } + return true; + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index e6134e8..e9309e1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -1361,13 +1361,14 @@ public class AssignmentManager implements ServerListener { return 0; } - public void submitServerCrash(final ServerName serverName, final boolean shouldSplitWal) { + public long submitServerCrash(final ServerName serverName, final boolean shouldSplitWal) { boolean carryingMeta = isCarryingMeta(serverName); ProcedureExecutor procExec = this.master.getMasterProcedureExecutor(); - procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(), serverName, - shouldSplitWal, carryingMeta)); - LOG.debug("Added=" + serverName + - " to dead servers, submitted shutdown handler to be executed meta=" + carryingMeta); + long pid = procExec.submitProcedure(new ServerCrashProcedure(procExec.getEnvironment(), + serverName, shouldSplitWal, carryingMeta)); + LOG.debug("Added=" + serverName + + " to dead servers, submitted shutdown handler to be executed meta=" + carryingMeta); + return pid; } public void offlineRegion(final RegionInfo regionInfo) { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java index 7680845..f642c5f 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java @@ -28,6 +28,7 @@ import java.util.stream.Collectors; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; @@ -36,6 +37,7 @@ import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; +import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.testclassification.ClientTests; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.util.Bytes; @@ -173,6 +175,22 @@ public class TestHbck { } } + @Test + public void testScheduleSCP() throws Exception { + HRegionServer testRs = TEST_UTIL.getRSForFirstRegionInTable(TABLE_NAME); + TEST_UTIL.loadTable(TEST_UTIL.getConnection().getTable(TABLE_NAME), Bytes.toBytes("family1"), + true); + ServerName serverName = testRs.getServerName(); + Hbck hbck = TEST_UTIL.getHbck(); + List pids = hbck.scheduleServerCrashProcedure(Arrays.asList(serverName.toString())); + assertTrue(pids.get(0) > 0); + LOG.info("pid is {}", pids.get(0)); + + pids = hbck.scheduleServerCrashProcedure(Arrays.asList(serverName.toString())); + assertTrue(pids.get(0) == -1); + LOG.info("pid is {}", pids.get(0)); + } + private void waitOnPids(List pids) { for (Long pid: pids) { while (!TEST_UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor(). -- 2.7.4