diff --git a/hbase-protocol-shaded/src/main/protobuf/Admin.proto b/hbase-protocol-shaded/src/main/protobuf/Admin.proto index c622d589c6..131342bdd5 100644 --- a/hbase-protocol-shaded/src/main/protobuf/Admin.proto +++ b/hbase-protocol-shaded/src/main/protobuf/Admin.proto @@ -274,6 +274,7 @@ message ExecuteProceduresRequest { repeated OpenRegionRequest open_region = 1; repeated CloseRegionRequest close_region = 2; repeated RemoteProcedureRequest proc = 3; + optional uint64 master_rpc_deadline = 4; } message ExecuteProceduresResponse { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java index 2d022b7bad..0c59f0b601 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java @@ -25,6 +25,7 @@ import java.io.IOException; import org.apache.hadoop.hbase.HBaseIOException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.YouAreDeadException; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionReplicaUtil; import org.apache.hadoop.hbase.client.RetriesExhaustedException; @@ -392,9 +393,13 @@ public class TransitRegionStateProcedure // from '<' to '<='. So here we still need to check whether the serverName // matches, to determine whether this is a retry when the openSeqNum is not changed. if (!regionNode.getRegionLocation().equals(serverName)) { - LOG.warn("Received report {} transition from {} for {}, pid={} but the region is not on it," + - " should be a retry, ignore", TransitionCode.OPENED, serverName, regionNode, getProcId()); - return; + LOG.warn("Received report {} transition from {} for {}, pid={} but the region is not on it," + + " killing RS", TransitionCode.OPENED, serverName, regionNode, getProcId()); + // We may be killing an innocent RS due to some network race condition (to fix that, we'd + // need HBASE-21864). However, that is relatively harmless compared to HBASE-21862. + // Play it safe and assume we could have a double-assignment situation. + // Note that we don't do it in regular RS report, because races there are much more frequent. + throw new YouAreDeadException("Potentially double-assigning " + regionNode); } regionNode.setOpenSeqNum(openSeqNum); env.getAssignmentManager().regionOpened(regionNode); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java index 638f9d3461..9a5256e0ce 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RSProcedureDispatcher.java @@ -52,7 +52,7 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.ExecuteProc import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.ExecuteProceduresResponse; import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.OpenRegionRequest; import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.OpenRegionResponse; -import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.RemoteProcedureRequest; +import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.RemoteProcedureRequest;import static org.apache.hadoop.hbase.HConstants.DEFAULT_HBASE_RPC_TIMEOUT;import static org.apache.hadoop.hbase.HConstants.HBASE_RPC_TIMEOUT_KEY; /** * A remote procecdure dispatcher for regionservers. @@ -67,6 +67,12 @@ public class RSProcedureDispatcher "hbase.regionserver.rpc.startup.waittime"; private static final int DEFAULT_RS_RPC_STARTUP_WAIT_TIME = 60000; + // -1 means don't set the deadline + public static final String MASTER_RPC_DEADLINE_SAFETY_MS = + "hbase.master.rpc.deadline.safety.ms"; + private static final int DEFAULT_MASTER_RPC_DEADLINE_SAFETY_MS = 0; + + private static final int RS_VERSION_WITH_EXEC_PROCS = 0x0200000; // 2.0 protected final MasterServices master; @@ -308,7 +314,13 @@ public class RSProcedureDispatcher LOG.trace("Building request with operations count=" + remoteProcedures.size()); } splitAndResolveOperation(getServerName(), remoteProcedures, this); - + int safetyMs = procedureEnv.getMasterConfiguration().getInt( + MASTER_RPC_DEADLINE_SAFETY_MS, DEFAULT_MASTER_RPC_DEADLINE_SAFETY_MS); + if (safetyMs >= 0) { + long timeoutMs = safetyMs + procedureEnv.getMasterConfiguration().getLong( + HBASE_RPC_TIMEOUT_KEY, DEFAULT_HBASE_RPC_TIMEOUT); + request.setMasterRpcDeadline(EnvironmentEdgeManager.currentTime() + timeoutMs); + } try { sendRequest(getServerName(), request.build()); } catch (IOException e) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 34a6c13924..96bad97182 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -2325,6 +2325,11 @@ public class HRegionServer extends HasThread implements return true; } catch (ServiceException se) { IOException ioe = ProtobufUtil.getRemoteException(se); + if (ioe instanceof YouAreDeadException) { + abort("Incorrect transition", ioe); + return false; + } + boolean pause = ioe instanceof ServerNotRunningYetException || ioe instanceof PleaseHoldException; if (pause) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java index 592f99c431..bf9a4d9e57 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java @@ -3776,6 +3776,19 @@ public class RSRpcServices implements HBaseRPCErrorHandler, checkOpen(); throwOnWrongStartCode(request); regionServer.getRegionServerCoprocessorHost().preExecuteProcedures(); + long rpcDeadline = request.hasMasterRpcDeadline() ? request.getMasterRpcDeadline() : 0; + long now = EnvironmentEdgeManager.currentTime(); + if (rpcDeadline > 0 && rpcDeadline < now) { + // Master probably already gave up on this request. Executing procedures in such cases + // can cause bugs in master due to some assumptions about failures. Fail the call. + String msg = "Dropping the executeProcedures request after the timeout; deadline is " + + rpcDeadline + "; current time is " + now; + LOG.warn(msg); + throw new ServiceException(msg); + } + // In theory it's still possible to reach the deadline before responding, + // but it should be fast and we can't un-submit part of the procedures right now. + if (request.getOpenRegionCount() > 0) { // Avoid reading from the TableDescritor every time(usually it will read from the file // system)