From eb54c81235f97f593d7df672ceea72eea3d59025 Mon Sep 17 00:00:00 2001 From: zhangduo Date: Sat, 23 Mar 2019 21:19:48 +0800 Subject: [PATCH] HBASE-22074 Should use procedure store to persist the state in reportRegionStateTransition --- .../hbase/shaded/protobuf/ProtobufUtil.java | 32 +-- .../shaded/protobuf/RequestConverter.java | 28 +-- .../src/main/protobuf/Admin.proto | 3 +- .../src/main/protobuf/MasterProcedure.proto | 11 + .../main/protobuf/RegionServerStatus.proto | 1 + .../master/assignment/AssignProcedure.java | 6 +- .../master/assignment/AssignmentManager.java | 12 +- .../assignment/CloseRegionProcedure.java | 26 ++- .../assignment/OpenRegionProcedure.java | 60 ++++- .../assignment/RegionRemoteProcedureBase.java | 214 +++++++++++++----- .../TransitRegionStateProcedure.java | 200 +++------------- .../master/assignment/UnassignProcedure.java | 4 +- .../procedure/RSProcedureDispatcher.java | 54 ++--- .../hbase/regionserver/HRegionServer.java | 10 +- .../hbase/regionserver/RSRpcServices.java | 8 +- .../regionserver/RegionServerServices.java | 31 ++- .../hbase/regionserver/SplitRequest.java | 5 +- .../handler/AssignRegionHandler.java | 14 +- .../handler/CloseRegionHandler.java | 3 +- .../handler/OpenRegionHandler.java | 14 +- .../handler/UnassignRegionHandler.java | 15 +- .../assignment/TestAssignmentManagerBase.java | 7 + .../TestCloseRegionWhileRSCrash.java | 14 +- .../procedure/TestServerRemoteProcedure.java | 16 +- 24 files changed, 421 insertions(+), 367 deletions(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java index 15a8c8a0a3..336c59cdc8 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java @@ -3003,28 +3003,32 @@ public final class ProtobufUtil { } /** - * Create a CloseRegionRequest for a given region name - * - * @param regionName the name of the region to close - * @return a CloseRegionRequest - */ - public static CloseRegionRequest buildCloseRegionRequest(ServerName server, - final byte[] regionName) { - return ProtobufUtil.buildCloseRegionRequest(server, regionName, null); - } + * Create a CloseRegionRequest for a given region name + * @param regionName the name of the region to close + * @return a CloseRegionRequest + */ + public static CloseRegionRequest buildCloseRegionRequest(ServerName server, byte[] regionName) { + return ProtobufUtil.buildCloseRegionRequest(server, regionName, null); + } + + public static CloseRegionRequest buildCloseRegionRequest(ServerName server, byte[] regionName, + ServerName destinationServer) { + return buildCloseRegionRequest(server, regionName, destinationServer, -1); + } - public static CloseRegionRequest buildCloseRegionRequest(ServerName server, - final byte[] regionName, ServerName destinationServer) { + public static CloseRegionRequest buildCloseRegionRequest(ServerName server, byte[] regionName, + ServerName destinationServer, long closeProcId) { CloseRegionRequest.Builder builder = CloseRegionRequest.newBuilder(); - RegionSpecifier region = RequestConverter.buildRegionSpecifier( - RegionSpecifierType.REGION_NAME, regionName); + RegionSpecifier region = + RequestConverter.buildRegionSpecifier(RegionSpecifierType.REGION_NAME, regionName); builder.setRegion(region); - if (destinationServer != null){ + if (destinationServer != null) { builder.setDestinationServer(toServerName(destinationServer)); } if (server != null) { builder.setServerStartCode(server.getStartcode()); } + builder.setCloseProcId(closeProcId); return builder.build(); } diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java index e7b6624c6e..5515b2f1b6 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java @@ -940,27 +940,6 @@ public final class RequestConverter { return builder.build(); } - /** - * Create a protocol buffer OpenRegionRequest to open a list of regions - * @param server the serverName for the RPC - * @param regionOpenInfos info of a list of regions to open - * @return a protocol buffer OpenRegionRequest - */ - public static OpenRegionRequest buildOpenRegionRequest(ServerName server, - final List>> regionOpenInfos) { - OpenRegionRequest.Builder builder = OpenRegionRequest.newBuilder(); - for (Pair> regionOpenInfo : regionOpenInfos) { - builder.addOpenInfo(buildRegionOpenInfo(regionOpenInfo.getFirst(), - regionOpenInfo.getSecond())); - } - if (server != null) { - builder.setServerStartCode(server.getStartcode()); - } - // send the master's wall clock time as well, so that the RS can refer to it - builder.setMasterSystemTime(EnvironmentEdgeManager.currentTime()); - return builder.build(); - } - /** * Create a protocol buffer OpenRegionRequest for a given region * @param server the serverName for the RPC @@ -971,7 +950,7 @@ public final class RequestConverter { public static OpenRegionRequest buildOpenRegionRequest(ServerName server, final RegionInfo region, List favoredNodes) { OpenRegionRequest.Builder builder = OpenRegionRequest.newBuilder(); - builder.addOpenInfo(buildRegionOpenInfo(region, favoredNodes)); + builder.addOpenInfo(buildRegionOpenInfo(region, favoredNodes, -1L)); if (server != null) { builder.setServerStartCode(server.getStartcode()); } @@ -1622,8 +1601,8 @@ public final class RequestConverter { /** * Create a RegionOpenInfo based on given region info and version of offline node */ - public static RegionOpenInfo buildRegionOpenInfo( - final RegionInfo region, final List favoredNodes) { + public static RegionOpenInfo buildRegionOpenInfo(RegionInfo region, List favoredNodes, + long openProcId) { RegionOpenInfo.Builder builder = RegionOpenInfo.newBuilder(); builder.setRegion(ProtobufUtil.toRegionInfo(region)); if (favoredNodes != null) { @@ -1631,6 +1610,7 @@ public final class RequestConverter { builder.addFavoredNodes(ProtobufUtil.toServerName(server)); } } + builder.setOpenProcId(openProcId); return builder.build(); } diff --git a/hbase-protocol-shaded/src/main/protobuf/Admin.proto b/hbase-protocol-shaded/src/main/protobuf/Admin.proto index c622d589c6..85b9113a27 100644 --- a/hbase-protocol-shaded/src/main/protobuf/Admin.proto +++ b/hbase-protocol-shaded/src/main/protobuf/Admin.proto @@ -88,6 +88,7 @@ message OpenRegionRequest { repeated ServerName favored_nodes = 3; // open region for distributedLogReplay // optional bool DEPRECATED_openForDistributedLogReplay = 4; + optional int64 open_proc_id = 5 [default = -1]; } } @@ -102,7 +103,6 @@ message OpenRegionResponse { } message WarmupRegionRequest { - required RegionInfo regionInfo = 1; } @@ -120,6 +120,7 @@ message CloseRegionRequest { optional ServerName destination_server = 4; // the intended server for this RPC. optional uint64 serverStartCode = 5; + optional int64 close_proc_id = 6 [default = -1]; } message CloseRegionResponse { diff --git a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto index 64ac39802d..85aa064649 100644 --- a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto +++ b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto @@ -28,6 +28,7 @@ import "HBase.proto"; import "RPC.proto"; import "Snapshot.proto"; import "Replication.proto"; +import "RegionServerStatus.proto"; // ============================================================================ // WARNING - Compatibility rules @@ -548,9 +549,19 @@ message RegionStateTransitionStateData { required bool force_new_plan = 3; } +enum RegionRemoteProcedureBaseState { + REGION_REMOTE_PROCEDURE_DISPATCH = 1; + REGION_REMOTE_PROCEDURE_REPORT = 2; + REGION_REMOTE_PROCEDURE_FAIL = 3; + REGION_REMOTE_PROCEDURE_CRASH = 4; +} + message RegionRemoteProcedureBaseStateData { required RegionInfo region = 1; required ServerName target_server = 2; + required RegionRemoteProcedureBaseState state = 3; + optional RegionStateTransition.TransitionCode transition_code = 4; + optional int64 seq_id = 5; } message OpenRegionProcedureStateData { diff --git a/hbase-protocol-shaded/src/main/protobuf/RegionServerStatus.proto b/hbase-protocol-shaded/src/main/protobuf/RegionServerStatus.proto index 002432a2f2..0137cb1608 100644 --- a/hbase-protocol-shaded/src/main/protobuf/RegionServerStatus.proto +++ b/hbase-protocol-shaded/src/main/protobuf/RegionServerStatus.proto @@ -96,6 +96,7 @@ message RegionStateTransition { /** For newly opened region, the open seq num is needed */ optional uint64 open_seq_num = 3; + repeated int64 proc_id = 4; enum TransitionCode { OPENED = 0; FAILED_OPEN = 1; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java index 33a35453e0..35510d6eef 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java @@ -21,7 +21,6 @@ import java.io.IOException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; -import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionOpenOperation; import org.apache.hadoop.hbase.procedure2.ProcedureMetrics; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; @@ -38,7 +37,6 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProto * @deprecated Do not use any more. * @see TransitRegionStateProcedure */ -// TODO: Add being able to assign a region to open read-only. @Deprecated @InterfaceAudience.Private public class AssignProcedure extends RegionTransitionProcedure { @@ -121,9 +119,7 @@ public class AssignProcedure extends RegionTransitionProcedure { @Override public RemoteOperation remoteCallBuild(final MasterProcedureEnv env, final ServerName serverName) { - assert serverName.equals(getRegionState(env).getRegionLocation()); - return new RegionOpenOperation(this, getRegionInfo(), - env.getAssignmentManager().getFavoredNodes(getRegionInfo()), false); + return null; } @Override diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 2d0c3bee9f..5e43637bd8 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -835,8 +835,10 @@ public class AssignmentManager { case CLOSED: assert transition.getRegionInfoCount() == 1 : transition; final RegionInfo hri = ProtobufUtil.toRegionInfo(transition.getRegionInfo(0)); + long procId = + transition.getProcIdCount() > 0 ? transition.getProcId(0) : Procedure.NO_PROC_ID; updateRegionTransition(serverName, transition.getTransitionCode(), hri, - transition.hasOpenSeqNum() ? transition.getOpenSeqNum() : HConstants.NO_SEQNUM); + transition.hasOpenSeqNum() ? transition.getOpenSeqNum() : HConstants.NO_SEQNUM, procId); break; case READY_TO_SPLIT: case SPLIT: @@ -903,7 +905,7 @@ public class AssignmentManager { } private void updateRegionTransition(ServerName serverName, TransitionCode state, - RegionInfo regionInfo, long seqId) throws IOException { + RegionInfo regionInfo, long seqId, long procId) throws IOException { checkMetaLoaded(regionInfo); RegionStateNode regionNode = regionStates.getRegionStateNode(regionInfo); @@ -919,7 +921,7 @@ public class AssignmentManager { ServerStateNode serverNode = regionStates.getOrCreateServer(serverName); regionNode.lock(); try { - if (!reportTransition(regionNode, serverNode, state, seqId)) { + if (!reportTransition(regionNode, serverNode, state, seqId, procId)) { // Don't log WARN if shutting down cluster; during shutdown. Avoid the below messages: // 2018-08-13 10:45:10,551 WARN ...AssignmentManager: No matching procedure found for // rit=OPEN, location=ve0538.halxg.cloudera.com,16020,1533493000958, @@ -941,14 +943,14 @@ public class AssignmentManager { } private boolean reportTransition(RegionStateNode regionNode, ServerStateNode serverNode, - TransitionCode state, long seqId) throws IOException { + TransitionCode state, long seqId, long procId) throws IOException { ServerName serverName = serverNode.getServerName(); TransitRegionStateProcedure proc = regionNode.getProcedure(); if (proc == null) { return false; } proc.reportTransition(master.getMasterProcedureExecutor().getEnvironment(), regionNode, - serverName, state, seqId); + serverName, state, seqId, procId); return true; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java index f867e96459..4fd60f0ead 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java @@ -20,15 +20,17 @@ package org.apache.hadoop.hbase.master.assignment; import java.io.IOException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.RegionInfo; -import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionCloseOperation; import org.apache.hadoop.hbase.procedure2.ProcedureMetrics; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteOperation; import org.apache.yetus.audience.InterfaceAudience; + import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.CloseRegionProcedureStateData; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; /** * The remote procedure used to close a region. @@ -46,9 +48,9 @@ public class CloseRegionProcedure extends RegionRemoteProcedureBase { super(); } - public CloseRegionProcedure(RegionInfo region, ServerName targetServer, - ServerName assignCandidate) { - super(region, targetServer); + public CloseRegionProcedure(TransitRegionStateProcedure parent, RegionInfo region, + ServerName targetServer, ServerName assignCandidate) { + super(parent, region, targetServer); this.assignCandidate = assignCandidate; } @@ -59,7 +61,7 @@ public class CloseRegionProcedure extends RegionRemoteProcedureBase { @Override public RemoteOperation remoteCallBuild(MasterProcedureEnv env, ServerName remote) { - return new RegionCloseOperation(this, region, assignCandidate); + return new RegionCloseOperation(this, region, getProcId(), assignCandidate); } @Override @@ -88,7 +90,17 @@ public class CloseRegionProcedure extends RegionRemoteProcedureBase { } @Override - protected boolean shouldDispatch(RegionStateNode regionNode) { - return regionNode.isInState(RegionState.State.CLOSING); + protected void reportTransition(RegionStateNode regionNode, TransitionCode transitionCode, + long seqId) throws IOException { + if (transitionCode != TransitionCode.CLOSED) { + throw new UnexpectedStateException("Received report unexpected " + transitionCode + + " transition, " + regionNode.toShortString() + ", " + this + ", expected CLOSED."); + } + } + + @Override + protected void updateTransition(MasterProcedureEnv env, RegionStateNode regionNode, + TransitionCode transitionCode, long seqId) throws IOException { + env.getAssignmentManager().regionClosed(regionNode, true); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java index 4b3a976f28..579b7570b0 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java @@ -20,15 +20,18 @@ package org.apache.hadoop.hbase.master.assignment; import java.io.IOException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.RegionInfo; -import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionOpenOperation; import org.apache.hadoop.hbase.procedure2.ProcedureMetrics; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteOperation; import org.apache.yetus.audience.InterfaceAudience; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.OpenRegionProcedureStateData; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; /** * The remote procedure used to open a region. @@ -36,12 +39,15 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.O @InterfaceAudience.Private public class OpenRegionProcedure extends RegionRemoteProcedureBase { + private static final Logger LOG = LoggerFactory.getLogger(OpenRegionProcedure.class); + public OpenRegionProcedure() { super(); } - public OpenRegionProcedure(RegionInfo region, ServerName targetServer) { - super(region, targetServer); + public OpenRegionProcedure(TransitRegionStateProcedure parent, RegionInfo region, + ServerName targetServer) { + super(parent, region, targetServer); } @Override @@ -51,8 +57,7 @@ public class OpenRegionProcedure extends RegionRemoteProcedureBase { @Override public RemoteOperation remoteCallBuild(MasterProcedureEnv env, ServerName remote) { - return new RegionOpenOperation(this, region, env.getAssignmentManager().getFavoredNodes(region), - false); + return new RegionOpenOperation(this, region, getProcId()); } @Override @@ -73,7 +78,48 @@ public class OpenRegionProcedure extends RegionRemoteProcedureBase { } @Override - protected boolean shouldDispatch(RegionStateNode regionNode) { - return regionNode.isInState(RegionState.State.OPENING); + protected void reportTransition(RegionStateNode regionNode, TransitionCode transitionCode, + long seqId) throws IOException { + switch (transitionCode) { + case OPENED: + // this is the openSeqNum + if (seqId < 0) { + throw new UnexpectedStateException("Received report unexpected " + TransitionCode.OPENED + + " transition openSeqNum=" + seqId + ", " + regionNode + ", proc=" + this); + } + break; + case FAILED_OPEN: + // nothing to check + break; + default: + throw new UnexpectedStateException( + "Received report unexpected " + transitionCode + " transition, " + + regionNode.toShortString() + ", " + this + ", expected OPENED or FAILED_OPEN."); + } + } + + @Override + protected void updateTransition(MasterProcedureEnv env, RegionStateNode regionNode, + TransitionCode transitionCode, long openSeqNum) throws IOException { + switch (transitionCode) { + case OPENED: + if (openSeqNum < regionNode.getOpenSeqNum()) { + LOG.warn( + "Received report {} transition from {} for {}, pid={} but the new openSeqNum {}" + + " is less than the current one {}, ignoring...", + transitionCode, targetServer, regionNode, getProcId(), openSeqNum, + regionNode.getOpenSeqNum()); + } else { + regionNode.setOpenSeqNum(openSeqNum); + } + env.getAssignmentManager().regionOpened(regionNode); + break; + case FAILED_OPEN: + env.getAssignmentManager().regionFailedOpen(regionNode, false); + break; + default: + throw new UnexpectedStateException("Unexpected transition code: " + transitionCode); + } + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java index f6d3a2eaa5..b83986b3c1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java @@ -18,9 +18,11 @@ package org.apache.hadoop.hbase.master.assignment; import java.io.IOException; +import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.TableProcedureInterface; import org.apache.hadoop.hbase.procedure2.FailedRemoteDispatchException; @@ -36,7 +38,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionRemoteProcedureBaseState; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionRemoteProcedureBaseStateData; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; /** * The base class for the remote procedures used to open/close a region. @@ -53,16 +57,23 @@ public abstract class RegionRemoteProcedureBase extends Procedure - * This could happen when master restarts. Since we do not know whether a request has already been - * sent to the region server after we add a remote operation to the dispatcher, so the safe way is - * to not persist the dispatched field and try to add the remote operation again. But it is - * possible that we do have already sent the request to region server and it has also sent back - * the response, so here we need to check the region state, if it is not in the expecting state, - * we should give up, otherwise we may hang for ever, as the region server will just ignore - * redundant calls. - */ - protected abstract boolean shouldDispatch(RegionStateNode regionNode); + // do some checks to see if the report is valid, without actually updating meta. + protected abstract void reportTransition(RegionStateNode regionNode, + TransitionCode transitionCode, long seqId) throws IOException; + + // A bit strange but the procedure store will throw RuntimeException if we can not persist the + // state, so upper layer should take care of this... + private void persistAndWake(MasterProcedureEnv env, RegionStateNode regionNode) { + env.getMasterServices().getMasterProcedureExecutor().getStore().update(this); + regionNode.getProcedureEvent().wake(env.getProcedureScheduler()); + } + + // should be called with RegionStateNode locked, to avoid race with the execute method below + void reportTransition(MasterProcedureEnv env, RegionStateNode regionNode, ServerName serverName, + TransitionCode transitionCode, long seqId) throws IOException { + if (state != RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH) { + // should be a retry + return; + } + if (!targetServer.equals(serverName)) { + throw new UnexpectedStateException("Received report from " + serverName + ", expected " + + targetServer + ", " + regionNode + ", proc=" + this); + } + reportTransition(regionNode, transitionCode, seqId); + // this state means we have received the report from RS, does not mean the result is fine, as we + // may received a FAILED_OPEN. + this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_REPORT; + this.transitionCode = transitionCode; + this.seqId = seqId; + // Persist the transition code and openSeqNum(if provided). + // We should not update the hbase:meta directly as this may cause races when master restarts, + // as the old active master may incorrectly report back to RS and cause the new master to hang + // on a OpenRegionProcedure forever. See HBASE-22060 and HBASE-22074 for more details. + boolean succ = false; + try { + persistAndWake(env, regionNode); + succ = true; + } finally { + if (!succ) { + this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH; + this.transitionCode = null; + this.seqId = HConstants.NO_SEQNUM; + } + } + } + + void serverCrashed(MasterProcedureEnv env, RegionStateNode regionNode, ServerName serverName) { + if (state != RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH) { + // should be a retry + return; + } + this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_CRASH; + boolean succ = false; + try { + persistAndWake(env, regionNode); + succ = true; + } finally { + if (!succ) { + this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH; + } + } + } + + private TransitRegionStateProcedure getParent(MasterProcedureEnv env) { + return (TransitRegionStateProcedure) env.getMasterServices().getMasterProcedureExecutor() + .getProcedure(getParentProcId()); + } + + private void unattach(MasterProcedureEnv env) { + getParent(env).unattachRemoteProc(this); + } + + // actually update the state to meta + protected abstract void updateTransition(MasterProcedureEnv env, RegionStateNode regionNode, + TransitionCode transitionCode, long seqId) throws IOException; @Override protected Procedure[] execute(MasterProcedureEnv env) throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException { - if (dispatched) { - // we are done, the parent procedure will check whether we are succeeded. - return null; - } RegionStateNode regionNode = getRegionNode(env); regionNode.lock(); try { - if (!shouldDispatch(regionNode)) { - return null; + switch (state) { + case REGION_REMOTE_PROCEDURE_DISPATCH: { + // The code which wakes us up also needs to lock the RSN so here we do not need to + // synchronize + // on the event. + ProcedureEvent event = regionNode.getProcedureEvent(); + try { + env.getRemoteDispatcher().addOperationToNode(targetServer, this); + } catch (FailedRemoteDispatchException e) { + LOG.warn("Can not add remote operation {} for region {} to server {}, this usually " + + "because the server is alread dead, give up and mark the procedure as complete, " + + "the parent procedure will take care of this.", this, region, targetServer, e); + unattach(env); + return null; + } + event.suspend(); + event.suspendIfNotReady(this); + throw new ProcedureSuspendedException(); + } + case REGION_REMOTE_PROCEDURE_REPORT: + updateTransition(env, regionNode, transitionCode, seqId); + unattach(env); + return null; + case REGION_REMOTE_PROCEDURE_FAIL: + // the remote call is failed so we do not need to change the region state, just return. + unattach(env); + return null; + case REGION_REMOTE_PROCEDURE_CRASH: + env.getAssignmentManager().regionClosed(regionNode, false); + unattach(env); + return null; + default: + throw new IllegalStateException("Unknown state: " + state); } - // The code which wakes us up also needs to lock the RSN so here we do not need to synchronize - // on the event. - ProcedureEvent event = regionNode.getProcedureEvent(); - try { - env.getRemoteDispatcher().addOperationToNode(targetServer, this); - } catch (FailedRemoteDispatchException e) { - LOG.warn("Can not add remote operation {} for region {} to server {}, this usually " + - "because the server is alread dead, give up and mark the procedure as complete, " + - "the parent procedure will take care of this.", this, region, targetServer, e); - return null; - } - dispatched = true; - event.suspend(); - event.suspendIfNotReady(this); - throw new ProcedureSuspendedException(); + } catch (IOException e) { + // TODO: retry with interval + LOG.warn("Failed to update transition", e); + throw new ProcedureYieldException(); } finally { regionNode.unlock(); } @@ -186,9 +279,14 @@ public abstract class RegionRemoteProcedureBase extends Procedure favoredNodes; - private final boolean openForReplay; - private boolean failedOpen; - public RegionOpenOperation(final RemoteProcedure remoteProcedure, - final RegionInfo regionInfo, final List favoredNodes, - final boolean openForReplay) { - super(remoteProcedure, regionInfo); - this.favoredNodes = favoredNodes; - this.openForReplay = openForReplay; - } - - protected void setFailedOpen(final boolean failedOpen) { - this.failedOpen = failedOpen; - } - - public boolean isFailedOpen() { - return failedOpen; + public RegionOpenOperation(RemoteProcedure remoteProcedure, RegionInfo regionInfo, + long procId) { + super(remoteProcedure, regionInfo, procId); } public OpenRegionRequest.RegionOpenInfo buildRegionOpenInfoRequest( final MasterProcedureEnv env) { - return RequestConverter.buildRegionOpenInfo(getRegionInfo(), - env.getAssignmentManager().getFavoredNodes(getRegionInfo())); + return RequestConverter.buildRegionOpenInfo(regionInfo, + env.getAssignmentManager().getFavoredNodes(regionInfo), procId); } } public static class RegionCloseOperation extends RegionOperation { private final ServerName destinationServer; - private boolean closed = false; - public RegionCloseOperation(final RemoteProcedure remoteProcedure, - final RegionInfo regionInfo, final ServerName destinationServer) { - super(remoteProcedure, regionInfo); + public RegionCloseOperation(RemoteProcedure remoteProcedure, RegionInfo regionInfo, long procId, + ServerName destinationServer) { + super(remoteProcedure, regionInfo, procId); this.destinationServer = destinationServer; } @@ -454,17 +436,9 @@ public class RSProcedureDispatcher return destinationServer; } - protected void setClosed(final boolean closed) { - this.closed = closed; - } - - public boolean isClosed() { - return closed; - } - public CloseRegionRequest buildCloseRegionRequest(final ServerName serverName) { - return ProtobufUtil.buildCloseRegionRequest(serverName, - getRegionInfo().getRegionName(), getDestinationServer()); + return ProtobufUtil.buildCloseRegionRequest(serverName, regionInfo.getRegionName(), + getDestinationServer(), procId); } } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index e407285f9f..bcb1a07b12 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -2228,9 +2228,11 @@ public class HRegionServer extends HasThread implements @Override public void postOpenDeployTasks(final PostOpenDeployContext context) throws IOException { HRegion r = context.getRegion(); + long openProcId = context.getOpenProcId(); long masterSystemTime = context.getMasterSystemTime(); rpcServices.checkOpen(); - LOG.info("Post open deploy tasks for " + r.getRegionInfo().getRegionNameAsString()); + LOG.info("Post open deploy tasks for {}, openProcId={}, masterSystemTime={}", + r.getRegionInfo().getRegionNameAsString(), openProcId, masterSystemTime); // Do checks to see if we need to compact (references or too many files) for (HStore s : r.stores.values()) { if (s.hasReferences() || s.needsCompaction()) { @@ -2247,7 +2249,7 @@ public class HRegionServer extends HasThread implements // Notify master if (!reportRegionStateTransition(new RegionStateTransitionContext(TransitionCode.OPENED, - openSeqNum, masterSystemTime, r.getRegionInfo()))) { + openSeqNum, openProcId, masterSystemTime, r.getRegionInfo()))) { throw new IOException( "Failed to report opened region to master: " + r.getRegionInfo().getRegionNameAsString()); } @@ -2263,6 +2265,7 @@ public class HRegionServer extends HasThread implements long openSeqNum = context.getOpenSeqNum(); long masterSystemTime = context.getMasterSystemTime(); RegionInfo[] hris = context.getHris(); + long[] procIds = context.getProcIds(); if (TEST_SKIP_REPORTING_TRANSITION) { // This is for testing only in case there is no master @@ -2301,6 +2304,9 @@ public class HRegionServer extends HasThread implements for (RegionInfo hri: hris) { transition.addRegionInfo(ProtobufUtil.toRegionInfo(hri)); } + for (long procId: procIds) { + transition.addProcId(procId); + } ReportRegionStateTransitionRequest request = builder.build(); int tries = 0; long pauseTime = INIT_PAUSE_TIME_MS; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java index 9b99ff82d6..2054ac78a7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java @@ -3714,8 +3714,8 @@ public class RSRpcServices implements HBaseRPCErrorHandler, regionServer.updateRegionFavoredNodesMapping(regionInfo.getEncodedName(), regionOpenInfo.getFavoredNodesList()); } - regionServer.executorService - .submit(AssignRegionHandler.create(regionServer, regionInfo, tableDesc, masterSystemTime)); + regionServer.executorService.submit(AssignRegionHandler.create(regionServer, regionInfo, + regionOpenInfo.getOpenProcId(), tableDesc, masterSystemTime)); } } @@ -3729,8 +3729,8 @@ public class RSRpcServices implements HBaseRPCErrorHandler, ServerName destination = request.hasDestinationServer() ? ProtobufUtil.toServerName(request.getDestinationServer()) : null; - regionServer.executorService - .submit(UnassignRegionHandler.create(regionServer, encodedName, false, destination)); + regionServer.executorService.submit(UnassignRegionHandler.create(regionServer, encodedName, + request.getCloseProcId(), false, destination)); } private void executeProcedures(RemoteProcedureRequest request) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerServices.java index e0638acb16..17f318b227 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerServices.java @@ -100,16 +100,23 @@ public interface RegionServerServices extends Server, MutableOnlineRegions, Favo */ class PostOpenDeployContext { private final HRegion region; + private final long openProcId; private final long masterSystemTime; - @InterfaceAudience.Private - public PostOpenDeployContext(HRegion region, long masterSystemTime) { + public PostOpenDeployContext(HRegion region, long openProcId, long masterSystemTime) { this.region = region; + this.openProcId = openProcId; this.masterSystemTime = masterSystemTime; } + public HRegion getRegion() { return region; } + + public long getOpenProcId() { + return openProcId; + } + public long getMasterSystemTime() { return masterSystemTime; } @@ -125,28 +132,46 @@ public interface RegionServerServices extends Server, MutableOnlineRegions, Favo private final TransitionCode code; private final long openSeqNum; private final long masterSystemTime; + private final long[] procIds; private final RegionInfo[] hris; - @InterfaceAudience.Private public RegionStateTransitionContext(TransitionCode code, long openSeqNum, long masterSystemTime, RegionInfo... hris) { this.code = code; this.openSeqNum = openSeqNum; this.masterSystemTime = masterSystemTime; this.hris = hris; + this.procIds = new long[hris.length]; } + + public RegionStateTransitionContext(TransitionCode code, long openSeqNum, long procId, + long masterSystemTime, RegionInfo hri) { + this.code = code; + this.openSeqNum = openSeqNum; + this.masterSystemTime = masterSystemTime; + this.hris = new RegionInfo[] { hri }; + this.procIds = new long[] { procId }; + } + public TransitionCode getCode() { return code; } + public long getOpenSeqNum() { return openSeqNum; } + public long getMasterSystemTime() { return masterSystemTime; } + public RegionInfo[] getHris() { return hris; } + + public long[] getProcIds() { + return procIds; + } } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java index 9a0531c923..9e806bb611 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java @@ -19,7 +19,6 @@ package org.apache.hadoop.hbase.regionserver; import java.security.PrivilegedAction; - import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.RegionInfo; @@ -30,7 +29,9 @@ import org.apache.hadoop.hbase.util.Bytes; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + import org.apache.hbase.thirdparty.com.google.common.base.Preconditions; + import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; /** @@ -87,7 +88,7 @@ class SplitRequest implements Runnable { // hri_a and hri_b objects may not reflect the regions that will be created, those objects // are created just to pass the information to the reportRegionStateTransition(). if (!server.reportRegionStateTransition(new RegionStateTransitionContext( - TransitionCode.READY_TO_SPLIT, HConstants.NO_SEQNUM, -1, parent, hri_a, hri_b))) { + TransitionCode.READY_TO_SPLIT, HConstants.NO_SEQNUM, -1, parent, hri_a, hri_b))) { LOG.error("Unable to ask master to split " + parent.getRegionNameAsString()); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java index c6fee2e57a..bc2425ba97 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java @@ -51,16 +51,19 @@ public class AssignRegionHandler extends EventHandler { private final RegionInfo regionInfo; + private final long openProcId; + private final TableDescriptor tableDesc; private final long masterSystemTime; private final RetryCounter retryCounter; - public AssignRegionHandler(RegionServerServices server, RegionInfo regionInfo, + public AssignRegionHandler(RegionServerServices server, RegionInfo regionInfo, long openProcId, @Nullable TableDescriptor tableDesc, long masterSystemTime, EventType eventType) { super(server, eventType); this.regionInfo = regionInfo; + this.openProcId = openProcId; this.tableDesc = tableDesc; this.masterSystemTime = masterSystemTime; this.retryCounter = HandlerUtil.getRetryCounter(); @@ -76,7 +79,7 @@ public class AssignRegionHandler extends EventHandler { RegionServerServices rs = getServer(); rs.getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes(), Boolean.TRUE); if (!rs.reportRegionStateTransition(new RegionStateTransitionContext(TransitionCode.FAILED_OPEN, - HConstants.NO_SEQNUM, masterSystemTime, regionInfo))) { + HConstants.NO_SEQNUM, openProcId, masterSystemTime, regionInfo))) { throw new IOException( "Failed to report failed open to master: " + regionInfo.getRegionNameAsString()); } @@ -133,7 +136,7 @@ public class AssignRegionHandler extends EventHandler { cleanUpAndReportFailure(e); return; } - rs.postOpenDeployTasks(new PostOpenDeployContext(region, masterSystemTime)); + rs.postOpenDeployTasks(new PostOpenDeployContext(region, openProcId, masterSystemTime)); rs.addRegion(region); LOG.info("Opened {}", regionName); Boolean current = rs.getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes()); @@ -156,7 +159,7 @@ public class AssignRegionHandler extends EventHandler { } public static AssignRegionHandler create(RegionServerServices server, RegionInfo regionInfo, - TableDescriptor tableDesc, long masterSystemTime) { + long openProcId, TableDescriptor tableDesc, long masterSystemTime) { EventType eventType; if (regionInfo.isMetaRegion()) { eventType = EventType.M_RS_CLOSE_META; @@ -166,6 +169,7 @@ public class AssignRegionHandler extends EventHandler { } else { eventType = EventType.M_RS_OPEN_REGION; } - return new AssignRegionHandler(server, regionInfo, tableDesc, masterSystemTime, eventType); + return new AssignRegionHandler(server, regionInfo, openProcId, tableDesc, masterSystemTime, + eventType); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java index 0e35a0be70..d4ea004cb2 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java @@ -26,6 +26,7 @@ import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.executor.EventHandler; import org.apache.hadoop.hbase.executor.EventType; +import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.RegionServerServices; import org.apache.hadoop.hbase.regionserver.RegionServerServices.RegionStateTransitionContext; @@ -123,7 +124,7 @@ public class CloseRegionHandler extends EventHandler { this.rsServices.removeRegion(region, destination); rsServices.reportRegionStateTransition(new RegionStateTransitionContext(TransitionCode.CLOSED, - HConstants.NO_SEQNUM, -1, regionInfo)); + HConstants.NO_SEQNUM, Procedure.NO_PROC_ID, -1, regionInfo)); // Done! Region is closed on this RS LOG.debug("Closed " + region.getRegionInfo().getRegionNameAsString()); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java index 31177ef79c..8b644b0d77 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java @@ -27,6 +27,7 @@ import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.TableDescriptor; import org.apache.hadoop.hbase.executor.EventHandler; import org.apache.hadoop.hbase.executor.EventType; +import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.Region; import org.apache.hadoop.hbase.regionserver.RegionServerServices; @@ -156,15 +157,14 @@ public class OpenRegionHandler extends EventHandler { } } - private void doCleanUpOnFailedOpen(HRegion region) - throws IOException { + private void doCleanUpOnFailedOpen(HRegion region) throws IOException { try { if (region != null) { cleanupFailedOpen(region); } } finally { rsServices.reportRegionStateTransition(new RegionStateTransitionContext( - TransitionCode.FAILED_OPEN, HConstants.NO_SEQNUM, -1, regionInfo)); + TransitionCode.FAILED_OPEN, HConstants.NO_SEQNUM, Procedure.NO_PROC_ID, -1, regionInfo)); } } @@ -248,19 +248,19 @@ public class OpenRegionHandler extends EventHandler { @Override public void run() { try { - this.services.postOpenDeployTasks(new PostOpenDeployContext(region, masterSystemTime)); + this.services.postOpenDeployTasks( + new PostOpenDeployContext(region, Procedure.NO_PROC_ID, masterSystemTime)); } catch (Throwable e) { String msg = "Exception running postOpenDeployTasks; region=" + this.region.getRegionInfo().getEncodedName(); this.exception = e; - if (e instanceof IOException - && isRegionStillOpening(region.getRegionInfo(), services)) { + if (e instanceof IOException && isRegionStillOpening(region.getRegionInfo(), services)) { server.abort(msg, e); } else { LOG.warn(msg, e); } } - // We're done. Set flag then wake up anyone waiting on thread to complete. + // We're done. Set flag then wake up anyone waiting on thread to complete. this.signaller.set(true); synchronized (this.signaller) { this.signaller.notify(); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java index cd38db14c7..3ce7caa003 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java @@ -49,18 +49,22 @@ public class UnassignRegionHandler extends EventHandler { private static final Logger LOG = LoggerFactory.getLogger(UnassignRegionHandler.class); private final String encodedName; + + private final long closeProcId; // If true, the hosting server is aborting. Region close process is different // when we are aborting. + // TODO: not used yet, we still use the old CloseRegionHandler when aborting private final boolean abort; private final ServerName destination; private final RetryCounter retryCounter; - public UnassignRegionHandler(RegionServerServices server, String encodedName, boolean abort, - @Nullable ServerName destination, EventType eventType) { + public UnassignRegionHandler(RegionServerServices server, String encodedName, long closeProcId, + boolean abort, @Nullable ServerName destination, EventType eventType) { super(server, eventType); this.encodedName = encodedName; + this.closeProcId = closeProcId; this.abort = abort; this.destination = destination; this.retryCounter = HandlerUtil.getRetryCounter(); @@ -117,7 +121,7 @@ public class UnassignRegionHandler extends EventHandler { } rs.removeRegion(region, destination); if (!rs.reportRegionStateTransition(new RegionStateTransitionContext(TransitionCode.CLOSED, - HConstants.NO_SEQNUM, -1, region.getRegionInfo()))) { + HConstants.NO_SEQNUM, closeProcId, -1, region.getRegionInfo()))) { throw new IOException("Failed to report close to master: " + regionName); } rs.getRegionsInTransitionInRS().remove(encodedNameBytes, Boolean.FALSE); @@ -131,13 +135,14 @@ public class UnassignRegionHandler extends EventHandler { } public static UnassignRegionHandler create(RegionServerServices server, String encodedName, - boolean abort, @Nullable ServerName destination) { + long closeProcId, boolean abort, @Nullable ServerName destination) { // Just try our best to determine whether it is for closing meta. It is not the end of the world // if we put the handler into a wrong executor. Region region = server.getRegion(encodedName); EventType eventType = region != null && region.getRegionInfo().isMetaRegion() ? EventType.M_RS_CLOSE_META : EventType.M_RS_CLOSE_REGION; - return new UnassignRegionHandler(server, encodedName, abort, destination, eventType); + return new UnassignRegionHandler(server, encodedName, closeProcId, abort, destination, + eventType); } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManagerBase.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManagerBase.java index fb6668acb8..9f3acebd38 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManagerBase.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManagerBase.java @@ -400,6 +400,13 @@ public abstract class TestAssignmentManagerBase { if (retries == timeoutTimes) { LOG.info("Mark server=" + server + " as dead. retries=" + retries); master.getServerManager().moveFromOnlineToDeadServers(server); + executor.schedule(new Runnable() { + @Override + public void run() { + LOG.info("Sending in CRASH of " + server); + doCrash(server); + } + }, 1, TimeUnit.SECONDS); } throw new SocketTimeoutException("simulate socket timeout"); } else { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java index d34bfbb4f2..0a29958d32 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java @@ -27,6 +27,7 @@ import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.ServerManager; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; @@ -167,7 +168,7 @@ public class TestCloseRegionWhileRSCrash { HRegionServer dstRs = UTIL.getOtherRegionServer(srcRs); ProcedureExecutor procExec = UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); - long dummyProcId = procExec.submitProcedure(new DummyServerProcedure(srcRs.getServerName())); + procExec.submitProcedure(new DummyServerProcedure(srcRs.getServerName())); ARRIVE.await(); UTIL.getMiniHBaseCluster().killRegionServer(srcRs.getServerName()); UTIL.waitFor(30000, @@ -185,13 +186,12 @@ public class TestCloseRegionWhileRSCrash { 30000); // wait until the timeout value increase three times ProcedureTestUtil.waitUntilProcedureTimeoutIncrease(UTIL, TransitRegionStateProcedure.class, 3); - // let's close the connection to make sure that the SCP can not update meta successfully - UTIL.getMiniHBaseCluster().getMaster().getConnection().close(); + // close connection to make sure that we can not finish the TRSP + HMaster master = UTIL.getMiniHBaseCluster().getMaster(); + master.getConnection().close(); RESUME.countDown(); - UTIL.waitFor(30000, () -> procExec.isFinished(dummyProcId)); - Thread.sleep(2000); - // here we restart - UTIL.getMiniHBaseCluster().stopMaster(0).join(); + UTIL.waitFor(30000, () -> !master.isAlive()); + // here we start a new master UTIL.getMiniHBaseCluster().startMaster(); t.join(); // Make sure that the region is online, it may not on the original target server, as we will set diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerRemoteProcedure.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerRemoteProcedure.java index d4745b9840..f03794a31d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerRemoteProcedure.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerRemoteProcedure.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.master.procedure; import static org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface.ServerOperationType.SWITCH_RPC_THROTTLE; +import static org.junit.Assert.fail; import java.io.IOException; import java.util.NavigableMap; @@ -31,7 +32,6 @@ import java.util.concurrent.Future; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; - import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.ServerName; @@ -42,6 +42,7 @@ import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.assignment.AssignmentManager; import org.apache.hadoop.hbase.master.assignment.MockMasterServices; import org.apache.hadoop.hbase.master.assignment.OpenRegionProcedure; +import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure; import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher; @@ -62,6 +63,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hbase.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder; + import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos; @Category({ MasterTests.class, MediumTests.class }) @@ -134,19 +136,21 @@ public class TestServerRemoteProcedure { } @Test - public void testRegionOpenProcedureIsNotHandledByDisPatcher() throws Exception { + public void testRegionOpenProcedureIsNotHandledByDispatcher() throws Exception { TableName tableName = TableName.valueOf("testRegionOpenProcedureIsNotHandledByDisPatcher"); RegionInfo hri = RegionInfoBuilder.newBuilder(tableName).setStartKey(Bytes.toBytes(1)) - .setEndKey(Bytes.toBytes(2)).setSplit(false).setRegionId(0).build(); - master.getMasterProcedureExecutor().getEnvironment().getAssignmentManager().getRegionStates() - .getOrCreateRegionStateNode(hri); + .setEndKey(Bytes.toBytes(2)).setSplit(false).setRegionId(0).build(); + MasterProcedureEnv env = master.getMasterProcedureExecutor().getEnvironment(); + env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(hri); + TransitRegionStateProcedure proc = TransitRegionStateProcedure.assign(env, hri, null); ServerName worker = master.getServerManager().getOnlineServersList().get(0); - OpenRegionProcedure openRegionProcedure = new OpenRegionProcedure(hri, worker); + OpenRegionProcedure openRegionProcedure = new OpenRegionProcedure(proc, hri, worker); Future future = submitProcedure(openRegionProcedure); Thread.sleep(2000); rsDispatcher.removeNode(worker); try { future.get(2000, TimeUnit.MILLISECONDS); + fail(); } catch (TimeoutException e) { LOG.info("timeout is expected"); } -- 2.17.1