From 6d34bec907def976ef0a66108907b2a6deab1fa5 Mon Sep 17 00:00:00 2001 From: zhangduo Date: Tue, 26 Mar 2019 11:28:41 +0800 Subject: [PATCH] HBASE-22074 Should use procedure store to persist the state in reportRegionStateTransition --- .../hbase/shaded/protobuf/ProtobufUtil.java | 32 +-- .../shaded/protobuf/RequestConverter.java | 28 +-- .../src/main/protobuf/Admin.proto | 3 +- .../src/main/protobuf/MasterProcedure.proto | 11 + .../main/protobuf/RegionServerStatus.proto | 1 + .../master/assignment/AssignProcedure.java | 6 +- .../master/assignment/AssignmentManager.java | 12 +- .../assignment/CloseRegionProcedure.java | 26 ++- .../assignment/OpenRegionProcedure.java | 60 ++++- .../assignment/RegionRemoteProcedureBase.java | 219 +++++++++++++----- .../TransitRegionStateProcedure.java | 208 +++-------------- .../master/assignment/UnassignProcedure.java | 4 +- .../procedure/RSProcedureDispatcher.java | 54 ++--- .../hbase/regionserver/HRegionServer.java | 10 +- .../hbase/regionserver/RSRpcServices.java | 8 +- .../regionserver/RegionServerServices.java | 31 ++- .../hbase/regionserver/SplitRequest.java | 5 +- .../handler/AssignRegionHandler.java | 14 +- .../handler/CloseRegionHandler.java | 3 +- .../handler/OpenRegionHandler.java | 14 +- .../handler/UnassignRegionHandler.java | 15 +- .../assignment/TestAssignmentManagerBase.java | 7 + .../TestCloseRegionWhileRSCrash.java | 14 +- .../TestOpenRegionProcedureHang.java | 209 +++++++++++++++++ .../procedure/TestServerRemoteProcedure.java | 16 +- 25 files changed, 636 insertions(+), 374 deletions(-) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestOpenRegionProcedureHang.java diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java index 15a8c8a0a3..336c59cdc8 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java @@ -3003,28 +3003,32 @@ public final class ProtobufUtil { } /** - * Create a CloseRegionRequest for a given region name - * - * @param regionName the name of the region to close - * @return a CloseRegionRequest - */ - public static CloseRegionRequest buildCloseRegionRequest(ServerName server, - final byte[] regionName) { - return ProtobufUtil.buildCloseRegionRequest(server, regionName, null); - } + * Create a CloseRegionRequest for a given region name + * @param regionName the name of the region to close + * @return a CloseRegionRequest + */ + public static CloseRegionRequest buildCloseRegionRequest(ServerName server, byte[] regionName) { + return ProtobufUtil.buildCloseRegionRequest(server, regionName, null); + } + + public static CloseRegionRequest buildCloseRegionRequest(ServerName server, byte[] regionName, + ServerName destinationServer) { + return buildCloseRegionRequest(server, regionName, destinationServer, -1); + } - public static CloseRegionRequest buildCloseRegionRequest(ServerName server, - final byte[] regionName, ServerName destinationServer) { + public static CloseRegionRequest buildCloseRegionRequest(ServerName server, byte[] regionName, + ServerName destinationServer, long closeProcId) { CloseRegionRequest.Builder builder = CloseRegionRequest.newBuilder(); - RegionSpecifier region = RequestConverter.buildRegionSpecifier( - RegionSpecifierType.REGION_NAME, regionName); + RegionSpecifier region = + RequestConverter.buildRegionSpecifier(RegionSpecifierType.REGION_NAME, regionName); builder.setRegion(region); - if (destinationServer != null){ + if (destinationServer != null) { builder.setDestinationServer(toServerName(destinationServer)); } if (server != null) { builder.setServerStartCode(server.getStartcode()); } + builder.setCloseProcId(closeProcId); return builder.build(); } diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java index 36c8fab23a..0c58d4b2f1 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java @@ -940,27 +940,6 @@ public final class RequestConverter { return builder.build(); } - /** - * Create a protocol buffer OpenRegionRequest to open a list of regions - * @param server the serverName for the RPC - * @param regionOpenInfos info of a list of regions to open - * @return a protocol buffer OpenRegionRequest - */ - public static OpenRegionRequest buildOpenRegionRequest(ServerName server, - final List>> regionOpenInfos) { - OpenRegionRequest.Builder builder = OpenRegionRequest.newBuilder(); - for (Pair> regionOpenInfo : regionOpenInfos) { - builder.addOpenInfo(buildRegionOpenInfo(regionOpenInfo.getFirst(), - regionOpenInfo.getSecond())); - } - if (server != null) { - builder.setServerStartCode(server.getStartcode()); - } - // send the master's wall clock time as well, so that the RS can refer to it - builder.setMasterSystemTime(EnvironmentEdgeManager.currentTime()); - return builder.build(); - } - /** * Create a protocol buffer OpenRegionRequest for a given region * @param server the serverName for the RPC @@ -971,7 +950,7 @@ public final class RequestConverter { public static OpenRegionRequest buildOpenRegionRequest(ServerName server, final RegionInfo region, List favoredNodes) { OpenRegionRequest.Builder builder = OpenRegionRequest.newBuilder(); - builder.addOpenInfo(buildRegionOpenInfo(region, favoredNodes)); + builder.addOpenInfo(buildRegionOpenInfo(region, favoredNodes, -1L)); if (server != null) { builder.setServerStartCode(server.getStartcode()); } @@ -1622,8 +1601,8 @@ public final class RequestConverter { /** * Create a RegionOpenInfo based on given region info and version of offline node */ - public static RegionOpenInfo buildRegionOpenInfo( - final RegionInfo region, final List favoredNodes) { + public static RegionOpenInfo buildRegionOpenInfo(RegionInfo region, List favoredNodes, + long openProcId) { RegionOpenInfo.Builder builder = RegionOpenInfo.newBuilder(); builder.setRegion(ProtobufUtil.toRegionInfo(region)); if (favoredNodes != null) { @@ -1631,6 +1610,7 @@ public final class RequestConverter { builder.addFavoredNodes(ProtobufUtil.toServerName(server)); } } + builder.setOpenProcId(openProcId); return builder.build(); } diff --git a/hbase-protocol-shaded/src/main/protobuf/Admin.proto b/hbase-protocol-shaded/src/main/protobuf/Admin.proto index c622d589c6..85b9113a27 100644 --- a/hbase-protocol-shaded/src/main/protobuf/Admin.proto +++ b/hbase-protocol-shaded/src/main/protobuf/Admin.proto @@ -88,6 +88,7 @@ message OpenRegionRequest { repeated ServerName favored_nodes = 3; // open region for distributedLogReplay // optional bool DEPRECATED_openForDistributedLogReplay = 4; + optional int64 open_proc_id = 5 [default = -1]; } } @@ -102,7 +103,6 @@ message OpenRegionResponse { } message WarmupRegionRequest { - required RegionInfo regionInfo = 1; } @@ -120,6 +120,7 @@ message CloseRegionRequest { optional ServerName destination_server = 4; // the intended server for this RPC. optional uint64 serverStartCode = 5; + optional int64 close_proc_id = 6 [default = -1]; } message CloseRegionResponse { diff --git a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto index 64ac39802d..d5a390ca44 100644 --- a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto +++ b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto @@ -28,6 +28,7 @@ import "HBase.proto"; import "RPC.proto"; import "Snapshot.proto"; import "Replication.proto"; +import "RegionServerStatus.proto"; // ============================================================================ // WARNING - Compatibility rules @@ -548,9 +549,19 @@ message RegionStateTransitionStateData { required bool force_new_plan = 3; } +enum RegionRemoteProcedureBaseState { + REGION_REMOTE_PROCEDURE_DISPATCH = 1; + REGION_REMOTE_PROCEDURE_REPORT_SUCCEED = 2; + REGION_REMOTE_PROCEDURE_DISPATCH_FAIL = 3; + REGION_REMOTE_PROCEDURE_SERVER_CRASH = 4; +} + message RegionRemoteProcedureBaseStateData { required RegionInfo region = 1; required ServerName target_server = 2; + required RegionRemoteProcedureBaseState state = 3; + optional RegionStateTransition.TransitionCode transition_code = 4; + optional int64 seq_id = 5; } message OpenRegionProcedureStateData { diff --git a/hbase-protocol-shaded/src/main/protobuf/RegionServerStatus.proto b/hbase-protocol-shaded/src/main/protobuf/RegionServerStatus.proto index 002432a2f2..0137cb1608 100644 --- a/hbase-protocol-shaded/src/main/protobuf/RegionServerStatus.proto +++ b/hbase-protocol-shaded/src/main/protobuf/RegionServerStatus.proto @@ -96,6 +96,7 @@ message RegionStateTransition { /** For newly opened region, the open seq num is needed */ optional uint64 open_seq_num = 3; + repeated int64 proc_id = 4; enum TransitionCode { OPENED = 0; FAILED_OPEN = 1; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java index 33a35453e0..35510d6eef 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java @@ -21,7 +21,6 @@ import java.io.IOException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; -import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionOpenOperation; import org.apache.hadoop.hbase.procedure2.ProcedureMetrics; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; @@ -38,7 +37,6 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProto * @deprecated Do not use any more. * @see TransitRegionStateProcedure */ -// TODO: Add being able to assign a region to open read-only. @Deprecated @InterfaceAudience.Private public class AssignProcedure extends RegionTransitionProcedure { @@ -121,9 +119,7 @@ public class AssignProcedure extends RegionTransitionProcedure { @Override public RemoteOperation remoteCallBuild(final MasterProcedureEnv env, final ServerName serverName) { - assert serverName.equals(getRegionState(env).getRegionLocation()); - return new RegionOpenOperation(this, getRegionInfo(), - env.getAssignmentManager().getFavoredNodes(getRegionInfo()), false); + return null; } @Override diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 2d0c3bee9f..5e43637bd8 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -835,8 +835,10 @@ public class AssignmentManager { case CLOSED: assert transition.getRegionInfoCount() == 1 : transition; final RegionInfo hri = ProtobufUtil.toRegionInfo(transition.getRegionInfo(0)); + long procId = + transition.getProcIdCount() > 0 ? transition.getProcId(0) : Procedure.NO_PROC_ID; updateRegionTransition(serverName, transition.getTransitionCode(), hri, - transition.hasOpenSeqNum() ? transition.getOpenSeqNum() : HConstants.NO_SEQNUM); + transition.hasOpenSeqNum() ? transition.getOpenSeqNum() : HConstants.NO_SEQNUM, procId); break; case READY_TO_SPLIT: case SPLIT: @@ -903,7 +905,7 @@ public class AssignmentManager { } private void updateRegionTransition(ServerName serverName, TransitionCode state, - RegionInfo regionInfo, long seqId) throws IOException { + RegionInfo regionInfo, long seqId, long procId) throws IOException { checkMetaLoaded(regionInfo); RegionStateNode regionNode = regionStates.getRegionStateNode(regionInfo); @@ -919,7 +921,7 @@ public class AssignmentManager { ServerStateNode serverNode = regionStates.getOrCreateServer(serverName); regionNode.lock(); try { - if (!reportTransition(regionNode, serverNode, state, seqId)) { + if (!reportTransition(regionNode, serverNode, state, seqId, procId)) { // Don't log WARN if shutting down cluster; during shutdown. Avoid the below messages: // 2018-08-13 10:45:10,551 WARN ...AssignmentManager: No matching procedure found for // rit=OPEN, location=ve0538.halxg.cloudera.com,16020,1533493000958, @@ -941,14 +943,14 @@ public class AssignmentManager { } private boolean reportTransition(RegionStateNode regionNode, ServerStateNode serverNode, - TransitionCode state, long seqId) throws IOException { + TransitionCode state, long seqId, long procId) throws IOException { ServerName serverName = serverNode.getServerName(); TransitRegionStateProcedure proc = regionNode.getProcedure(); if (proc == null) { return false; } proc.reportTransition(master.getMasterProcedureExecutor().getEnvironment(), regionNode, - serverName, state, seqId); + serverName, state, seqId, procId); return true; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java index f867e96459..4fd60f0ead 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java @@ -20,15 +20,17 @@ package org.apache.hadoop.hbase.master.assignment; import java.io.IOException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.RegionInfo; -import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionCloseOperation; import org.apache.hadoop.hbase.procedure2.ProcedureMetrics; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteOperation; import org.apache.yetus.audience.InterfaceAudience; + import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.CloseRegionProcedureStateData; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; /** * The remote procedure used to close a region. @@ -46,9 +48,9 @@ public class CloseRegionProcedure extends RegionRemoteProcedureBase { super(); } - public CloseRegionProcedure(RegionInfo region, ServerName targetServer, - ServerName assignCandidate) { - super(region, targetServer); + public CloseRegionProcedure(TransitRegionStateProcedure parent, RegionInfo region, + ServerName targetServer, ServerName assignCandidate) { + super(parent, region, targetServer); this.assignCandidate = assignCandidate; } @@ -59,7 +61,7 @@ public class CloseRegionProcedure extends RegionRemoteProcedureBase { @Override public RemoteOperation remoteCallBuild(MasterProcedureEnv env, ServerName remote) { - return new RegionCloseOperation(this, region, assignCandidate); + return new RegionCloseOperation(this, region, getProcId(), assignCandidate); } @Override @@ -88,7 +90,17 @@ public class CloseRegionProcedure extends RegionRemoteProcedureBase { } @Override - protected boolean shouldDispatch(RegionStateNode regionNode) { - return regionNode.isInState(RegionState.State.CLOSING); + protected void reportTransition(RegionStateNode regionNode, TransitionCode transitionCode, + long seqId) throws IOException { + if (transitionCode != TransitionCode.CLOSED) { + throw new UnexpectedStateException("Received report unexpected " + transitionCode + + " transition, " + regionNode.toShortString() + ", " + this + ", expected CLOSED."); + } + } + + @Override + protected void updateTransition(MasterProcedureEnv env, RegionStateNode regionNode, + TransitionCode transitionCode, long seqId) throws IOException { + env.getAssignmentManager().regionClosed(regionNode, true); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java index 4b3a976f28..579b7570b0 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java @@ -20,15 +20,18 @@ package org.apache.hadoop.hbase.master.assignment; import java.io.IOException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.RegionInfo; -import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionOpenOperation; import org.apache.hadoop.hbase.procedure2.ProcedureMetrics; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteOperation; import org.apache.yetus.audience.InterfaceAudience; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.OpenRegionProcedureStateData; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; /** * The remote procedure used to open a region. @@ -36,12 +39,15 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.O @InterfaceAudience.Private public class OpenRegionProcedure extends RegionRemoteProcedureBase { + private static final Logger LOG = LoggerFactory.getLogger(OpenRegionProcedure.class); + public OpenRegionProcedure() { super(); } - public OpenRegionProcedure(RegionInfo region, ServerName targetServer) { - super(region, targetServer); + public OpenRegionProcedure(TransitRegionStateProcedure parent, RegionInfo region, + ServerName targetServer) { + super(parent, region, targetServer); } @Override @@ -51,8 +57,7 @@ public class OpenRegionProcedure extends RegionRemoteProcedureBase { @Override public RemoteOperation remoteCallBuild(MasterProcedureEnv env, ServerName remote) { - return new RegionOpenOperation(this, region, env.getAssignmentManager().getFavoredNodes(region), - false); + return new RegionOpenOperation(this, region, getProcId()); } @Override @@ -73,7 +78,48 @@ public class OpenRegionProcedure extends RegionRemoteProcedureBase { } @Override - protected boolean shouldDispatch(RegionStateNode regionNode) { - return regionNode.isInState(RegionState.State.OPENING); + protected void reportTransition(RegionStateNode regionNode, TransitionCode transitionCode, + long seqId) throws IOException { + switch (transitionCode) { + case OPENED: + // this is the openSeqNum + if (seqId < 0) { + throw new UnexpectedStateException("Received report unexpected " + TransitionCode.OPENED + + " transition openSeqNum=" + seqId + ", " + regionNode + ", proc=" + this); + } + break; + case FAILED_OPEN: + // nothing to check + break; + default: + throw new UnexpectedStateException( + "Received report unexpected " + transitionCode + " transition, " + + regionNode.toShortString() + ", " + this + ", expected OPENED or FAILED_OPEN."); + } + } + + @Override + protected void updateTransition(MasterProcedureEnv env, RegionStateNode regionNode, + TransitionCode transitionCode, long openSeqNum) throws IOException { + switch (transitionCode) { + case OPENED: + if (openSeqNum < regionNode.getOpenSeqNum()) { + LOG.warn( + "Received report {} transition from {} for {}, pid={} but the new openSeqNum {}" + + " is less than the current one {}, ignoring...", + transitionCode, targetServer, regionNode, getProcId(), openSeqNum, + regionNode.getOpenSeqNum()); + } else { + regionNode.setOpenSeqNum(openSeqNum); + } + env.getAssignmentManager().regionOpened(regionNode); + break; + case FAILED_OPEN: + env.getAssignmentManager().regionFailedOpen(regionNode, false); + break; + default: + throw new UnexpectedStateException("Unexpected transition code: " + transitionCode); + } + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java index f6d3a2eaa5..c4af93751c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java @@ -18,9 +18,11 @@ package org.apache.hadoop.hbase.master.assignment; import java.io.IOException; +import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.TableProcedureInterface; import org.apache.hadoop.hbase.procedure2.FailedRemoteDispatchException; @@ -28,6 +30,7 @@ import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.procedure2.ProcedureEvent; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; +import org.apache.hadoop.hbase.procedure2.ProcedureUtil; import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteProcedure; import org.apache.hadoop.hbase.procedure2.RemoteProcedureException; @@ -36,7 +39,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; +import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionRemoteProcedureBaseState; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionRemoteProcedureBaseStateData; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; /** * The base class for the remote procedures used to open/close a region. @@ -53,16 +59,25 @@ public abstract class RegionRemoteProcedureBase extends Procedure - * This could happen when master restarts. Since we do not know whether a request has already been - * sent to the region server after we add a remote operation to the dispatcher, so the safe way is - * to not persist the dispatched field and try to add the remote operation again. But it is - * possible that we do have already sent the request to region server and it has also sent back - * the response, so here we need to check the region state, if it is not in the expecting state, - * we should give up, otherwise we may hang for ever, as the region server will just ignore - * redundant calls. - */ - protected abstract boolean shouldDispatch(RegionStateNode regionNode); + // do some checks to see if the report is valid, without actually updating meta. + protected abstract void reportTransition(RegionStateNode regionNode, + TransitionCode transitionCode, long seqId) throws IOException; + + // A bit strange but the procedure store will throw RuntimeException if we can not persist the + // state, so upper layer should take care of this... + private void persistAndWake(MasterProcedureEnv env, RegionStateNode regionNode) { + env.getMasterServices().getMasterProcedureExecutor().getStore().update(this); + regionNode.getProcedureEvent().wake(env.getProcedureScheduler()); + } + + // should be called with RegionStateNode locked, to avoid race with the execute method below + void reportTransition(MasterProcedureEnv env, RegionStateNode regionNode, ServerName serverName, + TransitionCode transitionCode, long seqId) throws IOException { + if (state != RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH) { + // should be a retry + return; + } + if (!targetServer.equals(serverName)) { + throw new UnexpectedStateException("Received report from " + serverName + ", expected " + + targetServer + ", " + regionNode + ", proc=" + this); + } + reportTransition(regionNode, transitionCode, seqId); + // this state means we have received the report from RS, does not mean the result is fine, as we + // may received a FAILED_OPEN. + this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_REPORT_SUCCEED; + this.transitionCode = transitionCode; + this.seqId = seqId; + // Persist the transition code and openSeqNum(if provided). + // We should not update the hbase:meta directly as this may cause races when master restarts, + // as the old active master may incorrectly report back to RS and cause the new master to hang + // on a OpenRegionProcedure forever. See HBASE-22060 and HBASE-22074 for more details. + boolean succ = false; + try { + persistAndWake(env, regionNode); + succ = true; + } finally { + if (!succ) { + this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH; + this.transitionCode = null; + this.seqId = HConstants.NO_SEQNUM; + } + } + } + + void serverCrashed(MasterProcedureEnv env, RegionStateNode regionNode, ServerName serverName) { + if (state != RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH) { + // should be a retry + return; + } + this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_SERVER_CRASH; + boolean succ = false; + try { + persistAndWake(env, regionNode); + succ = true; + } finally { + if (!succ) { + this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH; + } + } + } + + private TransitRegionStateProcedure getParent(MasterProcedureEnv env) { + return (TransitRegionStateProcedure) env.getMasterServices().getMasterProcedureExecutor() + .getProcedure(getParentProcId()); + } + + private void unattach(MasterProcedureEnv env) { + getParent(env).unattachRemoteProc(this); + } + + // actually update the state to meta + protected abstract void updateTransition(MasterProcedureEnv env, RegionStateNode regionNode, + TransitionCode transitionCode, long seqId) throws IOException; @Override protected Procedure[] execute(MasterProcedureEnv env) throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException { - if (dispatched) { - // we are done, the parent procedure will check whether we are succeeded. - return null; - } RegionStateNode regionNode = getRegionNode(env); regionNode.lock(); try { - if (!shouldDispatch(regionNode)) { - return null; + switch (state) { + case REGION_REMOTE_PROCEDURE_DISPATCH: { + // The code which wakes us up also needs to lock the RSN so here we do not need to + // synchronize + // on the event. + ProcedureEvent event = regionNode.getProcedureEvent(); + try { + env.getRemoteDispatcher().addOperationToNode(targetServer, this); + } catch (FailedRemoteDispatchException e) { + LOG.warn("Can not add remote operation {} for region {} to server {}, this usually " + + "because the server is alread dead, give up and mark the procedure as complete, " + + "the parent procedure will take care of this.", this, region, targetServer, e); + unattach(env); + return null; + } + event.suspend(); + event.suspendIfNotReady(this); + throw new ProcedureSuspendedException(); + } + case REGION_REMOTE_PROCEDURE_REPORT_SUCCEED: + updateTransition(env, regionNode, transitionCode, seqId); + unattach(env); + return null; + case REGION_REMOTE_PROCEDURE_DISPATCH_FAIL: + // the remote call is failed so we do not need to change the region state, just return. + unattach(env); + return null; + case REGION_REMOTE_PROCEDURE_SERVER_CRASH: + env.getAssignmentManager().regionClosed(regionNode, false); + unattach(env); + return null; + default: + throw new IllegalStateException("Unknown state: " + state); } - // The code which wakes us up also needs to lock the RSN so here we do not need to synchronize - // on the event. - ProcedureEvent event = regionNode.getProcedureEvent(); - try { - env.getRemoteDispatcher().addOperationToNode(targetServer, this); - } catch (FailedRemoteDispatchException e) { - LOG.warn("Can not add remote operation {} for region {} to server {}, this usually " + - "because the server is alread dead, give up and mark the procedure as complete, " + - "the parent procedure will take care of this.", this, region, targetServer, e); - return null; - } - dispatched = true; - event.suspend(); - event.suspendIfNotReady(this); + } catch (IOException e) { + long backoff = ProcedureUtil.getBackoffTimeMs(this.attempt++); + LOG.warn("Failed updating meta, suspend {}secs {}; {};", backoff / 1000, this, regionNode, e); + setTimeout(Math.toIntExact(backoff)); + setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT); + skipPersistence(); throw new ProcedureSuspendedException(); } finally { regionNode.unlock(); @@ -186,9 +286,14 @@ public abstract class RegionRemoteProcedureBase extends Procedure favoredNodes; - private final boolean openForReplay; - private boolean failedOpen; - public RegionOpenOperation(final RemoteProcedure remoteProcedure, - final RegionInfo regionInfo, final List favoredNodes, - final boolean openForReplay) { - super(remoteProcedure, regionInfo); - this.favoredNodes = favoredNodes; - this.openForReplay = openForReplay; - } - - protected void setFailedOpen(final boolean failedOpen) { - this.failedOpen = failedOpen; - } - - public boolean isFailedOpen() { - return failedOpen; + public RegionOpenOperation(RemoteProcedure remoteProcedure, RegionInfo regionInfo, + long procId) { + super(remoteProcedure, regionInfo, procId); } public OpenRegionRequest.RegionOpenInfo buildRegionOpenInfoRequest( final MasterProcedureEnv env) { - return RequestConverter.buildRegionOpenInfo(getRegionInfo(), - env.getAssignmentManager().getFavoredNodes(getRegionInfo())); + return RequestConverter.buildRegionOpenInfo(regionInfo, + env.getAssignmentManager().getFavoredNodes(regionInfo), procId); } } public static class RegionCloseOperation extends RegionOperation { private final ServerName destinationServer; - private boolean closed = false; - public RegionCloseOperation(final RemoteProcedure remoteProcedure, - final RegionInfo regionInfo, final ServerName destinationServer) { - super(remoteProcedure, regionInfo); + public RegionCloseOperation(RemoteProcedure remoteProcedure, RegionInfo regionInfo, long procId, + ServerName destinationServer) { + super(remoteProcedure, regionInfo, procId); this.destinationServer = destinationServer; } @@ -454,17 +436,9 @@ public class RSProcedureDispatcher return destinationServer; } - protected void setClosed(final boolean closed) { - this.closed = closed; - } - - public boolean isClosed() { - return closed; - } - public CloseRegionRequest buildCloseRegionRequest(final ServerName serverName) { - return ProtobufUtil.buildCloseRegionRequest(serverName, - getRegionInfo().getRegionName(), getDestinationServer()); + return ProtobufUtil.buildCloseRegionRequest(serverName, regionInfo.getRegionName(), + getDestinationServer(), procId); } } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index e407285f9f..bcb1a07b12 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -2228,9 +2228,11 @@ public class HRegionServer extends HasThread implements @Override public void postOpenDeployTasks(final PostOpenDeployContext context) throws IOException { HRegion r = context.getRegion(); + long openProcId = context.getOpenProcId(); long masterSystemTime = context.getMasterSystemTime(); rpcServices.checkOpen(); - LOG.info("Post open deploy tasks for " + r.getRegionInfo().getRegionNameAsString()); + LOG.info("Post open deploy tasks for {}, openProcId={}, masterSystemTime={}", + r.getRegionInfo().getRegionNameAsString(), openProcId, masterSystemTime); // Do checks to see if we need to compact (references or too many files) for (HStore s : r.stores.values()) { if (s.hasReferences() || s.needsCompaction()) { @@ -2247,7 +2249,7 @@ public class HRegionServer extends HasThread implements // Notify master if (!reportRegionStateTransition(new RegionStateTransitionContext(TransitionCode.OPENED, - openSeqNum, masterSystemTime, r.getRegionInfo()))) { + openSeqNum, openProcId, masterSystemTime, r.getRegionInfo()))) { throw new IOException( "Failed to report opened region to master: " + r.getRegionInfo().getRegionNameAsString()); } @@ -2263,6 +2265,7 @@ public class HRegionServer extends HasThread implements long openSeqNum = context.getOpenSeqNum(); long masterSystemTime = context.getMasterSystemTime(); RegionInfo[] hris = context.getHris(); + long[] procIds = context.getProcIds(); if (TEST_SKIP_REPORTING_TRANSITION) { // This is for testing only in case there is no master @@ -2301,6 +2304,9 @@ public class HRegionServer extends HasThread implements for (RegionInfo hri: hris) { transition.addRegionInfo(ProtobufUtil.toRegionInfo(hri)); } + for (long procId: procIds) { + transition.addProcId(procId); + } ReportRegionStateTransitionRequest request = builder.build(); int tries = 0; long pauseTime = INIT_PAUSE_TIME_MS; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java index 9b99ff82d6..2054ac78a7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java @@ -3714,8 +3714,8 @@ public class RSRpcServices implements HBaseRPCErrorHandler, regionServer.updateRegionFavoredNodesMapping(regionInfo.getEncodedName(), regionOpenInfo.getFavoredNodesList()); } - regionServer.executorService - .submit(AssignRegionHandler.create(regionServer, regionInfo, tableDesc, masterSystemTime)); + regionServer.executorService.submit(AssignRegionHandler.create(regionServer, regionInfo, + regionOpenInfo.getOpenProcId(), tableDesc, masterSystemTime)); } } @@ -3729,8 +3729,8 @@ public class RSRpcServices implements HBaseRPCErrorHandler, ServerName destination = request.hasDestinationServer() ? ProtobufUtil.toServerName(request.getDestinationServer()) : null; - regionServer.executorService - .submit(UnassignRegionHandler.create(regionServer, encodedName, false, destination)); + regionServer.executorService.submit(UnassignRegionHandler.create(regionServer, encodedName, + request.getCloseProcId(), false, destination)); } private void executeProcedures(RemoteProcedureRequest request) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerServices.java index e0638acb16..17f318b227 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RegionServerServices.java @@ -100,16 +100,23 @@ public interface RegionServerServices extends Server, MutableOnlineRegions, Favo */ class PostOpenDeployContext { private final HRegion region; + private final long openProcId; private final long masterSystemTime; - @InterfaceAudience.Private - public PostOpenDeployContext(HRegion region, long masterSystemTime) { + public PostOpenDeployContext(HRegion region, long openProcId, long masterSystemTime) { this.region = region; + this.openProcId = openProcId; this.masterSystemTime = masterSystemTime; } + public HRegion getRegion() { return region; } + + public long getOpenProcId() { + return openProcId; + } + public long getMasterSystemTime() { return masterSystemTime; } @@ -125,28 +132,46 @@ public interface RegionServerServices extends Server, MutableOnlineRegions, Favo private final TransitionCode code; private final long openSeqNum; private final long masterSystemTime; + private final long[] procIds; private final RegionInfo[] hris; - @InterfaceAudience.Private public RegionStateTransitionContext(TransitionCode code, long openSeqNum, long masterSystemTime, RegionInfo... hris) { this.code = code; this.openSeqNum = openSeqNum; this.masterSystemTime = masterSystemTime; this.hris = hris; + this.procIds = new long[hris.length]; } + + public RegionStateTransitionContext(TransitionCode code, long openSeqNum, long procId, + long masterSystemTime, RegionInfo hri) { + this.code = code; + this.openSeqNum = openSeqNum; + this.masterSystemTime = masterSystemTime; + this.hris = new RegionInfo[] { hri }; + this.procIds = new long[] { procId }; + } + public TransitionCode getCode() { return code; } + public long getOpenSeqNum() { return openSeqNum; } + public long getMasterSystemTime() { return masterSystemTime; } + public RegionInfo[] getHris() { return hris; } + + public long[] getProcIds() { + return procIds; + } } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java index 9a0531c923..9e806bb611 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java @@ -19,7 +19,6 @@ package org.apache.hadoop.hbase.regionserver; import java.security.PrivilegedAction; - import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.RegionInfo; @@ -30,7 +29,9 @@ import org.apache.hadoop.hbase.util.Bytes; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + import org.apache.hbase.thirdparty.com.google.common.base.Preconditions; + import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; /** @@ -87,7 +88,7 @@ class SplitRequest implements Runnable { // hri_a and hri_b objects may not reflect the regions that will be created, those objects // are created just to pass the information to the reportRegionStateTransition(). if (!server.reportRegionStateTransition(new RegionStateTransitionContext( - TransitionCode.READY_TO_SPLIT, HConstants.NO_SEQNUM, -1, parent, hri_a, hri_b))) { + TransitionCode.READY_TO_SPLIT, HConstants.NO_SEQNUM, -1, parent, hri_a, hri_b))) { LOG.error("Unable to ask master to split " + parent.getRegionNameAsString()); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java index c6fee2e57a..bc2425ba97 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java @@ -51,16 +51,19 @@ public class AssignRegionHandler extends EventHandler { private final RegionInfo regionInfo; + private final long openProcId; + private final TableDescriptor tableDesc; private final long masterSystemTime; private final RetryCounter retryCounter; - public AssignRegionHandler(RegionServerServices server, RegionInfo regionInfo, + public AssignRegionHandler(RegionServerServices server, RegionInfo regionInfo, long openProcId, @Nullable TableDescriptor tableDesc, long masterSystemTime, EventType eventType) { super(server, eventType); this.regionInfo = regionInfo; + this.openProcId = openProcId; this.tableDesc = tableDesc; this.masterSystemTime = masterSystemTime; this.retryCounter = HandlerUtil.getRetryCounter(); @@ -76,7 +79,7 @@ public class AssignRegionHandler extends EventHandler { RegionServerServices rs = getServer(); rs.getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes(), Boolean.TRUE); if (!rs.reportRegionStateTransition(new RegionStateTransitionContext(TransitionCode.FAILED_OPEN, - HConstants.NO_SEQNUM, masterSystemTime, regionInfo))) { + HConstants.NO_SEQNUM, openProcId, masterSystemTime, regionInfo))) { throw new IOException( "Failed to report failed open to master: " + regionInfo.getRegionNameAsString()); } @@ -133,7 +136,7 @@ public class AssignRegionHandler extends EventHandler { cleanUpAndReportFailure(e); return; } - rs.postOpenDeployTasks(new PostOpenDeployContext(region, masterSystemTime)); + rs.postOpenDeployTasks(new PostOpenDeployContext(region, openProcId, masterSystemTime)); rs.addRegion(region); LOG.info("Opened {}", regionName); Boolean current = rs.getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes()); @@ -156,7 +159,7 @@ public class AssignRegionHandler extends EventHandler { } public static AssignRegionHandler create(RegionServerServices server, RegionInfo regionInfo, - TableDescriptor tableDesc, long masterSystemTime) { + long openProcId, TableDescriptor tableDesc, long masterSystemTime) { EventType eventType; if (regionInfo.isMetaRegion()) { eventType = EventType.M_RS_CLOSE_META; @@ -166,6 +169,7 @@ public class AssignRegionHandler extends EventHandler { } else { eventType = EventType.M_RS_OPEN_REGION; } - return new AssignRegionHandler(server, regionInfo, tableDesc, masterSystemTime, eventType); + return new AssignRegionHandler(server, regionInfo, openProcId, tableDesc, masterSystemTime, + eventType); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java index 0e35a0be70..d4ea004cb2 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java @@ -26,6 +26,7 @@ import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.executor.EventHandler; import org.apache.hadoop.hbase.executor.EventType; +import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.RegionServerServices; import org.apache.hadoop.hbase.regionserver.RegionServerServices.RegionStateTransitionContext; @@ -123,7 +124,7 @@ public class CloseRegionHandler extends EventHandler { this.rsServices.removeRegion(region, destination); rsServices.reportRegionStateTransition(new RegionStateTransitionContext(TransitionCode.CLOSED, - HConstants.NO_SEQNUM, -1, regionInfo)); + HConstants.NO_SEQNUM, Procedure.NO_PROC_ID, -1, regionInfo)); // Done! Region is closed on this RS LOG.debug("Closed " + region.getRegionInfo().getRegionNameAsString()); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java index 31177ef79c..8b644b0d77 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java @@ -27,6 +27,7 @@ import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.TableDescriptor; import org.apache.hadoop.hbase.executor.EventHandler; import org.apache.hadoop.hbase.executor.EventType; +import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.Region; import org.apache.hadoop.hbase.regionserver.RegionServerServices; @@ -156,15 +157,14 @@ public class OpenRegionHandler extends EventHandler { } } - private void doCleanUpOnFailedOpen(HRegion region) - throws IOException { + private void doCleanUpOnFailedOpen(HRegion region) throws IOException { try { if (region != null) { cleanupFailedOpen(region); } } finally { rsServices.reportRegionStateTransition(new RegionStateTransitionContext( - TransitionCode.FAILED_OPEN, HConstants.NO_SEQNUM, -1, regionInfo)); + TransitionCode.FAILED_OPEN, HConstants.NO_SEQNUM, Procedure.NO_PROC_ID, -1, regionInfo)); } } @@ -248,19 +248,19 @@ public class OpenRegionHandler extends EventHandler { @Override public void run() { try { - this.services.postOpenDeployTasks(new PostOpenDeployContext(region, masterSystemTime)); + this.services.postOpenDeployTasks( + new PostOpenDeployContext(region, Procedure.NO_PROC_ID, masterSystemTime)); } catch (Throwable e) { String msg = "Exception running postOpenDeployTasks; region=" + this.region.getRegionInfo().getEncodedName(); this.exception = e; - if (e instanceof IOException - && isRegionStillOpening(region.getRegionInfo(), services)) { + if (e instanceof IOException && isRegionStillOpening(region.getRegionInfo(), services)) { server.abort(msg, e); } else { LOG.warn(msg, e); } } - // We're done. Set flag then wake up anyone waiting on thread to complete. + // We're done. Set flag then wake up anyone waiting on thread to complete. this.signaller.set(true); synchronized (this.signaller) { this.signaller.notify(); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java index cd38db14c7..3ce7caa003 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java @@ -49,18 +49,22 @@ public class UnassignRegionHandler extends EventHandler { private static final Logger LOG = LoggerFactory.getLogger(UnassignRegionHandler.class); private final String encodedName; + + private final long closeProcId; // If true, the hosting server is aborting. Region close process is different // when we are aborting. + // TODO: not used yet, we still use the old CloseRegionHandler when aborting private final boolean abort; private final ServerName destination; private final RetryCounter retryCounter; - public UnassignRegionHandler(RegionServerServices server, String encodedName, boolean abort, - @Nullable ServerName destination, EventType eventType) { + public UnassignRegionHandler(RegionServerServices server, String encodedName, long closeProcId, + boolean abort, @Nullable ServerName destination, EventType eventType) { super(server, eventType); this.encodedName = encodedName; + this.closeProcId = closeProcId; this.abort = abort; this.destination = destination; this.retryCounter = HandlerUtil.getRetryCounter(); @@ -117,7 +121,7 @@ public class UnassignRegionHandler extends EventHandler { } rs.removeRegion(region, destination); if (!rs.reportRegionStateTransition(new RegionStateTransitionContext(TransitionCode.CLOSED, - HConstants.NO_SEQNUM, -1, region.getRegionInfo()))) { + HConstants.NO_SEQNUM, closeProcId, -1, region.getRegionInfo()))) { throw new IOException("Failed to report close to master: " + regionName); } rs.getRegionsInTransitionInRS().remove(encodedNameBytes, Boolean.FALSE); @@ -131,13 +135,14 @@ public class UnassignRegionHandler extends EventHandler { } public static UnassignRegionHandler create(RegionServerServices server, String encodedName, - boolean abort, @Nullable ServerName destination) { + long closeProcId, boolean abort, @Nullable ServerName destination) { // Just try our best to determine whether it is for closing meta. It is not the end of the world // if we put the handler into a wrong executor. Region region = server.getRegion(encodedName); EventType eventType = region != null && region.getRegionInfo().isMetaRegion() ? EventType.M_RS_CLOSE_META : EventType.M_RS_CLOSE_REGION; - return new UnassignRegionHandler(server, encodedName, abort, destination, eventType); + return new UnassignRegionHandler(server, encodedName, closeProcId, abort, destination, + eventType); } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManagerBase.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManagerBase.java index fb6668acb8..9f3acebd38 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManagerBase.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManagerBase.java @@ -400,6 +400,13 @@ public abstract class TestAssignmentManagerBase { if (retries == timeoutTimes) { LOG.info("Mark server=" + server + " as dead. retries=" + retries); master.getServerManager().moveFromOnlineToDeadServers(server); + executor.schedule(new Runnable() { + @Override + public void run() { + LOG.info("Sending in CRASH of " + server); + doCrash(server); + } + }, 1, TimeUnit.SECONDS); } throw new SocketTimeoutException("simulate socket timeout"); } else { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java index d34bfbb4f2..0a29958d32 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java @@ -27,6 +27,7 @@ import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.ServerManager; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; @@ -167,7 +168,7 @@ public class TestCloseRegionWhileRSCrash { HRegionServer dstRs = UTIL.getOtherRegionServer(srcRs); ProcedureExecutor procExec = UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); - long dummyProcId = procExec.submitProcedure(new DummyServerProcedure(srcRs.getServerName())); + procExec.submitProcedure(new DummyServerProcedure(srcRs.getServerName())); ARRIVE.await(); UTIL.getMiniHBaseCluster().killRegionServer(srcRs.getServerName()); UTIL.waitFor(30000, @@ -185,13 +186,12 @@ public class TestCloseRegionWhileRSCrash { 30000); // wait until the timeout value increase three times ProcedureTestUtil.waitUntilProcedureTimeoutIncrease(UTIL, TransitRegionStateProcedure.class, 3); - // let's close the connection to make sure that the SCP can not update meta successfully - UTIL.getMiniHBaseCluster().getMaster().getConnection().close(); + // close connection to make sure that we can not finish the TRSP + HMaster master = UTIL.getMiniHBaseCluster().getMaster(); + master.getConnection().close(); RESUME.countDown(); - UTIL.waitFor(30000, () -> procExec.isFinished(dummyProcId)); - Thread.sleep(2000); - // here we restart - UTIL.getMiniHBaseCluster().stopMaster(0).join(); + UTIL.waitFor(30000, () -> !master.isAlive()); + // here we start a new master UTIL.getMiniHBaseCluster().startMaster(); t.join(); // Make sure that the region is online, it may not on the original target server, as we will set diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestOpenRegionProcedureHang.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestOpenRegionProcedureHang.java new file mode 100644 index 0000000000..0463721656 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestOpenRegionProcedureHang.java @@ -0,0 +1,209 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.assignment; + +import java.io.IOException; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.PleaseHoldException; +import org.apache.hadoop.hbase.StartMiniClusterOption; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.MasterServices; +import org.apache.hadoop.hbase.master.RegionPlan; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; +import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; +import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; +import org.apache.zookeeper.KeeperException; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse; + +/** + * See HBASE-22060 and HBASE-22074 for more details. + */ +@Category({ MasterTests.class, MediumTests.class }) +public class TestOpenRegionProcedureHang { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestOpenRegionProcedureHang.class); + + private static final Logger LOG = LoggerFactory.getLogger(TestOpenRegionProcedureHang.class); + + private static CountDownLatch ARRIVE; + private static CountDownLatch RESUME; + + private static CountDownLatch FINISH; + + private static CountDownLatch ABORT; + + private static final class AssignmentManagerForTest extends AssignmentManager { + + public AssignmentManagerForTest(MasterServices master) { + super(master); + } + + @Override + public ReportRegionStateTransitionResponse reportRegionStateTransition( + ReportRegionStateTransitionRequest req) throws PleaseHoldException { + RegionStateTransition transition = req.getTransition(0); + if (transition.getTransitionCode() == TransitionCode.OPENED && + ProtobufUtil.toTableName(transition.getRegionInfo(0).getTableName()).equals(NAME) && + ARRIVE != null) { + ARRIVE.countDown(); + try { + RESUME.await(); + RESUME = null; + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + try { + return super.reportRegionStateTransition(req); + } finally { + FINISH.countDown(); + } + } else { + return super.reportRegionStateTransition(req); + } + } + } + + public static final class HMasterForTest extends HMaster { + + public HMasterForTest(Configuration conf) throws IOException, KeeperException { + super(conf); + } + + @Override + protected AssignmentManager createAssignmentManager(MasterServices master) { + return new AssignmentManagerForTest(master); + } + + @Override + public void abort(String reason, Throwable cause) { + // hang here so we can finish the reportRegionStateTransition call, which is the most + // important part to reproduce the bug + if (ABORT != null) { + try { + ABORT.await(); + ABORT = null; + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } + super.abort(reason, cause); + } + } + + private static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); + + private static TableName NAME = TableName.valueOf("Open"); + + private static byte[] CF = Bytes.toBytes("cf"); + + @BeforeClass + public static void setUp() throws Exception { + Configuration conf = UTIL.getConfiguration(); + conf.setClass(HConstants.MASTER_IMPL, HMasterForTest.class, HMaster.class); + + // make sure we do not timeout when caling reportRegionStateTransition + conf.setInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 10 * 60 * 1000); + conf.setInt(HConstants.HBASE_RPC_SHORTOPERATION_TIMEOUT_KEY, 10 * 60 * 1000); + UTIL + .startMiniCluster(StartMiniClusterOption.builder().numMasters(2).numRegionServers(3).build()); + UTIL.createTable(NAME, CF); + UTIL.waitTableAvailable(NAME); + UTIL.getAdmin().balancerSwitch(false, true); + } + + @AfterClass + public static void tearDown() throws Exception { + UTIL.shutdownMiniCluster(); + } + + @Test + public void test() throws InterruptedException, KeeperException, IOException { + RegionInfo region = UTIL.getMiniHBaseCluster().getRegions(NAME).get(0).getRegionInfo(); + AssignmentManager am = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager(); + + HRegionServer rs1 = UTIL.getRSForFirstRegionInTable(NAME); + HRegionServer rs2 = UTIL.getOtherRegionServer(rs1); + + ARRIVE = new CountDownLatch(1); + RESUME = new CountDownLatch(1); + FINISH = new CountDownLatch(1); + ABORT = new CountDownLatch(1); + am.moveAsync(new RegionPlan(region, rs1.getServerName(), rs2.getServerName())); + + ARRIVE.await(); + ARRIVE = null; + HMaster master = UTIL.getMiniHBaseCluster().getMaster(); + master.getZooKeeper().close(); + UTIL.waitFor(30000, () -> { + for (MasterThread mt : UTIL.getMiniHBaseCluster().getMasterThreads()) { + if (mt.getMaster() != master && mt.getMaster().isActiveMaster()) { + return mt.getMaster().isInitialized(); + } + } + return false; + }); + ProcedureExecutor procExec = + UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); + UTIL.waitFor(30000, + () -> procExec.getProcedures().stream().filter(p -> p instanceof OpenRegionProcedure) + .map(p -> (OpenRegionProcedure) p).anyMatch(p -> p.region.getTable().equals(NAME))); + OpenRegionProcedure proc = procExec.getProcedures().stream() + .filter(p -> p instanceof OpenRegionProcedure).map(p -> (OpenRegionProcedure) p) + .filter(p -> p.region.getTable().equals(NAME)).findFirst().get(); + // wait a bit to let the OpenRegionProcedure send out the request + Thread.sleep(2000); + RESUME.countDown(); + if (!FINISH.await(15, TimeUnit.SECONDS)) { + LOG.info("Wait reportRegionStateTransition to finish timed out, this is possible if" + + " we update the procedure store, as the WALProcedureStore" + + " will retry forever to roll the writer if it is not closed"); + } + FINISH = null; + // if the reportRegionTransition is finished, wait a bit to let it return the data to RS + Thread.sleep(2000); + ABORT.countDown(); + + UTIL.waitFor(30000, () -> procExec.isFinished(proc.getProcId())); + UTIL.waitFor(30000, () -> procExec.isFinished(proc.getParentProcId())); + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerRemoteProcedure.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerRemoteProcedure.java index d4745b9840..f03794a31d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerRemoteProcedure.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerRemoteProcedure.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.master.procedure; import static org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface.ServerOperationType.SWITCH_RPC_THROTTLE; +import static org.junit.Assert.fail; import java.io.IOException; import java.util.NavigableMap; @@ -31,7 +32,6 @@ import java.util.concurrent.Future; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; - import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.ServerName; @@ -42,6 +42,7 @@ import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.assignment.AssignmentManager; import org.apache.hadoop.hbase.master.assignment.MockMasterServices; import org.apache.hadoop.hbase.master.assignment.OpenRegionProcedure; +import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure; import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher; @@ -62,6 +63,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hbase.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder; + import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos; @Category({ MasterTests.class, MediumTests.class }) @@ -134,19 +136,21 @@ public class TestServerRemoteProcedure { } @Test - public void testRegionOpenProcedureIsNotHandledByDisPatcher() throws Exception { + public void testRegionOpenProcedureIsNotHandledByDispatcher() throws Exception { TableName tableName = TableName.valueOf("testRegionOpenProcedureIsNotHandledByDisPatcher"); RegionInfo hri = RegionInfoBuilder.newBuilder(tableName).setStartKey(Bytes.toBytes(1)) - .setEndKey(Bytes.toBytes(2)).setSplit(false).setRegionId(0).build(); - master.getMasterProcedureExecutor().getEnvironment().getAssignmentManager().getRegionStates() - .getOrCreateRegionStateNode(hri); + .setEndKey(Bytes.toBytes(2)).setSplit(false).setRegionId(0).build(); + MasterProcedureEnv env = master.getMasterProcedureExecutor().getEnvironment(); + env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(hri); + TransitRegionStateProcedure proc = TransitRegionStateProcedure.assign(env, hri, null); ServerName worker = master.getServerManager().getOnlineServersList().get(0); - OpenRegionProcedure openRegionProcedure = new OpenRegionProcedure(hri, worker); + OpenRegionProcedure openRegionProcedure = new OpenRegionProcedure(proc, hri, worker); Future future = submitProcedure(openRegionProcedure); Thread.sleep(2000); rsDispatcher.removeNode(worker); try { future.get(2000, TimeUnit.MILLISECONDS); + fail(); } catch (TimeoutException e) { LOG.info("timeout is expected"); } -- 2.17.1