From 9e75d1fbba1f140037361bab45d77912f723338b Mon Sep 17 00:00:00 2001 From: zhangduo Date: Thu, 21 Mar 2019 14:11:29 +0800 Subject: [PATCH] HBASE-22074 Should use procedure store to persist the state in reportRegionStateTransition --- .../src/main/protobuf/MasterProcedure.proto | 3 + .../assignment/CloseRegionProcedure.java | 24 ++- .../assignment/OpenRegionProcedure.java | 47 ++++- .../assignment/RegionRemoteProcedureBase.java | 100 ++++++++--- .../TransitRegionStateProcedure.java | 170 +++--------------- .../procedure/TestServerRemoteProcedure.java | 16 +- 6 files changed, 173 insertions(+), 187 deletions(-) diff --git a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto index 64ac39802d..39878d41f9 100644 --- a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto +++ b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto @@ -28,6 +28,7 @@ import "HBase.proto"; import "RPC.proto"; import "Snapshot.proto"; import "Replication.proto"; +import "RegionServerStatus.proto"; // ============================================================================ // WARNING - Compatibility rules @@ -551,6 +552,8 @@ message RegionStateTransitionStateData { message RegionRemoteProcedureBaseStateData { required RegionInfo region = 1; required ServerName target_server = 2; + optional RegionStateTransition.TransitionCode transition_code = 3; + optional int64 seq_id = 4; } message OpenRegionProcedureStateData { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java index f867e96459..6c16450f33 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java @@ -20,15 +20,17 @@ package org.apache.hadoop.hbase.master.assignment; import java.io.IOException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.RegionInfo; -import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionCloseOperation; import org.apache.hadoop.hbase.procedure2.ProcedureMetrics; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteOperation; import org.apache.yetus.audience.InterfaceAudience; + import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.CloseRegionProcedureStateData; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; /** * The remote procedure used to close a region. @@ -46,9 +48,9 @@ public class CloseRegionProcedure extends RegionRemoteProcedureBase { super(); } - public CloseRegionProcedure(RegionInfo region, ServerName targetServer, - ServerName assignCandidate) { - super(region, targetServer); + public CloseRegionProcedure(TransitRegionStateProcedure parent, RegionInfo region, + ServerName targetServer, ServerName assignCandidate) { + super(parent, region, targetServer); this.assignCandidate = assignCandidate; } @@ -88,7 +90,17 @@ public class CloseRegionProcedure extends RegionRemoteProcedureBase { } @Override - protected boolean shouldDispatch(RegionStateNode regionNode) { - return regionNode.isInState(RegionState.State.CLOSING); + protected void reportTransition(RegionStateNode regionNode, TransitionCode transitionCode, + long seqId) throws IOException { + if (transitionCode != TransitionCode.CLOSED) { + throw new UnexpectedStateException("Received report unexpected " + transitionCode + + " transition, " + regionNode.toShortString() + ", " + this + ", expected CLOSED."); + } + } + + @Override + protected void updateTransition(MasterProcedureEnv env, RegionStateNode regionNode, + TransitionCode transitionCode, long seqId) throws IOException { + env.getAssignmentManager().regionClosed(regionNode, true); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java index 4b3a976f28..0e9edcf75d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java @@ -20,15 +20,18 @@ package org.apache.hadoop.hbase.master.assignment; import java.io.IOException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.RegionInfo; -import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionOpenOperation; import org.apache.hadoop.hbase.procedure2.ProcedureMetrics; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteOperation; import org.apache.yetus.audience.InterfaceAudience; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.OpenRegionProcedureStateData; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; /** * The remote procedure used to open a region. @@ -36,12 +39,15 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.O @InterfaceAudience.Private public class OpenRegionProcedure extends RegionRemoteProcedureBase { + private static final Logger LOG = LoggerFactory.getLogger(OpenRegionProcedure.class); + public OpenRegionProcedure() { super(); } - public OpenRegionProcedure(RegionInfo region, ServerName targetServer) { - super(region, targetServer); + public OpenRegionProcedure(TransitRegionStateProcedure parent, RegionInfo region, + ServerName targetServer) { + super(parent, region, targetServer); } @Override @@ -73,7 +79,38 @@ public class OpenRegionProcedure extends RegionRemoteProcedureBase { } @Override - protected boolean shouldDispatch(RegionStateNode regionNode) { - return regionNode.isInState(RegionState.State.OPENING); + protected void reportTransition(RegionStateNode regionNode, TransitionCode transitionCode, + long seqId) throws IOException { + switch (transitionCode) { + case OPENED: + // this is the openSeqNum + if (seqId < 0) { + throw new UnexpectedStateException("Received report unexpected " + TransitionCode.OPENED + + " transition openSeqNum=" + seqId + ", " + regionNode + ", proc=" + this); + } + break; + case FAILED_OPEN: + // nothing to check + break; + default: + throw new UnexpectedStateException( + "Received report unexpected " + transitionCode + " transition, " + + regionNode.toShortString() + ", " + this + ", expected OPENED or FAILED_OPEN."); + } + } + + @Override + protected void updateTransition(MasterProcedureEnv env, RegionStateNode regionNode, + TransitionCode transitionCode, long openSeqNum) throws IOException { + if (openSeqNum < regionNode.getOpenSeqNum()) { + LOG.warn( + "Received report {} transition from {} for {}, pid={} but the new openSeqNum {}" + + " is less than the current one {}, ignoring...", + TransitionCode.OPENED, targetServer, regionNode, getProcId(), openSeqNum, + regionNode.getOpenSeqNum()); + } else { + regionNode.setOpenSeqNum(openSeqNum); + } + env.getAssignmentManager().regionOpened(regionNode); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java index f6d3a2eaa5..65f01e2f78 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java @@ -18,9 +18,11 @@ package org.apache.hadoop.hbase.master.assignment; import java.io.IOException; +import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.TableProcedureInterface; import org.apache.hadoop.hbase.procedure2.FailedRemoteDispatchException; @@ -37,6 +39,7 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionRemoteProcedureBaseStateData; +import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; /** * The base class for the remote procedures used to open/close a region. @@ -53,16 +56,20 @@ public abstract class RegionRemoteProcedureBase extends Procedure - * This could happen when master restarts. Since we do not know whether a request has already been - * sent to the region server after we add a remote operation to the dispatcher, so the safe way is - * to not persist the dispatched field and try to add the remote operation again. But it is - * possible that we do have already sent the request to region server and it has also sent back - * the response, so here we need to check the region state, if it is not in the expecting state, - * we should give up, otherwise we may hang for ever, as the region server will just ignore - * redundant calls. - */ - protected abstract boolean shouldDispatch(RegionStateNode regionNode); + // do some checks to see if the report is valid, without actually updating meta. + protected abstract void reportTransition(RegionStateNode regionNode, + TransitionCode transitionCode, long seqId) throws IOException; + + // should be called with RegionStateNode locked, to avoid race with the execute method below + public void reportTransition(MasterProcedureEnv env, RegionStateNode regionNode, + ServerName serverName, TransitionCode transitionCode, long seqId) throws IOException { + if (this.transitionCode != null) { + // should be a retry. + return; + } + if (!targetServer.equals(serverName)) { + throw new UnexpectedStateException("Received report from " + serverName + ", expected " + + targetServer + ", " + regionNode + ", proc=" + this); + } + this.transitionCode = transitionCode; + this.seqId = seqId; + boolean succ = false; + try { + // Persist the transition code and openSeqNum(if provided). + // We should not update the hbase:meta directly as this may cause races when master restarts, + // as the old active master may incorrectly report back to RS and cause the new master to hang + // on a OpenRegionProcedure forever. See HBASE-22060 and HBASE-22074 for more details. + env.getMasterServices().getMasterProcedureExecutor().getStore().update(this); + regionNode.getProcedureEvent().wake(env.getProcedureScheduler()); + succ = true; + } finally { + if (!succ) { + // unset the state + this.transitionCode = null; + this.seqId = HConstants.NO_SEQNUM; + } + } + } + + private TransitRegionStateProcedure getParent(MasterProcedureEnv env) { + return (TransitRegionStateProcedure) env.getMasterServices().getMasterProcedureExecutor() + .getProcedure(getParentProcId()); + } + + // actually update the state to meta + protected abstract void updateTransition(MasterProcedureEnv env, RegionStateNode regionNode, + TransitionCode transitionCode, long seqId) throws IOException; @Override protected Procedure[] execute(MasterProcedureEnv env) throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException { - if (dispatched) { - // we are done, the parent procedure will check whether we are succeeded. - return null; - } RegionStateNode regionNode = getRegionNode(env); regionNode.lock(); try { - if (!shouldDispatch(regionNode)) { + if (transitionCode != null) { + // TODO: this could fail, and we need to implement the retry with interval logic. + updateTransition(env, regionNode, transitionCode, seqId); + getParent(env).unattachRemoteProc(this); return null; } // The code which wakes us up also needs to lock the RSN so here we do not need to synchronize @@ -170,10 +207,13 @@ public abstract class RegionRemoteProcedureBase extends Procedure