From cf3f6bf59db47553eab7192062bac4a67e3efa19 Mon Sep 17 00:00:00 2001 From: zhangduo Date: Wed, 20 Jun 2018 17:18:53 +0800 Subject: [PATCH] HBASE-20752 Make sure the regions are truly reopened after ReopenTableRegionsProcedure --- .../hadoop/hbase/shaded/protobuf/ProtobufUtil.java | 17 ++++ .../src/main/protobuf/HBase.proto | 6 ++ .../src/main/protobuf/MasterProcedure.proto | 5 +- .../hbase/master/assignment/AssignmentManager.java | 23 ++---- .../master/assignment/MoveRegionProcedure.java | 12 ++- .../hbase/master/assignment/RegionStateStore.java | 8 +- .../hbase/master/assignment/RegionStates.java | 91 +++++++++++++++++++++- .../master/procedure/ModifyTableProcedure.java | 21 +---- .../procedure/ReopenTableRegionsProcedure.java | 78 +++++++++++++++---- 9 files changed, 200 insertions(+), 61 deletions(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java index 717ddab..24d2ab7 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java @@ -57,6 +57,7 @@ import org.apache.hadoop.hbase.ExtendedCellBuilderFactory; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HBaseIOException; import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.HRegionLocation; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.NamespaceDescriptor; import org.apache.hadoop.hbase.ServerName; @@ -3130,6 +3131,22 @@ public final class ProtobufUtil { return rib.build(); } + public static HBaseProtos.RegionLocation toRegionLocation(HRegionLocation loc) { + HBaseProtos.RegionLocation.Builder builder = HBaseProtos.RegionLocation.newBuilder(); + builder.setRegionInfo(toRegionInfo(loc.getRegion())); + if (loc.getServerName() != null) { + builder.setServerName(toServerName(loc.getServerName())); + } + builder.setSeqNum(loc.getSeqNum()); + return builder.build(); + } + + public static HRegionLocation toRegionLocation(HBaseProtos.RegionLocation proto) { + org.apache.hadoop.hbase.client.RegionInfo regionInfo = toRegionInfo(proto.getRegionInfo()); + ServerName serverName = proto.hasServerName() ? toServerName(proto.getServerName()) : null; + return new HRegionLocation(regionInfo, serverName, proto.getSeqNum()); + } + public static List toSnapshotDescriptionList( GetCompletedSnapshotsResponse response, Pattern pattern) { return response.getSnapshotsList().stream().map(ProtobufUtil::createSnapshotDesc) diff --git a/hbase-protocol-shaded/src/main/protobuf/HBase.proto b/hbase-protocol-shaded/src/main/protobuf/HBase.proto index 0af2ffd..d06bc8b 100644 --- a/hbase-protocol-shaded/src/main/protobuf/HBase.proto +++ b/hbase-protocol-shaded/src/main/protobuf/HBase.proto @@ -267,4 +267,10 @@ message FlushedRegionSequenceId { message FlushedSequenceId { repeated FlushedRegionSequenceId regionSequenceId = 1; +} + +message RegionLocation { + required RegionInfo region_info = 1; + optional ServerName server_name = 2; + required int64 seq_num = 3; } \ No newline at end of file diff --git a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto index 0b4e1d7..39d2824 100644 --- a/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto +++ b/hbase-protocol-shaded/src/main/protobuf/MasterProcedure.proto @@ -441,11 +441,14 @@ message DisablePeerStateData { } enum ReopenTableRegionsState { - REOPEN_TABLE_REGIONS_REOPEN_ALL_REGIONS = 1; + REOPEN_TABLE_REGIONS_GET_REGIONS = 1; + REOPEN_TABLE_REGIONS_REOPEN_REGIONS = 2; + REOPEN_TABLE_REGIONS_CONFIRM_REOPENED = 3; } message ReopenTableRegionsStateData { required TableName table_name = 1; + repeated RegionLocation region = 2; } enum InitMetaState { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 0736435..dbfb6d1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -695,18 +695,6 @@ public class AssignmentManager implements ServerListener { return procs.toArray(UNASSIGN_PROCEDURE_ARRAY_TYPE); } - public MoveRegionProcedure[] createReopenProcedures(final Collection regionInfo) - throws IOException { - final MoveRegionProcedure[] procs = new MoveRegionProcedure[regionInfo.size()]; - int index = 0; - for (RegionInfo hri: regionInfo) { - final ServerName serverName = regionStates.getRegionServerOfRegion(hri); - final RegionPlan plan = new RegionPlan(hri, serverName, serverName); - procs[index++] = createMoveRegionProcedure(plan); - } - return procs; - } - /** * Called by things like DisableTableProcedure to get a list of UnassignProcedure * to unassign the regions of the table. @@ -745,22 +733,21 @@ public class AssignmentManager implements ServerListener { return proc; } - public MoveRegionProcedure createMoveRegionProcedure(final RegionPlan plan) - throws HBaseIOException { + private MoveRegionProcedure createMoveRegionProcedure(RegionPlan plan) throws HBaseIOException { if (plan.getRegionInfo().getTable().isSystemTable()) { List exclude = getExcludedServersForSystemTable(); if (plan.getDestination() != null && exclude.contains(plan.getDestination())) { try { - LOG.info("Can not move " + plan.getRegionInfo() + " to " + plan.getDestination() - + " because the server is not with highest version"); + LOG.info("Can not move " + plan.getRegionInfo() + " to " + plan.getDestination() + + " because the server is not with highest version"); plan.setDestination(getBalancer().randomAssignment(plan.getRegionInfo(), - this.master.getServerManager().createDestinationServersList(exclude))); + this.master.getServerManager().createDestinationServersList(exclude))); } catch (HBaseIOException e) { LOG.warn(e.toString(), e); } } } - return new MoveRegionProcedure(getProcedureEnvironment(), plan); + return new MoveRegionProcedure(getProcedureEnvironment(), plan, true); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MoveRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MoveRegionProcedure.java index 6fb73cd..4f9d5a1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MoveRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MoveRegionProcedure.java @@ -56,11 +56,15 @@ public class MoveRegionProcedure extends AbstractStateMachineRegionProcedure + * Notice that the {@code openSeqNum} in the returned HRegionLocation is also used to indicate the + * state of this region, positive means the region is in {@link State#OPEN}, -1 means + * {@link State#OPENING}. And for regions in other states we do not need reopen them. + */ + public List getRegionsOfTableForReopen(TableName tableName) { + return getTableRegionStateNodes(tableName).stream().map(this::createRegionForReopen) + .filter(r -> r != null).collect(Collectors.toList()); + } + + /** + * Check whether the region has been reopened. The meaning of the {@link HRegionLocation} is the + * same with {@link #getRegionsOfTableForReopen(TableName)}. + *

+ * For a region which is in {@link State#OPEN} before, if the region state is changed or the open + * seq num is changed, we can confirm that it has been reopened. + *

+ * For a region which is in {@link State#OPENING} before, usually it will be in {@link State#OPEN} + * now and we will schedule a MRP to reopen it. But there are several exceptions: + *

    + *
  • The region is in state other than {@link State#OPEN} or {@link State#OPENING}.
  • + *
  • The location of the region has been changed
  • + *
+ * Of course the region could still be in {@link State#OPENING} state and still on the same + * server, then here we will still return a {@link HRegionLocation} for it, just like + * {@link #getRegionsOfTableForReopen(TableName)}. + * @param oldLoc the previous state/location of this region + * @return null if the region has been reopened, otherwise a new {@link HRegionLocation} which + * means we still need to reopen the region. + * @see #getRegionsOfTableForReopen(TableName) + */ + public HRegionLocation checkReopened(HRegionLocation oldLoc) { + RegionStateNode node = getRegionStateNode(oldLoc.getRegion()); + synchronized (node) { + if (oldLoc.getSeqNum() >= 0) { + // in OPEN state before + if (node.isInState(State.OPEN)) { + if (node.getOpenSeqNum() > oldLoc.getSeqNum()) { + // normal case, the region has been reopened + return null; + } else { + // the open seq num does not change, need to reopen again + return new HRegionLocation(node.getRegionInfo(), node.getRegionLocation(), + node.getOpenSeqNum()); + } + } else { + // the state has been changed so we can make sure that the region has been reopened(not + // finished maybe, but not a problem). + return null; + } + } else { + // in OPENING state before + if (!node.isInState(State.OPEN, State.OPENING)) { + // not in OPEN or OPENING state, then we can make sure that the region has been + // reopened(not finished maybe, but not a problem) + return null; + } else { + if (!node.getRegionLocation().equals(oldLoc.getServerName())) { + // the region has been moved, so we can make sure that the region has been reopened. + return null; + } + // normal case, we are still in OPENING state, or the reopen has been opened and the state + // is changed to OPEN. + long openSeqNum = node.isInState(State.OPEN) ? node.getOpenSeqNum() : -1; + return new HRegionLocation(node.getRegionInfo(), node.getRegionLocation(), openSeqNum); + } + } + } + } + /** * @return Return the regions of the table; does not include OFFLINE unless you set * offline to true. Does not include regions that are in the diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java index 6fb9caa..9a45f56 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ModifyTableProcedure.java @@ -124,8 +124,7 @@ public class ModifyTableProcedure break; case MODIFY_TABLE_REOPEN_ALL_REGIONS: if (env.getAssignmentManager().isTableEnabled(getTableName())) { - addChildProcedure(env.getAssignmentManager() - .createReopenProcedures(getRegionInfoList(env))); + addChildProcedure(new ReopenTableRegionsProcedure(getTableName())); } return Flow.NO_MORE_STATE; default: @@ -174,7 +173,7 @@ public class ModifyTableProcedure @Override protected ModifyTableState getState(final int stateId) { - return ModifyTableState.valueOf(stateId); + return ModifyTableState.forNumber(stateId); } @Override @@ -296,22 +295,6 @@ public class ModifyTableProcedure } /** - * Undo the descriptor change (for rollback) - * @param env MasterProcedureEnv - * @throws IOException - **/ - private void restoreTableDescriptor(final MasterProcedureEnv env) throws IOException { - env.getMasterServices().getTableDescriptors().add(unmodifiedTableDescriptor); - - // delete any new column families from the modifiedTableDescriptor. - deleteFromFs(env, modifiedTableDescriptor, unmodifiedTableDescriptor); - - // Make sure regions are opened after table descriptor is updated. - //reOpenAllRegionsIfTableIsOnline(env); - // TODO: NUKE ROLLBACK!!!! - } - - /** * Removes from hdfs the families that are not longer present in the new table descriptor. * @param env MasterProcedureEnv * @throws IOException diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java index 133d6f4..7928c5b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java @@ -18,7 +18,14 @@ package org.apache.hadoop.hbase.master.procedure; import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.hadoop.hbase.HBaseIOException; +import org.apache.hadoop.hbase.HRegionLocation; import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.master.RegionPlan; +import org.apache.hadoop.hbase.master.assignment.MoveRegionProcedure; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; @@ -31,8 +38,9 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.R import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.ReopenTableRegionsStateData; /** - * Used for non table procedures to reopen the regions for a table. For example, - * {@link org.apache.hadoop.hbase.master.replication.ModifyPeerProcedure}. + * Used for reopening the regions for a table. + *

+ * Currently we use {@link MoveRegionProcedure} to reopen regions. */ @InterfaceAudience.Private public class ReopenTableRegionsProcedure @@ -42,6 +50,8 @@ public class ReopenTableRegionsProcedure private TableName tableName; + private List regions = Collections.emptyList(); + public ReopenTableRegionsProcedure() { } @@ -59,19 +69,53 @@ public class ReopenTableRegionsProcedure return TableOperationType.REGION_EDIT; } + private MoveRegionProcedure createReopenProcedure(MasterProcedureEnv env, HRegionLocation loc) { + try { + return new MoveRegionProcedure(env, + new RegionPlan(loc.getRegion(), loc.getServerName(), loc.getServerName()), false); + } catch (HBaseIOException e) { + // we skip the checks so this should not happen + throw new AssertionError(e); + } + } + @Override protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState state) throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException { switch (state) { - case REOPEN_TABLE_REGIONS_REOPEN_ALL_REGIONS: - try { - addChildProcedure(env.getAssignmentManager().createReopenProcedures( - env.getAssignmentManager().getRegionStates().getRegionsOfTable(tableName))); - } catch (IOException e) { - LOG.warn("Failed to schedule reopen procedures for {}", tableName, e); - throw new ProcedureSuspendedException(); + case REOPEN_TABLE_REGIONS_GET_REGIONS: + if (!env.getAssignmentManager().isTableEnabled(tableName)) { + LOG.info("Table {} is disabled, give up reopening its regions"); + return Flow.NO_MORE_STATE; + } + regions = + env.getAssignmentManager().getRegionStates().getRegionsOfTableForReopen(tableName); + setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); + return Flow.HAS_MORE_STATE; + case REOPEN_TABLE_REGIONS_REOPEN_REGIONS: + addChildProcedure(regions.stream().filter(l -> l.getSeqNum() >= 0) + .map(l -> createReopenProcedure(env, l)).toArray(MoveRegionProcedure[]::new)); + setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_CONFIRM_REOPENED); + return Flow.HAS_MORE_STATE; + case REOPEN_TABLE_REGIONS_CONFIRM_REOPENED: + regions = regions.stream().map(env.getAssignmentManager().getRegionStates()::checkReopened) + .filter(l -> l != null).collect(Collectors.toList()); + if (regions.isEmpty()) { + return Flow.NO_MORE_STATE; + } + if (regions.stream().anyMatch(l -> l.getSeqNum() >= 0)) { + setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS); + return Flow.HAS_MORE_STATE; } - return Flow.NO_MORE_STATE; + LOG.info("There are still {} region(s) which need to be reopened for table {} are in " + + "OPENING state, try again later", regions.size(), tableName); + // All the regions need to reopen are in OPENING state which means we can not schedule any + // MRPs. Then sleep for one second, and yield the procedure to let other procedures run + // first and hope next time we can get some regions in other state to make progress. + // TODO: add a delay for ProcedureYieldException so that we do not need to sleep here which + // blocks a procedure worker. + Thread.sleep(1000); + throw new ProcedureYieldException(); default: throw new UnsupportedOperationException("unhandled state=" + state); } @@ -95,20 +139,24 @@ public class ReopenTableRegionsProcedure @Override protected ReopenTableRegionsState getInitialState() { - return ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_ALL_REGIONS; + return ReopenTableRegionsState.REOPEN_TABLE_REGIONS_GET_REGIONS; } @Override protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException { super.serializeStateData(serializer); - serializer.serialize(ReopenTableRegionsStateData.newBuilder() - .setTableName(ProtobufUtil.toProtoTableName(tableName)).build()); + ReopenTableRegionsStateData.Builder builder = ReopenTableRegionsStateData.newBuilder() + .setTableName(ProtobufUtil.toProtoTableName(tableName)); + regions.stream().map(ProtobufUtil::toRegionLocation).forEachOrdered(builder::addRegion); + serializer.serialize(builder.build()); } @Override protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException { super.deserializeStateData(serializer); - tableName = ProtobufUtil - .toTableName(serializer.deserialize(ReopenTableRegionsStateData.class).getTableName()); + ReopenTableRegionsStateData data = serializer.deserialize(ReopenTableRegionsStateData.class); + tableName = ProtobufUtil.toTableName(data.getTableName()); + regions = data.getRegionList().stream().map(ProtobufUtil::toRegionLocation) + .collect(Collectors.toList()); } } -- 2.7.4