From 1fc1f6692dd945ad7a4ca2ff1497b0adeb304b41 Mon Sep 17 00:00:00 2001 From: Jingyun Tian Date: Tue, 19 Mar 2019 14:45:00 +0800 Subject: [PATCH] HBASE-21965 Fix failed split and merge transactions that have failed to roll back --- .../apache/hadoop/hbase/client/HBaseHbck.java | 16 ++ .../org/apache/hadoop/hbase/client/Hbck.java | 6 + .../shaded/protobuf/RequestConverter.java | 9 + .../src/main/protobuf/Master.proto | 16 ++ .../hbase/master/MasterRpcServices.java | 175 +++++++++++++++++ .../assignment/SplitTableRegionProcedure.java | 60 +++--- .../apache/hadoop/hbase/client/TestHbck.java | 180 +++++++++++++++++- .../TestSplitTransactionOnCluster.java | 2 +- 8 files changed, 435 insertions(+), 29 deletions(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java index a276017b0c..087c78e80b 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java @@ -19,10 +19,12 @@ package org.apache.hadoop.hbase.client; import java.io.IOException; import java.util.List; +import java.util.Map; import java.util.concurrent.Callable; import java.util.stream.Collectors; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.ipc.RpcControllerFactory; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.shaded.protobuf.RequestConverter; @@ -172,4 +174,18 @@ public class HBaseHbck implements Hbck { throw new IOException(se); } } + + @Override + public Map + getFailedSplitMergeLegacyRegions(List tableNames) throws IOException { + try { + MasterProtos.GetFailedSplitMergeLegacyRegionsResponse response = + this.hbck.getFailedSplitMergeLegacyRegions(rpcControllerFactory.newController(), + RequestConverter.toGetFailedSplitMergeLegacyRegionsRequest(tableNames)); + return response.getErrorsMap(); + } catch (ServiceException se) { + LOG.debug("check region consistency failed", se); + throw new IOException(se); + } + } } diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java index e88805cdcc..f5664e5b47 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java @@ -20,12 +20,15 @@ package org.apache.hadoop.hbase.client; import java.io.Closeable; import java.io.IOException; import java.util.List; +import java.util.Map; import org.apache.hadoop.hbase.Abortable; import org.apache.hadoop.hbase.HBaseInterfaceAudience; +import org.apache.hadoop.hbase.TableName; import org.apache.yetus.audience.InterfaceAudience; import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos; /** * Hbck fixup tool APIs. Obtain an instance from {@link ClusterConnection#getHbck()} and call @@ -106,4 +109,7 @@ public interface Hbck extends Abortable, Closeable { List scheduleServerCrashProcedure(List serverNames) throws IOException; + + Map + getFailedSplitMergeLegacyRegions(List tableName) throws IOException; } diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java index 36c8fab23a..e7b6624c6e 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java @@ -1913,6 +1913,15 @@ public final class RequestConverter { return b.addAllServerName(serverNames).build(); } + public static MasterProtos.GetFailedSplitMergeLegacyRegionsRequest + toGetFailedSplitMergeLegacyRegionsRequest(List tableNames) { + MasterProtos.GetFailedSplitMergeLegacyRegionsRequest.Builder b = + MasterProtos.GetFailedSplitMergeLegacyRegionsRequest.newBuilder(); + List protoTableNames = tableNames.stream() + .map(tableName -> ProtobufUtil.toProtoTableName(tableName)).collect(Collectors.toList()); + return b.addAllTable(protoTableNames).build(); + } + private static List toEncodedRegionNameRegionSpecifiers( List encodedRegionNames) { return encodedRegionNames.stream(). diff --git a/hbase-protocol-shaded/src/main/protobuf/Master.proto b/hbase-protocol-shaded/src/main/protobuf/Master.proto index 4ed0ad5c0e..52a89f0a22 100644 --- a/hbase-protocol-shaded/src/main/protobuf/Master.proto +++ b/hbase-protocol-shaded/src/main/protobuf/Master.proto @@ -1093,6 +1093,19 @@ message ScheduleServerCrashProcedureResponse { repeated uint64 pid = 1; } +message GetFailedSplitMergeLegacyRegionsRequest { + repeated TableName table = 1; +} + +enum REGION_ERROR_TYPE { + daughter_merged_region_not_online = 0; + orphan_region_on_hdfs = 1; +} + +message GetFailedSplitMergeLegacyRegionsResponse { + map errors = 1; +} + service HbckService { /** Update state of the table in meta only*/ rpc SetTableStateInMeta(SetTableStateInMetaRequest) @@ -1123,4 +1136,7 @@ service HbckService { /** Schedule a ServerCrashProcedure to help recover a crash server */ rpc ScheduleServerCrashProcedure(ScheduleServerCrashProcedureRequest) returns(ScheduleServerCrashProcedureResponse); + + rpc getFailedSplitMergeLegacyRegions(GetFailedSplitMergeLegacyRegionsRequest) + returns(GetFailedSplitMergeLegacyRegionsResponse); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java index b943000f6e..a1ac1d1231 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java @@ -25,14 +25,17 @@ import java.net.BindException; import java.net.InetAddress; import java.net.InetSocketAddress; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.stream.Collectors; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.ClusterMetricsBuilder; import org.apache.hadoop.hbase.DoNotRetryIOException; @@ -66,7 +69,10 @@ import org.apache.hadoop.hbase.ipc.RpcServer.BlockingServiceAndInterface; import org.apache.hadoop.hbase.ipc.RpcServerFactory; import org.apache.hadoop.hbase.ipc.RpcServerInterface; import org.apache.hadoop.hbase.ipc.ServerRpcController; +import org.apache.hadoop.hbase.master.assignment.MergeTableRegionsProcedure; +import org.apache.hadoop.hbase.master.assignment.RegionStateStore; import org.apache.hadoop.hbase.master.assignment.RegionStates; +import org.apache.hadoop.hbase.master.assignment.SplitTableRegionProcedure; import org.apache.hadoop.hbase.master.locking.LockProcedure; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.MasterProcedureUtil; @@ -103,8 +109,10 @@ import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils; import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.ForeignExceptionUtil; import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.hbase.util.PairOfSameType; import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; import org.apache.yetus.audience.InterfaceAudience; @@ -2494,6 +2502,173 @@ public class MasterRpcServices extends RSRpcServices } } + @Override + public MasterProtos.GetFailedSplitMergeLegacyRegionsResponse getFailedSplitMergeLegacyRegions( + RpcController controller, MasterProtos.GetFailedSplitMergeLegacyRegionsRequest request) + throws ServiceException { + List tables = request.getTableList(); + + Map errorRegions = new HashMap<>(); + try { + for (HBaseProtos.TableName tableName : tables) { + errorRegions.putAll(checkRegionConsistency(ProtobufUtil.toTableName(tableName))); + } + } catch (IOException e) { + throw new ServiceException(e); + } + return MasterProtos.GetFailedSplitMergeLegacyRegionsResponse.newBuilder().putAllErrors(errorRegions) + .build(); + } + + private Map checkRegionConsistency(TableName tableName) + throws IOException { + if (!MetaTableAccessor.tableExists(master.getConnection(), tableName)) { + throw new IOException("table " + tableName.getNameAsString() + " doesn't exist"); + } + if (!MetaTableAccessor.getTableState(master.getConnection(), tableName).isEnabled()) { + throw new IOException( + "table " + tableName.getNameAsString() + " is not enabled yet"); + } + final Map problemRegions = new HashMap<>(); + + // Case 1. find orphan region on HDFS + // orphan regions may due to a failed split region procedure, which daughter regions are created + // then the procedure is aborted. Or merged region is created then the procedure is aborted. + List orphanRegions = findOrphanRegionOnHDFS(tableName); + orphanRegions.stream().forEach( + region -> problemRegions.put(region, MasterProtos.REGION_ERROR_TYPE.orphan_region_on_hdfs)); + + // Case 2. find unassigned daughter regions or merged regions + List unassignedDaughterOrMergedRegions = + findUnassignedDaughterOrMergedRegions(tableName); + unassignedDaughterOrMergedRegions.stream().forEach(region -> problemRegions.put(region, + MasterProtos.REGION_ERROR_TYPE.daughter_merged_region_not_online)); + + // if these regions in problemRegions are currently handled by SplitTableRegionProcedure or + // MergeTableRegionsProcedure, we should remove them from this map + master.getProcedures().stream().filter(p -> !(p.isFinished() || p.isBypass())) + .filter( + p -> (p instanceof SplitTableRegionProcedure || p instanceof MergeTableRegionsProcedure)) + .forEach(p -> { + if (p instanceof SplitTableRegionProcedure) { + problemRegions + .remove(((SplitTableRegionProcedure) p).getDaughterOneRI().getEncodedName()); + problemRegions + .remove(((SplitTableRegionProcedure) p).getDaughterTwoRI().getEncodedName()); + } else { + problemRegions + .remove(((MergeTableRegionsProcedure) p).getMergedRegion().getEncodedName()); + } + }); + + // check if regions are still problematic now + checkRegionStillProblematic(problemRegions, tableName); + return problemRegions; + } + + + private void checkRegionStillProblematic( + Map problemRegions, TableName tableName) + throws IOException { + MetaTableAccessor.scanMeta(master.getConnection(), tableName, + MetaTableAccessor.QueryType.REGION, Integer.MAX_VALUE, r -> { + RegionInfo regionInfo = MetaTableAccessor.getRegionInfo(r); + if (problemRegions.containsKey(regionInfo.getEncodedName())) { + MasterProtos.REGION_ERROR_TYPE errorType = + problemRegions.get(regionInfo.getEncodedName()); + switch (errorType) { + case orphan_region_on_hdfs: + // region is build for this directory, it is not a problematic region any more + problemRegions.remove(regionInfo.getEncodedName()); + break; + case daughter_merged_region_not_online: + RegionState.State state = RegionStateStore.getRegionState(r, 0); + if (!state.matches(RegionState.State.CLOSED, RegionState.State.SPLITTING_NEW, + RegionState.State.MERGED)) { + problemRegions.remove(regionInfo.getEncodedName()); + } + break; + default: + throw new IOException("there should be no problematic region of this type"); + } + } + return true; + }); + } + + private List findUnassignedDaughterOrMergedRegions(TableName tableName) + throws IOException { + Set mergeSet = new HashSet<>(); + Map> splitMap = new HashMap<>(); + Map regionStates = new HashMap<>(); + + MetaTableAccessor.scanMeta(master.getConnection(), tableName, + MetaTableAccessor.QueryType.REGION, Integer.MAX_VALUE, r -> { + RegionInfo regionInfo = MetaTableAccessor.getRegionInfo(r); + RegionState.State state = RegionStateStore.getRegionState(r, 0); + regionStates.put(regionInfo.getEncodedName(), state); + if (regionInfo.isSplit()) { + if (regionInfo.isSplitParent()) { + splitMap.put(regionInfo, MetaTableAccessor.getDaughterRegions(r)); + } + } else if (r.getValue(HConstants.CATALOG_FAMILY, HConstants.MERGEA_QUALIFIER) != null + || r.getValue(HConstants.CATALOG_FAMILY, HConstants.MERGEB_QUALIFIER) != null) { + mergeSet.add(regionInfo); + } + return true; + }); + + // find unassigned split region + List problematicRegions = new ArrayList<>(); + for (Map.Entry> entry : splitMap.entrySet()) { + problematicRegions = + findUnassignedDaughterRegions(entry.getKey(), splitMap, regionStates, new ArrayList<>()); + } + + // find unassigned merged region + problematicRegions.addAll(mergeSet.stream().filter( + regionInfo -> !regionStates.get(regionInfo.getEncodedName()).matches(RegionState.State.OPEN)) + .map(regionInfo -> regionInfo.getEncodedName()).collect(Collectors.toList())); + return problematicRegions; + } + + private List findUnassignedDaughterRegions(RegionInfo regionInfo, + Map> splitMap, + Map regionStates, ArrayList problemRegions) { + if (!regionInfo.isSplit()) { + if (!regionStates.get(regionInfo.getEncodedName()).matches(RegionState.State.OPEN)) { + problemRegions.add(regionInfo.getEncodedName()); + } + return problemRegions; + } + findUnassignedDaughterRegions(splitMap.get(regionInfo).getFirst(), splitMap, regionStates, + problemRegions); + findUnassignedDaughterRegions(splitMap.get(regionInfo).getSecond(), splitMap, regionStates, + problemRegions); + return problemRegions; + } + + private List findOrphanRegionOnHDFS(TableName tableName) throws IOException { + List regionInfos = + MetaTableAccessor.getTableRegions(master.getConnection(), tableName); + // get regionNames from hdfs + Path tableDir = FSUtils.getTableDir(master.getMasterFileSystem().getRootDir(), tableName); + FileStatus[] regions = + master.getFileSystem().listStatus(tableDir, path -> !path.getName().startsWith(".")); + List regionNames = Arrays.stream(regions).map(region -> region.getPath().getName()) + .collect(Collectors.toList()); + HashSet regionsInMeta = new HashSet<>(); + regionInfos.stream().forEach(regionInfo -> regionsInMeta.add(regionInfo.getEncodedName())); + Iterator regionIterator = regionNames.iterator(); + while (regionIterator.hasNext()) { + String region = regionIterator.next(); + if (regionsInMeta.contains(region)) { + regionIterator.remove(); + } + } + return regionNames; + } + @Override public SwitchRpcThrottleResponse switchRpcThrottle(RpcController controller, SwitchRpcThrottleRequest request) throws ServiceException { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java index 8e0dcd3fde..920dd3ed33 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java @@ -93,8 +93,8 @@ public class SplitTableRegionProcedure extends AbstractStateMachineRegionProcedure { private static final Logger LOG = LoggerFactory.getLogger(SplitTableRegionProcedure.class); private Boolean traceEnabled = null; - private RegionInfo daughter_1_RI; - private RegionInfo daughter_2_RI; + private RegionInfo daughterOneRI; + private RegionInfo daughterTwoRI; private byte[] bestSplitRow; private RegionSplitPolicy splitPolicy; @@ -113,13 +113,13 @@ public class SplitTableRegionProcedure checkSplittable(env, regionToSplit, bestSplitRow); final TableName table = regionToSplit.getTable(); final long rid = getDaughterRegionIdTimestamp(regionToSplit); - this.daughter_1_RI = RegionInfoBuilder.newBuilder(table) + this.daughterOneRI = RegionInfoBuilder.newBuilder(table) .setStartKey(regionToSplit.getStartKey()) .setEndKey(bestSplitRow) .setSplit(false) .setRegionId(rid) .build(); - this.daughter_2_RI = RegionInfoBuilder.newBuilder(table) + this.daughterTwoRI = RegionInfoBuilder.newBuilder(table) .setStartKey(bestSplitRow) .setEndKey(regionToSplit.getEndKey()) .setSplit(false) @@ -137,6 +137,16 @@ public class SplitTableRegionProcedure } } + @VisibleForTesting + public RegionInfo getDaughterOneRI() { + return daughterOneRI; + } + + @VisibleForTesting + public RegionInfo getDaughterTwoRI() { + return daughterTwoRI; + } + /** * Check whether the region is splittable * @param env MasterProcedureEnv @@ -396,8 +406,8 @@ public class SplitTableRegionProcedure MasterProcedureProtos.SplitTableRegionStateData.newBuilder() .setUserInfo(MasterProcedureUtil.toProtoUserInfo(getUser())) .setParentRegionInfo(ProtobufUtil.toRegionInfo(getRegion())) - .addChildRegionInfo(ProtobufUtil.toRegionInfo(daughter_1_RI)) - .addChildRegionInfo(ProtobufUtil.toRegionInfo(daughter_2_RI)); + .addChildRegionInfo(ProtobufUtil.toRegionInfo(daughterOneRI)) + .addChildRegionInfo(ProtobufUtil.toRegionInfo(daughterTwoRI)); serializer.serialize(splitTableRegionMsg.build()); } @@ -411,8 +421,8 @@ public class SplitTableRegionProcedure setUser(MasterProcedureUtil.toUserInfo(splitTableRegionsMsg.getUserInfo())); setRegion(ProtobufUtil.toRegionInfo(splitTableRegionsMsg.getParentRegionInfo())); assert(splitTableRegionsMsg.getChildRegionInfoCount() == 2); - daughter_1_RI = ProtobufUtil.toRegionInfo(splitTableRegionsMsg.getChildRegionInfo(0)); - daughter_2_RI = ProtobufUtil.toRegionInfo(splitTableRegionsMsg.getChildRegionInfo(1)); + daughterOneRI = ProtobufUtil.toRegionInfo(splitTableRegionsMsg.getChildRegionInfo(0)); + daughterTwoRI = ProtobufUtil.toRegionInfo(splitTableRegionsMsg.getChildRegionInfo(1)); } @Override @@ -423,9 +433,9 @@ public class SplitTableRegionProcedure sb.append(", parent="); sb.append(getParentRegion().getShortNameToLog()); sb.append(", daughterA="); - sb.append(daughter_1_RI.getShortNameToLog()); + sb.append(daughterOneRI.getShortNameToLog()); sb.append(", daughterB="); - sb.append(daughter_2_RI.getShortNameToLog()); + sb.append(daughterTwoRI.getShortNameToLog()); } private RegionInfo getParentRegion() { @@ -443,7 +453,7 @@ public class SplitTableRegionProcedure } private byte[] getSplitRow() { - return daughter_2_RI.getStartKey(); + return daughterTwoRI.getStartKey(); } private static final State[] EXPECTED_SPLIT_STATES = new State[] { State.OPEN, State.CLOSED }; @@ -575,17 +585,17 @@ public class SplitTableRegionProcedure Pair expectedReferences = splitStoreFiles(env, regionFs); assertReferenceFileCount(fs, expectedReferences.getFirst(), - regionFs.getSplitsDir(daughter_1_RI)); + regionFs.getSplitsDir(daughterOneRI)); //Move the files from the temporary .splits to the final /table/region directory - regionFs.commitDaughterRegion(daughter_1_RI); + regionFs.commitDaughterRegion(daughterOneRI); assertReferenceFileCount(fs, expectedReferences.getFirst(), - new Path(tabledir, daughter_1_RI.getEncodedName())); + new Path(tabledir, daughterOneRI.getEncodedName())); assertReferenceFileCount(fs, expectedReferences.getSecond(), - regionFs.getSplitsDir(daughter_2_RI)); - regionFs.commitDaughterRegion(daughter_2_RI); + regionFs.getSplitsDir(daughterTwoRI)); + regionFs.commitDaughterRegion(daughterTwoRI); assertReferenceFileCount(fs, expectedReferences.getSecond(), - new Path(tabledir, daughter_2_RI.getEncodedName())); + new Path(tabledir, daughterTwoRI.getEncodedName())); } /** @@ -725,9 +735,9 @@ public class SplitTableRegionProcedure final byte[] splitRow = getSplitRow(); final String familyName = Bytes.toString(family); - final Path path_first = regionFs.splitStoreFile(this.daughter_1_RI, familyName, sf, splitRow, + final Path path_first = regionFs.splitStoreFile(this.daughterOneRI, familyName, sf, splitRow, false, splitPolicy); - final Path path_second = regionFs.splitStoreFile(this.daughter_2_RI, familyName, sf, splitRow, + final Path path_second = regionFs.splitStoreFile(this.daughterTwoRI, familyName, sf, splitRow, true, splitPolicy); if (LOG.isDebugEnabled()) { LOG.debug("pid=" + getProcId() + " splitting complete for store file: " + @@ -792,7 +802,7 @@ public class SplitTableRegionProcedure */ private void updateMeta(final MasterProcedureEnv env) throws IOException { env.getAssignmentManager().markRegionAsSplit(getParentRegion(), getParentRegionServerName(env), - daughter_1_RI, daughter_2_RI); + daughterOneRI, daughterTwoRI); } /** @@ -814,7 +824,7 @@ public class SplitTableRegionProcedure private void postSplitRegion(final MasterProcedureEnv env) throws IOException { final MasterCoprocessorHost cpHost = env.getMasterCoprocessorHost(); if (cpHost != null) { - cpHost.postCompletedSplitRegionAction(daughter_1_RI, daughter_2_RI, getUser()); + cpHost.postCompletedSplitRegionAction(daughterOneRI, daughterTwoRI, getUser()); } } @@ -832,8 +842,8 @@ public class SplitTableRegionProcedure private TransitRegionStateProcedure[] createAssignProcedures(MasterProcedureEnv env) throws IOException { List hris = new ArrayList(2); - hris.add(daughter_1_RI); - hris.add(daughter_2_RI); + hris.add(daughterOneRI); + hris.add(daughterTwoRI); return AssignmentManagerUtil.createAssignProceduresForOpeningNewRegions(env, hris, getRegionReplication(env), getParentRegionServerName(env)); } @@ -848,9 +858,9 @@ public class SplitTableRegionProcedure long maxSequenceId = WALSplitter.getMaxRegionSequenceId(walFS, getWALRegionDir(env, getParentRegion())); if (maxSequenceId > 0) { - WALSplitter.writeRegionSequenceIdFile(walFS, getWALRegionDir(env, daughter_1_RI), + WALSplitter.writeRegionSequenceIdFile(walFS, getWALRegionDir(env, daughterOneRI), maxSequenceId); - WALSplitter.writeRegionSequenceIdFile(walFS, getWALRegionDir(env, daughter_2_RI), + WALSplitter.writeRegionSequenceIdFile(walFS, getWALRegionDir(env, daughterTwoRI), maxSequenceId); } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java index 8318757cef..4359e51ba5 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java @@ -18,17 +18,33 @@ package org.apache.hadoop.hbase.client; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import java.io.IOException; import java.util.Arrays; import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Random; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; + +import org.apache.hadoop.hbase.Coprocessor; +import org.apache.hadoop.hbase.CoprocessorEnvironment; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.coprocessor.MasterCoprocessor; +import org.apache.hadoop.hbase.coprocessor.MasterCoprocessorEnvironment; +import org.apache.hadoop.hbase.coprocessor.MasterObserver; +import org.apache.hadoop.hbase.coprocessor.ObserverContext; +import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.master.assignment.MergeTableRegionsProcedure; +import org.apache.hadoop.hbase.master.assignment.SplitTableRegionProcedure; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.TableProcedureInterface; import org.apache.hadoop.hbase.procedure2.Procedure; @@ -40,6 +56,7 @@ import org.apache.hadoop.hbase.testclassification.ClientTests; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.util.Bytes; import org.junit.AfterClass; +import org.junit.Assert; import org.junit.Before; import org.junit.BeforeClass; import org.junit.ClassRule; @@ -57,6 +74,7 @@ import org.slf4j.LoggerFactory; import org.apache.hbase.thirdparty.com.google.common.io.Closeables; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos; /** * Class to test HBaseHbck. Spins up the minicluster once at test start and then takes it down @@ -102,6 +120,12 @@ public class TestHbck { TEST_UTIL.createMultiRegionTable(TABLE_NAME, Bytes.toBytes("family1"), 5); procExec = TEST_UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); ASYNC_CONN = ConnectionFactory.createAsyncConnection(TEST_UTIL.getConfiguration()).get(); + TEST_UTIL.getHBaseCluster().getMaster().getMasterCoprocessorHost().load( + FailingMergeAfterMetaUpdatedMasterObserver.class, Coprocessor.PRIORITY_USER, + TEST_UTIL.getHBaseCluster().getMaster().getConfiguration()); + TEST_UTIL.getHBaseCluster().getMaster().getMasterCoprocessorHost().load( + FailingSplitAfterMetaUpdatedMasterObserver.class, Coprocessor.PRIORITY_USER, + TEST_UTIL.getHBaseCluster().getMaster().getConfiguration()); } @AfterClass @@ -204,25 +228,175 @@ public class TestHbck { } } + @Test + public void testRecoverMergeAfterMetaUpdated() throws Exception { + String testTable = async ? "mergeTestAsync" : "mergeTestSync"; + TEST_UTIL.createMultiRegionTable(TableName.valueOf(testTable), Bytes.toBytes("family1"), 5); + TEST_UTIL.loadTable(TEST_UTIL.getConnection().getTable(TableName.valueOf(testTable)), + Bytes.toBytes("family1"), true); + HMaster master = TEST_UTIL.getHBaseCluster().getMaster(); + Hbck hbck = getHbck(); + FailingMergeAfterMetaUpdatedMasterObserver observer = master.getMasterCoprocessorHost() + .findCoprocessor(FailingMergeAfterMetaUpdatedMasterObserver.class); + try (Admin admin = TEST_UTIL.getConnection().getAdmin()) { + List regions = admin.getRegions(TableName.valueOf(testTable)); + admin.mergeRegionsAsync(regions.get(0).getRegionName(), regions.get(1).getRegionName(), true); + assertNotNull(observer); + observer.latch.await(5000, TimeUnit.MILLISECONDS); + Map result = + hbck.getFailedSplitMergeLegacyRegions(Arrays.asList(TableName.valueOf(testTable))); + // since there is a merge procedure work on the region, this check will return 2 orphan hdfs + // region. + Assert.assertTrue(result.values().stream() + .noneMatch(state -> state + .getNumber() == MasterProtos.REGION_ERROR_TYPE.daughter_merged_region_not_online + .getNumber())); + Optional> procedure = TEST_UTIL.getHBaseCluster().getMaster().getProcedures() + .stream().filter(p -> p instanceof MergeTableRegionsProcedure).findAny(); + Assert.assertTrue(procedure.isPresent()); + hbck.bypassProcedure(Arrays.asList(procedure.get().getProcId()), 5, true, false); + result = hbck.getFailedSplitMergeLegacyRegions(Arrays.asList(TableName.valueOf(testTable))); + Assert.assertTrue(result.size() == 3); + hbck.assigns(Arrays.asList(result.keySet().toArray(new String[0]))); + ProcedureTestingUtility.waitNoProcedureRunning(master.getMasterProcedureExecutor()); + // now the state should be fixed + result = hbck.getFailedSplitMergeLegacyRegions(Arrays.asList(TableName.valueOf(testTable))); + Assert.assertTrue(result.values().stream() + .noneMatch(state -> state + .getNumber() == MasterProtos.REGION_ERROR_TYPE.daughter_merged_region_not_online + .getNumber())); + } catch (InterruptedException ie) { + throw new IOException(ie); + } finally { + observer.resetLatch(); + } + } + + @Test + public void testRecoverSplitAfterMetaUpdated() throws Exception { + String testTable = async ? "splitTestAsync" : "splitTestSync"; + TEST_UTIL.createMultiRegionTable(TableName.valueOf(testTable), Bytes.toBytes("family1"), 5); + TEST_UTIL.loadTable(TEST_UTIL.getConnection().getTable(TableName.valueOf(testTable)), + Bytes.toBytes("family1"), true); + HMaster master = TEST_UTIL.getHBaseCluster().getMaster(); + Hbck hbck = getHbck(); + FailingSplitAfterMetaUpdatedMasterObserver observer = master.getMasterCoprocessorHost() + .findCoprocessor(FailingSplitAfterMetaUpdatedMasterObserver.class); + assertNotNull(observer); + try (Admin admin = TEST_UTIL.getConnection().getAdmin()) { + Random random = new Random(); + byte[] splitKey = new byte[16]; + random.nextBytes(splitKey); + admin.split(TableName.valueOf(testTable), splitKey); + observer.latch.await(5000, TimeUnit.MILLISECONDS); + Map result = + hbck.getFailedSplitMergeLegacyRegions(Arrays.asList(TableName.valueOf(testTable))); + // since there is a split procedure work on the region, thus this check should return a empty + // map. + Assert.assertTrue(result.values().stream() + .noneMatch(state -> state + .getNumber() == MasterProtos.REGION_ERROR_TYPE.daughter_merged_region_not_online + .getNumber())); + + Optional> procedure = TEST_UTIL.getHBaseCluster().getMaster().getProcedures() + .stream().filter(p -> p instanceof SplitTableRegionProcedure).findAny(); + Assert.assertTrue(procedure.isPresent()); + hbck.bypassProcedure(Arrays.asList(procedure.get().getProcId()), 5, true, false); + result = hbck.getFailedSplitMergeLegacyRegions(Arrays.asList(TableName.valueOf(testTable))); + Assert.assertTrue(result.values().stream() + .filter(state -> state + .getNumber() == MasterProtos.REGION_ERROR_TYPE.daughter_merged_region_not_online + .getNumber()) + .count() == 2); + hbck.assigns(Arrays.asList(result.keySet().toArray(new String[0]))); + ProcedureTestingUtility.waitNoProcedureRunning(master.getMasterProcedureExecutor()); + // now the state should be fixed + result = hbck.getFailedSplitMergeLegacyRegions(Arrays.asList(TableName.valueOf(testTable))); + Assert.assertTrue(result.values().stream() + .noneMatch(state -> state + .getNumber() == MasterProtos.REGION_ERROR_TYPE.daughter_merged_region_not_online + .getNumber())); + } catch (InterruptedException ie) { + throw new IOException(ie); + } finally { + observer.resetLatch(); + } + } + + @Test public void testScheduleSCP() throws Exception { HRegionServer testRs = TEST_UTIL.getRSForFirstRegionInTable(TABLE_NAME); TEST_UTIL.loadTable(TEST_UTIL.getConnection().getTable(TABLE_NAME), Bytes.toBytes("family1"), - true); + true); ServerName serverName = testRs.getServerName(); Hbck hbck = getHbck(); List pids = - hbck.scheduleServerCrashProcedure(Arrays.asList(ProtobufUtil.toServerName(serverName))); + hbck.scheduleServerCrashProcedure(Arrays.asList(ProtobufUtil.toServerName(serverName))); assertTrue(pids.get(0) > 0); LOG.info("pid is {}", pids.get(0)); List newPids = - hbck.scheduleServerCrashProcedure(Arrays.asList(ProtobufUtil.toServerName(serverName))); + hbck.scheduleServerCrashProcedure(Arrays.asList(ProtobufUtil.toServerName(serverName))); assertTrue(newPids.get(0) < 0); LOG.info("pid is {}", newPids.get(0)); waitOnPids(pids); } + public static class FailingSplitAfterMetaUpdatedMasterObserver + implements MasterCoprocessor, MasterObserver { + public volatile CountDownLatch latch; + + @Override + public void start(CoprocessorEnvironment e) throws IOException { + resetLatch(); + } + + @Override + public Optional getMasterObserver() { + return Optional.of(this); + } + + @Override + public void preSplitRegionAfterMETAAction(ObserverContext ctx) + throws IOException { + LOG.info("I'm here"); + latch.countDown(); + throw new IOException("this procedure will fail at here forever"); + } + + public void resetLatch() { + this.latch = new CountDownLatch(1); + } + } + + public static class FailingMergeAfterMetaUpdatedMasterObserver + implements MasterCoprocessor, MasterObserver { + public volatile CountDownLatch latch; + + @Override + public void start(CoprocessorEnvironment e) throws IOException { + resetLatch(); + } + + @Override + public Optional getMasterObserver() { + return Optional.of(this); + } + + public void resetLatch() { + this.latch = new CountDownLatch(1); + } + + @Override + public void postMergeRegionsCommitAction( + final ObserverContext ctx, final RegionInfo[] regionsToMerge, + final RegionInfo mergedRegion) throws IOException { + latch.countDown(); + throw new IOException("this procedure will fail at here forever"); + } + } + private void waitOnPids(List pids) { TEST_UTIL.waitFor(60000, () -> pids.stream().allMatch(procExec::isFinished)); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java index b91bf11dae..e09cf75f4c 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java @@ -275,7 +275,7 @@ public class TestSplitTransactionOnCluster { } public static class FailingSplitMasterObserver implements MasterCoprocessor, MasterObserver { - volatile CountDownLatch latch; + public volatile CountDownLatch latch; @Override public void start(CoprocessorEnvironment e) throws IOException { -- 2.17.1