From 0e8d96745f7d9f916e28f6d2c5874d14b0f79a00 Mon Sep 17 00:00:00 2001 From: Jingyun Tian Date: Mon, 18 Mar 2019 20:23:34 +0800 Subject: [PATCH] HBASE-21965 Fix failed split and merge transactions that have failed to roll back --- .../apache/hadoop/hbase/client/HBaseHbck.java | 16 ++ .../org/apache/hadoop/hbase/client/Hbck.java | 6 + .../shaded/protobuf/RequestConverter.java | 9 + .../src/main/protobuf/Master.proto | 17 ++ .../hbase/master/MasterRpcServices.java | 174 ++++++++++++++++++ .../assignment/SplitTableRegionProcedure.java | 32 +++- .../apache/hadoop/hbase/client/TestHbck.java | 172 ++++++++++++++++- .../TestSplitTransactionOnCluster.java | 2 +- 8 files changed, 423 insertions(+), 5 deletions(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java index a276017b0c..e6ea33faa8 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java @@ -19,10 +19,12 @@ package org.apache.hadoop.hbase.client; import java.io.IOException; import java.util.List; +import java.util.Map; import java.util.concurrent.Callable; import java.util.stream.Collectors; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.ipc.RpcControllerFactory; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.shaded.protobuf.RequestConverter; @@ -172,4 +174,18 @@ public class HBaseHbck implements Hbck { throw new IOException(se); } } + + @Override + public Map + checkSplitMergeConsistency(List tableNames) throws IOException { + try { + MasterProtos.checkSplitMergeConsistencyResponse response = + this.hbck.checkSplitMergeConsistency(rpcControllerFactory.newController(), + RequestConverter.toCheckSplitMergeConsistencyRequest(tableNames)); + return response.getErrorsMap(); + } catch (ServiceException se) { + LOG.debug("check region consistency failed", se); + throw new IOException(se); + } + } } diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java index e88805cdcc..390984866d 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java @@ -20,12 +20,15 @@ package org.apache.hadoop.hbase.client; import java.io.Closeable; import java.io.IOException; import java.util.List; +import java.util.Map; import org.apache.hadoop.hbase.Abortable; import org.apache.hadoop.hbase.HBaseInterfaceAudience; +import org.apache.hadoop.hbase.TableName; import org.apache.yetus.audience.InterfaceAudience; import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos; /** * Hbck fixup tool APIs. Obtain an instance from {@link ClusterConnection#getHbck()} and call @@ -106,4 +109,7 @@ public interface Hbck extends Abortable, Closeable { List scheduleServerCrashProcedure(List serverNames) throws IOException; + + Map checkSplitMergeConsistency(List tableName) + throws IOException; } diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java index 36c8fab23a..c3343ecee6 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java @@ -1913,6 +1913,15 @@ public final class RequestConverter { return b.addAllServerName(serverNames).build(); } + public static MasterProtos.checkSplitMergeConsistencyRequest + toCheckSplitMergeConsistencyRequest(List tableNames) { + MasterProtos.checkSplitMergeConsistencyRequest.Builder b = + MasterProtos.checkSplitMergeConsistencyRequest.newBuilder(); + List protoTableNames = tableNames.stream() + .map(tableName -> ProtobufUtil.toProtoTableName(tableName)).collect(Collectors.toList()); + return b.addAllTable(protoTableNames).build(); + } + private static List toEncodedRegionNameRegionSpecifiers( List encodedRegionNames) { return encodedRegionNames.stream(). diff --git a/hbase-protocol-shaded/src/main/protobuf/Master.proto b/hbase-protocol-shaded/src/main/protobuf/Master.proto index 4ed0ad5c0e..393d30145d 100644 --- a/hbase-protocol-shaded/src/main/protobuf/Master.proto +++ b/hbase-protocol-shaded/src/main/protobuf/Master.proto @@ -1093,6 +1093,20 @@ message ScheduleServerCrashProcedureResponse { repeated uint64 pid = 1; } +message checkSplitMergeConsistencyRequest { + repeated TableName table = 1; +} + +enum REGION_ERROR_TYPE { + merged_region_not_online = 0; + daughter_region_not_online = 1; + orphan_region_on_hdfs = 2; +} + +message checkSplitMergeConsistencyResponse { + map errors = 1; +} + service HbckService { /** Update state of the table in meta only*/ rpc SetTableStateInMeta(SetTableStateInMetaRequest) @@ -1123,4 +1137,7 @@ service HbckService { /** Schedule a ServerCrashProcedure to help recover a crash server */ rpc ScheduleServerCrashProcedure(ScheduleServerCrashProcedureRequest) returns(ScheduleServerCrashProcedureResponse); + + rpc checkSplitMergeConsistency(checkSplitMergeConsistencyRequest) + returns(checkSplitMergeConsistencyResponse); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java index 063a353237..76d0e9540a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java @@ -25,14 +25,17 @@ import java.net.BindException; import java.net.InetAddress; import java.net.InetSocketAddress; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.stream.Collectors; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.ClusterMetricsBuilder; import org.apache.hadoop.hbase.DoNotRetryIOException; @@ -66,8 +69,12 @@ import org.apache.hadoop.hbase.ipc.RpcServer.BlockingServiceAndInterface; import org.apache.hadoop.hbase.ipc.RpcServerFactory; import org.apache.hadoop.hbase.ipc.RpcServerInterface; import org.apache.hadoop.hbase.ipc.ServerRpcController; +import org.apache.hadoop.hbase.master.assignment.MergeTableRegionsProcedure; +import org.apache.hadoop.hbase.master.assignment.RegionStateStore; import org.apache.hadoop.hbase.master.assignment.RegionStates; +import org.apache.hadoop.hbase.master.assignment.SplitTableRegionProcedure; import org.apache.hadoop.hbase.master.locking.LockProcedure; +import org.apache.hadoop.hbase.master.procedure.AbstractStateMachineRegionProcedure; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.MasterProcedureUtil; import org.apache.hadoop.hbase.master.procedure.MasterProcedureUtil.NonceProcedureRunnable; @@ -103,8 +110,10 @@ import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils; import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.ForeignExceptionUtil; import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.hbase.util.PairOfSameType; import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; import org.apache.yetus.audience.InterfaceAudience; @@ -2491,6 +2500,171 @@ public class MasterRpcServices extends RSRpcServices } } + @Override + public MasterProtos.checkSplitMergeConsistencyResponse checkSplitMergeConsistency( + RpcController controller, MasterProtos.checkSplitMergeConsistencyRequest request) + throws ServiceException { + List tables = request.getTableList(); + + Map errorRegions = new HashMap<>(); + try { + for (HBaseProtos.TableName tableName : tables) { + errorRegions.putAll(checkRegionConsistency(ProtobufUtil.toTableName(tableName))); + } + } catch (IOException e) { + throw new ServiceException(e); + } + return MasterProtos.checkSplitMergeConsistencyResponse.newBuilder().putAllErrors(errorRegions) + .build(); + } + + private Map checkRegionConsistency(TableName tableName) + throws IOException { + if (!MetaTableAccessor.tableExists(master.getConnection(), tableName)) { + throw new IOException("table " + tableName.getNameAsString() + " doesnt exist"); + } + if (!MetaTableAccessor.getTableState(master.getConnection(), tableName).isEnabled()) { + throw new IOException( + "table " + tableName.getNameAsString() + " is not enabled yet, please check assignments"); + } + final Map problemRegions = new HashMap<>(); + + // Case 1. find orphan region on HDFS + // orphan regions may due to a failed split region procedure, which daughter regions are created + // then the procedure is aborted. Or merged region is created then the procedure is aborted. + List orphanRegions = findOrphanRegionOnHDFS(tableName); + orphanRegions.stream().forEach( + region -> problemRegions.put(region, MasterProtos.REGION_ERROR_TYPE.orphan_region_on_hdfs)); + + // Case 2. find unassigned daughter regions or merged regions + findUnassignedDaughterOrMergedRegions(tableName, problemRegions); + + master.getProcedures().stream().filter(p -> !(p.isFinished() || p.isBypass())) + .filter( + p -> (p instanceof SplitTableRegionProcedure || p instanceof MergeTableRegionsProcedure)) + .forEach(p -> { + if (p instanceof SplitTableRegionProcedure) { + problemRegions + .remove(((SplitTableRegionProcedure) p).getDaughter_1_RI().getEncodedName()); + problemRegions + .remove(((SplitTableRegionProcedure) p).getDaughter_2_RI().getEncodedName()); + } else { + LOG.info("tjy: delete ->" + ((MergeTableRegionsProcedure) p).getMergedRegion().getEncodedName()); + problemRegions + .remove(((MergeTableRegionsProcedure) p).getMergedRegion().getEncodedName()); + } + }); + + // check if regions are still problematic now + checkRegionStillProblematic(problemRegions, tableName); + return problemRegions; + } + + + private void checkRegionStillProblematic( + Map problemRegions, TableName tableName) + throws IOException { + MetaTableAccessor.scanMeta(master.getConnection(), tableName, + MetaTableAccessor.QueryType.REGION, Integer.MAX_VALUE, r -> { + RegionInfo regionInfo = MetaTableAccessor.getRegionInfo(r); + if (problemRegions.containsKey(regionInfo.getEncodedName())) { + MasterProtos.REGION_ERROR_TYPE errorType = + problemRegions.get(regionInfo.getEncodedName()); + switch (errorType) { + case orphan_region_on_hdfs: + // region is build for this directory, it is not a problematic region any more + problemRegions.remove(regionInfo.getEncodedName()); + break; + case merged_region_not_online: + case daughter_region_not_online: + RegionState.State state = RegionStateStore.getRegionState(r, 0); + if (!state.matches(RegionState.State.CLOSED, RegionState.State.SPLITTING_NEW, RegionState.State.MERGED)) { + problemRegions.remove(regionInfo.getEncodedName()); + } + break; + default: + throw new IOException("there should be no problematic region of this type"); + } + } + return true; + }); + } + + private void findUnassignedDaughterOrMergedRegions(TableName tableName, + Map problemRegions) throws IOException { + Set mergeSet = new HashSet<>(); + Map> splitMap = new HashMap<>(); + Map regionStates = new HashMap<>(); + + MetaTableAccessor.scanMeta(master.getConnection(), tableName, + MetaTableAccessor.QueryType.REGION, Integer.MAX_VALUE, r -> { + RegionInfo regionInfo = MetaTableAccessor.getRegionInfo(r); + RegionState.State state = RegionStateStore.getRegionState(r, 0); + regionStates.put(regionInfo.getEncodedName(), state); + if (regionInfo.isSplit()) { + if (regionInfo.isSplitParent()) { + splitMap.put(regionInfo, MetaTableAccessor.getDaughterRegions(r)); + } + } else if (r.getValue(HConstants.CATALOG_FAMILY, HConstants.MERGEA_QUALIFIER) != null + || r.getValue(HConstants.CATALOG_FAMILY, HConstants.MERGEB_QUALIFIER) != null) { + mergeSet.add(regionInfo); + } + return true; + }); + + // find unassigned split region + List UnassignedDaughterRegions = new ArrayList<>(); + for (Map.Entry> entry : splitMap.entrySet()) { + UnassignedDaughterRegions = + findUnassignedDaughterRegions(entry.getKey(), splitMap, regionStates, new ArrayList<>()); + } + UnassignedDaughterRegions.stream().forEach(region -> problemRegions.put(region, + MasterProtos.REGION_ERROR_TYPE.daughter_region_not_online)); + + // find unassigned merged region + mergeSet.stream().filter( + regionInfo -> !regionStates.get(regionInfo.getEncodedName()).matches(RegionState.State.OPEN)) + .forEach(region -> problemRegions.put(region.getEncodedName(), + MasterProtos.REGION_ERROR_TYPE.merged_region_not_online)); + } + + private List findUnassignedDaughterRegions(RegionInfo regionInfo, + Map> splitMap, + Map regionStates, ArrayList problemRegions) { + if (!regionInfo.isSplit()) { + if (!regionStates.get(regionInfo.getEncodedName()).matches(RegionState.State.OPEN)) { + problemRegions.add(regionInfo.getEncodedName()); + } + return problemRegions; + } + findUnassignedDaughterRegions(splitMap.get(regionInfo).getFirst(), splitMap, regionStates, + problemRegions); + findUnassignedDaughterRegions(splitMap.get(regionInfo).getSecond(), splitMap, regionStates, + problemRegions); + return problemRegions; + } + + private List findOrphanRegionOnHDFS(TableName tableName) throws IOException { + List regionInfos = + MetaTableAccessor.getTableRegions(master.getConnection(), tableName); + // get regionNames from hdfs + Path tableDir = FSUtils.getTableDir(master.getMasterFileSystem().getRootDir(), tableName); + FileStatus[] regions = + master.getFileSystem().listStatus(tableDir, path -> !path.getName().startsWith(".")); + List regionNames = Arrays.stream(regions).map(region -> region.getPath().getName()) + .collect(Collectors.toList()); + HashSet regionsInMeta = new HashSet<>(); + regionInfos.stream().forEach(regionInfo -> regionsInMeta.add(regionInfo.getEncodedName())); + Iterator regionIterator = regionNames.iterator(); + while (regionIterator.hasNext()) { + String region = regionIterator.next(); + if (regionsInMeta.contains(region)) { + regionIterator.remove(); + } + } + return regionNames; + } + @Override public SwitchRpcThrottleResponse switchRpcThrottle(RpcController controller, SwitchRpcThrottleRequest request) throws ServiceException { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java index 8e0dcd3fde..5a57f8aaab 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java @@ -103,7 +103,7 @@ public class SplitTableRegionProcedure } public SplitTableRegionProcedure(final MasterProcedureEnv env, - final RegionInfo regionToSplit, final byte[] splitRow) throws IOException { + final RegionInfo regionToSplit, final byte[] splitRow) throws IOException { super(env, regionToSplit); preflightChecks(env, true); // When procedure goes to run in its prepare step, it also does these checkOnline checks. Here @@ -137,6 +137,36 @@ public class SplitTableRegionProcedure } } + @VisibleForTesting + public RegionInfo getDaughter_1_RI() { + return daughter_1_RI; + } + + @VisibleForTesting + public RegionInfo getDaughter_2_RI() { + return daughter_2_RI; + } + + @Override + protected LockState acquireLock(final MasterProcedureEnv env) { + if (env.getProcedureScheduler().waitRegions(this, getTableName(), + getParentRegion(), daughter_1_RI, daughter_2_RI)) { + try { + LOG.debug(LockState.LOCK_EVENT_WAIT + " " + env.getProcedureScheduler().dumpLocks()); + } catch (IOException e) { + // Ignore, just for logging + } + return LockState.LOCK_EVENT_WAIT; + } + return LockState.LOCK_ACQUIRED; + } + + @Override + protected void releaseLock(final MasterProcedureEnv env) { + env.getProcedureScheduler().wakeRegions(this, getTableName(), + getParentRegion(), daughter_1_RI, daughter_2_RI); + } + /** * Check whether the region is splittable * @param env MasterProcedureEnv diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java index 8318757cef..4b106f2ba7 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestHbck.java @@ -18,17 +18,33 @@ package org.apache.hadoop.hbase.client; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import java.io.IOException; import java.util.Arrays; import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Random; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; + +import org.apache.hadoop.hbase.Coprocessor; +import org.apache.hadoop.hbase.CoprocessorEnvironment; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.coprocessor.MasterCoprocessor; +import org.apache.hadoop.hbase.coprocessor.MasterCoprocessorEnvironment; +import org.apache.hadoop.hbase.coprocessor.MasterObserver; +import org.apache.hadoop.hbase.coprocessor.ObserverContext; +import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.master.assignment.MergeTableRegionsProcedure; +import org.apache.hadoop.hbase.master.assignment.SplitTableRegionProcedure; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.TableProcedureInterface; import org.apache.hadoop.hbase.procedure2.Procedure; @@ -36,10 +52,12 @@ import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.regionserver.TestSplitTransactionOnCluster; import org.apache.hadoop.hbase.testclassification.ClientTests; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.util.Bytes; import org.junit.AfterClass; +import org.junit.Assert; import org.junit.Before; import org.junit.BeforeClass; import org.junit.ClassRule; @@ -57,6 +75,7 @@ import org.slf4j.LoggerFactory; import org.apache.hbase.thirdparty.com.google.common.io.Closeables; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos; /** * Class to test HBaseHbck. Spins up the minicluster once at test start and then takes it down @@ -102,6 +121,12 @@ public class TestHbck { TEST_UTIL.createMultiRegionTable(TABLE_NAME, Bytes.toBytes("family1"), 5); procExec = TEST_UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); ASYNC_CONN = ConnectionFactory.createAsyncConnection(TEST_UTIL.getConfiguration()).get(); + TEST_UTIL.getHBaseCluster().getMaster().getMasterCoprocessorHost().load( + FailingMergeAfterMetaUpdatedMasterObserver.class, Coprocessor.PRIORITY_USER, + TEST_UTIL.getHBaseCluster().getMaster().getConfiguration()); + TEST_UTIL.getHBaseCluster().getMaster().getMasterCoprocessorHost().load( + FailingSplitAfterMetaUpdatedMasterObserver.class, Coprocessor.PRIORITY_USER, + TEST_UTIL.getHBaseCluster().getMaster().getConfiguration()); } @AfterClass @@ -204,25 +229,166 @@ public class TestHbck { } } + @Test + public void testRecoverMergeAfterMetaUpdated() throws Exception { + String testTable = async ? "mergeTestAsync" : "mergeTestSync"; + TEST_UTIL.createMultiRegionTable(TableName.valueOf(testTable), Bytes.toBytes("family1"), 5); + TEST_UTIL.loadTable(TEST_UTIL.getConnection().getTable(TableName.valueOf(testTable)), + Bytes.toBytes("family1"), true); + HMaster master = TEST_UTIL.getHBaseCluster().getMaster(); + Hbck hbck = getHbck(); + FailingMergeAfterMetaUpdatedMasterObserver observer = master.getMasterCoprocessorHost() + .findCoprocessor(FailingMergeAfterMetaUpdatedMasterObserver.class); + try (Admin admin = TEST_UTIL.getConnection().getAdmin()) { + List regions = admin.getRegions(TableName.valueOf(testTable)); + admin.mergeRegionsAsync(regions.get(0).getRegionName(), regions.get(1).getRegionName(), true); + assertNotNull(observer); + observer.latch.await(5000, TimeUnit.MILLISECONDS); + Map result = + hbck.checkSplitMergeConsistency(Arrays.asList(TableName.valueOf(testTable))); + // since there is a merge procedure work on the region, this check will return 2 orphan hdfs + // region. + Assert.assertTrue(result.values().stream().noneMatch(state -> state + .getNumber() == MasterProtos.REGION_ERROR_TYPE.merged_region_not_online.getNumber())); + Optional> procedure = TEST_UTIL.getHBaseCluster().getMaster().getProcedures() + .stream().filter(p -> p instanceof MergeTableRegionsProcedure).findAny(); + Assert.assertTrue(procedure.isPresent()); + hbck.bypassProcedure(Arrays.asList(procedure.get().getProcId()), 5, true, false); + result = hbck.checkSplitMergeConsistency(Arrays.asList(TableName.valueOf(testTable))); + Assert.assertTrue(result.size() == 3); + hbck.assigns(Arrays.asList(result.keySet().toArray(new String[0]))); + ProcedureTestingUtility.waitNoProcedureRunning(master.getMasterProcedureExecutor()); + // now the state should be fixed + result = hbck.checkSplitMergeConsistency(Arrays.asList(TableName.valueOf(testTable))); + Assert.assertTrue(result.values().stream().noneMatch(state -> state + .getNumber() == MasterProtos.REGION_ERROR_TYPE.merged_region_not_online.getNumber())); + } catch (InterruptedException ie) { + throw new IOException(ie); + } finally { + observer.resetLatch(); + } + } + + @Test + public void testRecoverSplitAfterMetaUpdated() throws Exception { + String testTable = async ? "splitTestAsync" : "splitTestSync"; + TEST_UTIL.createMultiRegionTable(TableName.valueOf(testTable), Bytes.toBytes("family1"), 5); + TEST_UTIL.loadTable(TEST_UTIL.getConnection().getTable(TableName.valueOf(testTable)), + Bytes.toBytes("family1"), true); + HMaster master = TEST_UTIL.getHBaseCluster().getMaster(); + Hbck hbck = getHbck(); + FailingSplitAfterMetaUpdatedMasterObserver observer = master.getMasterCoprocessorHost() + .findCoprocessor(FailingSplitAfterMetaUpdatedMasterObserver.class); + assertNotNull(observer); + try (Admin admin = TEST_UTIL.getConnection().getAdmin()) { + Random random = new Random(); + byte[] splitKey = new byte[16]; + random.nextBytes(splitKey); + admin.split(TableName.valueOf(testTable), splitKey); + observer.latch.await(5000, TimeUnit.MILLISECONDS); + Map result = + hbck.checkSplitMergeConsistency(Arrays.asList(TableName.valueOf(testTable))); + // since there is a split procedure work on the region, thus this check should return a empty + // map. + Assert.assertTrue(result.values().stream().noneMatch(state -> state + .getNumber() == MasterProtos.REGION_ERROR_TYPE.daughter_region_not_online.getNumber())); + + Optional> procedure = TEST_UTIL.getHBaseCluster().getMaster().getProcedures() + .stream().filter(p -> p instanceof SplitTableRegionProcedure).findAny(); + Assert.assertTrue(procedure.isPresent()); + hbck.bypassProcedure(Arrays.asList(procedure.get().getProcId()), 5, true, false); + result = hbck.checkSplitMergeConsistency(Arrays.asList(TableName.valueOf(testTable))); + Assert.assertTrue(result.values().stream() + .filter(state -> state + .getNumber() == MasterProtos.REGION_ERROR_TYPE.daughter_region_not_online.getNumber()) + .count() == 2); + hbck.assigns(Arrays.asList(result.keySet().toArray(new String[0]))); + ProcedureTestingUtility.waitNoProcedureRunning(master.getMasterProcedureExecutor()); + // now the state should be fixed + result = hbck.checkSplitMergeConsistency(Arrays.asList(TableName.valueOf(testTable))); + Assert.assertTrue(result.values().stream().noneMatch(state -> state + .getNumber() == MasterProtos.REGION_ERROR_TYPE.daughter_region_not_online.getNumber())); + } catch (InterruptedException ie) { + throw new IOException(ie); + } finally { + observer.resetLatch(); + } + } + + @Test public void testScheduleSCP() throws Exception { HRegionServer testRs = TEST_UTIL.getRSForFirstRegionInTable(TABLE_NAME); TEST_UTIL.loadTable(TEST_UTIL.getConnection().getTable(TABLE_NAME), Bytes.toBytes("family1"), - true); + true); ServerName serverName = testRs.getServerName(); Hbck hbck = getHbck(); List pids = - hbck.scheduleServerCrashProcedure(Arrays.asList(ProtobufUtil.toServerName(serverName))); + hbck.scheduleServerCrashProcedure(Arrays.asList(ProtobufUtil.toServerName(serverName))); assertTrue(pids.get(0) > 0); LOG.info("pid is {}", pids.get(0)); List newPids = - hbck.scheduleServerCrashProcedure(Arrays.asList(ProtobufUtil.toServerName(serverName))); + hbck.scheduleServerCrashProcedure(Arrays.asList(ProtobufUtil.toServerName(serverName))); assertTrue(newPids.get(0) < 0); LOG.info("pid is {}", newPids.get(0)); waitOnPids(pids); } + public static class FailingSplitAfterMetaUpdatedMasterObserver + implements MasterCoprocessor, MasterObserver { + public volatile CountDownLatch latch; + + @Override + public void start(CoprocessorEnvironment e) throws IOException { + resetLatch(); + } + + @Override + public Optional getMasterObserver() { + return Optional.of(this); + } + + @Override + public void preSplitRegionAfterMETAAction(ObserverContext ctx) + throws IOException { + LOG.info("I'm here"); + latch.countDown(); + throw new IOException("this procedure will fail at here forever"); + } + + public void resetLatch() { + this.latch = new CountDownLatch(1); + } + } + + public static class FailingMergeAfterMetaUpdatedMasterObserver + implements MasterCoprocessor, MasterObserver { + public volatile CountDownLatch latch; + + @Override + public void start(CoprocessorEnvironment e) throws IOException { + resetLatch(); + } + + @Override + public Optional getMasterObserver() { + return Optional.of(this); + } + + public void resetLatch() { + this.latch = new CountDownLatch(1); + } + + @Override + public void postMergeRegionsCommitAction( + final ObserverContext ctx, final RegionInfo[] regionsToMerge, + final RegionInfo mergedRegion) throws IOException { + latch.countDown(); + throw new IOException("this procedure will fail at here forever"); + } + } + private void waitOnPids(List pids) { TEST_UTIL.waitFor(60000, () -> pids.stream().allMatch(procExec::isFinished)); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java index b91bf11dae..e09cf75f4c 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java @@ -275,7 +275,7 @@ public class TestSplitTransactionOnCluster { } public static class FailingSplitMasterObserver implements MasterCoprocessor, MasterObserver { - volatile CountDownLatch latch; + public volatile CountDownLatch latch; @Override public void start(CoprocessorEnvironment e) throws IOException { -- 2.17.1