From 4a3087d4c783632811e2831408a302d6738b77b1 Mon Sep 17 00:00:00 2001 From: Jingyun Tian Date: Fri, 15 Mar 2019 15:53:18 +0800 Subject: [PATCH] HBASE-21965 Fix failed split and merge transactions that have failed to roll back --- .../apache/hadoop/hbase/client/HBaseHbck.java | 16 +++ .../org/apache/hadoop/hbase/client/Hbck.java | 6 + .../shaded/protobuf/RequestConverter.java | 9 ++ .../src/main/protobuf/Master.proto | 17 +++ .../hbase/master/MasterRpcServices.java | 124 ++++++++++++++++++ 5 files changed, 172 insertions(+) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java index a276017b0c..e6ea33faa8 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseHbck.java @@ -19,10 +19,12 @@ package org.apache.hadoop.hbase.client; import java.io.IOException; import java.util.List; +import java.util.Map; import java.util.concurrent.Callable; import java.util.stream.Collectors; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.ipc.RpcControllerFactory; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.shaded.protobuf.RequestConverter; @@ -172,4 +174,18 @@ public class HBaseHbck implements Hbck { throw new IOException(se); } } + + @Override + public Map + checkSplitMergeConsistency(List tableNames) throws IOException { + try { + MasterProtos.checkSplitMergeConsistencyResponse response = + this.hbck.checkSplitMergeConsistency(rpcControllerFactory.newController(), + RequestConverter.toCheckSplitMergeConsistencyRequest(tableNames)); + return response.getErrorsMap(); + } catch (ServiceException se) { + LOG.debug("check region consistency failed", se); + throw new IOException(se); + } + } } diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java index e88805cdcc..390984866d 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/Hbck.java @@ -20,12 +20,15 @@ package org.apache.hadoop.hbase.client; import java.io.Closeable; import java.io.IOException; import java.util.List; +import java.util.Map; import org.apache.hadoop.hbase.Abortable; import org.apache.hadoop.hbase.HBaseInterfaceAudience; +import org.apache.hadoop.hbase.TableName; import org.apache.yetus.audience.InterfaceAudience; import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos; /** * Hbck fixup tool APIs. Obtain an instance from {@link ClusterConnection#getHbck()} and call @@ -106,4 +109,7 @@ public interface Hbck extends Abortable, Closeable { List scheduleServerCrashProcedure(List serverNames) throws IOException; + + Map checkSplitMergeConsistency(List tableName) + throws IOException; } diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java index 36c8fab23a..c3343ecee6 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/RequestConverter.java @@ -1913,6 +1913,15 @@ public final class RequestConverter { return b.addAllServerName(serverNames).build(); } + public static MasterProtos.checkSplitMergeConsistencyRequest + toCheckSplitMergeConsistencyRequest(List tableNames) { + MasterProtos.checkSplitMergeConsistencyRequest.Builder b = + MasterProtos.checkSplitMergeConsistencyRequest.newBuilder(); + List protoTableNames = tableNames.stream() + .map(tableName -> ProtobufUtil.toProtoTableName(tableName)).collect(Collectors.toList()); + return b.addAllTable(protoTableNames).build(); + } + private static List toEncodedRegionNameRegionSpecifiers( List encodedRegionNames) { return encodedRegionNames.stream(). diff --git a/hbase-protocol-shaded/src/main/protobuf/Master.proto b/hbase-protocol-shaded/src/main/protobuf/Master.proto index 4ed0ad5c0e..393d30145d 100644 --- a/hbase-protocol-shaded/src/main/protobuf/Master.proto +++ b/hbase-protocol-shaded/src/main/protobuf/Master.proto @@ -1093,6 +1093,20 @@ message ScheduleServerCrashProcedureResponse { repeated uint64 pid = 1; } +message checkSplitMergeConsistencyRequest { + repeated TableName table = 1; +} + +enum REGION_ERROR_TYPE { + merged_region_not_online = 0; + daughter_region_not_online = 1; + orphan_region_on_hdfs = 2; +} + +message checkSplitMergeConsistencyResponse { + map errors = 1; +} + service HbckService { /** Update state of the table in meta only*/ rpc SetTableStateInMeta(SetTableStateInMetaRequest) @@ -1123,4 +1137,7 @@ service HbckService { /** Schedule a ServerCrashProcedure to help recover a crash server */ rpc ScheduleServerCrashProcedure(ScheduleServerCrashProcedureRequest) returns(ScheduleServerCrashProcedureResponse); + + rpc checkSplitMergeConsistency(checkSplitMergeConsistencyRequest) + returns(checkSplitMergeConsistencyResponse); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java index 063a353237..eb5b2d4309 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java @@ -25,14 +25,17 @@ import java.net.BindException; import java.net.InetAddress; import java.net.InetSocketAddress; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.stream.Collectors; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.ClusterMetricsBuilder; import org.apache.hadoop.hbase.DoNotRetryIOException; @@ -66,6 +69,7 @@ import org.apache.hadoop.hbase.ipc.RpcServer.BlockingServiceAndInterface; import org.apache.hadoop.hbase.ipc.RpcServerFactory; import org.apache.hadoop.hbase.ipc.RpcServerInterface; import org.apache.hadoop.hbase.ipc.ServerRpcController; +import org.apache.hadoop.hbase.master.assignment.RegionStateStore; import org.apache.hadoop.hbase.master.assignment.RegionStates; import org.apache.hadoop.hbase.master.locking.LockProcedure; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; @@ -103,8 +107,10 @@ import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils; import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.ForeignExceptionUtil; import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.hbase.util.PairOfSameType; import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; import org.apache.hadoop.hbase.zookeeper.MetaTableLocator; import org.apache.yetus.audience.InterfaceAudience; @@ -2491,6 +2497,124 @@ public class MasterRpcServices extends RSRpcServices } } + @Override + public MasterProtos.checkSplitMergeConsistencyResponse checkSplitMergeConsistency( + RpcController controller, MasterProtos.checkSplitMergeConsistencyRequest request) + throws ServiceException { + List tables = request.getTableList(); + + Map errorRegions = new HashMap<>(); + try { + for (HBaseProtos.TableName tableName : tables) { + errorRegions.putAll(checkRegionConsistency(ProtobufUtil.toTableName(tableName))); + } + } catch (IOException e) { + throw new ServiceException(e); + } + return MasterProtos.checkSplitMergeConsistencyResponse.newBuilder().putAllErrors(errorRegions) + .build(); + } + + private Map checkRegionConsistency(TableName tableName) + throws IOException { + if (!MetaTableAccessor.tableExists(master.getConnection(), tableName)) { + throw new IOException("table " + tableName.getNameAsString() + " doesnt exist"); + } + if (!MetaTableAccessor.getTableState(master.getConnection(), tableName).isEnabled()) { + throw new IOException( + "table " + tableName.getNameAsString() + " is not enabled yet, please check assignments"); + } + Map problemRegions = new HashMap<>(); + List regionInfos = + MetaTableAccessor.getTableRegions(master.getConnection(), tableName); + + // Case 1. find orphan region on HDFS + // orphan regions may due to a failed split region procedure, which daughter regions are created + // then the procedure is aborted. Or merged region is created then the procedure is aborted. + List orphanRegions = findOrphanRegionOnHDFS(regionInfos, tableName); + orphanRegions.stream().forEach( + region -> problemRegions.put(region, MasterProtos.REGION_ERROR_TYPE.orphan_region_on_hdfs)); + + // Case 2. find unassigned daughter regions or merged regions + findUnassignedDaughterOrMergedRegions(tableName, problemRegions); + + return problemRegions; + } + + private void findUnassignedDaughterOrMergedRegions(TableName tableName, + Map problemRegions) throws IOException { + Set mergeSet = new HashSet<>(); + Map> splitMap = new HashMap<>(); + Map regionStates = new HashMap<>(); + + MetaTableAccessor.scanMeta(master.getConnection(), tableName, + MetaTableAccessor.QueryType.REGION, Integer.MAX_VALUE, r -> { + RegionInfo regionInfo = MetaTableAccessor.getRegionInfo(r); + RegionState.State state = RegionStateStore.getRegionState(r, 0); + regionStates.put(regionInfo.getEncodedName(), state); + if (regionInfo.isSplit()) { + if (regionInfo.isSplitParent()) { + splitMap.put(regionInfo, MetaTableAccessor.getDaughterRegions(r)); + } + } else if (r.getValue(HConstants.CATALOG_FAMILY, HConstants.MERGEA_QUALIFIER) != null + || r.getValue(HConstants.CATALOG_FAMILY, HConstants.MERGEB_QUALIFIER) != null) { + mergeSet.add(regionInfo); + } + return true; + }); + + // find unassigned split region + List UnassignedDaughterRegions = new ArrayList<>(); + for (Map.Entry> entry : splitMap.entrySet()) { + UnassignedDaughterRegions = + findUnassignedDaughterRegions(entry.getKey(), splitMap, regionStates, new ArrayList<>()); + } + UnassignedDaughterRegions.stream().forEach(region -> problemRegions.put(region, + MasterProtos.REGION_ERROR_TYPE.daughter_region_not_online)); + + // find unassigned merged region + mergeSet.stream() + .filter(regionInfo -> !regionStates.get(regionInfo).matches(RegionState.State.OPEN)) + .map(regionInfo -> regionInfo.getEncodedName()).forEach(region -> problemRegions.put(region, + MasterProtos.REGION_ERROR_TYPE.merged_region_not_online)); + } + + private List findUnassignedDaughterRegions(RegionInfo regionInfo, + Map> splitMap, + Map regionStates, ArrayList problemRegions) { + if (!regionInfo.isSplit()) { + if (!regionStates.get(regionInfo.getEncodedName()).matches(RegionState.State.OPEN)) { + problemRegions.add(regionInfo.getEncodedName()); + } + return problemRegions; + } + findUnassignedDaughterRegions(splitMap.get(regionInfo).getFirst(), splitMap, regionStates, + problemRegions); + findUnassignedDaughterRegions(splitMap.get(regionInfo).getSecond(), splitMap, regionStates, + problemRegions); + return problemRegions; + } + + private List findOrphanRegionOnHDFS(List regionInfos, TableName tableName) + throws IOException { + // get regionNames from hdfs + Path tableDir = FSUtils.getTableDir(master.getMasterFileSystem().getRootDir(), tableName); + FileStatus[] regions = + master.getFileSystem().listStatus(tableDir, path -> !path.getName().startsWith(".")); + List regionNames = Arrays.stream(regions).map(region -> region.getPath().getName()) + .collect(Collectors.toList()); + HashSet regionsInMeta = new HashSet<>(); + regionInfos.stream().forEach(regionInfo -> regionsInMeta.add(regionInfo.getEncodedName())); + Iterator regionIterator = regionNames.iterator(); + while (regionIterator.hasNext()) { + String region = regionIterator.next(); + if (regionsInMeta.contains(region)) { + regionIterator.remove(); + } + } + return regionNames; + } + @Override public SwitchRpcThrottleResponse switchRpcThrottle(RpcController controller, SwitchRpcThrottleRequest request) throws ServiceException { -- 2.17.1