From effef4ed14ae771c053c87e74ae16f9b281a4d99 Mon Sep 17 00:00:00 2001 From: Mike Drob Date: Mon, 8 Oct 2018 14:28:23 -0500 Subject: [PATCH] HBASE-21073 Create repair mode startup option for HMaster --- .../org/apache/hadoop/hbase/master/HMaster.java | 17 ++++++++- .../apache/hadoop/hbase/master/LoadBalancer.java | 17 +++++++-- .../hadoop/hbase/master/MasterRpcServices.java | 2 +- .../apache/hadoop/hbase/master/ServerManager.java | 22 ++++++----- .../hbase/master/balancer/BaseLoadBalancer.java | 16 ++++++-- .../hadoop/hbase/regionserver/HRegionServer.java | 3 +- .../hadoop/hbase/regionserver/RSRpcServices.java | 6 ++- .../org/apache/hadoop/hbase/master/TestMaster.java | 44 ++++++++++++++++++++++ 8 files changed, 105 insertions(+), 22 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 8ae8be35ed..6179cfde21 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -440,6 +440,11 @@ public class HMaster extends HRegionServer implements MasterServices { /** jetty server for master to redirect requests to regionserver infoServer */ private Server masterJettyServer; + // Determine if we should do normal startup or minimal "single-user" mode with no region + // servers and no user tables. Useful for repair and recovery of hbase:meta + private boolean repairMode; + static final String REPAIR_MODE = "hbase.master.repairmode"; + public static class RedirectServlet extends HttpServlet { private static final long serialVersionUID = 2894774810058302473L; private final int regionServerInfoPort; @@ -499,6 +504,7 @@ public class HMaster extends HRegionServer implements MasterServices { super(conf); TraceUtil.initTracer(conf); try { + this.repairMode = conf.getBoolean(REPAIR_MODE, false); this.rsFatals = new MemoryBoundedLogMessageBuffer( conf.getLong("hbase.master.buffer.for.rs.fatals", 1 * 1024 * 1024)); LOG.info("hbase.rootdir=" + getRootDir() + @@ -683,6 +689,9 @@ public class HMaster extends HRegionServer implements MasterServices { */ @Override protected void waitForMasterActive(){ + if (repairMode) { + return; + } boolean tablesOnMaster = LoadBalancer.isTablesOnMaster(conf); while (!(tablesOnMaster && activeMaster) && !isStopped() && !isAborted()) { sleeper.sleep(); @@ -1066,7 +1075,7 @@ public class HMaster extends HRegionServer implements MasterServices { // The below depends on hbase:meta being online. this.tableStateManager.start(); // Initialize after meta is up as below scans meta - if (favoredNodesManager != null) { + if (favoredNodesManager != null && !repairMode) { SnapshotOfRegionAssignmentFromMeta snapshotOfRegionAssignment = new SnapshotOfRegionAssignmentFromMeta(getConnection()); snapshotOfRegionAssignment.initialize(); @@ -1115,6 +1124,12 @@ public class HMaster extends HRegionServer implements MasterServices { configurationManager.registerObserver(this.logCleaner); // Set master as 'initialized'. setInitialized(true); + + if (repairMode) { + LOG.info("Detected repair mode, skipping final initialization steps."); + return; + } + assignmentManager.checkIfShouldMoveSystemRegionAsync(); status.setStatus("Assign meta replicas"); MasterMetaBootstrap metaBootstrap = createMetaBootstrap(); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/LoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/LoadBalancer.java index 9a894e1f09..63a789b6f7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/LoadBalancer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/LoadBalancer.java @@ -53,13 +53,12 @@ public interface LoadBalancer extends Configurable, Stoppable, ConfigurationObse * By default, it carries no tables. * TODO: Add any | system as flags to indicate what it can do. */ - public static final String TABLES_ON_MASTER = "hbase.balancer.tablesOnMaster"; + String TABLES_ON_MASTER = "hbase.balancer.tablesOnMaster"; /** * Master carries system tables. */ - public static final String SYSTEM_TABLES_ON_MASTER = - "hbase.balancer.tablesOnMaster.systemTablesOnly"; + String SYSTEM_TABLES_ON_MASTER = "hbase.balancer.tablesOnMaster.systemTablesOnly"; // Used to signal to the caller that the region(s) cannot be assigned // We deliberately use 'localhost' so the operation will fail fast @@ -167,6 +166,14 @@ public interface LoadBalancer extends Configurable, Stoppable, ConfigurationObse /*Updates balancer status tag reported to JMX*/ void updateBalancerStatus(boolean status); + /** + * @return true if starting up in master repair mode. will load only system tables onto master + * and no user tables. + */ + static boolean isRepairMode(Configuration conf) { + return conf.getBoolean(HMaster.REPAIR_MODE, false); + } + /** * @return true if Master carries regions */ @@ -177,4 +184,8 @@ public interface LoadBalancer extends Configurable, Stoppable, ConfigurationObse static boolean isSystemTablesOnlyOnMaster(Configuration conf) { return conf.getBoolean(SYSTEM_TABLES_ON_MASTER, false); } + + static boolean isMasterCanHostUserRegions(Configuration conf) { + return isTablesOnMaster(conf) && !isSystemTablesOnlyOnMaster(conf); + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java index ecd6e0b76d..53eeac1fb1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java @@ -355,7 +355,7 @@ public class MasterRpcServices extends RSRpcServices throws IOException { // RpcServer at HM by default enable ByteBufferPool iff HM having user table region in it boolean reservoirEnabled = conf.getBoolean(RESERVOIR_ENABLED_KEY, - (LoadBalancer.isTablesOnMaster(conf) && !LoadBalancer.isSystemTablesOnlyOnMaster(conf))); + LoadBalancer.isMasterCanHostUserRegions(conf)); try { return RpcServerFactory.createRpcServer(server, name, getServices(), bindAddress, // use final bindAddress for this server. diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 68b8e79964..34d0168c06 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -761,19 +761,21 @@ public class ServerManager { * RegionServers to check-in. */ private int getMinToStart() { - // One server should be enough to get us off the ground. - int requiredMinToStart = 1; - if (LoadBalancer.isTablesOnMaster(master.getConfiguration())) { - if (LoadBalancer.isSystemTablesOnlyOnMaster(master.getConfiguration())) { - // If Master is carrying regions but NOT user-space regions, it - // still shows as a 'server'. We need at least one more server to check - // in before we can start up so set defaultMinToStart to 2. - requiredMinToStart = requiredMinToStart + 1; - } + int minimumRequired = 1; + + if (master.getConfiguration().getBoolean(HMaster.REPAIR_MODE, false)) { + // Only need master up here, it will show up as a 'server' and will host meta + minimumRequired = 1; + } else if (LoadBalancer.isTablesOnMaster(master.getConfiguration()) && + LoadBalancer.isSystemTablesOnlyOnMaster(master.getConfiguration())) { + // If Master is carrying regions it will show up as a 'server', but is not handling user- + // space regions, so we need a second server. + minimumRequired = 2; } + int minToStart = this.master.getConfiguration().getInt(WAIT_ON_REGIONSERVERS_MINTOSTART, -1); // Ensure we are never less than requiredMinToStart else stuff won't work. - return minToStart == -1 || minToStart < requiredMinToStart? requiredMinToStart: minToStart; + return minToStart == -1 || minToStart < minimumRequired ? minimumRequired : minToStart; } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java index 52a284e3c0..d149175411 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java @@ -1013,6 +1013,7 @@ public abstract class BaseLoadBalancer implements LoadBalancer { protected MasterServices services; protected boolean tablesOnMaster; protected boolean onlySystemTablesOnMaster; + protected boolean repairMode; @Override public void setConf(Configuration conf) { @@ -1024,8 +1025,15 @@ public abstract class BaseLoadBalancer implements LoadBalancer { if (overallSlop < 0) overallSlop = 0; else if (overallSlop > 1) overallSlop = 1; + this.repairMode = LoadBalancer.isRepairMode(this.config); this.tablesOnMaster = LoadBalancer.isTablesOnMaster(this.config); this.onlySystemTablesOnMaster = LoadBalancer.isSystemTablesOnlyOnMaster(this.config); + + if (this.repairMode) { + this.tablesOnMaster = true; + this.onlySystemTablesOnMaster = true; + } + // If system tables on master, implies tablesOnMaster = true. if (this.onlySystemTablesOnMaster && !this.tablesOnMaster) { LOG.warn("Set " + TABLES_ON_MASTER + "=true because " + SYSTEM_TABLES_ON_MASTER + "=true"); @@ -1036,8 +1044,8 @@ public abstract class BaseLoadBalancer implements LoadBalancer { regionFinder.setConf(conf); } // Print out base configs. Don't print overallSlop since it for simple balancer exclusively. - LOG.info("slop=" + this.slop + ", tablesOnMaster=" + this.tablesOnMaster + - ", systemTablesOnMaster=" + this.onlySystemTablesOnMaster); + LOG.info("slop={}, tablesOnMaster={}, systemTablesOnMaster={}, repairMode={}", + this.slop, this.tablesOnMaster, this.onlySystemTablesOnMaster, this.repairMode); } protected void setSlop(Configuration conf) { @@ -1241,7 +1249,7 @@ public abstract class BaseLoadBalancer implements LoadBalancer { regions.removeAll(masterRegions); } } - if (regions == null || regions.isEmpty()) { + if (repairMode || regions == null || regions.isEmpty()) { return assignments; } @@ -1414,7 +1422,7 @@ public abstract class BaseLoadBalancer implements LoadBalancer { regions = regions.entrySet().stream().filter(e -> !masterRegions.contains(e.getKey())) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } - if (regions.isEmpty()) { + if (repairMode || regions.isEmpty()) { return assignments; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index d5e9832504..7d0073d046 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -945,6 +945,7 @@ public class HRegionServer extends HasThread implements // Try and register with the Master; tell it we are here. Break if server is stopped or the // clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and start // up all Services. Use RetryCounter to get backoff in case Master is struggling to come up. + LOG.debug("About to register with Master."); RetryCounterFactory rcf = new RetryCounterFactory(Integer.MAX_VALUE, this.sleeper.getPeriod(), 1000 * 60 * 5); RetryCounter rc = rcf.create(); @@ -1809,7 +1810,7 @@ public class HRegionServer extends HasThread implements */ private void setupWALAndReplication() throws IOException { boolean isMasterNoTableOrSystemTableOnly = this instanceof HMaster && - (!LoadBalancer.isTablesOnMaster(conf) || LoadBalancer.isSystemTablesOnlyOnMaster(conf)); + !LoadBalancer.isMasterCanHostUserRegions(conf); WALFactory factory = new WALFactory(conf, serverName.toString(), !isMasterNoTableOrSystemTableOnly); if (!isMasterNoTableOrSystemTableOnly) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java index 8bb2e9c0da..17c582df67 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java @@ -2485,8 +2485,10 @@ public class RSRpcServices implements HBaseRPCErrorHandler, } private boolean shouldRejectRequestsFromClient(HRegion region) { - return regionServer.getReplicationSourceService().getSyncReplicationPeerInfoProvider() - .checkState(region.getRegionInfo().getTable(), RejectRequestsFromClientStateChecker.get()); + TableName table = region.getRegionInfo().getTable(); + ReplicationSourceService service = regionServer.getReplicationSourceService(); + return service != null && service.getSyncReplicationPeerInfoProvider() + .checkState(table, RejectRequestsFromClientStateChecker.get()); } private void rejectIfInStandByState(HRegion region) throws DoNotRetryIOException { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMaster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMaster.java index 81bdb023bd..5fbeb9f2ae 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMaster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMaster.java @@ -20,6 +20,7 @@ package org.apache.hadoop.hbase.master; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -38,11 +39,17 @@ import org.apache.hadoop.hbase.MetaTableAccessor; import org.apache.hadoop.hbase.MiniHBaseCluster; import org.apache.hadoop.hbase.PleaseHoldException; import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.StartMiniClusterOption; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.UnknownRegionException; import org.apache.hadoop.hbase.client.Admin; +import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionInfoBuilder; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.client.TableState; import org.apache.hadoop.hbase.testclassification.MasterTests; @@ -263,5 +270,42 @@ public class TestMaster { // Assert lock gets put in place again. assertTrue(fs.exists(hbckLockPath)); } + + @Test + public void testRepairMode() throws Exception { + TableName testRepairMode = TableName.valueOf(name.getMethodName()); + Table t = TEST_UTIL.createTable(testRepairMode, FAMILYNAME); + Put p = new Put(Bytes.toBytes("r")); + p.addColumn(FAMILYNAME, Bytes.toBytes("c"), new byte[0]); + t.put(p); + + TEST_UTIL.shutdownMiniHBaseCluster(); + + Configuration c = TEST_UTIL.getConfiguration(); + c.setBoolean(HMaster.REPAIR_MODE, true); + c.setInt("hbase.master.init.timeout.localHBaseCluster", 30000); + + LOG.info("Starting master-only"); + + TEST_UTIL.startMiniHBaseCluster(StartMiniClusterOption.builder() + .numRegionServers(0).createRootDir(false).build()); + + Connection conn = TEST_UTIL.getConnection(); + Scan scan = new Scan(); + scan.addFamily(HConstants.TABLE_FAMILY); + + try (ResultScanner scanner = conn.getTable(TableName.META_TABLE_NAME).getScanner(scan)) { + assertArrayEquals(TableName.NAMESPACE_TABLE_NAME.getName(), scanner.next().getRow()); + assertArrayEquals(testRepairMode.getName(), scanner.next().getRow()); + assertNull(scanner.next()); + } + + try { + conn.getTable(testRepairMode).getScanner(new Scan()).next(); + fail("Should not be able to access user-space tables in repair mode."); + } catch (Exception e) { + // Expected + } + } } -- 2.16.1