From 725bba7f4b40124b02f21689b8a688af9fa70f0d Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Wed, 12 Sep 2018 10:47:33 -0700 Subject: [PATCH] HBASE-21191 Add a holding-pattern if no assign for meta or namespace (Can happen if masterprocwals have been cleared). Add a check for hbase:meta being online before we go to read it. If not online, move into a holding-pattern until rectified, probably by external operator. Incorporates bulk of patch made by Allan Yang over on HBASE-21035. M hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounterFactory.java Add a Constructor for case where retries are for ever. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Move stuff around so that the first hbase:meta read is the AM#loadMeta. Previously, checking table state and/or favored nodes could end up trying to read a meta that was not onlined holding up master startup. Do similar for the namespace table. Adds new methods isMeta and isNamespace which check that the regions/tables are online.. if not, we wait logging with a back-off that assigns need to be run. --- .../hadoop/hbase/util/RetryCounterFactory.java | 4 + .../org/apache/hadoop/hbase/master/HMaster.java | 126 +++++++++++++++++++-- .../hadoop/hbase/master/MasterRpcServices.java | 5 +- .../hadoop/hbase/master/MasterWalManager.java | 44 +++++-- .../hadoop/hbase/master/TableNamespaceManager.java | 8 +- .../hbase/master/assignment/AssignmentManager.java | 5 +- .../master/procedure/DisableTableProcedure.java | 2 +- .../master/procedure/EnableTableProcedure.java | 2 +- .../hadoop/hbase/regionserver/HRegionServer.java | 12 +- .../apache/hadoop/hbase/TestMetaTableAccessor.java | 16 +++ .../TestMetaInitIfAllProceduresLost.java | 121 ++++++++++++++++++++ 11 files changed, 312 insertions(+), 33 deletions(-) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestMetaInitIfAllProceduresLost.java diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounterFactory.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounterFactory.java index dcf6626ae4..c15cfb2cc7 100644 --- a/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounterFactory.java +++ b/hbase-common/src/main/java/org/apache/hadoop/hbase/util/RetryCounterFactory.java @@ -28,6 +28,10 @@ import org.apache.yetus.audience.InterfaceAudience; public class RetryCounterFactory { private final RetryConfig retryConfig; + public RetryCounterFactory(int sleepIntervalMillis) { + this(Integer.MAX_VALUE, sleepIntervalMillis); + } + public RetryCounterFactory(int maxAttempts, int sleepIntervalMillis) { this(maxAttempts, sleepIntervalMillis, -1); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index f3fb989315..4a585a5db3 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -55,6 +55,7 @@ import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; + import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; @@ -81,10 +82,14 @@ import org.apache.hadoop.hbase.TableNotFoundException; import org.apache.hadoop.hbase.UnknownRegionException; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; +import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.MasterSwitchType; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionInfoBuilder; import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.client.TableDescriptor; import org.apache.hadoop.hbase.client.TableDescriptorBuilder; import org.apache.hadoop.hbase.client.TableState; @@ -191,6 +196,8 @@ import org.apache.hadoop.hbase.util.HasThread; import org.apache.hadoop.hbase.util.IdLock; import org.apache.hadoop.hbase.util.ModifyRegionUtils; import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.hbase.util.RetryCounter; +import org.apache.hadoop.hbase.util.RetryCounterFactory; import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.util.VersionInfo; import org.apache.hadoop.hbase.zookeeper.LoadBalancerTracker; @@ -916,10 +923,14 @@ public class HMaster extends HRegionServer implements MasterServices { status.setStatus("Initializing master coprocessors"); this.cpHost = new MasterCoprocessorHost(this, this.conf); + // Checking if meta needs initializing. status.setStatus("Initializing meta table if this is a new deploy"); InitMetaProcedure initMetaProc = null; - if (assignmentManager.getRegionStates().getRegionState(RegionInfoBuilder.FIRST_META_REGIONINFO) - .isOffline()) { + // Print out state of hbase:meta on startup; helps debugging. + RegionState rs = this.assignmentManager.getRegionStates(). + getRegionState(RegionInfoBuilder.FIRST_META_REGIONINFO); + LOG.info("hbase:meta {}", rs); + if (rs.isOffline()) { Optional> optProc = procedureExecutor.getProcedures().stream() .filter(p -> p instanceof InitMetaProcedure).findAny(); if (optProc.isPresent()) { @@ -946,7 +957,6 @@ public class HMaster extends HRegionServer implements MasterServices { if (initMetaProc != null) { initMetaProc.await(); } - tableStateManager.start(); // Wake up this server to check in sleeper.skipSleepCycle(); @@ -963,7 +973,20 @@ public class HMaster extends HRegionServer implements MasterServices { return; } - //Initialize after meta as it scans meta + status.setStatus("Starting assignment manager"); + // FIRST HBASE:META READ!!!! + // The below cannot make progress w/o hbase:meta being online. + // This is the FIRST attempt at going to hbase:meta. Meta on-lining is going on in background + // as procedures run -- in particular SCPs for crashed servers... One should put up hbase:meta + // if it is down. It may take a while to come online. So, wait here until meta if for sure + // available. Thats what isMeta does. + if (!isMeta()) { + return; + } + this.assignmentManager.joinCluster(); + // The below depends on hbase:meta being online. + this.tableStateManager.start(); + // Initialize after meta is up as below scans meta if (favoredNodesManager != null) { SnapshotOfRegionAssignmentFromMeta snapshotOfRegionAssignment = new SnapshotOfRegionAssignmentFromMeta(getConnection()); @@ -971,10 +994,6 @@ public class HMaster extends HRegionServer implements MasterServices { favoredNodesManager.initialize(snapshotOfRegionAssignment); } - // Fix up assignment manager status - status.setStatus("Starting assignment manager"); - this.assignmentManager.joinCluster(); - // set cluster status again after user regions are assigned this.balancer.setClusterMetrics(getClusterMetricsWithoutCoprocessor()); @@ -989,6 +1008,13 @@ public class HMaster extends HRegionServer implements MasterServices { this.catalogJanitorChore = new CatalogJanitor(this); getChoreService().scheduleChore(catalogJanitorChore); + // NAMESPACE READ!!!! + // Here we expect hbase:namespace to be online. See inside initClusterSchemaService. + // TODO: Fix this. Namespace is a pain being a sort-of system table. Fold it in to hbase:meta. + // isNamespace does like isMeta and waits until namespace is onlined before allowing progress. + if (!isNamespace()) { + return; + } status.setStatus("Starting cluster schema service"); initClusterSchemaService(); @@ -1064,6 +1090,90 @@ public class HMaster extends HRegionServer implements MasterServices { } } + /** + * Check hbase:meta is up and ready for reading. For use during Master startup only. + * @return True if meta is UP and online and startup can progress. Otherwise, meta is not online + * and we will hold here until operator intervention. + */ + @VisibleForTesting + public boolean isMeta() throws InterruptedException { + return isRegionOnline(RegionInfoBuilder.FIRST_META_REGIONINFO); + } + + /** + * @return True if region is online and scannable else false if an error or shutdown (Otherwise + * we just block in here holding up all forward-progess). + */ + private boolean isRegionOnline(RegionInfo ri) throws InterruptedException { + RetryCounter rc = null; + while (!isStopped()) { + RegionState rs = this.assignmentManager.getRegionStates().getRegionState(ri); + if (rs.isOpened()) { + if (this.getServerManager().isServerOnline(rs.getServerName())) { + if (isScanRegion(getConnection(), ri)) { + LOG.info("Verified online: {}", rs); + return true; + } + // If this fails, then I should mark meta as OFFLINE. TODO. + } + } + // Region is not OPEN. + Optional> optProc = this.procedureExecutor.getProcedures(). + stream().filter(p -> p instanceof ServerCrashProcedure).findAny(); + // TODO: Add a page to refguide on how to do repair. Have this log message point to it. + // Page will talk about loss of edits, how to schedule at least the meta WAL recovery, and + // then how to assign including how to break region lock if one held. + LOG.warn("{} NOT online, state={}; ServerCrashProcedures={}. Master startup in " + + "holding-pattern until region onlined; operator intervention required? Schedule assign", + ri.getRegionNameAsString(), rs, optProc.isPresent()); + // Check once-a-minute. + if (rc == null) { + rc = new RetryCounterFactory(1000).create(); + } + Threads.sleep(rc.getBackoffTimeAndIncrementAttempts()); + } + return false; + } + + /** + * Check hbase:namespace table is assigned. If not, startup will hang looking for the ns table + * (TODO: Fix this! NS should not hold-up startup). + * @return True if namespace table is up/online. + */ + @VisibleForTesting + public boolean isNamespace() throws InterruptedException { + List ris = this.assignmentManager.getRegionStates(). + getRegionsOfTable(TableName.NAMESPACE_TABLE_NAME); + if (ris.isEmpty()) { + // If empty, means we've not assigned the namespace table yet... Just return true so startup + // continues and the namespace table gets created. + return true; + } + // Else there are namespace regions up in meta. Ensure they are assigned before we go on. + for (RegionInfo ri: ris) { + isRegionOnline(ri); + } + return true; + } + + /** + * @return True if we can scan at least first-row in the passed-in region else false if we can't + * (We may be in here for some time as we run through all rpc retries). + */ + private static boolean isScanRegion(Connection connection, RegionInfo ri) { + try (Table t = connection.getTable(ri.getTable())) { + Scan s = new Scan(ri.getStartKey()); + s.setLimit(1); + try (ResultScanner rs = t.getScanner(s)) { + rs.next(); + } + return true; + } catch (IOException e) { + e.printStackTrace(); + } + return false; + } + /** * Adds the {@code MasterQuotasObserver} to the list of configured Master observers to * automatically remove quotas for a table when that table is deleted. diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java index 4e41783607..598853f789 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java @@ -525,9 +525,8 @@ public class MasterRpcServices extends RSRpcServices RpcController controller, ReportRSFatalErrorRequest request) throws ServiceException { String errorText = request.getErrorMessage(); ServerName sn = ProtobufUtil.toServerName(request.getServer()); - String msg = "Region server " + sn - + " reported a fatal error:\n" + errorText; - LOG.error(msg); + String msg = sn + " reported a fatal error:\n" + errorText; + LOG.warn(msg); master.rsFatals.add(msg); return ReportRSFatalErrorResponse.newBuilder().build(); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java index d716a11619..346052db55 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java @@ -144,18 +144,33 @@ public class MasterWalManager { } /** - * @return listing of ServerNames found in the filesystem under the WAL directory - * that COULD BE 'alive'; excludes those that have a '-splitting' suffix as these are already - * being split -- they cannot be 'alive'. + * Get Servernames which are currently splitting; paths have a '-splitting' suffix. + * @return ServerName + * @throws IOException IOException + */ + public Set getSplittingServersFromWALDir() throws IOException { + return getServerNamesFromWALDirPath( + p -> p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT)); + } + + /** + * Get Servernames that COULD BE 'alive'; excludes those that have a '-splitting' suffix as these + * are already being split -- they cannot be 'alive'. + * @return ServerName + * @throws IOException IOException */ public Set getLiveServersFromWALDir() throws IOException { - Path walDirPath = new Path(rootDir, HConstants.HREGION_LOGDIR_NAME); - FileStatus[] walDirForLiveServers = FSUtils.listStatus(fs, walDirPath, - p -> !p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT)); - if (walDirForLiveServers == null) { - return Collections.emptySet(); - } - return Stream.of(walDirForLiveServers).map(s -> { + return getServerNamesFromWALDirPath( + p -> !p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT)); + } + + /** + * @return listing of ServerNames found by parsing WAL directory paths in FS. + * + */ + public Set getServerNamesFromWALDirPath(final PathFilter filter) throws IOException { + FileStatus[] walDirForServerNames = getWALDirPaths(filter); + return Stream.of(walDirForServerNames).map(s -> { ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(s.getPath()); if (serverName == null) { LOG.warn("Log folder {} doesn't look like its name includes a " + @@ -167,6 +182,15 @@ public class MasterWalManager { }).filter(s -> s != null).collect(Collectors.toSet()); } + /** + * @return List of all RegionServer WAL dirs; i.e. this.rootDir/HConstants.HREGION_LOGDIR_NAME. + */ + public FileStatus[] getWALDirPaths(final PathFilter filter) throws IOException { + Path walDirPath = new Path(rootDir, HConstants.HREGION_LOGDIR_NAME); + FileStatus[] walDirForServerNames = FSUtils.listStatus(fs, walDirPath, filter); + return walDirForServerNames == null? new FileStatus[0]: walDirForServerNames; + } + /** * Inspect the log directory to find dead servers which need recovery work * @return A set of ServerNames which aren't running but still have WAL files left in file system diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/TableNamespaceManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/TableNamespaceManager.java index 0b4e35bf19..aefeebe098 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/TableNamespaceManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/TableNamespaceManager.java @@ -383,12 +383,16 @@ public class TableNamespaceManager implements Stoppable { return; } try { - this.zkNamespaceManager.stop(); + if (this.zkNamespaceManager != null) { + this.zkNamespaceManager.stop(); + } } catch (IOException ioe) { LOG.warn("Failed NamespaceManager close", ioe); } try { - this.nsTable.close(); + if (this.nsTable != null) { + this.nsTable.close(); + } } catch (IOException ioe) { LOG.warn("Failed Namespace Table close", ioe); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index f0a723dae1..3d09d3a28c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -1225,8 +1225,9 @@ public class AssignmentManager implements ServerListener { long startTime = System.nanoTime(); LOG.debug("Joining cluster..."); - // Scan hbase:meta to build list of existing regions, servers, and assignment - // hbase:meta is online when we get to here and TableStateManager has been started. + // Scan hbase:meta to build list of existing regions, servers, and assignment. + // hbase:meta is online now or will be. Inside loadMeta, we keep trying. Can't make progress + // w/o meta. loadMeta(); while (master.getServerManager().countOfRegionServers() < 1) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DisableTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DisableTableProcedure.java index 3a2a9521f4..dd1034ec0a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DisableTableProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DisableTableProcedure.java @@ -256,7 +256,7 @@ public class DisableTableProcedure TableStateManager tsm = env.getMasterServices().getTableStateManager(); TableState ts = tsm.getTableState(tableName); if (!ts.isEnabled()) { - LOG.info("Not ENABLED skipping {}", this); + LOG.info("Not ENABLED, state={}, skipping disable; {}", ts.getState(), this); setFailure("master-disable-table", new TableNotEnabledException(ts.toString())); canTableBeDisabled = false; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/EnableTableProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/EnableTableProcedure.java index c46070cd58..a1f56c2333 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/EnableTableProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/EnableTableProcedure.java @@ -335,7 +335,7 @@ public class EnableTableProcedure TableStateManager tsm = env.getMasterServices().getTableStateManager(); TableState ts = tsm.getTableState(tableName); if(!ts.isDisabled()){ - LOG.info("Not DISABLED tableState=" + ts + "; skipping enable"); + LOG.info("Not DISABLED tableState={}; skipping enable; {}", ts.getState(), this); setFailure("master-enable-table", new TableNotDisabledException(ts.toString())); canTableBeEnabled = false; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 2146e03e71..1f84f9f7cf 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -711,8 +711,12 @@ public class HRegionServer extends HasThread implements "hbase.regionserver.kerberos.principal", host); } - protected void waitForMasterActive() { - } + + /** + * Wait for an active Master. + * See override in Master superclass for how it is used. + */ + protected void waitForMasterActive() {} protected String getProcessName() { return REGIONSERVER; @@ -873,10 +877,6 @@ public class HRegionServer extends HasThread implements } } - // In case colocated master, wait here till it's active. - // So backup masters won't start as regionservers. - // This is to avoid showing backup masters as regionservers - // in master web UI, or assigning any region to them. waitForMasterActive(); if (isStopped() || isAborted()) { return; // No need for further initialization diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestMetaTableAccessor.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestMetaTableAccessor.java index f7865ee330..41e1b5536b 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestMetaTableAccessor.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestMetaTableAccessor.java @@ -46,6 +46,7 @@ import org.apache.hadoop.hbase.ipc.CallRunner; import org.apache.hadoop.hbase.ipc.DelegatingRpcScheduler; import org.apache.hadoop.hbase.ipc.PriorityFunction; import org.apache.hadoop.hbase.ipc.RpcScheduler; +import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.regionserver.RSRpcServices; import org.apache.hadoop.hbase.regionserver.SimpleRpcSchedulerFactory; @@ -104,6 +105,21 @@ public class TestMetaTableAccessor { UTIL.shutdownMiniCluster(); } + @Test + public void testIsMetaWhenAllHealthy() throws InterruptedException { + HMaster m = UTIL.getMiniHBaseCluster().getMaster(); + assertTrue(m.isMeta()); + } + + @Test + public void testIsMetaWhenMetaGoesOffline() throws InterruptedException { + HMaster m = UTIL.getMiniHBaseCluster().getMaster(); + int index = UTIL.getMiniHBaseCluster().getServerWithMeta(); + HRegionServer rsWithMeta = UTIL.getMiniHBaseCluster().getRegionServer(index); + rsWithMeta.abort("TESTING"); + assertTrue(m.isMeta()); + } + /** * Does {@link MetaTableAccessor#getRegion(Connection, byte[])} and a write * against hbase:meta while its hosted server is restarted to prove our retrying diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestMetaInitIfAllProceduresLost.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestMetaInitIfAllProceduresLost.java new file mode 100644 index 0000000000..3154214e2d --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestMetaInitIfAllProceduresLost.java @@ -0,0 +1,121 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.assignment; + + + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.RegionInfoBuilder; +import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.util.JVMClusterUtil; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; + +import static org.apache.hadoop.hbase.procedure2.store.wal.WALProcedureStore.MASTER_PROCEDURE_LOGDIR; + +@Category({MasterTests.class, MediumTests.class}) +public class TestMetaInitIfAllProceduresLost { + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestMetaInitIfAllProceduresLost.class); + + private static final Logger LOG = LoggerFactory + .getLogger(TestMetaInitIfAllProceduresLost.class); + + protected static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); + + @BeforeClass + public static void setupCluster() throws Exception { + UTIL.startMiniCluster(3); + } + + @AfterClass + public static void cleanupTest() throws Exception { + try { + UTIL.shutdownMiniCluster(); + } catch (Exception e) { + LOG.warn("failure shutting down cluster", e); + } + } + + @Test + public void test() throws Exception { + for (JVMClusterUtil.RegionServerThread rst : UTIL.getMiniHBaseCluster() + .getRegionServerThreads()) { + rst.getRegionServer().abort("killAll"); + } + //wait for a while, until all dirs are changed to '-splitting' + UTIL.waitFor(30000, () -> + UTIL.getMiniHBaseCluster().getMaster().getMasterWalManager() + .getLiveServersFromWALDir().size() == 0); + Thread.sleep(1000); + Path procedureWals = new Path( + UTIL.getMiniHBaseCluster().getMaster().getMasterFileSystem() + .getRootDir(), MASTER_PROCEDURE_LOGDIR); + //Kill the master + UTIL.getMiniHBaseCluster().killAll(); + //Delte all procedure log to create an anomaly + for (FileStatus file : UTIL.getTestFileSystem().listStatus(procedureWals)) { + LOG.info("removing " + file); + UTIL.getTestFileSystem().delete(file.getPath()); + } + UTIL.getMiniHBaseCluster().startMaster(); + UTIL.getMiniHBaseCluster().startRegionServer(); + UTIL.getMiniHBaseCluster().startRegionServer(); + UTIL.getMiniHBaseCluster().startRegionServer(); + ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1); + // Schedule an assign of meta after ten seconds. Then a few seconds later, do namespace assign. + // The meta table needs to be online before the namespace can succeed. + final HMaster master = UTIL.getHBaseCluster().getMaster(); + final AssignmentManager am = master.getAssignmentManager(); + final AssignProcedure ap = am.createAssignProcedure(RegionInfoBuilder.FIRST_META_REGIONINFO); + scheduler.schedule(() -> master.getMasterProcedureExecutor().submitProcedure(ap), 10, TimeUnit.SECONDS); + scheduler.schedule(() -> { + // hbase:meta should be online by the time this runs. That means we should have read the + // regions that make up the namespace table so below query should return results. + List ris = am.getRegionStates().getRegionsOfTable(TableName.NAMESPACE_TABLE_NAME); + if (ris.isEmpty()) { + throw new RuntimeException("No namespace regions found!"); + } + for (RegionInfo ri: ris) { + AssignProcedure riap = am.createAssignProcedure(ri); + master.getMasterProcedureExecutor().submitProcedure(riap); + } + }, 20 /*Must run AFTER meta is online*/, TimeUnit.SECONDS); + // Master should able to finish init even if all procedures are lost + UTIL.waitFor(180000, () -> UTIL.getMiniHBaseCluster().getMaster() != null && UTIL + .getMiniHBaseCluster().getMaster().isInitialized()); + } +} -- 2.16.3