From 1ea1d4a249cd8841ac3abbb3706d603f460f632b Mon Sep 17 00:00:00 2001 From: Bahram Chehrazy Date: Mon, 4 Feb 2019 13:07:40 -0800 Subject: [PATCH] [HBASE-21844] Handling incorrect Meta state on Zookeeper when the reported server is not actually online --- .../apache/hadoop/hbase/master/HMaster.java | 18 ++++++++--- .../hadoop/hbase/TestMetaTableAccessor.java | 32 +++++++++++++++++-- 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 9d2a743dd2..8a16e0d133 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -1216,31 +1216,39 @@ public class HMaster extends HRegionServer implements MasterServices { */ @VisibleForTesting public boolean waitForMetaOnline() throws InterruptedException { - return isRegionOnline(RegionInfoBuilder.FIRST_META_REGIONINFO); + return waitForRegionOnline(RegionInfoBuilder.FIRST_META_REGIONINFO); } /** * @return True if region is online and scannable else false if an error or shutdown (Otherwise * we just block in here holding up all forward-progess). */ - private boolean isRegionOnline(RegionInfo ri) throws InterruptedException { + private boolean waitForRegionOnline(RegionInfo ri) throws InterruptedException { RetryCounter rc = null; while (!isStopped()) { RegionState rs = this.assignmentManager.getRegionStates().getRegionState(ri); if (rs.isOpened()) { + LOG.info("{} is OPEN; state={}", ri.getRegionNameAsString(), rs); if (this.getServerManager().isServerOnline(rs.getServerName())) { return true; + } else if (this.getServerManager().isServerDead(rs.getServerName())) { + LOG.warn("{} state is OPEN, but the server {} is dead. Waiting for SCP to recover it.", + ri.getRegionNameAsString(), rs.getServerName()); + } else { + LOG.error("{} State is OPEN, but the server {} is not online and no SCP is scheduled. Expiring the server.", + ri.getRegionNameAsString(), rs.getServerName()); + this.getServerManager().expireServer(rs.getServerName()); } } // Region is not OPEN. - Optional> optProc = this.procedureExecutor.getProcedures(). + Optional> scpProc = this.procedureExecutor.getProcedures(). stream().filter(p -> p instanceof ServerCrashProcedure).findAny(); // TODO: Add a page to refguide on how to do repair. Have this log message point to it. // Page will talk about loss of edits, how to schedule at least the meta WAL recovery, and // then how to assign including how to break region lock if one held. LOG.warn("{} is NOT online; state={}; ServerCrashProcedures={}. Master startup cannot " + "progress, in holding-pattern until region onlined.", - ri.getRegionNameAsString(), rs, optProc.isPresent()); + ri.getRegionNameAsString(), rs, scpProc.isPresent()); // Check once-a-minute. if (rc == null) { rc = new RetryCounterFactory(1000).create(); @@ -1274,7 +1282,7 @@ public class HMaster extends HRegionServer implements MasterServices { } // Else there are namespace regions up in meta. Ensure they are assigned before we go on. for (RegionInfo ri : ris) { - isRegionOnline(ri); + waitForRegionOnline(ri); } return true; } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestMetaTableAccessor.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestMetaTableAccessor.java index 5582178805..e722b0a4ec 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestMetaTableAccessor.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestMetaTableAccessor.java @@ -46,7 +46,9 @@ import org.apache.hadoop.hbase.ipc.CallRunner; import org.apache.hadoop.hbase.ipc.DelegatingRpcScheduler; import org.apache.hadoop.hbase.ipc.PriorityFunction; import org.apache.hadoop.hbase.ipc.RpcScheduler; +import org.apache.hadoop.hbase.master.DeadServer; import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.ServerManager; import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.regionserver.RSRpcServices; import org.apache.hadoop.hbase.regionserver.SimpleRpcSchedulerFactory; @@ -107,13 +109,13 @@ public class TestMetaTableAccessor { } @Test - public void testIsMetaWhenAllHealthy() throws InterruptedException { + public void testWaitForMetaWhenAllHealthy() throws InterruptedException { HMaster m = UTIL.getMiniHBaseCluster().getMaster(); assertTrue(m.waitForMetaOnline()); } @Test - public void testIsMetaWhenMetaGoesOffline() throws InterruptedException { + public void testWaitForMetaWhenMetaGoesOffline() throws InterruptedException { HMaster m = UTIL.getMiniHBaseCluster().getMaster(); int index = UTIL.getMiniHBaseCluster().getServerWithMeta(); HRegionServer rsWithMeta = UTIL.getMiniHBaseCluster().getRegionServer(index); @@ -121,7 +123,31 @@ public class TestMetaTableAccessor { assertTrue(m.waitForMetaOnline()); } - /** + @Test + public void testWaitForMetaWhenMetaIsOpenButMetaServerIsUnavailable() throws InterruptedException { + MiniHBaseCluster cluster = UTIL.getMiniHBaseCluster(); + HMaster master = cluster.getMaster(); + int index = cluster.getServerWithMeta(); + HRegionServer rsWithMeta = cluster.getRegionServer(index); + ServerName metaServerName = rsWithMeta.getServerName(); + + // Simulate a dead meta server that the master doesn't know about it. + ServerManager serverManager = cluster.getMaster().getServerManager(); + serverManager.moveFromOnlineToDeadServers(metaServerName); + DeadServer deadServers = serverManager.getDeadServers(); + deadServers.finish(metaServerName); + deadServers.removeDeadServer(metaServerName); + + // Verify that the old meta server is not listed in either online or dead. + assertFalse(serverManager.isServerDead(metaServerName)); + assertFalse(serverManager.isServerOnline(metaServerName)); + + // Verify that the master can recover the meta. + assertTrue(master.waitForMetaOnline()); + } + + + /** * Does {@link MetaTableAccessor#getRegion(Connection, byte[])} and a write * against hbase:meta while its hosted server is restarted to prove our retrying * works. -- 2.20.1.windows.1