diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 288b33ffea..47a342afa3 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -51,6 +51,8 @@ import java.util.concurrent.atomic.AtomicReference; import java.util.function.Function; import java.util.regex.Pattern; import java.util.stream.Collectors; +import java.util.stream.Stream; + import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; @@ -208,6 +210,7 @@ import org.apache.hbase.thirdparty.com.google.common.collect.Maps; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.GetRegionInfoResponse.CompactionState; +import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.ServerCrashState; import org.apache.hadoop.hbase.shaded.protobuf.generated.QuotaProtos.Quotas; import org.apache.hadoop.hbase.shaded.protobuf.generated.QuotaProtos.SpaceViolationPolicy; import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription; @@ -1088,7 +1091,44 @@ public class HMaster extends HRegionServer implements MasterServices { */ @VisibleForTesting public boolean waitUntilMetaOnline() throws InterruptedException { - return isRegionOnline(RegionInfoBuilder.FIRST_META_REGIONINFO); + RetryCounter rc = null; + while (!isStopped()) { + RegionState rs = this.assignmentManager.getRegionStates() + .getRegionState(RegionInfoBuilder.FIRST_META_REGIONINFO); + if (rs.isOpened()) { + if (this.getServerManager().isServerOnline(rs.getServerName())) { + return true; + } else { + // As the meta is OPENED but the server is not online , it means there can be an SCP if it + // is crashed which will be transitioning meta + Optional optSCP = this.procedureExecutor.getProcedures().stream() + .filter(p -> p instanceof ServerCrashProcedure).map(m -> ((ServerCrashProcedure) m)) + .filter( + scp -> (scp.hasMetaTableRegion() && scp.getServerName().equals(rs.getServerName()))) + .findAny(); + LOG.warn( + "{} is NOT online; state={}; ServerCrashProcedures={}. Master startup cannot " + + "progress, in holding-pattern until region onlined.", + RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionNameAsString(), rs, + optSCP.isPresent()); + // we have not found the SCP for the server and the server is also not online yet , it + // is better to expire it + if (!optSCP.isPresent()) { + if (!this.getServerManager().isServerOnline(rs.getServerName()) + && !this.getServerManager().getOnlineServers().isEmpty()) { + // Scheduling SCP for the server if there are any live server available + getAssignmentManager().submitServerCrash(rs.getServerName(), true); + } + } + } + } + // Check once-a-minute. + if (rc == null) { + rc = new RetryCounterFactory(1000).create(); + } + Threads.sleep(rc.getBackoffTimeAndIncrementAttempts()); + } + return false; } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java index 72bc96841b..be2fad378b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java @@ -473,8 +473,8 @@ public class MasterRpcServices extends RSRpcServices master.metricsMaster.incrementRequests(sl.getTotalNumberOfRequests() - (oldLoad != null ? oldLoad.getRequestCount() : 0)); } - } catch (IOException ioe) { - throw new ServiceException(ioe); + } catch (IOException | KeeperException ike) { + throw new ServiceException(ike); } return RegionServerReportResponse.newBuilder().build(); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 04a25c7748..cce42e8658 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -928,9 +928,10 @@ public class AssignmentManager implements ServerListener { * this method will check the the online regions against the in-memory state of the AM, * if there is a mismatch we will try to fence out the RS with the assumption * that something went wrong on the RS side. + * @throws KeeperException */ public void reportOnlineRegions(final ServerName serverName, final Set regionNames) - throws YouAreDeadException { + throws YouAreDeadException, KeeperException { if (!isRunning()) return; if (LOG.isTraceEnabled()) { LOG.trace("ReportOnlineRegions " + serverName + " regionCount=" + regionNames.size() + @@ -963,7 +964,8 @@ public class AssignmentManager implements ServerListener { wakeServerReportEvent(serverNode); } - void checkOnlineRegionsReportForMeta(final ServerName serverName, final Set regionNames) { + void checkOnlineRegionsReportForMeta(final ServerName serverName, final Set regionNames) + throws KeeperException { try { for (byte[] regionName: regionNames) { final RegionInfo hri = getMetaRegionFromName(regionName); @@ -980,6 +982,10 @@ public class AssignmentManager implements ServerListener { if (!reportTransition(regionNode, serverName, TransitionCode.OPENED, 0)) { LOG.warn("META REPORTED but no procedure found (complete?); set location={}", serverName); regionNode.setRegionLocation(serverName); + + // Along with master internal states, Update meta znode as well as per reporting + MetaTableLocator.setMetaLocation(master.getZooKeeper(), serverName, + regionNode.getState()); } else if (LOG.isTraceEnabled()) { LOG.trace("META REPORTED: " + regionNode); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestMetaTableAccessor.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestMetaTableAccessor.java index b5f5bac0d8..c772320ebb 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestMetaTableAccessor.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestMetaTableAccessor.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hbase; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; @@ -47,6 +48,9 @@ import org.apache.hadoop.hbase.ipc.DelegatingRpcScheduler; import org.apache.hadoop.hbase.ipc.PriorityFunction; import org.apache.hadoop.hbase.ipc.RpcScheduler; import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.master.assignment.RegionStates; +import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode; import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.regionserver.RSRpcServices; import org.apache.hadoop.hbase.regionserver.SimpleRpcSchedulerFactory; @@ -120,6 +124,20 @@ public class TestMetaTableAccessor { assertTrue(m.waitUntilMetaOnline()); } + /** + * Master should be able to recover from any unexpected state of meta-region-server znode + * Like having long-gone regionserver appeared in meta znode without an SCP + */ + @Test + public void testMetaZnodeWithNonExistentServer() throws Exception { + HMaster m = UTIL.getMiniHBaseCluster().getMaster(); + RegionStateNode metaRegionStateNode = m.getAssignmentManager().getRegionStates() + .getRegionStateNode(RegionInfoBuilder.FIRST_META_REGIONINFO); + assertEquals("Wrong state for meta!", RegionState.State.OPEN, metaRegionStateNode.getState()); + metaRegionStateNode.setRegionLocation(ServerName.valueOf("dummy", 1, 1)); + assertTrue(m.waitForMetaOnline()); + } + /** * Does {@link MetaTableAccessor#getRegion(Connection, byte[])} and a write * against hbase:meta while its hosted server is restarted to prove our retrying diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java index ec6b82ef77..ab0188eebd 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/MockMasterServices.java @@ -130,7 +130,7 @@ public class MockMasterServices extends MockNoopMasterServices { try { getAssignmentManager().reportOnlineRegions(serverName, regions == null? new HashSet(): regions); - } catch (YouAreDeadException e) { + } catch (YouAreDeadException | KeeperException e) { throw new RuntimeException(e); } return super.waitServerReportEvent(serverName, proc);