From bbdc005ce5e63a75b3086a264a4d9158e6da74cd Mon Sep 17 00:00:00 2001 From: Ben Lau Date: Thu, 4 Oct 2018 16:51:44 -0700 Subject: [PATCH] HBASE-15911 NPE in AssignmentManager.onRegionTransition after Master restart --- .../org/apache/hadoop/hbase/master/HMaster.java | 12 +- .../master/procedure/DeleteNamespaceProcedure.java | 2 +- .../hbase/master/procedure/ProcedureSyncWait.java | 8 +- .../hadoop/hbase/quotas/MasterQuotaManager.java | 15 +- .../TestSplitTransactionOnCluster.java | 190 ++++++++++++++++++++- 5 files changed, 215 insertions(+), 12 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index c2f99f462d..05321d2b0e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -779,6 +779,10 @@ public class HMaster extends HRegionServer implements MasterServices, Server { // initialize master side coprocessors before we start handling requests status.setStatus("Initializing master coprocessors"); this.cpHost = new MasterCoprocessorHost(this, this.conf); + + // create the quota manager but don't start it until master is initialized later + quotaManager = new MasterQuotaManager(this); + this.assignmentManager.setRegionStateListener((RegionStateListener) quotaManager); // start up all service threads. status.setStatus("Initializing master service threads"); @@ -905,7 +909,7 @@ public class HMaster extends HRegionServer implements MasterServices, Server { assignmentManager.checkIfShouldMoveSystemRegionAsync(); status.setStatus("Starting quota manager"); - initQuotaManager(); + quotaManager.start(); // assign the meta replicas Set EMPTY_SET = new HashSet(); @@ -949,12 +953,6 @@ public class HMaster extends HRegionServer implements MasterServices, Server { } } - private void initQuotaManager() throws IOException { - quotaManager = new MasterQuotaManager(this); - this.assignmentManager.setRegionStateListener((RegionStateListener) quotaManager); - quotaManager.start(); - } - /** * Create a {@link ServerManager} instance. * @param master diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteNamespaceProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteNamespaceProcedure.java index a7ebc3063e..e2fb5f3c69 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteNamespaceProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/DeleteNamespaceProcedure.java @@ -368,7 +368,7 @@ public class DeleteNamespaceProcedure protected static void removeNamespaceQuota( final MasterProcedureEnv env, final String namespaceName) throws IOException { - env.getMasterServices().getMasterQuotaManager().removeNamespaceQuota(namespaceName); + ProcedureSyncWait.getMasterQuotaManager(env).removeNamespaceQuota(namespaceName); } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ProcedureSyncWait.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ProcedureSyncWait.java index 1846cef6b9..938758ed1b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ProcedureSyncWait.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ProcedureSyncWait.java @@ -193,7 +193,13 @@ public final class ProcedureSyncWait { new ProcedureSyncWait.Predicate() { @Override public MasterQuotaManager evaluate() throws IOException { - return env.getMasterServices().getMasterQuotaManager(); + MasterQuotaManager masterQuotaManager = env.getMasterServices().getMasterQuotaManager(); + if (masterQuotaManager != null) { + if (masterQuotaManager.isQuotaDisabled() || masterQuotaManager.isQuotaInitialized()) { + return masterQuotaManager; + } + } + return null; } }); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/MasterQuotaManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/MasterQuotaManager.java index f1b7ff9e36..f9ee899a44 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/MasterQuotaManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/quotas/MasterQuotaManager.java @@ -20,6 +20,7 @@ import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.MetaTableAccessor; import org.apache.hadoop.hbase.NamespaceDescriptor; +import org.apache.hadoop.hbase.PleaseHoldException; import org.apache.hadoop.hbase.RegionStateListener; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.classification.InterfaceAudience; @@ -51,7 +52,7 @@ public class MasterQuotaManager implements RegionStateListener { private NamedLock namespaceLocks; private NamedLock tableLocks; private NamedLock userLocks; - private boolean initialized = false; + private volatile boolean initialized = false; private NamespaceAuditor namespaceQuotaManager; public MasterQuotaManager(final MasterServices masterServices) { @@ -81,7 +82,11 @@ public class MasterQuotaManager implements RegionStateListener { namespaceQuotaManager.start(); initialized = true; } - + + public boolean isQuotaDisabled() { + return !QuotaUtil.isQuotaEnabled(masterServices.getConfiguration()); + } + public void stop() { } @@ -348,6 +353,8 @@ public class MasterQuotaManager implements RegionStateListener { public void onRegionMerged(HRegionInfo hri) throws IOException { if (initialized) { namespaceQuotaManager.updateQuotaForRegionMerge(hri); + } else if (QuotaUtil.isQuotaEnabled(masterServices.getConfiguration())){ + throw new PleaseHoldException("Master quota manager not ready to process region merges yet"); } } @@ -355,6 +362,8 @@ public class MasterQuotaManager implements RegionStateListener { public void onRegionSplit(HRegionInfo hri) throws IOException { if (initialized) { namespaceQuotaManager.checkQuotaToSplitRegion(hri); + } else if (QuotaUtil.isQuotaEnabled(masterServices.getConfiguration())){ + throw new PleaseHoldException("Master quota manager not ready to process region merges yet"); } } @@ -523,6 +532,8 @@ public class MasterQuotaManager implements RegionStateListener { public void onRegionSplitReverted(HRegionInfo hri) throws IOException { if (initialized) { this.namespaceQuotaManager.removeRegionFromNamespaceUsage(hri); + } else if (QuotaUtil.isQuotaEnabled(masterServices.getConfiguration())){ + throw new PleaseHoldException("Master quota manager not ready to process region merges yet"); } } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java index 32700af1f8..8cc029cf07 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java @@ -33,6 +33,7 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -51,6 +52,7 @@ import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.MasterNotRunningException; import org.apache.hadoop.hbase.MetaTableAccessor; import org.apache.hadoop.hbase.MiniHBaseCluster; +import org.apache.hadoop.hbase.MiniHBaseCluster.MiniHBaseClusterRegionServer; import org.apache.hadoop.hbase.RegionTransition; import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.ServerName; @@ -68,6 +70,7 @@ import org.apache.hadoop.hbase.client.HBaseAdmin; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Mutation; import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.RegionLocator; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.client.Scan; @@ -77,18 +80,26 @@ import org.apache.hadoop.hbase.coordination.ZKSplitTransactionCoordination; import org.apache.hadoop.hbase.coordination.ZkCloseRegionCoordination; import org.apache.hadoop.hbase.coordination.ZkCoordinatedStateManager; import org.apache.hadoop.hbase.coordination.ZkOpenRegionCoordination; +import org.apache.hadoop.hbase.coprocessor.BaseMasterObserver; import org.apache.hadoop.hbase.coprocessor.BaseRegionObserver; +import org.apache.hadoop.hbase.coprocessor.CoprocessorHost; +import org.apache.hadoop.hbase.coprocessor.MasterCoprocessorEnvironment; import org.apache.hadoop.hbase.coprocessor.ObserverContext; import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment; import org.apache.hadoop.hbase.exceptions.DeserializationException; import org.apache.hadoop.hbase.executor.EventType; import org.apache.hadoop.hbase.master.AssignmentManager; import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.MasterRpcServices; import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.master.RegionState.State; import org.apache.hadoop.hbase.master.RegionStates; import org.apache.hadoop.hbase.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.GetRegionInfoResponse.CompactionState; +import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest; +import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse; +import org.apache.hadoop.hbase.quotas.QuotaUtil; +import org.apache.hadoop.hbase.regionserver.TestSplitTransactionOnCluster.MyMaster.MyMasterRpcServices; import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext; import org.apache.hadoop.hbase.regionserver.throttle.NoLimitThroughputController; import org.apache.hadoop.hbase.testclassification.LargeTests; @@ -114,6 +125,7 @@ import org.junit.BeforeClass; import org.junit.Test; import org.junit.experimental.categories.Category; +import com.google.protobuf.RpcController; import com.google.protobuf.ServiceException; /** @@ -140,10 +152,12 @@ public class TestSplitTransactionOnCluster { new HBaseTestingUtility(); static void setupOnce() throws Exception { + TESTING_UTIL.getConfiguration().set(CoprocessorHost.MASTER_COPROCESSOR_CONF_KEY, + DelayedInitializationObserver.class.getName()); TESTING_UTIL.getConfiguration().setInt(HConstants.HBASE_BALANCER_PERIOD, 60000); useZKForAssignment = TESTING_UTIL.getConfiguration().getBoolean( "hbase.assignment.usezk", true); - TESTING_UTIL.startMiniCluster(NB_SERVERS); + TESTING_UTIL.startMiniCluster(1, NB_SERVERS, null, MyMaster.class, MiniHBaseClusterRegionServer.class); } @BeforeClass @@ -1396,7 +1410,181 @@ public class TestSplitTransactionOnCluster { TESTING_UTIL.deleteTable(table); } } + + /** + * Verifies HBASE-15911. When master is starting, region splits should not cause the master to + * fail eg NPE due to uninitialized state in the AssignmentManager or elsewhere. + * @throws Exception + */ + @Test(timeout = 120000) + public void testSplitBeforeMasterInitialization() throws Exception { + if (useZKForAssignment) { + LOG.info("Skipping this test because it targets a bug that happens for READY_TO_SPLIT when " + + "quotas are enabled and READY_TO_SPLIT is a transition specific to ZKLess assignment"); + return; + } + + final TableName tableName = TableName.valueOf("testSplitBeforeMasterInitialization"); + HTable t = createTableAndWait(tableName, HConstants.CATALOG_FAMILY); + List regions = cluster.getRegions(tableName); + HRegionInfo hri = getAndCheckSingleTableRegion(regions); + + LOG.info("Waiting for master to be fully shutdown"); + ServerName masterName = TESTING_UTIL.getMiniHBaseCluster().getMaster().getServerName(); + TESTING_UTIL.getMiniHBaseCluster().stopMaster(masterName); + TESTING_UTIL.getMiniHBaseCluster().waitForMasterToStop(masterName, 15000); + + try { + LOG.info("Starting master with delayed initialization and quota support"); + DelayedInitializationObserver.enable(); + TESTING_UTIL.getMiniHBaseCluster().getConfiguration().setBoolean(QuotaUtil.QUOTA_CONF_KEY, true); + TESTING_UTIL.getMiniHBaseCluster().startMaster(); + + RegionLocator regionLocator = t.getRegionLocator(); + + int uncaughtTransitionExceptionsBefore = getUncaughtTransitionExceptions(); + + // Split a region, will not succeed, but should not bring down master either + LOG.info("Splitting region: " + hri.getEncodedName() + " ON " + + regionLocator.getRegionLocation(hri.getStartKey())); + TESTING_UTIL.loadTable(t, HConstants.CATALOG_FAMILY, false); + admin.splitRegion(hri.getEncodedNameAsBytes()); + + // Long enough for the split to be reported back to an uninitialized master. + Thread.sleep(5000); + + LOG.info("Resuming master initialization"); + DelayedInitializationObserver.startSignal.countDown(); + int uncaughtTransitionExceptionsAfter = getUncaughtTransitionExceptions(); + + // Wait some time and ensure that master didn't die and there is only 1 region + Thread.sleep(10000); + LOG.info("Performing checks now"); + assertEquals(1, regionLocator.getAllRegionLocations().size()); + assertTrue(TESTING_UTIL.getMiniHBaseCluster().getMaster().isInitialized()); + assertEquals(1, TESTING_UTIL.getMiniHBaseCluster().getLiveMasterThreads().size()); + assertEquals(uncaughtTransitionExceptionsBefore, uncaughtTransitionExceptionsAfter); + } finally { + DelayedInitializationObserver.reset(); + TESTING_UTIL.getMiniHBaseCluster().getConfiguration().setBoolean(QuotaUtil.QUOTA_CONF_KEY, false); + } + + LOG.info("Waiting for master to be fully shutdown"); + masterName = TESTING_UTIL.getMiniHBaseCluster().getMaster().getServerName(); + TESTING_UTIL.getMiniHBaseCluster().stopMaster(masterName); + TESTING_UTIL.getMiniHBaseCluster().waitForMasterToStop(masterName, 15000); + + /* Now test without quotas, region split should succeed even though master is not initialized + * completely and master should not die. + */ + + try { + LOG.info("Starting master with delayed initialization"); + DelayedInitializationObserver.enable(); + TESTING_UTIL.getMiniHBaseCluster().startMaster(); + + RegionLocator regionLocator = t.getRegionLocator(); + + int uncaughtTransitionExceptionsBefore = getUncaughtTransitionExceptions(); + + // Split a region, should succeed + LOG.info("Splitting region: " + hri.getEncodedName() + " ON " + + regionLocator.getRegionLocation(hri.getStartKey())); + TESTING_UTIL.loadTable(t, HConstants.CATALOG_FAMILY, false); + admin.splitRegion(hri.getEncodedNameAsBytes()); + + // Long enough for the split to be reported back to an uninitialized master. + Thread.sleep(5000); + + LOG.info("Resuming master initialization"); + DelayedInitializationObserver.startSignal.countDown(); + + int uncaughtTransitionExceptionsAfter = getUncaughtTransitionExceptions(); + + // Wait some time and ensure that master didn't die and there are 2 regions + Thread.sleep(10000); + LOG.info("Performing checks now"); + assertEquals(2, regionLocator.getAllRegionLocations().size()); + assertTrue(TESTING_UTIL.getMiniHBaseCluster().getMaster().isInitialized()); + assertEquals(1, TESTING_UTIL.getMiniHBaseCluster().getLiveMasterThreads().size()); + assertEquals(uncaughtTransitionExceptionsBefore, uncaughtTransitionExceptionsAfter); + } finally { + DelayedInitializationObserver.reset(); + } + } + + public static class DelayedInitializationObserver extends BaseMasterObserver { + private static volatile boolean enabled = false; + private static volatile CountDownLatch startSignal = new CountDownLatch(1); + + @Override + public void preMasterInitialization(ObserverContext ctx) + throws IOException { + if (!enabled) { + return; + } + LOG.info("Delaying master initialization in coproc"); + try { + startSignal.await(); + } catch (InterruptedException e) { + LOG.error("Unexpected interrupt", e); + } + LOG.info("Resuming master initialization in coproc"); + } + + public static void enable() { + enabled = true; + } + + public static void reset() { + enabled = false; + startSignal = new CountDownLatch(1); + } + } + + private int getUncaughtTransitionExceptions() { + MyMaster master = (MyMaster) TESTING_UTIL.getMiniHBaseCluster().getMaster(); + if (master == null) return 0; + MyMasterRpcServices rpcServices = ((MyMasterRpcServices) master.getMasterRpcServices()); + if (rpcServices == null) return 0; + return rpcServices.uncaughtTransitionExceptions.get(); + } + + public static class MyMaster extends HMaster { + public MyMaster(Configuration conf, CoordinatedStateManager cp) + throws IOException, KeeperException, + InterruptedException { + super(conf, cp); + } + + @Override + protected RSRpcServices createRpcServices() throws IOException { + return new MyMasterRpcServices(this); + } + + public static class MyMasterRpcServices extends MasterRpcServices { + final AtomicInteger uncaughtTransitionExceptions = new AtomicInteger(); + + public MyMasterRpcServices(HMaster m) throws IOException { + super(m); + } + + @Override + public ReportRegionStateTransitionResponse reportRegionStateTransition(RpcController c, + ReportRegionStateTransitionRequest req) throws ServiceException { + try { + return super.reportRegionStateTransition(c, req); + } catch (ServiceException e1) { + throw e1; + } catch (Exception e2) { + uncaughtTransitionExceptions.addAndGet(1); + throw e2; + } + } + } + } + public static class MockedCoordinatedStateManager extends ZkCoordinatedStateManager { public void initialize(Server server, HRegion region) { -- 2.14.3 (Apple Git-98)