From 83b76d119890d0c0d86f7187206b1d38045d4e6d Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Mon, 22 Jan 2018 14:44:16 -0800 Subject: [PATCH] HBASE-19838 Can not shutdown backup master cleanly when it has already tried to become the active master On Master@shutdown, close the shared Master connection to kill any ongoing RPCs by hosted clients. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Call close ont the Master shared clusterconnection to kill any ongoing rpcs. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java Remove guts of close; we were closing the Masters connection....not our responsibility. Added unit test written by Duo Zhang which demonstrates the case where Master will not go down. --- .../org/apache/hadoop/hbase/master/HMaster.java | 21 +++- .../apache/hadoop/hbase/master/ServerManager.java | 16 +-- .../hadoop/hbase/regionserver/HRegionServer.java | 3 + .../hbase/master/TestShutdownBackupMaster.java | 108 +++++++++++++++++++++ 4 files changed, 132 insertions(+), 16 deletions(-) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestShutdownBackupMaster.java diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 09b18bcaa5..88d85966a7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -2605,7 +2605,7 @@ public class HMaster extends HRegionServer implements MasterServices { } @Override - public void abort(final String msg, final Throwable t) { + public void abort(String reason, Throwable cause) { if (isAborted() || isStopped()) { return; } @@ -2614,8 +2614,9 @@ public class HMaster extends HRegionServer implements MasterServices { LOG.error(HBaseMarkers.FATAL, "Master server abort: loaded coprocessors are: " + getLoadedCoprocessors()); } - if (t != null) { - LOG.error(HBaseMarkers.FATAL, msg, t); + String msg = "***** ABORTING master " + this + ": " + reason + " *****"; + if (cause != null) { + LOG.error(HBaseMarkers.FATAL, msg, cause); } else { LOG.error(HBaseMarkers.FATAL, msg); } @@ -2666,14 +2667,19 @@ public class HMaster extends HRegionServer implements MasterServices { return rsFatals; } + /** + * Shutdown the cluster. + * Master runs a coordinated stop of all RegionServers and then itself. + */ public void shutdown() throws IOException { if (cpHost != null) { cpHost.preShutdown(); } - + // Tell the servermanager cluster is down. if (this.serverManager != null) { this.serverManager.shutdownCluster(); } + // Set the cluster down flag; broadcast across the cluster. if (this.clusterStatusTracker != null){ try { this.clusterStatusTracker.setClusterDown(); @@ -2681,6 +2687,13 @@ public class HMaster extends HRegionServer implements MasterServices { LOG.error("ZooKeeper exception trying to set cluster as down in ZK", e); } } + // Shutdown our cluster connection. This will kill any hosted RPCs that might be going on; + // this is what we want especially if the Master is in startup phase doing call outs to + // hbase:meta, etc. when cluster is down. Without ths connection close, we'd have to wait on + // the rpc to timeout. + if (this.clusterConnection != null) { + this.clusterConnection.close(); + } } public void stopMaster() throws IOException { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 3db60335d8..3c84bfba5c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -200,10 +200,8 @@ public class ServerManager { Configuration c = master.getConfiguration(); maxSkew = c.getLong("hbase.master.maxclockskew", 30000); warningSkew = c.getLong("hbase.master.warningclockskew", 10000); - this.connection = connect ? master.getClusterConnection() : null; - this.rpcControllerFactory = this.connection == null - ? null - : connection.getRpcControllerFactory(); + this.connection = connect? master.getClusterConnection(): null; + this.rpcControllerFactory = this.connection == null? null: connection.getRpcControllerFactory(); } /** @@ -968,16 +966,10 @@ public class ServerManager { } /** - * Stop the ServerManager. Currently closes the connection to the master. + * Stop the ServerManager. */ public void stop() { - if (connection != null) { - try { - connection.close(); - } catch (IOException e) { - LOG.error("Attempt to close connection to master failed", e); - } - } + // Nothing to do. } /** diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 516a77ea87..537579dd5d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -251,6 +251,9 @@ public class HRegionServer extends HasThread implements * Cluster connection to be shared by services. * Initialized at server startup and closed when server shuts down. * Clients must never close it explicitly. + * Clients hosted by this Server should make use of this clusterConnection rather than create + * their own; if they create their own, there is no way for the hosting server to shutdown + * ongoing client RPCs. */ protected ClusterConnection clusterConnection; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestShutdownBackupMaster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestShutdownBackupMaster.java new file mode 100644 index 0000000000..02d7f2f8aa --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestShutdownBackupMaster.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master; + +import static org.junit.Assert.assertNotNull; + +import java.io.IOException; +import java.util.concurrent.CountDownLatch; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.CategoryBasedTimeout; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.MiniHBaseCluster; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; +import org.apache.zookeeper.KeeperException; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.rules.TestRule; + +/** + * Test to confirm that we will not hang when stop a backup master which is trying to become the + * active master. See HBASE-19838 + */ +@Category({ MasterTests.class, MediumTests.class }) +public class TestShutdownBackupMaster { + @Rule public final TestRule timeout = CategoryBasedTimeout.builder().withTimeout(this.getClass()). + withLookingForStuckThread(true).build(); + + private static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); + + private static volatile CountDownLatch ARRIVE; + + private static volatile CountDownLatch CONTINUE; + + public static final class MockHMaster extends HMaster { + + public MockHMaster(Configuration conf) throws IOException, KeeperException { + super(conf); + } + + @Override + void initClusterSchemaService() throws IOException, InterruptedException { + if (ARRIVE != null) { + ARRIVE.countDown(); + CONTINUE.await(); + } + super.initClusterSchemaService(); + } + } + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + UTIL.getConfiguration().setClass(HConstants.MASTER_IMPL, MockHMaster.class, HMaster.class); + UTIL.startMiniCluster(2, 2); + UTIL.waitUntilAllSystemRegionsAssigned(); + } + + @AfterClass + public static void tearDownAfterClass() throws Exception { + // make sure that we can stop the cluster cleanly + UTIL.shutdownMiniCluster(); + } + + @Test + public void testShutdownWhileBecomingActive() throws InterruptedException { + MiniHBaseCluster cluster = UTIL.getHBaseCluster(); + HMaster activeMaster = null; + HMaster backupMaster = null; + for (MasterThread t : cluster.getMasterThreads()) { + if (t.getMaster().isActiveMaster()) { + activeMaster = t.getMaster(); + } else { + backupMaster = t.getMaster(); + } + } + assertNotNull(activeMaster); + assertNotNull(backupMaster); + ARRIVE = new CountDownLatch(1); + CONTINUE = new CountDownLatch(1); + activeMaster.abort("Aborting active master for test"); + // wait until we arrive the initClusterSchemaService + ARRIVE.await(); + // killall RSes + cluster.getRegionServerThreads().stream().map(t -> t.getRegionServer()) + .forEachOrdered(rs -> rs.abort("Aborting RS for test")); + CONTINUE.countDown(); + } +} -- 2.11.0 (Apple Git-81)