From 6ed809758908e1d0815bc04b21aefbf55f6cdc68 Mon Sep 17 00:00:00 2001 From: Guanghao Zhang Date: Fri, 19 Oct 2018 19:34:04 +0800 Subject: [PATCH] HBASE-21325 Force to terminate regionserver when abort hang in somewhere --- .../hadoop/hbase/regionserver/HRegionServer.java | 18 ++++++ .../hbase/replication/SyncReplicationTestBase.java | 1 + .../TestSyncReplicationActiveKillRS.java | 67 ++++++++++++++++++++++ 3 files changed, 86 insertions(+) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestSyncReplicationActiveKillRS.java diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 7adf58e..ee15fb8 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -38,6 +38,8 @@ import java.util.Map.Entry; import java.util.Objects; import java.util.Set; import java.util.SortedMap; +import java.util.Timer; +import java.util.TimerTask; import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; @@ -316,6 +318,9 @@ public class HRegionServer extends HasThread implements // Go down hard. Used if file system becomes unavailable and also in // debugging and unit tests. private volatile boolean abortRequested; + public static final String ABORT_TIMEOUT = "hbase.regionserver.abort.timeout"; + // Default abort timeout is 600 seconds for safe + private static final long DEFAULT_ABORT_TIMEOUT = 600000; ConcurrentMap rowlocks = new ConcurrentHashMap<>(); @@ -1026,6 +1031,19 @@ public class HRegionServer extends HasThread implements abort(prefix + t.getMessage(), t); } } + + if (abortRequested) { + // Force to terminate region server when timeout. + Timer abortMonitor = new Timer("Abort regionserver monitor", true); + abortMonitor.schedule(new TimerTask() { + @Override + public void run() { + LOG.warn("Aborting region server timed out, terminate forcibly..."); + System.exit(0); + } + }, conf.getLong(ABORT_TIMEOUT, DEFAULT_ABORT_TIMEOUT)); + } + if (this.leases != null) { this.leases.closeAfterLeasesExpire(); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/SyncReplicationTestBase.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/SyncReplicationTestBase.java index 1b52354..62b5b3d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/SyncReplicationTestBase.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/SyncReplicationTestBase.java @@ -96,6 +96,7 @@ public class SyncReplicationTestBase { conf.setInt("replication.source.maxretriesmultiplier", 10); conf.setFloat("replication.source.ratio", 1.0f); conf.setBoolean("replication.source.eof.autorecovery", true); + conf.setLong(HRegionServer.ABORT_TIMEOUT, 60000); } @BeforeClass diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestSyncReplicationActiveKillRS.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestSyncReplicationActiveKillRS.java new file mode 100644 index 0000000..3206c9d --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestSyncReplicationActiveKillRS.java @@ -0,0 +1,67 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.replication; + +import static org.junit.Assert.fail; + +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.testclassification.LargeTests; +import org.apache.hadoop.hbase.testclassification.ReplicationTests; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +@Category({ ReplicationTests.class, LargeTests.class }) +public class TestSyncReplicationActiveKillRS extends SyncReplicationTestBase { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestSyncReplicationActiveKillRS.class); + + private final int TIMEOUT_IN_SECONDS = 120; + + @Test + public void testActiveKillRS() throws Exception { + UTIL2.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID, + SyncReplicationState.STANDBY); + UTIL1.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID, + SyncReplicationState.ACTIVE); + + writeAndVerifyReplication(UTIL1, UTIL2, 0, 100); + UTIL1.getMiniHBaseCluster().getRegionServer(0).stop("Stop RS for test"); + writeAndVerifyReplication(UTIL1, UTIL2, 100, 200); + + // disable async replication + UTIL1.getAdmin().disableReplicationPeer(PEER_ID); + UTIL2.getAdmin().disableReplicationPeer(PEER_ID); + + // transit remote cluster to DA and kill one rs in local cluster + UTIL2.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID, + SyncReplicationState.DOWNGRADE_ACTIVE); + UTIL1.getMiniHBaseCluster().getRegionServer(1).abort("Abort RS for test"); + + long startTime = System.currentTimeMillis(); + while (System.currentTimeMillis() - startTime < TIMEOUT_IN_SECONDS * 1000) { + if (UTIL1.getMiniHBaseCluster().getLiveRegionServerThreads().size() == 1) { + return; + } + Thread.sleep(1000); + } + fail("Failed to abort a region server in " + TIMEOUT_IN_SECONDS + " seconds"); + } +} \ No newline at end of file -- 2.7.4