From 484f019e46b23c3a9a95b9109da1bdc3905e83a6 Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Sat, 30 Jun 2018 21:58:44 -0400 Subject: [PATCH] Test --- .../org/apache/hadoop/hbase/master/HMaster.java | 15 ++ .../hadoop/hbase/master/MasterMetaBootstrap.java | 13 +- .../apache/hadoop/hbase/master/ServerManager.java | 5 +- .../hbase/master/assignment/AssignProcedure.java | 2 +- .../master/procedure/ServerCrashProcedure.java | 4 +- .../hadoop/hbase/regionserver/HRegionServer.java | 14 +- .../assignment/TestLateFailedRPCPostSCP.java | 195 +++++++++++++++++++++ 7 files changed, 227 insertions(+), 21 deletions(-) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestLateFailedRPCPostSCP.java diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 7ad77652db..a0a33891fc 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -2845,6 +2845,9 @@ public class HMaster extends HRegionServer implements MasterServices { return serverCrashProcessingEnabled.isReady(); } + /** + * @see #enableCrashedServerProcessing() + */ @VisibleForTesting public void setServerCrashProcessingEnabled(final boolean b) { procedureExecutor.getEnvironment().setEventReady(serverCrashProcessingEnabled, b); @@ -2854,6 +2857,18 @@ public class HMaster extends HRegionServer implements MasterServices { return serverCrashProcessingEnabled; } + @VisibleForTesting + public void enableCrashedServerProcessing() throws InterruptedException { + // If crashed server processing is disabled, we enable it and expire those dead but not expired + // servers. This is required so that if meta is assigning to a server which dies after + // assignMeta starts assignment, ServerCrashProcedure can re-assign it. Otherwise, we will be + // stuck here waiting forever if waitForMeta is specified. + if (!isServerCrashProcessingEnabled()) { + setServerCrashProcessingEnabled(true); + getServerManager().processQueuedDeadServers(); + } + } + /** * Compute the average load across all region servers. * Currently, this uses a very naive computation - just uses the number of diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java index f1408271d9..0df3ac822d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java @@ -54,7 +54,7 @@ public class MasterMetaBootstrap { // Now we can start the TableStateManager. It is backed by hbase:meta. master.getTableStateManager().start(); // Enable server crash procedure handling - enableCrashedServerProcessing(); + master.enableCrashedServerProcessing(); } public void processDeadServers() { @@ -137,15 +137,4 @@ public class MasterMetaBootstrap { LOG.warn("Ignoring exception " + ex); } } - - private void enableCrashedServerProcessing() throws InterruptedException { - // If crashed server processing is disabled, we enable it and expire those dead but not expired - // servers. This is required so that if meta is assigning to a server which dies after - // assignMeta starts assignment, ServerCrashProcedure can re-assign it. Otherwise, we will be - // stuck here waiting forever if waitForMeta is specified. - if (!master.isServerCrashProcessingEnabled()) { - master.setServerCrashProcessingEnabled(true); - master.getServerManager().processQueuedDeadServers(); - } - } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index b6bc1a15f6..59b91adeff 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -574,14 +574,13 @@ public class ServerManager { } // No SCP handling during startup. if (!master.isServerCrashProcessingEnabled()) { - LOG.info("Master doesn't enable ServerShutdownHandler during initialization, " - + "delay expiring server " + serverName); + LOG.info("ServerShutdownHandler disabled; queuing processing of {}", serverName); // Even though we delay expire of this server, we still need to handle Meta's RIT // that are against the crashed server; since when we do RecoverMetaProcedure, // the SCP is not enabled yet and Meta's RIT may be suspend forever. See HBase-19287 master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName); this.queuedDeadServers.add(serverName); - // Return true because though on SCP queued, there will be one queued later. + // Return true because though no SCP queued, there will be one queued later. return true; } if (this.deadservers.isDeadServer(serverName)) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java index 86f0a3ff59..0b57ffd0d1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java @@ -210,7 +210,7 @@ public class AssignProcedure extends RegionTransitionProcedure { regionNode.setRegionLocation(lastRegionLocation); } else if (regionNode.getLastHost() != null) { retain = true; - LOG.info("Setting lastHost as the region location " + regionNode.getLastHost()); + LOG.info("Setting null lastHost as {}", regionNode.getLastHost()); regionNode.setRegionLocation(regionNode.getLastHost()); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java index 5a4c10fabc..4fb64bf9ef 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java @@ -402,8 +402,8 @@ public class ServerCrashProcedure continue; } if (!rtpServerName.equals(this.serverName)) continue; - LOG.info("pid=" + getProcId() + " found RIT " + rtp + "; " + - rtp.getRegionState(env).toShortString()); + LOG.info("pid={} found RIT against crashed server {}; interrupting {}", getProcId(), + rtp, rtp.getRegionState(env).toShortString()); // Notify RIT on server crash. if (sce == null) { sce = new ServerCrashException(getProcId(), getServerName()); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 759b51ec87..f11b7ed568 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -1564,7 +1564,14 @@ public class HRegionServer extends HasThread implements } private void deleteMyEphemeralNode() throws KeeperException { - ZKUtil.deleteNode(this.zooKeeper, getMyEphemeralNodePath()); + Configuration c = getConfiguration(); + // For testing, you may want to kill server but not delete the ephemeral znode on the way out; + // set the configuration hbase.delete.ephemeral.znode to false if this is case. + if (c == null || c.getBoolean("hbase.delete.ephemeral.znode", true)) { + ZKUtil.deleteNode(this.zooKeeper, getMyEphemeralNodePath()); + } else { + LOG.info("Not deleting ephemeral znode {}", getMyEphemeralNodePath()); + } } @Override @@ -2361,9 +2368,10 @@ public class HRegionServer extends HasThread implements } /* - * Simulate a kill -9 of this server. Exits w/o closing regions or cleaninup + * Simulate a kill -9 of this server. Exits w/o closing regions or cleaning-up * logs but it does close socket in case want to bring up server on old - * hostname+port immediately. + * hostname+port immediately. Also removes znode on the way out UNLESS you set + * hbase.testing.keep.znode=true in the regionserver Configuration. */ @VisibleForTesting protected void kill() { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestLateFailedRPCPostSCP.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestLateFailedRPCPostSCP.java new file mode 100644 index 0000000000..991726afaa --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestLateFailedRPCPostSCP.java @@ -0,0 +1,195 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.assignment; + +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseIOException; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.MiniHBaseCluster; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.AsyncAdmin; +import org.apache.hadoop.hbase.client.AsyncConnection; +import org.apache.hadoop.hbase.client.ConnectionFactory; +import org.apache.hadoop.hbase.coprocessor.CoprocessorHost; +import org.apache.hadoop.hbase.coprocessor.ObserverContext; +import org.apache.hadoop.hbase.coprocessor.RegionCoprocessor; +import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment; +import org.apache.hadoop.hbase.coprocessor.RegionObserver; +import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.procedure2.Procedure; +import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher; +import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.testclassification.LargeTests; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.JVMClusterUtil; +import org.apache.hadoop.hbase.util.Threads; +import org.junit.After; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.rules.TestName; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Optional; +import java.util.concurrent.atomic.AtomicBoolean; + +import static junit.framework.TestCase.fail; + +/** + * Test case where SCP has cleaned up a RS but then a hung RPC against the crashed server times out + * and tries to do failure recovery on its Assign (where SCP has already done the cleanup). Ongoing + * RPCs are not interrupted. TODO. See + * HBASE-20796 STUCK RIT though region successfully assigned + */ +@Category({MasterTests.class, LargeTests.class}) +public class TestLateFailedRPCPostSCP { + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestLateFailedRPCPostSCP.class); + + private static final Logger LOG = + LoggerFactory.getLogger(TestLateFailedRPCPostSCP.class); + + @Rule + public TestName name = new TestName(); + + public static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); + + + @BeforeClass + public static void beforeClass() { + } + + @Before + public void before() throws Exception { + TEST_UTIL.getConfiguration().set(CoprocessorHost.REGION_COPROCESSOR_CONF_KEY, + HaltOpenRegionCoprocessor.class.getName()); + int timeout = 15 * 1000; + TEST_UTIL.getConfiguration().setInt(HConstants.ZK_SESSION_TIMEOUT, timeout); + TEST_UTIL.getConfiguration().setInt(RemoteProcedureDispatcher.DISPATCH_DELAY_CONF_KEY, + timeout * 3); + // Have more than two servers so when we kill one, there is more than one left; when just one + // server left, we skip assign logic and won't try to do retain; i.e. RPC the server we have + // on hold. + TEST_UTIL.startMiniCluster(1, 3); + } + + @After + public void after() throws Exception { + TEST_UTIL.shutdownMiniCluster(); + } + + @Test + public void test() throws Exception { + MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); + + // Create a table with regions + final TableName tableName = TableName.valueOf(name.getMethodName()); + byte[] family = Bytes.toBytes("family"); + TEST_UTIL.createMultiRegionTable(tableName, family, 10); + TEST_UTIL.getAdmin().disableTable(tableName); + HaltOpenRegionCoprocessor.HALT_OPEN.set(true); + HRegionServer hrs = getNotMetaRegionServer(cluster); + // Use async to start enable so we get control back immediately rather than block until done. + try (AsyncConnection connection = + ConnectionFactory.createAsyncConnection(TEST_UTIL.getConfiguration()).get()) { + AsyncAdmin aa = connection.getAdmin(); + aa.enableTable(tableName); + Threads.sleep(1000); + AssignProcedure stuckAssignProcedure = null; + for (long pid: cluster.getMaster().getMasterProcedureExecutor().getActiveProcIds()) { + Procedure p = cluster.getMaster().getMasterProcedureExecutor().getProcedure(pid); + if (p instanceof AssignProcedure) { + AssignProcedure ap = (AssignProcedure)p; + ServerName sn = + ap.getServer(cluster.getMaster().getMasterProcedureExecutor().getEnvironment()); + if (hrs.getServerName().equals(sn) && + cluster.getMaster().getAssignmentManager().getRegionStates(). + isRegionInState(ap.getRegionInfo(), RegionState.State.OPENING)) { + stuckAssignProcedure = ap; + break; + } + } + } + if (stuckAssignProcedure == null) { + fail(); + } + // Expire server but leave it up so RPC stays stuck against it. + cluster.killRegionServer(hrs.getServerName()); + Threads.sleep(1000); + HaltOpenRegionCoprocessor.HALT_OPEN.set(false); + LOG.info("Send remoteCallFailed {}", stuckAssignProcedure); + stuckAssignProcedure.remoteCallFailed(cluster.getMaster().getMasterProcedureExecutor().getEnvironment(), hrs.getServerName(), new HBaseIOException("Test")); + for (int i = 0; i < 100000; i++) { + Threads.sleep(100); + } + int x = 0; + } + } + + private static HRegionServer getNotMetaRegionServer(MiniHBaseCluster cluster) { + ServerName masterServerName = cluster.getMaster().getServerName(); + ServerName regionServerName = null; + int index = cluster.getServerWithMeta(); + ServerName metaRegionServerName = cluster.getRegionServer(index).getServerName(); + for (JVMClusterUtil.RegionServerThread rst: cluster.getLiveRegionServerThreads()) { + ServerName sn = rst.getRegionServer().getServerName(); + if (sn.equals(masterServerName)) { + continue; + } + if (sn.equals(metaRegionServerName)) { + continue; + } + regionServerName = sn; + break; + } + if (regionServerName == null) { + fail(); + } + return cluster.getRegionServer(regionServerName); + } + + /** + * Coprocessor that will halt all region open's after we throw the static HALT boolean switch. + * Good for hanging Master RPC'ing to a RegionServer. + */ + public static class HaltOpenRegionCoprocessor implements RegionCoprocessor, RegionObserver { + public static AtomicBoolean HALT_OPEN = new AtomicBoolean(false); + + @Override + public Optional getRegionObserver() { + return Optional.of(new RegionObserver() { + @Override + public void preOpen(ObserverContext c) throws IOException { + while (HALT_OPEN.get()) { + Threads.sleep(100); + } + } + }); + } + } +} + -- 2.16.3