From 6870dc59b9771bc95f0e51ef4d727e9e99626fb5 Mon Sep 17 00:00:00 2001 From: Wellington Chevreuil Date: Tue, 20 Aug 2019 11:46:59 -0300 Subject: [PATCH] HBASE-22631 assign failed may make gced parent region appear again --- .../master/assignment/AssignProcedure.java | 2 +- .../TestAssignProcedureFailures.java | 205 ++++++++++++++++++ 2 files changed, 206 insertions(+), 1 deletion(-) create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignProcedureFailures.java diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java index f2b967afa7..ae62d0678e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignProcedure.java @@ -362,10 +362,10 @@ public class AssignProcedure extends RegionTransitionProcedure { } this.forceNewPlan = true; this.targetServer = null; - regionNode.offline(); // We were moved to OPENING state before dispatch. Undo. It is safe to call // this method because it checks for OPENING first. env.getAssignmentManager().undoRegionAsOpening(regionNode); + regionNode.offline(); setTransitionState(RegionTransitionState.REGION_TRANSITION_QUEUE); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignProcedureFailures.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignProcedureFailures.java new file mode 100644 index 0000000000..8e36e69cba --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignProcedureFailures.java @@ -0,0 +1,205 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.assignment; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Admin; +import org.apache.hadoop.hbase.client.ConnectionFactory; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.coprocessor.ObserverContext; +import org.apache.hadoop.hbase.coprocessor.RegionCoprocessor; +import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment; +import org.apache.hadoop.hbase.coprocessor.RegionObserver; +import org.apache.hadoop.hbase.master.CatalogJanitor; +import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.procedure2.Procedure; +import org.apache.hadoop.hbase.regionserver.HRegion; +import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost; +import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.rules.TestName; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Optional; + +/** + * Integration test for condition described on HBASE-22631. It forces an assignment failure, + * so that AssingmentManager.RegionStates.serverMap two RS entries will both have references for + * the given region. Then, sub-sequent actions are: + * 1) Succeed assignment for the region; + * 2) Kill RS other than the one where the region is online; + * 3) Split the region; + * 4) Force a CatalogJanitor run to submit a GCRegionProcedure; + * 5) Restart previously killed RS, wait for it to come online; + * 6) Wait for the killed RS SCP to complete success + * 7) Check that AssignmentManager.RegionStates.serverMap does not have any entry for the killed RS, + * since the SCP completed success + * 8) Check the region is either removed from AssignmentManager.RegionStates.regionsMap, or is in + * SPLIT state. + * + */ +@Category({ MasterTests.class, MediumTests.class }) +public class TestAssignProcedureFailures { + + @ClassRule public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestAssignProcedureFailures.class); + + @Rule + public TestName name = new TestName(); + + private static final Logger LOG = LoggerFactory.getLogger(TestAssignProcedureFailures.class); + + protected static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); + + private static AssignFailureTestObserver TEST_OBSERVER = new AssignFailureTestObserver(); + + @Test + public void testFailAssignThenSplit() throws Exception { + //setting our mocked co-processor which allows to simulate assignment failures + UTIL.getConfiguration().set(RegionCoprocessorHost.USER_REGION_COPROCESSOR_CONF_KEY, + AssignFailureTestCoprocessor.class.getName()); + UTIL.startMiniCluster(2); + TableName tableName = TableName.valueOf(name.getMethodName()); + byte[] family = Bytes.toBytes("f"); + Table table = UTIL.createTable(tableName, family); + final HRegion region = UTIL.getHBaseCluster().getRegions(tableName).get(0); + LOG.info("Starting tests for condition detected on HBASE-22631. Our target region: {}", + region.getRegionInfo().getEncodedName()); + UTIL.unassignRegion(region.getRegionInfo().getEncodedName()); + UTIL.waitUntilNoRegionsInTransition(); + //makes sure the region got closed, before proceeding to open it + RegionStates regionStates = + UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().getRegionStates(); + assertEquals(RegionState.State.CLOSED, + regionStates.getRegionState(region.getRegionInfo()).getState()); + //the CP flag that will force assignments to fail + TEST_OBSERVER.FAIL_OPENING = true; + //reducing timeouts and number of retries to expedite test execution + Configuration clientConfig = UTIL.getConfiguration(); + clientConfig.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY, 500); + clientConfig.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 5); + Admin admin = ConnectionFactory.createConnection(clientConfig).getAdmin(); + try { + admin.assign(region.getRegionInfo().getEncodedNameAsBytes()); + } catch (Exception e) { + System.out.println(e.getMessage()); + } + regionStates = + UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().getRegionStates(); + + LOG.info("Passed point of failed assign attempts. Region State: {}", + regionStates.getRegionState(region.getRegionInfo()).getState().name()); + //now we gave up on previous assignment attempts, + //change flag to allow assignments flow naturally + TEST_OBSERVER.FAIL_OPENING = false; + clientConfig.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY, 60000); + clientConfig.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 15); + admin = ConnectionFactory.createConnection(clientConfig).getAdmin(); + admin.assign(region.getRegionInfo().getEncodedNameAsBytes()); + UTIL.waitUntilAllRegionsAssigned(tableName); + //region should had got assigned, now assert the region is really OPEN + regionStates = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().getRegionStates(); + RegionState regionState = regionStates.getRegionState(region.getRegionInfo()); + assertEquals(RegionState.State.OPEN, regionState.getState()); + //Save the name for the RS holding the region + ServerName serverNameRegionOnline = regionState.getServerName(); + //split the original region + UTIL.getHBaseCluster().getMaster() + .splitRegion(region.getRegionInfo(), Bytes.toBytes("bbb"), HConstants.NO_NONCE, + HConstants.NO_NONCE); + //wait a little to give a chance for the split completion + Thread.sleep(400); + //get the RS reference + HRegionServer rs = UTIL.getHBaseCluster().getRegionServer(serverNameRegionOnline); + //we are interested on the other RS, who didn't get the region assigned successfully. + //If RegionStates.serverMap still has the region associated to this RS, it should not cause + //region to become online again. + HRegionServer otherRs = UTIL.getOtherRegionServer(rs); + //Next step is to force and wait for CatalogJanitor to GC the split region + long testTime = System.currentTimeMillis(); + UTIL.getHBaseCluster().getMaster().getCatalogJanitor().triggerNow(); + CatalogJanitor.Report report = + UTIL.getHBaseCluster().getMaster().getCatalogJanitor().getLastReport(); + while (report.getCreateTime() < testTime) { + Thread.sleep(100); + report = UTIL.getHBaseCluster().getMaster().getCatalogJanitor().getLastReport(); + } + LOG.info("continuing after CatalogJanitor run has completed..."); + //at this point, split region should had been collected. Restart the RS. + //Killing the other RS + UTIL.getHBaseCluster().killRegionServer(otherRs.getServerName()); + Thread.sleep(500); + //lets wait for this RS SCP to get completed + Optional> optionalProcedure = UTIL.getHBaseCluster().getMaster().getProcedures() + .stream().filter( p -> p.getProcName().indexOf("ServerCrashProcedure")>=0).findFirst(); + if(optionalProcedure.isPresent()){ + Procedure proc = optionalProcedure.get(); + while(!proc.getState().equals(ProcedureProtos.ProcedureState.SUCCESS)){ + Thread.sleep(100); + } + } + UTIL.getHBaseCluster() + .startRegionServer(otherRs.getServerName().getHostname(), otherRs.getServerName().getPort()); + UTIL.getHBaseCluster().getRegionServer(otherRs.getServerName()).waitForServerOnline(); + //collect a "fresh" RegionStates to go over expected regions states + regionStates = UTIL.getMiniHBaseCluster().getMaster().getAssignmentManager().getRegionStates(); + //Once the SCP has succeeded, there shouldn't be any entry for otherRS + assertNull(regionStates.getServerNode(otherRs.getServerName())); + //If the region is still on RegionStates.regionsMap, it should be in SPLIT state + assertTrue(regionStates.getRegionState(region.getRegionInfo()) == null || + regionStates.getRegionState(region.getRegionInfo()) + .getState().equals(RegionState.State.SPLIT)); + } + + public static class AssignFailureTestCoprocessor implements RegionCoprocessor { + + @Override public Optional getRegionObserver() { + return Optional.of(TEST_OBSERVER); + } + + } + + public static class AssignFailureTestObserver implements RegionObserver { + + public static boolean FAIL_OPENING = false; + + @Override public void preOpen(ObserverContext c) + throws IOException { + if (FAIL_OPENING) { + throw new IOException( + "Missing table descriptor for" + c.getEnvironment().getRegionInfo().getEncodedName()); + } + } + } + +} -- 2.17.2 (Apple Git-113)