From 8c52684319ef9812f1f6d0a6a690f989ccc010f3 Mon Sep 17 00:00:00 2001 From: Wellington Chevreuil Date: Mon, 4 Feb 2019 10:47:08 +0000 Subject: [PATCH] HBASE-21843 - Fix scenario where region is not assigned because no SCP is submitted for RS assigned to the region in META. Change-Id: I724ae70ac5fa50d362034ac9855e062ba1fd11b5 --- .../master/assignment/AssignmentManager.java | 15 ++++++ .../assignment/TestAssignmentManager.java | 51 +++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 85fffc340b..f814a64a47 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -1291,6 +1291,21 @@ public class AssignmentManager { localState = State.OFFLINE; } + boolean isServerOfflineOrRestarted = !AssignmentManager.this.getMaster().getServerManager(). + isServerOnline(regionLocation); + if(isServerOfflineOrRestarted){ + boolean isServerRecoverable = false; + try { + isServerRecoverable = AssignmentManager.this.getMaster(). + getMasterWalManager().getLiveServersFromWALDir().contains(regionLocation); + if(!isServerRecoverable) { + localState = State.OFFLINE; + } + }catch (IOException e){ + LOG.warn("Error when trying to load list of RSes WAL dirs to check for " + + "recoverable dead servers."); + } + } RegionStateNode regionNode = regionStates.getOrCreateRegionStateNode(regionInfo); // Do not need to lock on regionNode, as we can make sure that before we finish loading // meta, all the related procedures can not be executed. The only exception is for meta diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java index 5ec7cc64e4..b5f510e72c 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java @@ -18,20 +18,29 @@ package org.apache.hadoop.hbase.master.assignment; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertTrue; import java.util.concurrent.Executors; import java.util.concurrent.Future; + +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionInfoBuilder; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.master.RegionState.State; import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; import org.apache.hadoop.hbase.procedure2.util.StringUtils; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.util.Bytes; import org.junit.ClassRule; import org.junit.Ignore; import org.junit.Test; @@ -214,4 +223,46 @@ public class TestAssignmentManager extends TestAssignmentManagerBase { // set it back as default, see setUpMeta() am.wakeMetaLoadedEvent(); } + + @Test + public void testAssignRegionOpenOldRegionServerStartCode() throws Exception{ + this.util.startMiniCluster(); + TableName tableName = TableName.valueOf("test"); + this.util.createTable(tableName,"cf"); + long startCodeBeforeCrash = getStartCodeForRsAssignedInMeta(); + ServerName rsName = this.util.getRSForFirstRegionInTable(tableName). + getServerName(); + this.util.killMiniHBaseCluster(); + synchronized (this){ + this.wait(300); + } + //Manually delete RS WAL dir to simulate this catastrophe + Path regionServerWalPath = new Path("/hbase/WALs/" + + rsName.toString()); + this.util.getDFSCluster().getFileSystem().delete(regionServerWalPath, + true); + this.util.restartHBaseCluster(1); + synchronized (this){ + this.wait(300); + } + long startCodeAfterCrash = getStartCodeForRsAssignedInMeta(); + assertNotEquals("start codes should be different", + startCodeBeforeCrash, startCodeAfterCrash); + } + + private long getStartCodeForRsAssignedInMeta() throws Exception{ + Table meta = null; + try { + meta = this.util.getConnection().getTable(TableName.valueOf("hbase:meta")); + Scan scan = new Scan(); + scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("serverstartcode")); + ResultScanner resultScanner = meta.getScanner(scan); + Result result = resultScanner.next(); + String key = Bytes.toString(result.getRow()); + long startCode = Bytes.toLong(result.value()); + return startCode; + }finally { + meta.close(); + } + } } -- 2.17.2 (Apple Git-113)