diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index 29c824d..5e62e79 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -29,6 +29,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.NavigableMap; import java.util.Set; import java.util.TreeMap; @@ -165,6 +166,9 @@ public class AssignmentManager extends ZooKeeperListener { private final Map> mergingRegions = new HashMap>(); + private final Map> splitRegions + = new HashMap>(); + /** * The sleep time for which the assignment will wait before retrying in case of hbase:meta assignment * failure due to lack of availability of region plan or bad region plan @@ -1321,14 +1325,30 @@ public class AssignmentManager extends ZooKeeperListener { ServerName serverName = rs.getServerName(); if (serverManager.isServerOnline(serverName)) { - if (rs.isOnServer(serverName) - && (rs.isOpened() || rs.isSplitting())) { - regionOnline(regionInfo, serverName); - if (disabled) { - // if server is offline, no hurt to unassign again - LOG.info("Opened " + regionNameStr - + "but this table is disabled, triggering close of region"); - unassign(regionInfo); + if (rs.isOnServer(serverName) && (rs.isOpened() || rs.isSplitting())) { + synchronized (regionStates) { + regionOnline(regionInfo, serverName); + if (rs.isSplitting()) { + // Check if the daugter regions are still there, if they are present, offline + // as its the case of a rollback. + HRegionInfo hri_a = splitRegions.get(regionInfo).getFirst(); + HRegionInfo hri_b = splitRegions.get(regionInfo).getSecond(); + if (!regionStates.isRegionInTransition(hri_a.getEncodedName())) { + LOG.warn("Split daughter region not in transition " + hri_a); + } + if (!regionStates.isRegionInTransition(hri_b.getEncodedName())) { + LOG.warn("Split daughter region not in transition" + hri_b); + } + regionOffline(hri_a); + regionOffline(hri_b); + splitRegions.remove(regionInfo); + } + if (disabled) { + // if server is offline, no hurt to unassign again + LOG.info("Opened " + regionNameStr + + "but this table is disabled, triggering close of region"); + unassign(regionInfo); + } } } else if (rs.isMergingNew()) { synchronized (regionStates) { @@ -3798,6 +3818,7 @@ public class AssignmentManager extends ZooKeeperListener { } synchronized (regionStates) { + splitRegions.put(p, new PairOfSameType(hri_a, hri_b)); regionStates.updateRegionState(hri_a, State.SPLITTING_NEW, sn); regionStates.updateRegionState(hri_b, State.SPLITTING_NEW, sn); regionStates.updateRegionState(rt, State.SPLITTING); @@ -3813,6 +3834,7 @@ public class AssignmentManager extends ZooKeeperListener { regionOffline(p, State.SPLIT); regionOnline(hri_a, sn); regionOnline(hri_b, sn); + splitRegions.remove(p); } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java index ee9cd40..99de513 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java @@ -58,6 +58,8 @@ import org.apache.hadoop.hbase.Waiter; import org.apache.hadoop.hbase.ZooKeeperConnectionException; import org.apache.hadoop.hbase.MetaTableAccessor; import org.apache.hadoop.hbase.client.Admin; +import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.client.Delete; import org.apache.hadoop.hbase.client.HBaseAdmin; import org.apache.hadoop.hbase.client.HTable; @@ -1161,6 +1163,42 @@ public class TestSplitTransactionOnCluster { TESTING_UTIL.deleteTable(tableName); } } + + @Test + public void testFailedSplit() throws Exception { + TableName tableName = TableName.valueOf("testFailedSplit"); + byte[] colFamily = Bytes.toBytes("info"); + TESTING_UTIL.createTable(tableName, colFamily); + Connection connection = ConnectionFactory.createConnection(TESTING_UTIL.getConfiguration()); + HTable table = (HTable) connection.getTable(tableName); + try { + TESTING_UTIL.loadTable(table, colFamily); + List regions = TESTING_UTIL.getHBaseAdmin().getTableRegions(tableName); + assertTrue(regions.size() == 1); + final HRegion actualRegion = cluster.getRegions(tableName).get(0); + actualRegion.getCoprocessorHost().load(FailingSplitRegionObserver.class, + Coprocessor.PRIORITY_USER, actualRegion.getBaseConf()); + + // The following split would fail. + admin.split(tableName); + FailingSplitRegionObserver.latch.await(); + LOG.info("Waiting for region to come out of RIT"); + TESTING_UTIL.waitFor(60000, 1000, new Waiter.Predicate() { + @Override + public boolean evaluate() throws Exception { + RegionStates regionStates = cluster.getMaster().getAssignmentManager().getRegionStates(); + Map rit = regionStates.getRegionsInTransition(); + return !rit.containsKey(actualRegion.getRegionInfo().getEncodedName()); + } + }); + regions = TESTING_UTIL.getHBaseAdmin().getTableRegions(tableName); + assertTrue(regions.size() == 1); + } finally { + table.close(); + connection.close(); + TESTING_UTIL.deleteTable(tableName); + } + } public static class MockedCoordinatedStateManager extends ZkCoordinatedStateManager {