diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index 052a777..241a35b 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -146,7 +146,7 @@ public class AssignmentManager extends ZooKeeperListener { * See below in {@link #assign()} and {@link #unassign()}. */ private final int maximumAttempts; - + /** * The sleep time for which the assignment will wait before retrying in case of hbase:meta assignment * failure due to lack of availability of region plan @@ -1324,7 +1324,15 @@ public class AssignmentManager extends ZooKeeperListener { + "but this table is disabled, triggering close of region"); unassign(regionInfo); } + } else if (rs.isSplitting()) { + LOG.debug("Ephemeral node deleted. Found in SPLITTING state. " + "Removing from RIT " + + rs.getRegion()); + // it can be either SPLIT fail, or RS dead. + regionStates.regionOnline(rs.getRegion(), rs.getServerName()); } + // RS does not delete the znode in case SPLIT, it only means RS died which + // will be handled by SSH + // in region merge we do not put merging regions to MERGING state } finally { lock.unlock(); } diff --git hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java index e6aac9c..271d815 100644 --- hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java +++ hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestSplitTransactionOnCluster.java @@ -35,7 +35,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.Abortable; -import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.Coprocessor; import org.apache.hadoop.hbase.HBaseIOException; import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.HColumnDescriptor; @@ -48,7 +48,9 @@ import org.apache.hadoop.hbase.MiniHBaseCluster; import org.apache.hadoop.hbase.RegionTransition; import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.UnknownRegionException; +import org.apache.hadoop.hbase.Waiter; import org.apache.hadoop.hbase.ZooKeeperConnectionException; import org.apache.hadoop.hbase.catalog.MetaReader; import org.apache.hadoop.hbase.client.Delete; @@ -183,6 +185,7 @@ public class TestSplitTransactionOnCluster { assertTrue("not able to find a splittable region", region != null); new Thread() { + @Override public void run() { SplitTransaction st = null; st = new MockedSplitTransaction(region, Bytes.toBytes("row2")); @@ -241,6 +244,65 @@ public class TestSplitTransactionOnCluster { } } + @Test(timeout = 60000) + public void testRITStateForRollback() throws Exception { + final TableName tableName = + TableName.valueOf("testRITStateForRollback"); + try { + // Create table then get the single region for our new table. + HTable t = createTableAndWait(tableName.getName(), Bytes.toBytes("cf")); + final List regions = cluster.getRegions(tableName); + final HRegionInfo hri = getAndCheckSingleTableRegion(regions); + int regionServerIndex = cluster.getServerWith(regions.get(0).getRegionName()); + final HRegionServer regionServer = cluster.getRegionServer(regionServerIndex); + insertData(tableName.getName(), admin, t); + t.close(); + + // Turn off balancer so it doesn't cut in and mess up our placements. + this.admin.setBalancerRunning(false, true); + // Turn off the meta scanner so it don't remove parent on us. + cluster.getMaster().setCatalogJanitorEnabled(false); + + // find a splittable region + final HRegion region = findSplittableRegion(regions); + assertTrue("not able to find a splittable region", region != null); + + // install region co-processor to fail splits + region.getCoprocessorHost().load(FailingSplitRegionObserver.class, + Coprocessor.PRIORITY_USER, region.getBaseConf()); + + // split async + this.admin.split(region.getRegionName(), new byte[] {42}); + + // we have to wait until the SPLITTING state is seen by the master + FailingSplitRegionObserver.latch.await(); + + LOG.info("Waiting for region to come out of RIT"); + TESTING_UTIL.waitFor(60000, 1000, new Waiter.Predicate() { + @Override + public boolean evaluate() throws Exception { + RegionStates regionStates = cluster.getMaster().getAssignmentManager().getRegionStates(); + Map rit = regionStates.getRegionsInTransition(); + return !rit.containsKey(hri.getEncodedName()); + } + }); + } finally { + admin.setBalancerRunning(true, false); + cluster.getMaster().setCatalogJanitorEnabled(true); + TESTING_UTIL.deleteTable(tableName); + } + } + + public static class FailingSplitRegionObserver extends BaseRegionObserver { + static volatile CountDownLatch latch = new CountDownLatch(1); + @Override + public void preSplitBeforePONR(ObserverContext ctx, + byte[] splitKey, List metaEntries) throws IOException { + latch.countDown(); + throw new IOException("Causing rollback of region split"); + } + } + /** * A test that intentionally has master fail the processing of the split message. * Tests that the regionserver split ephemeral node gets cleaned up if it @@ -1107,6 +1169,7 @@ public class TestSplitTransactionOnCluster { super(conf); } + @Override protected void startCatalogJanitorChore() { LOG.debug("Customised master executed."); }