commit 32f0b806e3a559ea9aa8e1a5ad2f1e6a855e7df3 Author: Todd Lipcon Date: Mon Sep 12 18:18:06 2011 -0700 HBASE-4287. FAILED_OPEN state in ZK diff --git src/main/java/org/apache/hadoop/hbase/executor/EventHandler.java src/main/java/org/apache/hadoop/hbase/executor/EventHandler.java index 4e2d92f..90c91ae 100644 --- src/main/java/org/apache/hadoop/hbase/executor/EventHandler.java +++ src/main/java/org/apache/hadoop/hbase/executor/EventHandler.java @@ -109,6 +109,7 @@ public abstract class EventHandler implements Runnable, Comparable { RS_ZK_REGION_OPENED (4), // RS has finished opening a region RS_ZK_REGION_SPLITTING (5), // RS has started a region split RS_ZK_REGION_SPLIT (6), // RS split has completed. + RS_ZK_REGION_FAILED_OPEN (7), // RS failed to open a region // Messages originating from Master to RS M_RS_OPEN_REGION (20), // Master asking RS to open a region diff --git src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java index b3cb746..3be34a3 100644 --- src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java +++ src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java @@ -118,6 +118,7 @@ public class ExecutorService { // Master executor services case RS_ZK_REGION_CLOSED: + case RS_ZK_REGION_FAILED_OPEN: return ExecutorType.MASTER_CLOSE_REGION; case RS_ZK_REGION_OPENED: diff --git src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index c0170b4..cd9e5cb 100644 --- src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -465,6 +465,7 @@ public class AssignmentManager extends ZooKeeperListener { break; case RS_ZK_REGION_CLOSED: + case RS_ZK_REGION_FAILED_OPEN: // Region is closed, insert into RIT and handle it addToRITandCallClose(regionInfo, RegionState.State.CLOSED, data); break; @@ -705,6 +706,21 @@ public class AssignmentManager extends ZooKeeperListener { this.executorService.submit(new ClosedRegionHandler(master, this, regionState.getRegion())); break; + + case RS_ZK_REGION_FAILED_OPEN: + if (regionState == null || + (!regionState.isPendingOpen() && !regionState.isOpening())) { + LOG.warn("Received FAILED_OPEN for region " + prettyPrintedRegionName + + " from server " + data.getOrigin() + " but region was in " + + " the state " + regionState + " and not in PENDING_OPEN or OPENING"); + return; + } + // Handle this the same as if it were opened and then closed. + regionState.update(RegionState.State.CLOSED, + data.getStamp(), data.getOrigin()); + this.executorService.submit(new ClosedRegionHandler(master, + this, regionState.getRegion())); + break; case RS_ZK_REGION_OPENING: // Should see OPENING after we have asked it to OPEN or additional diff --git src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java index 8f96baa..7cdd686 100644 --- src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java +++ src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java @@ -106,7 +106,11 @@ public class OpenRegionHandler extends EventHandler { // Open region. After a successful open, failures in subsequent // processing needs to do a close as part of cleanup. region = openRegion(); - if (region == null) return; + if (region == null) { + tryTransitionToFailedOpen(regionInfo); + return; + } + boolean failed = true; if (tickleOpening("post_region_open")) { if (updateMeta(region)) failed = false; @@ -114,10 +118,17 @@ public class OpenRegionHandler extends EventHandler { if (failed || this.server.isStopped() || this.rsServices.isStopping()) { cleanupFailedOpen(region); + tryTransitionToFailedOpen(regionInfo); return; } if (!transitionToOpened(region)) { + // If we fail to transition to opened, it's because of one of two cases: + // (a) we lost our ZK lease + // OR (b) someone else opened the region before us + // In either case, we don't need to transition to FAILED_OPEN state. + // In case (a), the Master will process us as a dead server. In case + // (b) the region is already being handled elsewhere anyway. cleanupFailedOpen(region); return; } @@ -137,7 +148,7 @@ public class OpenRegionHandler extends EventHandler { * state meantime so master doesn't timeout our region-in-transition. * Caller must cleanup region if this fails. */ - private boolean updateMeta(final HRegion r) { + boolean updateMeta(final HRegion r) { if (this.server.isStopped() || this.rsServices.isStopping()) { return false; } @@ -270,33 +281,34 @@ public class OpenRegionHandler extends EventHandler { } return result; } - + /** - * @return Instance of HRegion if successful open else null. + * @param Region we're working on. + * This is not guaranteed to succeed, we just do our best. + * @return Transition znode to CLOSED state. */ - HRegion openRegion(Path tableDir) { - HRegion region = null; + private boolean tryTransitionToFailedOpen(final HRegionInfo hri) { + boolean result = false; + final String name = hri.getRegionNameAsString(); try { - // Instantiate the region. This also periodically tickles our zk OPENING - // state so master doesn't timeout this region in transition. - region = HRegion.openHRegion(tableDir, this.regionInfo, this.htd, - this.rsServices.getWAL(), this.server.getConfiguration(), - this.rsServices, - new CancelableProgressable() { - public boolean progress() { - // We may lose the znode ownership during the open. Currently its - // too hard interrupting ongoing region open. Just let it complete - // and check we still have the znode after region open. - return tickleOpening("open_region_progress"); - } - }); - } catch (IOException e) { - // We failed open. Let our znode expire in regions-in-transition and - // Master will assign elsewhere. Presumes nothing to close. - LOG.error("Failed open of region=" + - this.regionInfo.getRegionNameAsString(), e); + LOG.info("Opening of region " + hri + " failed, marking as FAILED_OPEN in ZK"); + if (ZKAssign.transitionNode( + this.server.getZooKeeper(), hri, + this.server.getServerName(), + EventType.RS_ZK_REGION_OPENING, + EventType.RS_ZK_REGION_FAILED_OPEN, + this.version) == -1) { + LOG.warn("Unable to mark region " + hri + " as FAILED_OPEN. " + + "It's likely that the master already timed out this open " + + "attempt, and thus another RS already has the region."); + } else { + result = true; + } + } catch (KeeperException e) { + LOG.error("Failed transitioning node " + name + + " from OPENING to FAILED_OPEN", e); } - return region; + return result; } /** @@ -318,11 +330,12 @@ public class OpenRegionHandler extends EventHandler { return tickleOpening("open_region_progress"); } }); - } catch (IOException e) { - // We failed open. Let our znode expire in regions-in-transition and - // Master will assign elsewhere. Presumes nothing to close. + } catch (Throwable t) { + // We failed open. Our caller will see the 'null' return value + // and transition the node back to FAILED_OPEN. If that fails, + // we rely on the Timeout Monitor in the master to reassign. LOG.error("Failed open of region=" + - this.regionInfo.getRegionNameAsString(), e); + this.regionInfo.getRegionNameAsString(), t); } return region; } @@ -390,4 +403,4 @@ public class OpenRegionHandler extends EventHandler { private boolean isGoodVersion() { return this.version != -1; } -} \ No newline at end of file +} diff --git src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java index cefa8aa..02e3d16 100644 --- src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java +++ src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java @@ -25,11 +25,16 @@ import java.io.IOException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.Server; +import org.apache.hadoop.hbase.ZooKeeperConnectionException; +import org.apache.hadoop.hbase.executor.RegionTransitionData; +import org.apache.hadoop.hbase.executor.EventHandler.EventType; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.RegionServerServices; import org.apache.hadoop.hbase.zookeeper.ZKAssign; @@ -40,6 +45,9 @@ import org.apache.zookeeper.KeeperException.NodeExistsException; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; +import org.mockito.Mockito; +import org.mockito.invocation.InvocationOnMock; +import org.mockito.stubbing.Answer; /** * Test of the {@link OpenRegionHandler}. @@ -47,6 +55,11 @@ import org.junit.Test; public class TestOpenRegionHandler { static final Log LOG = LogFactory.getLog(TestOpenRegionHandler.class); private final static HBaseTestingUtility HTU = new HBaseTestingUtility(); + private static final HTableDescriptor TEST_HTD = + new HTableDescriptor("TestOpenRegionHandler.java"); + private static final HRegionInfo TEST_HRI = + new HRegionInfo(TEST_HTD.getName(), HConstants.EMPTY_END_ROW, + HConstants.EMPTY_END_ROW); @BeforeClass public static void before() throws Exception { HTU.startMiniZKCluster(); @@ -69,10 +82,8 @@ public class TestOpenRegionHandler { final Server server = new MockServer(HTU); final RegionServerServices rss = new MockRegionServerServices(); - HTableDescriptor htd = new HTableDescriptor("TestOpenRegionHandler.java"); - final HRegionInfo hri = - new HRegionInfo(htd.getName(), HConstants.EMPTY_END_ROW, - HConstants.EMPTY_END_ROW); + HTableDescriptor htd = TEST_HTD; + final HRegionInfo hri = TEST_HRI; HRegion region = HRegion.createHRegion(hri, HBaseTestingUtility.getTestDir(), HTU .getConfiguration(), htd); @@ -80,9 +91,8 @@ public class TestOpenRegionHandler { OpenRegionHandler handler = new OpenRegionHandler(server, rss, hri, htd) { HRegion openRegion() { // Open region first, then remove znode as though it'd been hijacked. - //HRegion region = super.openRegion(); - HRegion region = super.openRegion(HBaseTestingUtility.getTestDir()); - + HRegion region = super.openRegion(); + // Don't actually open region BUT remove the znode as though it'd // been hijacked on us. ZooKeeperWatcher zkw = this.server.getZooKeeper(); @@ -103,4 +113,55 @@ public class TestOpenRegionHandler { // post OPENING; again will expect it to come back w/o NPE or exception. handler.process(); } + + @Test + public void testFailedOpenRegion() throws Exception { + Server server = new MockServer(HTU); + RegionServerServices rsServices = Mockito.mock(RegionServerServices.class); + + // Create it OFFLINE, which is what it expects + ZKAssign.createNodeOffline(server.getZooKeeper(), TEST_HRI, server.getServerName()); + + // Create the handler + OpenRegionHandler handler = + new OpenRegionHandler(server, rsServices, TEST_HRI, TEST_HTD) { + @Override + HRegion openRegion() { + // Fake failure of opening a region due to an IOE, which is caught + return null; + } + }; + handler.process(); + + // Handler should have transitioned it to FAILED_OPEN + RegionTransitionData data = + ZKAssign.getData(server.getZooKeeper(), TEST_HRI.getEncodedName()); + assertEquals(EventType.RS_ZK_REGION_FAILED_OPEN, data.getEventType()); + } + + @Test + public void testFailedUpdateMeta() throws Exception { + Server server = new MockServer(HTU); + RegionServerServices rsServices = Mockito.mock(RegionServerServices.class); + + // Create it OFFLINE, which is what it expects + ZKAssign.createNodeOffline(server.getZooKeeper(), TEST_HRI, server.getServerName()); + + // Create the handler + OpenRegionHandler handler = + new OpenRegionHandler(server, rsServices, TEST_HRI, TEST_HTD) { + @Override + boolean updateMeta(final HRegion r) { + // Fake failure of updating META + return false; + } + }; + handler.process(); + + // Handler should have transitioned it to FAILED_OPEN + RegionTransitionData data = + ZKAssign.getData(server.getZooKeeper(), TEST_HRI.getEncodedName()); + assertEquals(EventType.RS_ZK_REGION_FAILED_OPEN, data.getEventType()); + } + } \ No newline at end of file