Index: hbase-server/src/test/java/org/apache/hadoop/hbase/TestFullLogReconstruction.java =================================================================== --- hbase-server/src/test/java/org/apache/hadoop/hbase/TestFullLogReconstruction.java (revision 1519102) +++ hbase-server/src/test/java/org/apache/hadoop/hbase/TestFullLogReconstruction.java (working copy) @@ -55,7 +55,7 @@ c.setInt("ipc.client.connect.max.retries", 1); c.setInt("dfs.client.block.recovery.retries", 1); c.setInt(HConstants.ZK_SESSION_TIMEOUT, 1000); - TEST_UTIL.startMiniCluster(2); + TEST_UTIL.startMiniCluster(3); } /** Index: hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java =================================================================== --- hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java (revision 1519102) +++ hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/handler/TestOpenRegionHandler.java (working copy) @@ -129,7 +129,67 @@ } } + /** + * Test the openregionhandler can deal with perceived failure of transitioning to OPENED state + * due to intermittent zookeeper malfunctioning. + * @see HBASE-9387 + * @throws IOException + * @throws NodeExistsException + * @throws KeeperException + */ @Test + public void testRegionServerAbortionDueToFailureTransitioningToOpened() + throws IOException, NodeExistsException, KeeperException { + final Server server = new MockServer(HTU); + final RegionServerServices rss = HTU.createMockRegionServerService(); + + HTableDescriptor htd = TEST_HTD; + final HRegionInfo hri = TEST_HRI; + HRegion region = + HRegion.createHRegion(hri, HTU.getDataTestDir(), HTU + .getConfiguration(), htd); + assertNotNull(region); + try { + OpenRegionHandler handler = new OpenRegionHandler(server, rss, hri, htd) { + boolean transitionToOpened(final HRegion r) throws IOException { + // try to transition to OPENED first + boolean b = super.transitionToOpened(r); + + // then remove znode as if master deletes it on successful open - before intermittent + // zookeeper issue kicks in + ZooKeeperWatcher zkw = this.server.getZooKeeper(); + String node = ZKAssign.getNodeName(zkw, hri.getEncodedName()); + try { + ZKUtil.deleteNodeFailSilent(zkw, node); + } catch (KeeperException e) { + throw new RuntimeException("Ugh failed delete of " + node, e); + } + // the return value of false simulates the scenario where ZK is in transient bad state, + // leading to perception by OpenRegionHandler of failure transitioning to OPENED + return false; + } + }; + rss.getRegionsInTransitionInRS().put( + hri.getEncodedNameAsBytes(), Boolean.TRUE); + // Call process without first creating OFFLINE region in zk, see if + // exception or just quiet return (expected). + handler.process(); + rss.getRegionsInTransitionInRS().put( + hri.getEncodedNameAsBytes(), Boolean.TRUE); + ZKAssign.createNodeOffline(server.getZooKeeper(), hri, server.getServerName()); + // Call process again but this time yank the zk znode out from under it + // post OPENING; again will expect it to come back w/o NPE or exception. + handler.process(); + } finally { + HRegion.closeHRegion(region); + } + // Region server is expected to abort due to OpenRegionHandler perceiving transitioning + // to OPENED as failed + // This was corresponding to the second handler.process() call above. + assertTrue("region server should have aborted", rss.isAborted()); + } + + @Test public void testFailedOpenRegion() throws Exception { Server server = new MockServer(HTU); RegionServerServices rsServices = HTU.createMockRegionServerService(); Index: hbase-server/src/test/java/org/apache/hadoop/hbase/MockRegionServerServices.java =================================================================== --- hbase-server/src/test/java/org/apache/hadoop/hbase/MockRegionServerServices.java (revision 1519102) +++ hbase-server/src/test/java/org/apache/hadoop/hbase/MockRegionServerServices.java (working copy) @@ -55,6 +55,7 @@ private ZooKeeperWatcher zkw = null; private ServerName serverName = null; private RpcServerInterface rpcServer = null; + private volatile boolean abortRequested; MockRegionServerServices(ZooKeeperWatcher zkw) { this.zkw = zkw; @@ -154,7 +155,8 @@ @Override public void abort(String why, Throwable e) { - //no-op + this.abortRequested = true; + stop(why); } @Override @@ -169,7 +171,7 @@ @Override public boolean isAborted() { - return false; + return this.abortRequested; } @Override Index: hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java =================================================================== --- hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java (revision 1519102) +++ hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/OpenRegionHandler.java (working copy) @@ -35,6 +35,7 @@ import org.apache.hadoop.hbase.regionserver.RegionServerServices; import org.apache.hadoop.hbase.util.CancelableProgressable; import org.apache.hadoop.hbase.zookeeper.ZKAssign; +import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.zookeeper.KeeperException; /** @@ -359,7 +360,7 @@ * @return whether znode is successfully transitioned to OPENED state. * @throws IOException */ - private boolean transitionToOpened(final HRegion r) throws IOException { + boolean transitionToOpened(final HRegion r) throws IOException { boolean result = false; HRegionInfo hri = r.getRegionInfo(); final String name = hri.getRegionNameAsString(); @@ -401,9 +402,19 @@ EventType.RS_ZK_REGION_OPENING, EventType.RS_ZK_REGION_FAILED_OPEN, this.version) == -1) { - LOG.warn("Unable to mark region " + hri + " as FAILED_OPEN. " + + String warnMsg = "Unable to mark region " + hri + " as FAILED_OPEN. " + "It's likely that the master already timed out this open " + - "attempt, and thus another RS already has the region."); + "attempt, and thus another RS already has the region."; + try { + String node = ZKAssign.getNodeName(this.server.getZooKeeper(), hri.getEncodedName()); + if (ZKUtil.checkExists(this.server.getZooKeeper(), node) < 0) { + rsServices.abort(warnMsg, null); + } else { + LOG.warn(warnMsg); + } + } catch (KeeperException ke) { + rsServices.abort(warnMsg, ke); + } } else { result = true; }