Index: src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java (revision 1332922) +++ src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java (working copy) @@ -32,6 +32,7 @@ import org.apache.hadoop.hbase.Abortable; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.NotAllMetaRegionsOnlineException; +import org.apache.hadoop.hbase.NotServingRegionException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.HConnection; import org.apache.hadoop.hbase.client.HConnectionManager; @@ -183,7 +184,7 @@ this(zk, conf, HConnectionManager.getConnection(conf), abortable, defaultTimeout); } - CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf, + public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf, HConnection connection, Abortable abortable, final int defaultTimeout) throws IOException { this.connection = connection; @@ -410,8 +411,12 @@ HRegionInterface current = getCachedConnection(this.metaLocation); // If we are to refresh, verify we have a good connection by making // an invocation on it. - if (verifyRegionLocation(current, this.metaLocation, META_REGION_NAME)) { - return current; + try { + if (verifyRegionLocation(current, this.metaLocation, META_REGION_NAME)) { + return current; + } + } catch (NotServingRegionException nsre) { + // do nothing } resetMetaLocation(); } @@ -425,13 +430,20 @@ if (newLocation == null) return null; HRegionInterface newConnection = getCachedConnection(newLocation); - if (verifyRegionLocation(newConnection, newLocation, META_REGION_NAME)) { - setMetaLocation(newLocation); - return newConnection; - } else { + try { + if (verifyRegionLocation(newConnection, newLocation, META_REGION_NAME)) { + setMetaLocation(newLocation); + return newConnection; + } else { + if (LOG.isTraceEnabled()) { + LOG.trace("New .META. server: " + newLocation + " isn't valid." + + " Cached .META. server: " + this.metaLocation); + } + } + } catch (NotServingRegionException nsre) { if (LOG.isTraceEnabled()) { - LOG.trace("New .META. server: " + newLocation + " isn't valid." + - " Cached .META. server: " + this.metaLocation); + LOG.trace("New .META. server: " + newLocation + " isn't valid." + + " Cached .META. server: " + this.metaLocation); } } return null; @@ -629,7 +641,11 @@ } catch (RemoteException e) { IOException ioe = e.unwrapRemoteException(); t = ioe; - } catch (IOException e) { + }catch(NotServingRegionException nsre){ + LOG.info("Failed verification of " + Bytes.toStringBinary(regionName) + + " at address=" + address + "; " + t); + throw nsre; + }catch (IOException e) { Throwable cause = e.getCause(); if (cause != null && cause instanceof EOFException) { t = cause; Index: src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java (revision 1332922) +++ src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java (working copy) @@ -28,6 +28,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HRegionInfo; +import org.apache.hadoop.hbase.NotServingRegionException; import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.catalog.CatalogTracker; @@ -100,7 +101,13 @@ throws InterruptedException, IOException, KeeperException { long timeout = this.server.getConfiguration(). getLong("hbase.catalog.verification.timeout", 1000); - if (!this.server.getCatalogTracker().verifyRootRegionLocation(timeout)) { + boolean rootRegionLocation = false; + try { + rootRegionLocation = this.server.getCatalogTracker().verifyRootRegionLocation(timeout); + } catch (NotServingRegionException nsre) { + // Root region location is not available + } + if (!rootRegionLocation) { this.services.getAssignmentManager().assignRoot(); } } Index: src/main/java/org/apache/hadoop/hbase/master/HMaster.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/HMaster.java (revision 1332922) +++ src/main/java/org/apache/hadoop/hbase/master/HMaster.java (working copy) @@ -52,6 +52,7 @@ import org.apache.hadoop.hbase.HServerLoad; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.MasterNotRunningException; +import org.apache.hadoop.hbase.NotServingRegionException; import org.apache.hadoop.hbase.PleaseHoldException; import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.ServerName; @@ -595,7 +596,17 @@ boolean rit = this.assignmentManager. processRegionInTransitionAndBlockUntilAssigned(HRegionInfo.ROOT_REGIONINFO); ServerName currentRootServer = null; - if (!catalogTracker.verifyRootRegionLocation(timeout)) { + boolean rootRegionLocation = false; + try { + rootRegionLocation = catalogTracker.verifyRootRegionLocation(timeout); + } catch (NotServingRegionException nsre) { + if(rit == true){ + // the root region location is available. + // See HBASE-5875 for details. + rootRegionLocation = true; + } + } + if (!rootRegionLocation) { currentRootServer = this.catalogTracker.getRootLocation(); splitLogAndExpireIfOnline(currentRootServer); this.assignmentManager.assignRoot(); @@ -1608,6 +1619,15 @@ public boolean isServerShutdownHandlerEnabled() { return this.serverShutdownHandlerEnabled; } + /** + * For testing + * @param setServerShutDownEnabled + * @return + */ + public void setServerShutdownHandlerEnabled(boolean setServerShutDownEnabled) { + this.serverShutdownHandlerEnabled = setServerShutDownEnabled; + } + @Override @Deprecated Index: src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTracker.java =================================================================== --- src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTracker.java (revision 1332922) +++ src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTracker.java (working copy) @@ -20,6 +20,7 @@ package org.apache.hadoop.hbase.catalog; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; import java.io.IOException; import java.net.ConnectException; @@ -290,8 +291,7 @@ Mockito.when(implementation.getRegionInfo((byte [])Mockito.any())). thenThrow(connectException); Mockito.when(connection.getHRegionConnection(Mockito.anyString(), - Mockito.anyInt(), Mockito.anyBoolean())). - thenReturn(implementation); + Mockito.anyInt())).thenReturn(implementation); final CatalogTracker ct = constructAndStartCatalogTracker(connection); try { RootLocationEditor.setRootLocation(this.watcher, @@ -303,6 +303,32 @@ RootLocationEditor.deleteRootLocation(this.watcher); } } + + @Test + public void testVerifyRootRegionLocationFailsByNSRE() throws IOException, InterruptedException, + KeeperException { + HConnection connection = Mockito.mock(HConnection.class); + NotServingRegionException nsre = new NotServingRegionException("Root region not available."); + final HRegionInterface implementation = Mockito.mock(HRegionInterface.class); + Mockito.when(implementation.getRegionInfo((byte[]) Mockito.any())).thenThrow(nsre); + Mockito + .when( + connection.getHRegionConnection(Mockito.anyString(), Mockito.anyInt())).thenReturn(implementation); + final CatalogTracker ct = constructAndStartCatalogTracker(connection); + try { + RootLocationEditor.setRootLocation(this.watcher, + new ServerName("example.com", 1234, System.currentTimeMillis())); + try { + ct.verifyRootRegionLocation(100); + fail("NotServingRegionException must be thrown."); + } catch (NotServingRegionException nsrExcep) { + } + } finally { + // Clean out root location or later tests will be confused... they presume + // start fresh in zk. + RootLocationEditor.deleteRootLocation(this.watcher); + } + } @Test (expected = NotAllMetaRegionsOnlineException.class) public void testTimeoutWaitForRoot() Index: src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManager.java =================================================================== --- src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManager.java (revision 1332922) +++ src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManager.java (working copy) @@ -600,7 +600,7 @@ /** * An {@link AssignmentManager} with some extra facility used testing */ - class AssignmentManagerWithExtrasForTesting extends AssignmentManager { + public static class AssignmentManagerWithExtrasForTesting extends AssignmentManager { // Keep a reference so can give it out below in {@link #getExecutorService} private final ExecutorService es; // Ditto for ct @@ -634,6 +634,11 @@ while (this.gate.get()) Threads.sleep(1); super.processRegionsInTransition(data, regionInfo, deadServers, expectedVersion); } + + boolean processRegionInTransitionAndBlockUntilAssigned(HRegionInfo hri) + throws InterruptedException, KeeperException, IOException { + return true; + }; /** * @return ExecutorService used by this instance. Index: src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java =================================================================== --- src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java (revision 1332922) +++ src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java (working copy) @@ -26,9 +26,12 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.TreeSet; +import java.util.concurrent.atomic.AtomicBoolean; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -36,11 +39,22 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.*; +import org.apache.hadoop.hbase.catalog.CatalogTracker; +import org.apache.hadoop.hbase.catalog.RootLocationEditor; +import org.apache.hadoop.hbase.client.HConnection; import org.apache.hadoop.hbase.executor.EventHandler.EventType; +import org.apache.hadoop.hbase.executor.ExecutorService.ExecutorType; +import org.apache.hadoop.hbase.executor.ExecutorService; import org.apache.hadoop.hbase.executor.RegionTransitionData; +import org.apache.hadoop.hbase.ipc.HRegionInterface; import org.apache.hadoop.hbase.master.AssignmentManager.RegionState; +import org.apache.hadoop.hbase.master.TestAssignmentManager.AssignmentManagerWithExtrasForTesting; +import org.apache.hadoop.hbase.master.metrics.MasterMetrics; +import org.apache.hadoop.hbase.monitoring.TaskMonitor; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.regionserver.RegionScanner; +import org.apache.hadoop.hbase.tmpl.common.TaskMonitorTmpl; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.FSTableDescriptors; import org.apache.hadoop.hbase.util.JVMClusterUtil; @@ -50,13 +64,130 @@ import org.apache.hadoop.hbase.zookeeper.ZKAssign; import org.apache.hadoop.hbase.zookeeper.ZKTable; import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; +import org.apache.zookeeper.KeeperException; import org.junit.Test; import org.junit.experimental.categories.Category; +import org.mockito.Mockito; +import org.mockito.internal.util.reflection.Whitebox; @Category(LargeTests.class) public class TestMasterFailover { private static final Log LOG = LogFactory.getLog(TestMasterFailover.class); + /** + * Test to see how the ROOT and META assignment happens on master failover + * when the ROOT node is found in RIT. Verifies HBASE-5875 + * @throws Exception + */ + @Test(timeout = 180000) + public void testAssignRootAndMetaOnMasterFailOver() throws Exception { + Configuration conf = HBaseConfiguration.create(); + HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf); + try { + LOG.info("Starting testAssignRootAndMetaOnMasterFailOver"); + final int NUM_MASTERS = 1; + final int NUM_RS = 1; + + conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000); + conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 8000); + + // Start the cluster + TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS); + + MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); + + HRegionServer regionServer = cluster.getRegionServer(0); + + cluster.waitForActiveAndReadyMaster(); + HMaster master = cluster.getMaster(); + + // do not enable the server ShutdownHandler + master.setServerShutdownHandlerEnabled(false); + + // Mock the AM and the CatalogTracker. + AssignmentManagerWithExtrasForTesting am = setUpMockedAssignmentManager( + master, master.getServerManager(), regionServer); + Whitebox.setInternalState(master, "assignmentManager", am); + + try { + RootLocationEditor.setRootLocation(master.getZooKeeperWatcher(), + regionServer.getServerName()); + master.assignRootAndMeta(TaskMonitor.get().createStatus("assigning root")); + + Map> assignments = master.getAssignmentManager() + .getAssignments(); + + // Wait till the assignments are done for ROOT and META + while (assignments.get(regionServer.getServerName()).size() != 2) { + Thread.sleep(10); + } + } finally { + master.setServerShutdownHandlerEnabled(true); + Whitebox.setInternalState(master, "assignmentManager", master.assignmentManager); + Whitebox.setInternalState(master, "catalogTracker", master.getCatalogTracker()); + } + } finally { + TEST_UTIL.shutdownMiniCluster(); + } + + } + + private AssignmentManagerWithExtrasForTesting setUpMockedAssignmentManager( + HMaster server, ServerManager manager, final HRegionServer rs) throws IOException, + KeeperException { + // We need a mocked catalog tracker. Its used by our AM instance. + CatalogTracker ct = Mockito.mock(CatalogTracker.class); + + // Make an RS Interface implementation. Make it so a scanner can go against + // it and a get to return the single region, REGIONINFO, this test is + // messing with. Needed when "new master" joins cluster. AM will try and + // rebuild its list of user regions and it will also get the HRI that goes + // with an encoded name by doing a Get on .META. + HRegionInterface ri = Mockito.mock(HRegionInterface.class); + HConnection connection = Mockito.mock(HConnection.class); + NotServingRegionException nsre = new NotServingRegionException("Root region not available."); + Mockito.when(connection.getHRegionConnection(Mockito.anyString(), Mockito.anyInt())) + .thenReturn(ri); + Mockito.when(ri.getRegionInfo((byte[]) Mockito.any())).thenThrow(nsre); + // Get a meta row result that has region up on SERVERNAME_A for REGIONINFO + + // Make it so we can get the connection from our mocked catalogtracker + Mockito.when(ct.getConnection()).thenReturn(connection); + // Create and startup an executor. Used by AM handling zk callbacks. + ExecutorService executor = server.getExecutorService(); + LoadBalancer balancer = LoadBalancerFactory.getLoadBalancer(server.getConfiguration()); + // Create an abortable object + Abortable abortable = new Abortable() { + @Override + public void abort(String why, Throwable e) { + LOG.info(why, e); + } + + @Override + public boolean isAborted() { + return false; + } + }; + ct = new CatalogTracker(server.getZooKeeperWatcher(), server.getConfiguration(), connection, + abortable, 0) { + @Override + public boolean verifyMetaRegionLocation(long timeout) throws InterruptedException, + IOException { + return true; + } + + @Override + public ServerName getMetaLocation() { + return rs.getServerName(); + } + }; + // set the internal state for the catalog tracker. + Whitebox.setInternalState(server, "catalogTracker", ct); + AssignmentManagerWithExtrasForTesting am = new AssignmentManagerWithExtrasForTesting(server, + manager, ct, balancer, executor); + return am; + } + @Test (timeout=180000) public void testShouldCheckMasterFailOverWhenMETAIsInOpenedState() throws Exception {