Index: src/test/java/org/apache/hadoop/hbase/master/TestKillingServersFromMaster.java =================================================================== --- src/test/java/org/apache/hadoop/hbase/master/TestKillingServersFromMaster.java (revision 0) +++ src/test/java/org/apache/hadoop/hbase/master/TestKillingServersFromMaster.java (revision 0) @@ -0,0 +1,136 @@ +/** + * Copyright 2010 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; + +import java.io.IOException; + +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.HServerAddress; +import org.apache.hadoop.hbase.HServerInfo; +import org.apache.hadoop.hbase.MiniHBaseCluster; +import org.apache.hadoop.hbase.YouAreDeadException; +import org.apache.hadoop.hbase.MiniHBaseCluster.MiniHBaseClusterRegionServer; +import org.apache.hadoop.hbase.util.JVMClusterUtil; +import org.apache.hadoop.hbase.util.Threads; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; + +public class TestKillingServersFromMaster { + private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); + private static MiniHBaseCluster CLUSTER; + + @BeforeClass + public static void beforeAllTests() throws Exception { + TEST_UTIL.startMiniCluster(2); + CLUSTER = TEST_UTIL.getHBaseCluster(); + } + + @AfterClass + public static void afterAllTests() throws IOException { + TEST_UTIL.shutdownMiniCluster(); + } + + @Before + public void setup() throws IOException { + TEST_UTIL.ensureSomeRegionServersAvailable(2); + } + + /** + * Test that a region server that reports with the wrong start code + * gets shut down + * See HBASE-2613 + * @throws Exception + */ + @Test (timeout=180000) + public void testRsReportsWrongStartCode() throws Exception { + JVMClusterUtil.RegionServerThread rst = + CLUSTER.getLiveRegionServerThreads().get(0); + MiniHBaseClusterRegionServer rs = + (MiniHBaseClusterRegionServer)rst.getRegionServer(); + HServerInfo hsi = rs.getServerInfo(); + // This constructor creates a new startcode + HServerInfo newHsi = new HServerInfo(hsi.getServerAddress(), + hsi.getInfoPort(), hsi.getHostname()); + rs.setHServerInfo(newHsi); + // The old server should have been added to deadservers; cycle till we see it + while (!CLUSTER.getMaster().getServerManager().getDeadServers(). + contains(hsi.getServerName())) { + Threads.sleep(1); + } + // The processing of deadservers cannot succeed because lease is still held + // on the regionservers WAL... so assert that new server has not yet been + // let in. + assertFalse(CLUSTER.getMaster().getServerManager().getOnlineServers(). + containsKey(newHsi)); + // Now abort. Need to abort because server is stuck trying to get WAL lease. + rs.abort("Done with test"); + // Make sure this server is down before we move to the next test. + rst.join(); + } + + /** + * Test that a region server that reports with the wrong address + * gets shut down + * See HBASE-2613 + * @throws Exception + */ + // HBASE-2613 'Remove the code around MSG_CALL_SERVER_STARTUP' made it so if + // an RS w/ host+port+startcode that hasn't come in via report for duty, then + // we tell it stop. Well, that wont' work anymore now we have master's + // joining already running clusters; we can't expect regionserver to go back + // to report-for-duty. Now we just register anyone who checks in (if same + // host+port as a server we already know AND the new server's startcode is + // in advance of the startcode we already have for this host+port, then we'll + // tell new server hold and trigger expire on the server we have. See + // over in ServerManager. So, leaving this test disabled/ignored + @Ignore @Test (timeout=180000) + public void testRsReportsWrongAddress() throws Exception { + JVMClusterUtil.RegionServerThread rst = + CLUSTER.getLiveRegionServerThreads().get(0); + MiniHBaseClusterRegionServer rs = + (MiniHBaseClusterRegionServer)rst.getRegionServer(); + assertFalse(rs.isStopped()); + rs.getHServerInfo().setServerAddress(new HServerAddress("0.0.0.0", 60010)); + rst.join(); + assertEquals(1, CLUSTER.getLiveRegionServerThreads().size()); + } + + /** + * Send a YouAreDeadException to the region server and expect it to shut down + * See HBASE-2691 + * @throws Exception + */ + @Test (timeout=180000) + public void testSendYouAreDead() throws Exception { + JVMClusterUtil.RegionServerThread rst = + CLUSTER.getLiveRegionServerThreads().get(0); + MiniHBaseClusterRegionServer rs = + (MiniHBaseClusterRegionServer)rst.getRegionServer(); + CLUSTER.addExceptionToSendRegionServer(rs, new YouAreDeadException("bam!")); + rst.join(); + assertEquals(1, CLUSTER.getLiveRegionServerThreads().size()); + } +} \ No newline at end of file Index: src/test/java/org/apache/hadoop/hbase/master/BROKE_FIX_TestKillingServersFromMaster.java =================================================================== --- src/test/java/org/apache/hadoop/hbase/master/BROKE_FIX_TestKillingServersFromMaster.java (revision 1027809) +++ src/test/java/org/apache/hadoop/hbase/master/BROKE_FIX_TestKillingServersFromMaster.java (working copy) @@ -1,103 +0,0 @@ -/** - * Copyright 2010 The Apache Software Foundation - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.master; - -import static org.junit.Assert.assertEquals; - -import java.io.IOException; - -import org.apache.hadoop.hbase.HBaseTestingUtility; -import org.apache.hadoop.hbase.HServerAddress; -import org.apache.hadoop.hbase.HServerInfo; -import org.apache.hadoop.hbase.MiniHBaseCluster; -import org.apache.hadoop.hbase.YouAreDeadException; -import org.apache.hadoop.hbase.MiniHBaseCluster.MiniHBaseClusterRegionServer; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Ignore; -import org.junit.Test; - -public class BROKE_FIX_TestKillingServersFromMaster { - private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); - private static MiniHBaseCluster cluster; - - @BeforeClass - public static void beforeAllTests() throws Exception { - TEST_UTIL.startMiniCluster(2); - cluster = TEST_UTIL.getHBaseCluster(); - } - - @AfterClass - public static void afterAllTests() throws IOException { - TEST_UTIL.shutdownMiniCluster(); - } - - @Before - public void setup() throws IOException { - TEST_UTIL.ensureSomeRegionServersAvailable(2); - } - - /** - * Test that a region server that reports with the wrong start code - * gets shut down - * See HBASE-2613 - * @throws Exception - */ - @Ignore @Test (timeout=180000) - public void testRsReportsWrongStartCode() throws Exception { - MiniHBaseClusterRegionServer firstServer = - (MiniHBaseClusterRegionServer)cluster.getRegionServer(0); - HServerInfo hsi = firstServer.getServerInfo(); - // This constructor creates a new startcode - firstServer.setHServerInfo(new HServerInfo(hsi.getServerAddress(), - hsi.getInfoPort(), hsi.getHostname())); - cluster.waitOnRegionServer(0); - assertEquals(1, cluster.getLiveRegionServerThreads().size()); - } - - /** - * Test that a region server that reports with the wrong address - * gets shut down - * See HBASE-2613 - * @throws Exception - */ - @Ignore @Test (timeout=180000) - public void testRsReportsWrongAddress() throws Exception { - MiniHBaseClusterRegionServer firstServer = - (MiniHBaseClusterRegionServer)cluster.getRegionServer(0); - firstServer.getHServerInfo().setServerAddress( - new HServerAddress("0.0.0.0", 60010)); - cluster.waitOnRegionServer(0); - assertEquals(1, cluster.getLiveRegionServerThreads().size()); - } - - /** - * Send a YouAreDeadException to the region server and expect it to shut down - * See HBASE-2691 - * @throws Exception - */ - @Ignore @Test (timeout=180000) - public void testSendYouAreDead() throws Exception { - cluster.addExceptionToSendRegionServer(0, new YouAreDeadException("bam!")); - cluster.waitOnRegionServer(0); - assertEquals(1, cluster.getLiveRegionServerThreads().size()); - } -} \ No newline at end of file Index: src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (revision 1027809) +++ src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (working copy) @@ -120,6 +120,7 @@ import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; import org.apache.hadoop.io.MapWritable; import org.apache.hadoop.io.Writable; +import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.net.DNS; import org.apache.zookeeper.KeeperException; @@ -521,8 +522,10 @@ calledCloseUserRegions = true; } } - // Try to get the root region location from zookeeper. - this.catalogTracker.waitForRoot(); + // Try to get the root region location from zookeeper. If not available, + // continue -- we may have been shutdown. + if (!this.catalogTracker.isRootAvailable(1000)) continue; + long now = System.currentTimeMillis(); // Drop into the send loop if msgInterval has elapsed or if something // to send. If we fail talking to the master, then we'll sleep below @@ -658,10 +661,19 @@ outboundMessages.toArray(HMsg.EMPTY_HMSG_ARRAY), getMostLoadedRegions()); break; - } catch (IOException ioe) { - // Couldn't connect to the master, get location from zk and reconnect - // Method blocks until new master is found or we are stopped - getMaster(); + } catch (RemoteException e) { + IOException ioe = e.unwrapRemoteException(); + try { + // Rethrow so it can be handled by the following catches + throw ioe; + } catch (YouAreDeadException ee) { + // Let it out. Caller knows how to deal. + throw ee; + } catch (IOException ee) { + ioehandler(ee); + } + } catch (IOException e) { + ioehandler(e); } } updateOutboundMsgs(outboundMessages); @@ -679,6 +691,13 @@ return outboundMessages; } + private void ioehandler(final IOException ioe) { + // Presume couldn't connect to the master, get location from zk and reconnect. + // Method blocks until new master is found or we are stopped + LOG.debug("Failed report because of " + ioe.getMessage() + "; retrying"); + getMaster(); + } + private HServerLoad buildServerLoad() { MemoryUsage memory = ManagementFactory.getMemoryMXBean().getHeapMemoryUsage(); HServerLoad hsl = new HServerLoad(requestCount.get(), Index: src/main/java/org/apache/hadoop/hbase/master/ServerManager.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (revision 1027809) +++ src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (working copy) @@ -50,10 +50,8 @@ import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler; import org.apache.hadoop.hbase.master.metrics.MasterMetrics; import org.apache.hadoop.hbase.regionserver.Leases.LeaseStillHeldException; -import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.util.StringUtils; -import org.apache.zookeeper.KeeperException; /** * The ServerManager class manages info about region servers - HServerInfo, @@ -173,8 +171,9 @@ HServerInfo existingServer = haveServerWithSameHostAndPortAlready(serverInfo.getHostnamePort()); if (existingServer != null) { - String message = "Server start rejected; we already have " + hostAndPort + - " registered; existingServer=" + existingServer + ", newServer=" + serverInfo; + String message = "Server " + serverInfo.getServerName() + " start " + + "rejected; we already have " + hostAndPort + + " registered; existingServer=" + existingServer.getServerName(); LOG.info(message); if (existingServer.getStartCode() < serverInfo.getStartCode()) { LOG.info("Triggering server recovery; existingServer " + Index: src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (revision 1027809) +++ src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java (working copy) @@ -637,6 +637,10 @@ region.getRegionNameAsString()); return; } + if (this.master.isStopped()) { + LOG.info("This server is stopped, skipping assign of " + + region.getRegionNameAsString()); + } RegionState state = addToRegionsInTransition(region); synchronized (state) { assign(state); Index: src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java (revision 1027809) +++ src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java (working copy) @@ -89,6 +89,10 @@ @Override public void process() throws IOException { final String serverName = this.hsi.getServerName(); + if (this.server.isStopped()) { + LOG.info("Skipping out on server shutdown processing of " + serverName + + " because this server has been stopped"); + } LOG.info("Splitting logs for " + serverName); this.services.getMasterFileSystem().splitLog(serverName); Index: src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java (revision 1027809) +++ src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java (working copy) @@ -197,6 +197,23 @@ } /** + * True if -ROOT- is available. Waits + * for up to the specified timeout if not immediately available. Returns + * true if root available else false. + * @param timeout maximum time to wait for root availability, in milliseconds + * @return True if root available. + * @throws InterruptedException if interrupted while waiting + */ + public boolean isRootAvailable(final long timeout) + throws InterruptedException { + try { + return waitForRoot(timeout) != null; + } catch(NotAllMetaRegionsOnlineException e) { + return false; + } + } + + /** * Gets a connection to the server hosting root, as reported by ZooKeeper, * waiting up to the specified timeout for availability. * @see #waitForRoot(long) for additional information