Index: src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestMasterAbort.java =================================================================== --- src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestMasterAbort.java (revision 0) +++ src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestMasterAbort.java (revision 0) @@ -0,0 +1,61 @@ +/** + * Copyright 2007 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * + */ +public class TestMasterAbort extends HBaseClusterTestCase { + private final Log LOG = LogFactory.getLog(this.getClass().getName()); + private final int leasePeriod; + + /** constructor */ + public TestMasterAbort() { + super(); + conf.setInt("ipc.client.timeout", 5000); // reduce client timeout + conf.setInt("ipc.client.connect.max.retries", 5); // and number of retries + conf.setInt("hbase.client.retries.number", 5); // reduce HBase retries + this.leasePeriod = conf.getInt("hbase.master.lease.period", 30 * 1000); + } + + /** + * @throws Exception + */ + public void testMasterAbort() throws Exception { + try { + // When the META table can be opened, the region servers are running + @SuppressWarnings("unused") + HTable meta = new HTable(conf, HConstants.META_TABLE_NAME); + // Tell the master to stop talking to the region server + cluster.getMaster().abort(); + // Now wait for the region server to stop itself + try { + Thread.sleep(this.leasePeriod + (this.leasePeriod / 2)); + } catch (InterruptedException e) { + } + } catch (Exception e) { + LOG.error("unexpected exception", e); + } + } +} Index: src/contrib/hbase/src/test/org/apache/hadoop/hbase/MiniHBaseCluster.java =================================================================== --- src/contrib/hbase/src/test/org/apache/hadoop/hbase/MiniHBaseCluster.java (revision 581113) +++ src/contrib/hbase/src/test/org/apache/hadoop/hbase/MiniHBaseCluster.java (working copy) @@ -279,6 +279,13 @@ } /** + * @return reference to master object + */ + public HMaster getMaster() { + return this.masterThread.getMaster(); + } + + /** * @return Returns the rpc address actually used by the master server, because * the supplied port is not necessarily the actual port used. */ Index: src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestRegionServerAbort.java =================================================================== --- src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestRegionServerAbort.java (revision 581113) +++ src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestRegionServerAbort.java (working copy) @@ -25,8 +25,6 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.Text; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; /** * Tests region server failover when a region server exits. @@ -41,9 +39,6 @@ conf.setInt("ipc.client.timeout", 5000); // reduce client timeout conf.setInt("ipc.client.connect.max.retries", 5); // and number of retries conf.setInt("hbase.client.retries.number", 5); // reduce HBase retries - Logger.getRootLogger().setLevel(Level.WARN); - Logger.getLogger(this.getClass().getPackage().getName()). - setLevel(Level.DEBUG); } /** @@ -92,8 +87,10 @@ } LOG.info("Success!"); } finally { - LOG.info("Closing scanner " + scanner); - scanner.close(); + if (scanner != null) { + LOG.info("Closing scanner " + scanner); + scanner.close(); + } } } } \ No newline at end of file Index: src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestSplit.java =================================================================== --- src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestSplit.java (revision 581113) +++ src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestSplit.java (working copy) @@ -153,21 +153,26 @@ /** * Test that a region is cleaned up after its daughter splits release all * references. - * @throws IOException + * @throws Exception */ - public void testSplitRegionIsDeleted() throws IOException { - // Start up a hbase cluster - MiniHBaseCluster cluster = new MiniHBaseCluster(conf, 1, true); + public void testSplitRegionIsDeleted() throws Exception { try { - // Create a table. - HBaseAdmin admin = new HBaseAdmin(this.conf); - admin.createTable(createTableDescriptor(getName())); - // This builds a multi-region table by splitting. It will assert - // the parent region gets cleaned-up. - MultiRegionTable.makeMultiRegionTable(conf, cluster, - this.localFs, getName(), COLFAMILY_NAME3); - } finally { - cluster.shutdown(); + // Start up a hbase cluster + MiniHBaseCluster cluster = new MiniHBaseCluster(conf, 1, true); + try { + // Create a table. + HBaseAdmin admin = new HBaseAdmin(this.conf); + admin.createTable(createTableDescriptor(getName())); + // This builds a multi-region table by splitting. It will assert + // the parent region gets cleaned-up. + MultiRegionTable.makeMultiRegionTable(conf, cluster, + this.localFs, getName(), COLFAMILY_NAME3); + } finally { + cluster.shutdown(); + } + } catch (Exception e) { + LOG.error("test failed", e); + throw e; } } Index: src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegionServer.java =================================================================== --- src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegionServer.java (revision 581113) +++ src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegionServer.java (working copy) @@ -99,6 +99,7 @@ int numRetries; protected final int threadWakeFrequency; private final int msgInterval; + private final int serverLeaseTimeout; // Remote HMaster private final HMasterRegionInterface hbaseMaster; @@ -384,6 +385,8 @@ this.numRetries = conf.getInt("hbase.client.retries.number", 2); this.threadWakeFrequency = conf.getInt(THREAD_WAKE_FREQUENCY, 10 * 1000); this.msgInterval = conf.getInt("hbase.regionserver.msginterval", 3 * 1000); + this.serverLeaseTimeout = + conf.getInt("hbase.master.lease.period", 30 * 1000); // Cache flushing chore thread. this.cacheFlusherThread = @@ -427,11 +430,20 @@ try { init(reportForDuty()); + long lastMsg = 0; while(!stopRequested.get()) { - long lastMsg = 0; // Now ask master what it wants us to do and tell it what we have done for (int tries = 0; !stopRequested.get();) { - if ((System.currentTimeMillis() - lastMsg) >= msgInterval) { + long now = System.currentTimeMillis(); + if (lastMsg != 0 && (now - lastMsg) >= serverLeaseTimeout) { + // It has been way too long since we last reported to the master. + // Commit suicide. + LOG.fatal("unable to report to master for " + (now - lastMsg) + + " milliseconds - aborting server"); + abort(); + break; + } + if ((now - lastMsg) >= msgInterval) { HMsg outboundArray[] = null; synchronized(outboundMsgs) { outboundArray = @@ -514,9 +526,10 @@ stop(); } } - } // while (!stopRequested.get()) + } + this.sleeper.sleep(lastMsg); - } + } // while (!stopRequested.get()) } } catch (Throwable t) { LOG.fatal("Unhandled exception. Aborting...", t); @@ -743,12 +756,13 @@ LOG.debug("Telling master we are up"); } MapWritable result = null; + long lastMsg = 0; while(!stopRequested.get()) { - long lastMsg = 0; try { this.requestCount.set(0); this.serverInfo.setLoad(new HServerLoad(0, onlineRegions.size())); result = this.hbaseMaster.regionServerStartup(serverInfo); + lastMsg = System.currentTimeMillis(); if (LOG.isDebugEnabled()) { LOG.debug("Done telling master we are up"); } Index: src/contrib/hbase/src/java/org/apache/hadoop/hbase/HMaster.java =================================================================== --- src/contrib/hbase/src/java/org/apache/hadoop/hbase/HMaster.java (revision 581113) +++ src/contrib/hbase/src/java/org/apache/hadoop/hbase/HMaster.java (working copy) @@ -89,7 +89,9 @@ // plain boolean because want to pass a reference to supporting threads // started here in HMaster rather than have them have to know about the // hosting class - volatile AtomicBoolean closed = new AtomicBoolean(true); + AtomicBoolean closed = new AtomicBoolean(true); + private final int leaseTimeout; + volatile boolean abortRequested = false; volatile boolean fsOk; Path dir; Configuration conf; @@ -862,9 +864,9 @@ conf.getLong("hbase.hbasemaster.maxregionopen", 30 * 1000); this.msgQueue = new LinkedBlockingQueue(); - - this.serverLeases = new Leases( - conf.getInt("hbase.master.lease.period", 30 * 1000), + + this.leaseTimeout = conf.getInt("hbase.master.lease.period", 30 * 1000); + this.serverLeases = new Leases(leaseTimeout, conf.getInt("hbase.master.lease.thread.wakefrequency", 15 * 1000)); this.server = RPC.getServer(this, address.getBindAddress(), @@ -934,6 +936,12 @@ LOG.info("HMaster initialized on " + this.address.toString()); } + // For debugging purposes. Kill the master. + + void abort() { + this.closed.set(true); + this.abortRequested = true; + } /** * Checks to see if the file system is still accessible. * If not, sets closed @@ -1126,6 +1134,7 @@ * HMasterRegionInterface */ + /** {@inheritDoc} */ @SuppressWarnings("unused") public MapWritable regionServerStartup(HServerInfo serverInfo) throws IOException { @@ -1199,6 +1208,19 @@ /** {@inheritDoc} */ public HMsg[] regionServerReport(HServerInfo serverInfo, HMsg msgs[]) throws IOException { + + if (abortRequested) { + // We are going to shut down the master out from under the region servers + // as a test to verify that they shut themselves off. + // + // First we want to ignore the region server for a lease period, so it + // will shut itself down. + try { + Thread.sleep(leaseTimeout); + } catch (InterruptedException e) { + } + return new HMsg[0]; + } String serverName = serverInfo.getServerAddress().toString().trim(); long serverLabel = getServerLabel(serverName); Index: src/contrib/hbase/src/java/org/apache/hadoop/hbase/util/Sleeper.java =================================================================== --- src/contrib/hbase/src/java/org/apache/hadoop/hbase/util/Sleeper.java (revision 581113) +++ src/contrib/hbase/src/java/org/apache/hadoop/hbase/util/Sleeper.java (working copy) @@ -31,6 +31,10 @@ private final int period; private AtomicBoolean stop; + /** + * @param sleep + * @param stop + */ public Sleeper(final int sleep, final AtomicBoolean stop) { this.period = sleep; this.stop = stop;