Index: src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestRegionServerAbort.java =================================================================== --- src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestRegionServerAbort.java (revision 581345) +++ src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestRegionServerAbort.java (working copy) @@ -25,8 +25,6 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.Text; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; /** * Tests region server failover when a region server exits. @@ -41,9 +39,6 @@ conf.setInt("ipc.client.timeout", 5000); // reduce client timeout conf.setInt("ipc.client.connect.max.retries", 5); // and number of retries conf.setInt("hbase.client.retries.number", 5); // reduce HBase retries - Logger.getRootLogger().setLevel(Level.WARN); - Logger.getLogger(this.getClass().getPackage().getName()). - setLevel(Level.DEBUG); } /** @@ -92,8 +87,10 @@ } LOG.info("Success!"); } finally { - LOG.info("Closing scanner " + scanner); - scanner.close(); + if (scanner != null) { + LOG.info("Closing scanner " + scanner); + scanner.close(); + } } } } \ No newline at end of file Index: src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestSplit.java =================================================================== --- src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestSplit.java (revision 581345) +++ src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestSplit.java (working copy) @@ -153,21 +153,26 @@ /** * Test that a region is cleaned up after its daughter splits release all * references. - * @throws IOException + * @throws Exception */ - public void testSplitRegionIsDeleted() throws IOException { - // Start up a hbase cluster - MiniHBaseCluster cluster = new MiniHBaseCluster(conf, 1, true); + public void testSplitRegionIsDeleted() throws Exception { try { - // Create a table. - HBaseAdmin admin = new HBaseAdmin(this.conf); - admin.createTable(createTableDescriptor(getName())); - // This builds a multi-region table by splitting. It will assert - // the parent region gets cleaned-up. - MultiRegionTable.makeMultiRegionTable(conf, cluster, - this.localFs, getName(), COLFAMILY_NAME3); - } finally { - cluster.shutdown(); + // Start up a hbase cluster + MiniHBaseCluster cluster = new MiniHBaseCluster(conf, 1, true); + try { + // Create a table. + HBaseAdmin admin = new HBaseAdmin(this.conf); + admin.createTable(createTableDescriptor(getName())); + // This builds a multi-region table by splitting. It will assert + // the parent region gets cleaned-up. + MultiRegionTable.makeMultiRegionTable(conf, cluster, + this.localFs, getName(), COLFAMILY_NAME3); + } finally { + cluster.shutdown(); + } + } catch (Exception e) { + LOG.error("test failed", e); + throw e; } } Index: src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegionServer.java =================================================================== --- src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegionServer.java (revision 581345) +++ src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegionServer.java (working copy) @@ -99,6 +99,7 @@ int numRetries; protected final int threadWakeFrequency; private final int msgInterval; + private final int serverLeaseTimeout; // Remote HMaster private final HMasterRegionInterface hbaseMaster; @@ -384,6 +385,8 @@ this.numRetries = conf.getInt("hbase.client.retries.number", 2); this.threadWakeFrequency = conf.getInt(THREAD_WAKE_FREQUENCY, 10 * 1000); this.msgInterval = conf.getInt("hbase.regionserver.msginterval", 3 * 1000); + this.serverLeaseTimeout = + conf.getInt("hbase.master.lease.period", 30 * 1000); // Cache flushing chore thread. this.cacheFlusherThread = @@ -427,11 +430,20 @@ try { init(reportForDuty()); + long lastMsg = 0; while(!stopRequested.get()) { - long lastMsg = 0; // Now ask master what it wants us to do and tell it what we have done for (int tries = 0; !stopRequested.get();) { - if ((System.currentTimeMillis() - lastMsg) >= msgInterval) { + long now = System.currentTimeMillis(); + if (lastMsg != 0 && (now - lastMsg) >= serverLeaseTimeout) { + // It has been way too long since we last reported to the master. + // Commit suicide. + LOG.fatal("unable to report to master for " + (now - lastMsg) + + " milliseconds - aborting server"); + abort(); + break; + } + if ((now - lastMsg) >= msgInterval) { HMsg outboundArray[] = null; synchronized(outboundMsgs) { outboundArray = @@ -514,9 +526,10 @@ stop(); } } - } // while (!stopRequested.get()) + } + this.sleeper.sleep(lastMsg); - } + } // while (!stopRequested.get()) } } catch (Throwable t) { LOG.fatal("Unhandled exception. Aborting...", t); @@ -743,12 +756,13 @@ LOG.debug("Telling master we are up"); } MapWritable result = null; + long lastMsg = 0; while(!stopRequested.get()) { - long lastMsg = 0; try { this.requestCount.set(0); this.serverInfo.setLoad(new HServerLoad(0, onlineRegions.size())); result = this.hbaseMaster.regionServerStartup(serverInfo); + lastMsg = System.currentTimeMillis(); if (LOG.isDebugEnabled()) { LOG.debug("Done telling master we are up"); }