Index: src/test/org/apache/hadoop/hbase/MiniHBaseCluster.java =================================================================== --- src/test/org/apache/hadoop/hbase/MiniHBaseCluster.java (revision 940322) +++ src/test/org/apache/hadoop/hbase/MiniHBaseCluster.java (working copy) @@ -106,13 +106,29 @@ } } + /** + * Subclass so can get at the kill method. + */ + public static class MiniHBaseClusterRegionServer extends HRegionServer { + public MiniHBaseClusterRegionServer(HBaseConfiguration conf) + throws IOException { + super(conf); + } + + @Override + public void kill() { + super.kill(); + } + } + private void init(final int nRegionNodes) throws IOException { try { // start up a LocalHBaseCluster while (true) { try { hbaseCluster = new LocalHBaseCluster(conf, nRegionNodes, - MiniHBaseCluster.MiniHBaseClusterMaster.class); + MiniHBaseCluster.MiniHBaseClusterMaster.class, + MiniHBaseCluster.MiniHBaseClusterRegionServer.class); hbaseCluster.startup(); } catch (BindException e) { //this port is already in use. try to use another (for multiple testing) @@ -134,13 +150,13 @@ * Starts a region server thread running * * @throws IOException - * @return Name of regionserver started. + * @return New RegionServerThread */ - public String startRegionServer() throws IOException { + public JVMClusterUtil.RegionServerThread startRegionServer() throws IOException { JVMClusterUtil.RegionServerThread t = this.hbaseCluster.addRegionServer(); t.start(); t.waitForServerOnline(); - return t.getName(); + return t; } /** @@ -172,6 +188,21 @@ } /** + * Cause a region server to exit doing no clean on way out (other than close + * socket). + * @param serverNumber Used as index into a list. + */ + public String killRegionServer(int serverNumber) { + MiniHBaseClusterRegionServer server = + (MiniHBaseClusterRegionServer)getRegionServer(serverNumber); + // Don't run hdfs shutdown thread. + server.setHDFSShutdownThreadOnExit(null); + LOG.info("Killing " + server.toString()); + server.kill(); + return server.toString(); + } + + /** * Shut down the specified region server cleanly * * @param serverNumber Used as index into a list. @@ -290,7 +321,21 @@ public void addMessageToSendRegionServer(final int serverNumber, final HMsg msg) throws IOException { - HRegionServer hrs = getRegionServer(serverNumber); + MiniHBaseClusterRegionServer hrs = + (MiniHBaseClusterRegionServer)getRegionServer(serverNumber); + addMessageToSendRegionServer(hrs, msg); + } + + /** + * Add a message to include in the responses send a regionserver when it + * checks back in. + * @param hrs Which region server. + * @param msg The MESSAGE + * @throws IOException + */ + public void addMessageToSendRegionServer(final MiniHBaseClusterRegionServer hrs, + final HMsg msg) + throws IOException { ((MiniHBaseClusterMaster)getMaster()).addMessage(hrs.getHServerInfo(), msg); } } \ No newline at end of file Index: src/test/org/apache/hadoop/hbase/master/TestMasterTransistions.java =================================================================== --- src/test/org/apache/hadoop/hbase/master/TestMasterTransistions.java (revision 940322) +++ src/test/org/apache/hadoop/hbase/master/TestMasterTransistions.java (working copy) @@ -19,9 +19,10 @@ */ package org.apache.hadoop.hbase.master; -import static org.junit.Assert.*; +import static org.junit.Assert.assertTrue; import java.io.IOException; +import java.util.Collection; import java.util.Set; import java.util.concurrent.CopyOnWriteArraySet; @@ -30,19 +31,23 @@ import org.apache.hadoop.hbase.HMsg; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HServerAddress; +import org.apache.hadoop.hbase.HServerInfo; import org.apache.hadoop.hbase.MiniHBaseCluster; +import org.apache.hadoop.hbase.MiniHBaseCluster.MiniHBaseClusterRegionServer; import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Threads; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; +import org.mortbay.log.Log; /** * Test transitions of state across the master. @@ -72,6 +77,114 @@ } /** + * HBase2482 is about outstanding region openings. If any are outstanding + * when a regionserver goes down, then they'll never deploy. They'll be + * stuck in the regions-in-transition list for ever. + */ + static class HBase2482Listener implements RegionServerOperationListener { + private final HRegionServer victim; + private boolean killSent = false; + private volatile boolean closed = false; + private final Collection copyOfOnlineRegions; + + public HBase2482Listener(final HRegionServer victim) { + this.victim = victim; + this.copyOfOnlineRegions = + this.victim.getCopyOfOnlineRegionsSortedBySize().values(); + } + + @Override + public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { + if (!victim.getServerInfo().equals(serverInfo) || + incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN) || + this.killSent) { + return true; + } + Log.info("KILLING " + this.victim); + ((MiniHBaseCluster.MiniHBaseClusterRegionServer)this.victim).kill(); + this.killSent = true; + return true; + } + + @Override + public boolean process(RegionServerOperation op) throws IOException { + return true; + } + + @Override + public void processed(RegionServerOperation op) { + if (!this.closed && op instanceof ProcessRegionClose) { + ProcessRegionClose close = (ProcessRegionClose)op; + for (HRegion r: this.copyOfOnlineRegions) { + if (r.getRegionInfo().equals(close.regionInfo)) { + // We've closed one of the regions that was on the victim server. + // Now we can start testing for when all regions are back online + // again + this.closed = true; + break; + } + } + } + } + } + + /** + * In 2482, a RS with an opening region on it dies. The region gets stuck + * in the master's regions-in-transition and never leaves it. + * @see HBASE-2482 + */ + @Test public void testKillRSWithOpeningRegion2482() throws Exception { + MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); + int countOfMetaRegions = countOfMetaRegions(); + // Add listener. + HMaster m = cluster.getMaster(); + MiniHBaseClusterRegionServer hrs = + (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer(); + hrs.setHDFSShutdownThreadOnExit(null); + HBase2482Listener listener = new HBase2482Listener(hrs); + m.getRegionServerOperationQueue(). + registerRegionServerOperationListener(listener); + try { + // Wait until some regions are open on the new server. + while (hrs.getOnlineRegions().size() < 2) Threads.sleep(100); + // Go close all non-catalog regions on new server + closeAlltNonCatalogRegions(cluster, hrs); + // After all the closes, add a blocking message before the opens start to + // come in. + cluster.addMessageToSendRegionServer(0, + new HMsg(HMsg.Type.TESTING_MSG_BLOCK_RS)); + // Wait till region closed before we start wait on all regions back online. + while (!listener.closed) Threads.sleep(100); + while(!listener.killSent) Threads.sleep(100); + // Now wait for regions to come back online. + waitUntilAllRegionsAssigned(countOfMetaRegions); + } finally { + m.getRegionServerOperationQueue(). + unregisterRegionServerOperationListener(listener); + } + } + + + /* + * @param cluster + * @param hrs + * @return Count of regions closed. + * @throws IOException + */ + private int closeAlltNonCatalogRegions(final MiniHBaseCluster cluster, + final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs) + throws IOException { + int countOfRegions = 0; + for (HRegion r: hrs.getOnlineRegions()) { + if (r.getRegionInfo().isMetaRegion()) continue; + cluster.addMessageToSendRegionServer(hrs, + new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo())); + countOfRegions++; + } + return countOfRegions; + } + + /** * Listener for regionserver events testing hbase-2428 (Infinite loop of * region closes if META region is offline). In particular, listen * for the close of the 'metaServer' and when it comes in, requeue it with a @@ -167,6 +280,11 @@ int getCloseCount() { return this.closeCount; } + + @Override + public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { + return true; + } } /** @@ -260,4 +378,26 @@ if (rows == countOfRegions) break; } } + + /* + * @return Count of regions in meta table. + * @throws IOException + */ + private static int countOfMetaRegions() + throws IOException { + HTable meta = new HTable(TEST_UTIL.getConfiguration(), + HConstants.META_TABLE_NAME); + int rows = 0; + Scan scan = new Scan(); + scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); + ResultScanner s = meta.getScanner(scan); + for (Result r = null; (r = s.next()) != null;) { + byte [] b = + r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); + if (b == null || b.length <= 0) break; + rows++; + } + s.close(); + return rows; + } } \ No newline at end of file Index: src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java =================================================================== --- src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (revision 940322) +++ src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (working copy) @@ -135,6 +135,8 @@ // debugging and unit tests. protected volatile boolean abortRequested; + private boolean killed = false; + // If false, the file system has become unavailable protected volatile boolean fsOk; @@ -630,7 +632,9 @@ hlogRoller.interruptIfNecessary(); this.majorCompactionChecker.interrupt(); - if (abortRequested) { + if (this.killed) { + // Just skip out without closing regions, logs, etc. + } else if (abortRequested) { if (this.fsOk) { // Only try to clean up if the file system is available try { @@ -682,10 +686,13 @@ HBaseRPC.stopProxy(this.hbaseMaster); this.hbaseMaster = null; } - - join(); - zooKeeperWrapper.close(); - + + if (this.killed) { + // Skip out with waiting or closing up zk. + } else { + join(); + zooKeeperWrapper.close(); + } if (shutdownHDFS.get()) { runThread(this.hdfsShutdownThread, this.conf.getLong("hbase.dfs.shutdown.wait", 30000)); @@ -1265,6 +1272,15 @@ } /** + * Simulate a kill -9 server. Exits without doing even the basics done on + * an {@link #abort()} -- except it does close socket. + */ + protected void kill() { + this.killed = true; + abort(); + } + + /** * Cause the server to exit without closing the regions it is serving, the * log it is using and without notifying the master. * Used unit testing and on catastrophic events such as HDFS is yanked out @@ -1499,6 +1515,11 @@ region.flushcache(); break; + case TESTING_MSG_BLOCK_RS: + while (!stopRequested.get()) { + Threads.sleep(1000); + LOG.info("Regionserver blocked"); + } default: throw new AssertionError( "Impossible state during msg processing. Instruction: " @@ -1537,7 +1558,7 @@ } } } - + void openRegion(final HRegionInfo regionInfo) { Integer mapKey = Bytes.mapKey(regionInfo.getRegionName()); HRegion region = this.onlineRegions.get(mapKey); @@ -1784,10 +1805,8 @@ // Count of Puts processed. int i = 0; checkOpen(); - boolean isMetaRegion = false; try { HRegion region = getRegion(regionName); - isMetaRegion = region.getRegionInfo().isMetaRegion(); if (!region.getRegionInfo().isMetaTable()) { this.cacheFlusher.reclaimMemStoreMemory(); @@ -2015,11 +2034,9 @@ // Count of Deletes processed. int i = 0; checkOpen(); - boolean isMetaRegion = false; try { boolean writeToWAL = true; HRegion region = getRegion(regionName); - isMetaRegion = region.getRegionInfo().isMetaRegion(); if (!region.getRegionInfo().isMetaTable()) { this.cacheFlusher.reclaimMemStoreMemory(); } @@ -2513,6 +2530,24 @@ } /** + * Utility for constructing an instance of the passed HRegionServer class. + * @param regionServerClass + * @param conf2 + * @return HRegionServer instance. + */ + public static HRegionServer constructRegionServer(Class regionServerClass, + final Configuration conf2) { + try { + Constructor c = + regionServerClass.getConstructor(HBaseConfiguration.class); + return c.newInstance(conf2); + } catch (Exception e) { + throw new RuntimeException("Failed construction of " + + "Master: " + regionServerClass.toString(), e); + } + } + + /** * Do class main. * @param args * @param regionServerClass HRegionServer to instantiate. @@ -2539,9 +2574,8 @@ if (runtime != null) { LOG.info("vmInputArguments=" + runtime.getInputArguments()); } - Constructor c = - regionServerClass.getConstructor(HBaseConfiguration.class); - startRegionServer(c.newInstance(conf)); + HRegionServer hrs = constructRegionServer(regionServerClass, conf); + startRegionServer(hrs); } } catch (Throwable t) { LOG.error( "Can not start region server because "+ Index: src/java/org/apache/hadoop/hbase/LocalHBaseCluster.java =================================================================== --- src/java/org/apache/hadoop/hbase/LocalHBaseCluster.java (revision 940322) +++ src/java/org/apache/hadoop/hbase/LocalHBaseCluster.java (working copy) @@ -82,12 +82,17 @@ * @param noRegionServers Count of regionservers to start. * @throws IOException */ - public LocalHBaseCluster(final HBaseConfiguration conf, - final int noRegionServers) + public LocalHBaseCluster(final HBaseConfiguration conf, final int noRegionServers) throws IOException { - this(conf, noRegionServers, HMaster.class); + this(conf, noRegionServers, HMaster.class, getRegionServerImplementation(conf)); } + @SuppressWarnings("unchecked") + private static Class getRegionServerImplementation(final HBaseConfiguration conf) { + return (Class)conf.getClass(HConstants.REGION_SERVER_IMPL, + HRegionServer.class); + } + /** * Constructor. * @param conf Configuration to use. Post construction has the master's @@ -98,7 +103,8 @@ */ @SuppressWarnings("unchecked") public LocalHBaseCluster(final HBaseConfiguration conf, - final int noRegionServers, final Class masterClass) + final int noRegionServers, final Class masterClass, + final Class regionServerClass) throws IOException { this.conf = conf; // Create the master @@ -111,7 +117,7 @@ new CopyOnWriteArrayList(); this.regionServerClass = (Class)conf.getClass(HConstants.REGION_SERVER_IMPL, - HRegionServer.class); + regionServerClass); for (int i = 0; i < noRegionServers; i++) { addRegionServer(i); } Index: src/java/org/apache/hadoop/hbase/master/RegionServerOperationQueue.java =================================================================== --- src/java/org/apache/hadoop/hbase/master/RegionServerOperationQueue.java (revision 940322) +++ src/java/org/apache/hadoop/hbase/master/RegionServerOperationQueue.java (working copy) @@ -12,6 +12,8 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HMsg; +import org.apache.hadoop.hbase.HServerInfo; import org.apache.hadoop.hbase.RemoteExceptionHandler; import org.apache.hadoop.hbase.util.Sleeper; import org.apache.hadoop.ipc.RemoteException; @@ -197,6 +199,24 @@ } } + /** + * Called for each message passed the master. Most of the messages that come + * in here will go on to become {@link #process(RegionServerOperation)}s but + * others like {@linke HMsg.Type#MSG_REPORT_PROCESS_OPEN} go no further; + * only in here can you see them come in. + * @param serverInfo Server we got the message from. + * @param incomingMsg The message received. + * @return True to continue processing, false to skip. + */ + boolean process(final HServerInfo serverInfo, + final HMsg incomingMsg) { + if (this.listeners.isEmpty()) return true; + for (RegionServerOperationListener listener: this.listeners) { + if (!listener.process(serverInfo, incomingMsg)) return false; + } + return true; + } + /* * Tell listeners that we processed a RegionServerOperation. * @param op Operation to tell the world about. Index: src/java/org/apache/hadoop/hbase/master/ServerManager.java =================================================================== --- src/java/org/apache/hadoop/hbase/master/ServerManager.java (revision 940322) +++ src/java/org/apache/hadoop/hbase/master/ServerManager.java (working copy) @@ -423,7 +423,7 @@ * @return */ private HMsg[] processMsgs(HServerInfo serverInfo, - HRegionInfo[] mostLoadedRegions, HMsg incomingMsgs[]) { + HRegionInfo[] mostLoadedRegions, HMsg incomingMsgs[]) { ArrayList returnMsgs = new ArrayList(); if (serverInfo.getServerAddress() == null) { throw new NullPointerException("Server address cannot be null; " + @@ -438,6 +438,10 @@ LOG.info("Processing " + incomingMsgs[i] + " from " + serverInfo.getServerName() + "; " + (i + 1) + " of " + incomingMsgs.length); + if (!this.master.getRegionServerOperationQueue(). + process(serverInfo, incomingMsgs[i])) { + continue; + } switch (incomingMsgs[i].getType()) { case MSG_REPORT_PROCESS_OPEN: openingCount++; Index: src/java/org/apache/hadoop/hbase/master/RegionServerOperationListener.java =================================================================== --- src/java/org/apache/hadoop/hbase/master/RegionServerOperationListener.java (revision 940322) +++ src/java/org/apache/hadoop/hbase/master/RegionServerOperationListener.java (working copy) @@ -21,6 +21,9 @@ import java.io.IOException; +import org.apache.hadoop.hbase.HMsg; +import org.apache.hadoop.hbase.HServerInfo; + /** * Listener for regionserver events in master. * @see HMaster#registerRegionServerOperationListener(RegionServerOperationListener) @@ -28,6 +31,18 @@ */ public interface RegionServerOperationListener { /** + * Called for each message passed the master. Most of the messages that come + * in here will go on to become {@link #process(RegionServerOperation)}s but + * others like {@linke HMsg.Type#MSG_REPORT_PROCESS_OPEN} go no further; + * only in here can you see them come in. + * @param serverInfo Server we got the message from. + * @param incomingMsg The message received. + * @return True to continue processing, false to skip. + */ + public boolean process(final HServerInfo serverInfo, + final HMsg incomingMsg); + + /** * Called before processing op * @param op * @return True if we are to proceed w/ processing. Index: src/java/org/apache/hadoop/hbase/master/HMaster.java =================================================================== --- src/java/org/apache/hadoop/hbase/master/HMaster.java (revision 940322) +++ src/java/org/apache/hadoop/hbase/master/HMaster.java (working copy) @@ -1141,7 +1141,8 @@ return c.newInstance(conf); } catch (Exception e) { throw new RuntimeException("Failed construction of " + - "Master: " + masterClass.toString(), e); + "Master: " + masterClass.toString() + + ((e.getCause() != null)? e.getCause().getMessage(): ""), e); } } Index: src/java/org/apache/hadoop/hbase/HMsg.java =================================================================== --- src/java/org/apache/hadoop/hbase/HMsg.java (revision 940322) +++ src/java/org/apache/hadoop/hbase/HMsg.java (working copy) @@ -1,5 +1,5 @@ /** - * Copyright 2007 The Apache Software Foundation + * Copyright 2010 The Apache Software Foundation * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file @@ -117,6 +117,13 @@ * rather than send them individually in MSG_REPORT_OPEN messages. */ MSG_REPORT_SPLIT_INCLUDES_DAUGHTERS, + + /** + * When RegionServer receives this message, it goes into a sleep that only + * an exit will cure. This message is sent by unit tests simulating + * pathological states. + */ + TESTING_MSG_BLOCK_RS, } private Type type = null;