commit 193882dfbc044eeb6f88a00e6032763c3a38a040 Author: Todd Lipcon Date: Mon Jun 14 15:42:16 2010 -0700 HBASE-2726. RegionServer should never abort without providing a reason diff --git src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index bca819e..fe9aa8a 100644 --- src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -377,7 +377,7 @@ public class HRegionServer implements HRegionInterface, if (restart) { restart(); } else { - abort(); + abort("ZooKeeper session expired"); } } else if (type == EventType.NodeDeleted) { watchMasterAddress(); @@ -397,8 +397,7 @@ public class HRegionServer implements HRegionInterface, } private void restart() { - LOG.info("Restarting Region Server"); - abort(); + abort("Restarting region server"); Threads.shutdown(regionServerThread); boolean done = false; while (!done) { @@ -568,8 +567,7 @@ public class HRegionServer implements HRegionInterface, } // for } catch (Throwable t) { if (!checkOOME(t)) { - LOG.fatal("Unhandled exception. Aborting...", t); - abort(); + abort("Unhandled exception", t); } } this.leases.closeAfterLeasesExpire(); @@ -836,8 +834,7 @@ public class HRegionServer implements HRegionInterface, (e.getCause() != null && e.getCause() instanceof OutOfMemoryError) || (e.getMessage() != null && e.getMessage().contains("java.lang.OutOfMemoryError"))) { - LOG.fatal("OutOfMemoryError, aborting.", e); - abort(); + abort("OutOfMemoryError, aborting", e); stop = true; } return stop; @@ -855,8 +852,7 @@ public class HRegionServer implements HRegionInterface, try { FSUtils.checkFileSystemAvailable(this.fs); } catch (IOException e) { - LOG.fatal("Shutting down HRegionServer: file system not available", e); - abort(); + abort("File System not available", e); this.fsOk = false; } } @@ -1008,8 +1004,7 @@ public class HRegionServer implements HRegionInterface, String n = Thread.currentThread().getName(); UncaughtExceptionHandler handler = new UncaughtExceptionHandler() { public void uncaughtException(Thread t, Throwable e) { - abort(); - LOG.fatal("Set stop flag in " + t.getName(), e); + abort("Uncaught exception in service thread " + t.getName(), e); } }; Threads.setDaemonThreadRunning(this.hlogRoller, n + ".logRoller", @@ -1132,8 +1127,15 @@ public class HRegionServer implements HRegionInterface, * log it is using and without notifying the master. * Used unit testing and on catastrophic events such as HDFS is yanked out * from under hbase or we OOME. + * @param reason the reason we are aborting + * @param cause the exception that caused the abort, or null */ - public void abort() { + public void abort(String reason, Throwable cause) { + if (cause != null) { + LOG.fatal("Aborting region server " + this + ": " + reason, cause); + } else { + LOG.fatal("Aborting region server " + this + ": " + reason); + } this.abortRequested = true; this.reservedSpace.clear(); if (this.metrics != null) { @@ -1141,6 +1143,13 @@ public class HRegionServer implements HRegionInterface, } stop(); } + + /** + * @see HRegionServer#abort(String, Throwable) + */ + public void abort(String reason) { + abort(reason, null); + } /* * Simulate a kill -9 of this server. @@ -1149,7 +1158,7 @@ public class HRegionServer implements HRegionInterface, */ protected void kill() { this.killed = true; - abort(); + abort("Simulated kill"); } /** diff --git src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java index fed59eb..7b8fa56 100644 --- src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java +++ src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java @@ -86,20 +86,21 @@ class LogRoller extends Thread implements LogRollListener { } catch (FailedLogCloseException e) { LOG.fatal("Forcing server shutdown", e); server.checkFileSystem(); - server.abort(); + server.abort("Failed log close in log roller", e); } catch (java.net.ConnectException e) { LOG.fatal("Forcing server shutdown", e); server.checkFileSystem(); - server.abort(); + server.abort("Failed connect in log roller", e); } catch (IOException ex) { LOG.fatal("Log rolling failed with ioe: ", RemoteExceptionHandler.checkIOException(ex)); server.checkFileSystem(); // Abort if we get here. We probably won't recover an IOE. HBASE-1132 - server.abort(); + server.abort("IOE in log roller", ex); } catch (Exception ex) { LOG.error("Log rolling failed", ex); server.checkFileSystem(); + server.abort("Log rolling failed", ex); } finally { rollLog.set(false); rollLock.unlock(); diff --git src/main/java/org/apache/hadoop/hbase/regionserver/MemStoreFlusher.java src/main/java/org/apache/hadoop/hbase/regionserver/MemStoreFlusher.java index 263384e..124fac8 100644 --- src/main/java/org/apache/hadoop/hbase/regionserver/MemStoreFlusher.java +++ src/main/java/org/apache/hadoop/hbase/regionserver/MemStoreFlusher.java @@ -261,8 +261,7 @@ class MemStoreFlusher extends Thread implements FlushRequester { // is required. Currently the only way to do this is a restart of // the server. Abort because hdfs is probably bad (HBASE-644 is a case // where hdfs was bad but passed the hdfs check). - LOG.fatal("Replay of hlog required. Forcing server shutdown", ex); - server.abort(); + server.abort("Replay of HLog required. Forcing server shutdown", ex); return false; } catch (IOException ex) { LOG.error("Cache flush failed" diff --git src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java index 42acc73..68f3346 100644 --- src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java +++ src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java @@ -274,7 +274,7 @@ public class MiniHBaseCluster { public String abortRegionServer(int serverNumber) { HRegionServer server = getRegionServer(serverNumber); LOG.info("Aborting " + server.toString()); - server.abort(); + server.abort("Aborting for tests", new Exception("Trace info")); return server.toString(); } diff --git src/test/java/org/apache/hadoop/hbase/master/TestMasterTransitions.java src/test/java/org/apache/hadoop/hbase/master/TestMasterTransitions.java index 112fd79..e59bdf7 100644 --- src/test/java/org/apache/hadoop/hbase/master/TestMasterTransitions.java +++ src/test/java/org/apache/hadoop/hbase/master/TestMasterTransitions.java @@ -337,10 +337,10 @@ public class TestMasterTransitions { if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true; // Save the region that is in transition so can test later it came back. this.regionToFind = incomingMsg.getRegionInfo(); - LOG.info("ABORTING " + this.victim + " because got a " + + String msg = "ABORTING " + this.victim + " because got a " + HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " + - incomingMsg.getRegionInfo().getRegionNameAsString()); - this.victim.abort(); + incomingMsg.getRegionInfo().getRegionNameAsString(); + this.victim.abort(msg); this.abortSent = true; return true; }