Index: src/java/org/apache/hadoop/hbase/HConstants.java =================================================================== --- src/java/org/apache/hadoop/hbase/HConstants.java (revision 674597) +++ src/java/org/apache/hadoop/hbase/HConstants.java (working copy) @@ -104,7 +104,10 @@ /** Default maximum file size */ static final long DEFAULT_MAX_FILE_SIZE = 256 * 1024 * 1024; - + + /** Default size of a reservation block */ + static final int DEFAULT_SIZE_RESERVATION_BLOCK = 1024 * 1024 * 5; + // Always store the location of the root table's HRegion. // This HRegion is never split. Index: src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java =================================================================== --- src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (revision 674597) +++ src/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (working copy) @@ -31,6 +31,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Random; @@ -157,6 +158,12 @@ public static final String REGIONSERVER = "regionserver"; /** + * Space is reserved in HRS constructor and then released when aborting + * to recover from an OOME. See HBASE-706. + */ + private LinkedList reservedSpace; + + /** * Thread to shutdown the region server in an orderly manner. This thread * is registered as a shutdown hook in the HRegionServer constructor and is * only called when the HRegionServer receives a kill signal. @@ -257,7 +264,12 @@ this.leases = new Leases( conf.getInt("hbase.regionserver.lease.period", 3 * 60 * 1000), this.threadWakeFrequency); - + + reservedSpace = new LinkedList(); + int nbBlocks = conf.getInt("hbase.regionserver.nbreservationblocks", 4); + for(int i = 0; i < nbBlocks; i++) + reservedSpace.add(new byte[DEFAULT_SIZE_RESERVATION_BLOCK]); + // Register shutdown hook for HRegionServer, runs an orderly shutdown // when a kill signal is recieved Runtime.getRuntime().addShutdownHook(new ShutdownThread(this)); @@ -403,6 +415,9 @@ housekeeping(); sleeper.sleep(lastMsg); } // for + } catch (OutOfMemoryError error) { + abort(); + LOG.fatal("Ran out of memory", error); } catch (Throwable t) { LOG.fatal("Unhandled exception. Aborting...", t); abort(); @@ -649,6 +664,7 @@ * from under hbase or we OOME. */ public void abort() { + reservedSpace.clear(); this.abortRequested = true; stop(); } @@ -1133,10 +1149,14 @@ try { cacheFlusher.reclaimMemcacheMemory(); region.batchUpdate(b); + } catch (OutOfMemoryError error) { + abort(); + LOG.fatal("Ran out of memory", error); } catch (IOException e) { checkFileSystem(); throw e; } + } // Index: conf/hbase-default.xml =================================================================== --- conf/hbase-default.xml (revision 674597) +++ conf/hbase-default.xml (working copy) @@ -251,6 +251,13 @@ + hbase.regionserver.nbreservationblocks + 4 + The number of reservation blocks which are used to prevent + unstable region servers caused by an OOME. + + + hbase.io.index.interval 32 The interval at which we record offsets in hbase