### Eclipse Workspace Patch 1.0 #P apache-trunk Index: hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HStore.java =================================================================== --- hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HStore.java (revision 1430144) +++ hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HStore.java (working copy) @@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.regionserver; import java.io.IOException; +import java.io.InterruptedIOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -154,6 +155,10 @@ private final Compactor compactor; + private static final int DEFAULT_FLUSH_RETRIES_NUMBER = 10; + private final int flush_retries_number; + private final int pauseTime; + /** * Constructor * @param basedir qualified path under which the region directory lives; @@ -221,6 +226,10 @@ this.compactor = new Compactor(conf); // Create a compaction manager. this.compactionPolicy = new CompactionPolicy(conf, this); + this.flush_retries_number = conf.getInt( + "hbase.hstore.flush.retries.number", DEFAULT_FLUSH_RETRIES_NUMBER); + this.pauseTime = conf.getInt(HConstants.HBASE_SERVER_PAUSE, + HConstants.DEFAULT_HBASE_SERVER_PAUSE); } /** @@ -693,8 +702,43 @@ // If an exception happens flushing, we let it out without clearing // the memstore snapshot. The old snapshot will be returned when we say // 'snapshot', the next time flush comes around. - return internalFlushCache( - snapshot, logCacheFlushId, snapshotTimeRangeTracker, flushedSize, status); + // Retry after catching exception when flushing, otherwise server will abort + // itself + IOException lastException = null; + for (int i = 0; i < flush_retries_number; i++) { + try { + Path pathName = internalFlushCache(snapshot, logCacheFlushId, + snapshotTimeRangeTracker, flushedSize, status); + try { + // Path name is null if there is no entry to flush + if (pathName != null) { + validateStoreFile(pathName); + } + return pathName; + } catch (Exception e) { + LOG.warn("Failed validating store file " + pathName + + ", retring num=" + i, e); + if (e instanceof IOException) { + lastException = (IOException) e; + } else { + lastException = new IOException(e); + } + } + } catch (IOException e) { + LOG.warn("Failed flushing store file, retring num=" + i, e); + lastException = e; + } + if (lastException != null) { + try { + Thread.sleep(pauseTime); + } catch (InterruptedException e) { + IOException iie = new InterruptedIOException(); + iie.initCause(e); + throw iie; + } + } + } + throw lastException; } /* @@ -816,7 +860,6 @@ // Write-out finished successfully, move into the right spot String fileName = path.getName(); Path dstPath = new Path(homedir, fileName); - validateStoreFile(path); String msg = "Renaming flushed file at " + path + " to " + dstPath; LOG.debug(msg); status.setStatus("Flushing " + this + ": " + msg); @@ -1839,7 +1882,7 @@ public static final long FIXED_OVERHEAD = ClassSize.align((20 * ClassSize.REFERENCE) + (4 * Bytes.SIZEOF_LONG) - + (3 * Bytes.SIZEOF_INT) + Bytes.SIZEOF_BOOLEAN); + + (5 * Bytes.SIZEOF_INT) + Bytes.SIZEOF_BOOLEAN); public static final long DEEP_OVERHEAD = ClassSize.align(FIXED_OVERHEAD + ClassSize.OBJECT + ClassSize.REENTRANT_LOCK Index: hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java =================================================================== --- hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java (revision 1430144) +++ hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java (working copy) @@ -511,6 +511,17 @@ public static long DEFAULT_HBASE_CLIENT_PAUSE = 1000; /** + * Parameter name for server pause value, used mostly as value to wait before + * running a retry of a failed operation. + */ + public static String HBASE_SERVER_PAUSE = "hbase.server.pause"; + + /** + * Default value of {@link #HBASE_SERVER_PAUSE}. + */ + public static int DEFAULT_HBASE_SERVER_PAUSE = 1000; + + /** * Parameter name for maximum retries, used as maximum for all retryable * operations such as fetching of the root region from root region server, * getting a cell's value, starting a row update, etc.