Index: hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java =================================================================== --- hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (revision 1508324) +++ hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (working copy) @@ -67,6 +67,7 @@ import org.apache.hadoop.hbase.HConstants.OperationStatusCode; import org.apache.hadoop.hbase.ClockOutOfSyncException; import org.apache.hadoop.hbase.DoNotRetryIOException; +import org.apache.hadoop.hbase.DroppedSnapshotException; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.HealthCheckChore; @@ -3599,12 +3600,20 @@ } FlushRegionResponse.Builder builder = FlushRegionResponse.newBuilder(); if (shouldFlush) { - boolean result = region.flushcache(); - if (result) { - this.compactSplitThread.requestSystemCompaction(region, - "Compaction through user triggered flush"); + try { + boolean result = region.flushcache(); + if (result) { + this.compactSplitThread.requestSystemCompaction(region, + "Compaction through user triggered flush"); + } + builder.setFlushed(result); + } catch (DroppedSnapshotException ex) { + abort("Flush memstore failed. Hence aborting RS.", ex); + NotServingRegionException nsre = new NotServingRegionException( + "Aborting due to flush memstore failed"); + nsre.initCause(ex); + throw nsre; } - builder.setFlushed(result); } builder.setLastFlushTime(region.getLastFlushTime()); return builder.build(); @@ -3630,7 +3639,15 @@ HRegion region = getRegion(request.getRegion()); region.startRegionOperation(Operation.SPLIT_REGION); LOG.info("Splitting " + region.getRegionNameAsString()); - region.flushcache(); + try { + region.flushcache(); + } catch (DroppedSnapshotException ex) { + abort("Flush memstore failed. Hence aborting RS.", ex); + NotServingRegionException nsre = new NotServingRegionException( + "Aborting due to flush memstore failed"); + nsre.initCause(ex); + throw nsre; + } byte[] splitPoint = null; if (request.hasSplitPoint()) { splitPoint = request.getSplitPoint().toByteArray(); Index: hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java =================================================================== --- hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java (revision 1508324) +++ hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java (working copy) @@ -1347,7 +1347,7 @@ dataInMemoryWithoutWAL.set(0); } synchronized (writestate) { - if (!writestate.flushing && writestate.writesEnabled) { + if (!writestate.flushing && writestate.writesEnabled && !this.closing.get()) { this.writestate.flushing = true; } else { if (LOG.isDebugEnabled()) { @@ -1588,6 +1588,8 @@ Bytes.toStringBinary(getRegionName())); dse.initCause(t); status.abort("Flush failed: " + StringUtils.stringifyException(t)); + // we'll abort this server soon, let's set closing flag to avoid potential data lost issue + this.closing.set(true); throw dse; } Index: hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestWALReplay.java =================================================================== --- hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestWALReplay.java (revision 1508324) +++ hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestWALReplay.java (working copy) @@ -47,6 +47,7 @@ import org.apache.hadoop.hbase.MasterNotRunningException; import org.apache.hadoop.hbase.MediumTests; import org.apache.hadoop.hbase.MiniHBaseCluster; +import org.apache.hadoop.hbase.NotServingRegionException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.ZooKeeperConnectionException; import org.apache.hadoop.hbase.client.Delete; @@ -573,7 +574,7 @@ /** * Test that we could recover the data correctly after aborting flush. In the * test, first we abort flush after writing some data, then writing more data - * and flush again, at last verify the data. + * and flush again, at last verify the flush result. * @throws IOException */ @Test @@ -627,39 +628,19 @@ Mockito.doReturn(true).when(rsServices).isAborted(); } // writing more data - int moreRow = 10; - for (int i = writtenRowCount; i < writtenRowCount + moreRow; i++) { - Put put = new Put(Bytes.toBytes(tableNameStr + Integer.toString(i))); - put.add(families.get(i % families.size()).getName(), Bytes.toBytes("q"), - Bytes.toBytes("val")); - region.put(put); - } - writtenRowCount += moreRow; - // call flush again - CustomStoreFlusher.throwExceptionWhenFlushing.set(false); try { - region.flushcache(); - } catch (IOException t) { - LOG.info("Expected exception when flushing region because server is stopped," - + t.getMessage()); + int moreRow = 10; + for (int i = writtenRowCount; i < writtenRowCount + moreRow; i++) { + Put put = new Put(Bytes.toBytes(tableNameStr + Integer.toString(i))); + put.add(families.get(i % families.size()).getName(), Bytes.toBytes("q"), + Bytes.toBytes("val")); + region.put(put); + fail("No exception thrown."); + } + } catch (Exception ex) { + assertTrue(ex instanceof NotServingRegionException); + assertTrue(ex.getMessage().contains("is closing")); } - - region.close(true); - wal.close(); - - // Let us try to split and recover - runWALSplit(this.conf); - HLog wal2 = createWAL(this.conf); - Mockito.doReturn(false).when(rsServices).isAborted(); - HRegion region2 = new HRegion(basedir, wal2, this.fs, this.conf, hri, htd, - rsServices); - long seqid2 = region2.initialize(); - // HRegionServer usually does this. It knows the largest seqid across all - // regions. - wal2.setSequenceNumber(seqid2); - - scanner = region2.getScanner(new Scan()); - assertEquals(writtenRowCount, getScannedCount(scanner)); } private int getScannedCount(RegionScanner scanner) throws IOException {