diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java index c551a94..0ac078f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java @@ -1737,11 +1737,22 @@ public class FSHLog implements WAL { try { FSWALEntry entry = truck.unloadFSWALEntryPayload(); // If already an exception, do not try to append. Throw. - if (this.exception != null) throw this.exception; + if (this.exception != null) { + // We got an exception from an earlier attemp at append. Do not let this append + // go through. Fail it but stamp the sequenceid into the append because we + // need to do this to close the latch held down deep in WALKey... otherwise it will + // just hang out. The #append below does this too. + entry.stampRegionSequenceId(); + // If append fails, presume any pending syncs will fail too; let all waiting handlers + // know of the exception. This is probably of no use but just in case. + cleanupOutstandingSyncsOnException(sequence, this.exception); + // Return to keep processing events coming off the ringbuffer + return; + } append(entry); } catch (Exception e) { - // Failed append. Record the exception. Throw it from here on out til new WAL in place - this.exception = new DamagedWALException(e); + // Failed append. Record the exception. Throw it from here on out til new WAL in place. + this.exception = e; // If append fails, presume any pending syncs will fail too; let all waiting handlers // know of the exception. cleanupOutstandingSyncsOnException(sequence, this.exception); @@ -1883,9 +1894,11 @@ public class FSHLog implements WAL { // Update metrics. postAppend(entry, EnvironmentEdgeManager.currentTime() - start); } catch (Exception e) { - LOG.warn("Could not append. Requesting close of WAL", e); + String msg = "Failed appending sequenceId=" + regionSequenceId + + ", requesting roll of WAL"; + LOG.warn(msg, e); requestLogRoll(); - throw e; + throw new DamagedWALException(msg, e); } numEntries.incrementAndGet(); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALKey.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALKey.java index 9b3dede..74284e0 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALKey.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALKey.java @@ -315,8 +315,12 @@ public class WALKey implements SequenceId, Comparable { */ public long getSequenceId(final long maxWaitForSeqId) throws IOException { // TODO: This implementation waiting on a latch is problematic because if a higher level - // determines we should stop or abort, there is not global list of all these blocked WALKeys - // waiting on a sequence id; they can't be cancelled... interrupted. See getNextSequenceId + // determines we should stop or abort, there is no global list of all these blocked WALKeys + // waiting on a sequence id; they can't be cancelled... interrupted. See getNextSequenceId. + // + // UPDATE: I think we can remove the timeout now we are stamping all walkeys with sequenceid, + // even those that have failed (previously we were not... so they would just hang out...). + // St.Ack 20150910 try { if (maxWaitForSeqId < 0) { this.seqNumAssignedLatch.await(); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestWALLockup.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestWALLockup.java index a620951..5cec7de 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestWALLockup.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestWALLockup.java @@ -139,10 +139,6 @@ public class TestWALLockup { protected void beforeWaitOnSafePoint() { if (throwException) { LOG.info("COUNTDOWN"); - // Don't countdown latch until someone waiting on it. - while (this.latch.getCount() <= 0) { - Threads.sleep(10); - } this.latch.countDown(); } }