diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java index a1c20306b3..d4549deca6 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java @@ -96,6 +96,8 @@ public class SplitLogWorker implements Runnable { // TODO have to correctly figure out when log splitting has been // interrupted or has encountered a transient error and when it has // encountered a bad non-retry-able persistent error. + // Note: this can actually get the master stuck (HBASE-22289) so treat preempted as error. + // splitLogFile does return false for legitimate retriable errors. try { if (!WALSplitter.splitLogFile(walDir, fs.getFileStatus(new Path(walDir, filename)), fs, conf, p, sequenceIdChecker, diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/WALSplitterHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/WALSplitterHandler.java index 49ab574ec5..b5bd17212d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/WALSplitterHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/WALSplitterHandler.java @@ -20,6 +20,7 @@ package org.apache.hadoop.hbase.regionserver.handler; import java.io.IOException; import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.LongAdder; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; @@ -68,19 +69,20 @@ public class WALSplitterHandler extends EventHandler { Status status = null; try { status = this.splitTaskExecutor.exec(splitTaskDetails.getWALFile(), reporter); + LongAdder errCounter = null; switch (status) { case DONE: coordination.endTask(new SplitLogTask.Done(this.serverName), SplitLogCounters.tot_wkr_task_done, splitTaskDetails); break; case PREEMPTED: - SplitLogCounters.tot_wkr_preempt_task.increment(); - LOG.warn("task execution preempted " + splitTaskDetails.getWALFile()); - break; + errCounter = SplitLogCounters.tot_wkr_preempt_task; + LOG.warn("task execution preempted; treating as error " + splitTaskDetails.getWALFile()); + //$FALL-THROUGH$ case ERR: if (server != null && !server.isStopped()) { - coordination.endTask(new SplitLogTask.Err(this.serverName), - SplitLogCounters.tot_wkr_task_err, splitTaskDetails); + coordination.endTask(new SplitLogTask.Err(this.serverName), (errCounter == null + ? SplitLogCounters.tot_wkr_task_err : errCounter), splitTaskDetails); break; } // if the RS is exiting then there is probably a tons of stuff @@ -91,8 +93,8 @@ public class WALSplitterHandler extends EventHandler { LOG.info("task execution interrupted because worker is exiting " + splitTaskDetails.toString()); } - coordination.endTask(new SplitLogTask.Resigned(this.serverName), - SplitLogCounters.tot_wkr_task_resigned, splitTaskDetails); + coordination.endTask(new SplitLogTask.Resigned(this.serverName), (errCounter == null + ? SplitLogCounters.tot_wkr_task_resigned : errCounter) , splitTaskDetails); break; } } finally {