diff --git src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java index 7b7316f..fbf72c4 100644 --- src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java +++ src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java @@ -324,6 +324,7 @@ public class SplitLogManager extends ZooKeeperListener { LOG.warn("Error splitting " + path); } } + boolean safeToDeleteNode = true; Task task = tasks.get(path); if (task == null) { if (!ZKSplitLog.isRescanNode(watcher, path)) { @@ -341,6 +342,9 @@ public class SplitLogManager extends ZooKeeperListener { if (status == SUCCESS) { task.batch.done++; } else { + // Deleting the node asynchronously will cause race issue + // against split log retry. In this case, we can leave the node there. + safeToDeleteNode = false; task.batch.error++; } task.batch.notify(); @@ -351,8 +355,12 @@ public class SplitLogManager extends ZooKeeperListener { // delete the task node in zk. Keep trying indefinitely - its an async // call and no one is blocked waiting for this node to be deleted. All // task names are unique (log.) there is no risk of deleting - // a future task. - deleteNode(path, Long.MAX_VALUE); + // a future task. This is true if the task status is SUCCESS, if not, + // it will race against split log retry. It will be safer to leave the + // node there if the task is failed and it is not an orphan. + if (safeToDeleteNode) { + deleteNode(path, Long.MAX_VALUE); + } return; } diff --git src/test/java/org/apache/hadoop/hbase/master/TestSplitLogManager.java src/test/java/org/apache/hadoop/hbase/master/TestSplitLogManager.java index 5c9d7dd..88c0c31 100644 --- src/test/java/org/apache/hadoop/hbase/master/TestSplitLogManager.java +++ src/test/java/org/apache/hadoop/hbase/master/TestSplitLogManager.java @@ -326,8 +326,8 @@ public class TestSplitLogManager { batch.wait(); } } - waitForCounter(tot_mgr_task_deleted, 0, 1, 1000); - assertTrue(ZKUtil.checkExists(zkw, tasknode) == -1); + waitForCounter(tot_mgr_log_split_err, 0, 1, 1000); + assertTrue(ZKUtil.checkExists(zkw, tasknode) != -1); conf.setInt("hbase.splitlog.max.resubmit", ZKSplitLog.DEFAULT_MAX_RESUBMIT); }