From 33e0a2ac20d2d3deb33a38bc82125ce4d761ab7e Mon Sep 17 00:00:00 2001 From: khemani Date: Wed, 25 Apr 2012 18:52:20 -0700 Subject: [PATCH] [HBASE-5860] splitlogmanager should not unnecessarily resubmit tasks when zk unavailable --- .../hadoop/hbase/master/SplitLogManager.java | 11 +++++++++-- 1 files changed, 9 insertions(+), 2 deletions(-) diff --git src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java index 919c65f..21d011d 100644 --- src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java +++ src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java @@ -924,7 +924,11 @@ public class SplitLogManager extends ZooKeeperListener { LOG.info("resubmitted " + resubmitted + " out of " + tot + " tasks"); } // If there are pending tasks and all of them have been unassigned for - // some time then put up a RESCAN node to ping the workers. + // some time then put up a RESCAN node to ping the workers. (Skip this + // processing if znode create is pending. Creation of any znode has + // the same effect as creation of a RESCAN node. They both ping the + // workers to look for new tasks) + // // ZKSplitlog.DEFAULT_UNASSIGNED_TIMEOUT is of the order of minutes // because a. it is very unlikely that every worker had a // transient error when trying to grab the task b. if there are no @@ -934,7 +938,7 @@ public class SplitLogManager extends ZooKeeperListener { // that there is always one worker in the system if (tot > 0 && !found_assigned_task && ((EnvironmentEdgeManager.currentTimeMillis() - lastNodeCreateTime) > - unassignedTimeout)) { + unassignedTimeout) && !isAnyCreateZNodePending()) { for (Map.Entry e : tasks.entrySet()) { String path = e.getKey(); Task task = e.getValue(); @@ -992,6 +996,9 @@ public class SplitLogManager extends ZooKeeperListener { } } + static boolean isAnyCreateZNodePending() { + return tot_mgr_node_create_queued.get() > tot_mgr_node_create_result.get(); + } /** * Asynchronous handler for zk get-data-set-watch on node results. * Retries on failures. -- 1.7.7.2