From 9dcbb38f7c5557941562d7c6e91be82415e00a5e Mon Sep 17 00:00:00 2001 From: nspiegelberg Date: Fri, 27 Apr 2012 15:01:48 -0400 Subject: [PATCH] [jira] [HBASE-5890] SplitLog Rescan BusyWaits upon Zk.CONNECTIONLOSS Summary: We ran into a production issue yesterday where the SplitLogManager tried to create a Rescan node in ZK. The createAsync() generated a KeeperException.CONNECTIONLOSS that was immedately sent to processResult(), createRescan node with --retry_count was called, and this created a CPU busywait that also clogged up the logs. We should handle this better. Test Plan: - mvn test --- .../hadoop/hbase/master/SplitLogManager.java | 12 ++++++++++++ 1 files changed, 12 insertions(+), 0 deletions(-) diff --git a/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java b/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java index 919c65f..d5b986c 100644 --- a/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java +++ b/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java @@ -38,6 +38,7 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.Chore; +import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.Stoppable; import org.apache.hadoop.hbase.master.SplitLogManager.TaskFinisher.Status; @@ -1093,6 +1094,17 @@ public class SplitLogManager extends ZooKeeperListener { LOG.error("ZK session expired. Master is expected to shut down. Abandoning retries."); return; } + if (rc == KeeperException.Code.CONNECTIONLOSS.intValue()) { + // maybe transient network failure, but happens immediately. + // dampen our response so we don't flood + try { + Thread.sleep(HConstants.SOCKET_RETRY_WAIT_MS); + } catch (InterruptedException e) { + LOG.debug("Rescan Thread interrupted by user: ", e); + return; + } + + } Long retry_count = (Long)ctx; LOG.warn("rc=" + KeeperException.Code.get(rc) + " for "+ path + " remaining retries=" + retry_count); -- 1.6.5.2