From cd99ed7ab17268a0ff933836c9a3ef7f668ab64d Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Tue, 12 Nov 2019 01:04:12 +0530 Subject: [PATCH] HBASE-23261 : Processing ZK BadVersionException during node transition --- .../apache/hadoop/hbase/zookeeper/ZKAssign.java | 10 +++++++++- .../ZKSplitTransactionCoordination.java | 16 ++++++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKAssign.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKAssign.java index 297e96e749..b2e1e1eeb1 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKAssign.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKAssign.java @@ -868,7 +868,15 @@ public class ZKAssign { try { rt = RegionTransition.createRegionTransition( endState, region.getRegionName(), serverName, payload); - if(!ZKUtil.setData(zkw, node, rt.toByteArray(), stat.getVersion())) { + boolean isDataSet; + try { + isDataSet = ZKUtil.setData(zkw, node, rt.toByteArray(), stat.getVersion()); + } catch (KeeperException.BadVersionException e) { + isDataSet = false; + LOG.error("Received BadVersionException from ZK for " + encoded + + ", version: " + stat.getVersion()); + } + if (!isDataSet) { LOG.warn(zkw.prefix("Attempt to transition the " + "unassigned node for " + encoded + " from " + beginState + " to " + endState + " failed, " + diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZKSplitTransactionCoordination.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZKSplitTransactionCoordination.java index f6e96fa307..8bfbe363c2 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZKSplitTransactionCoordination.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZKSplitTransactionCoordination.java @@ -40,6 +40,9 @@ public class ZKSplitTransactionCoordination implements SplitTransactionCoordinat private CoordinatedStateManager coordinationManager; private final ZooKeeperWatcher watcher; + // max wait for split transaction - 100 times in a loop with 100 ms of thread sleep each time + private static final int SPLIT_WAIT_TIMEOUT = 100; + private static final Log LOG = LogFactory.getLog(ZKSplitTransactionCoordination.class); public ZKSplitTransactionCoordination(CoordinatedStateManager coordinationProvider, @@ -163,6 +166,10 @@ public class ZKSplitTransactionCoordination implements SplitTransactionCoordinat } Thread.sleep(100); spins++; + if (spins > SPLIT_WAIT_TIMEOUT) { + throw new IOException("Waiting time for Split Transaction exceeded for region: " + + parent.getRegionInfo().getRegionNameAsString()); + } byte[] data = ZKAssign.getDataNoWatch(watcher, node, stat); if (data == null) { throw new IOException("Data is null, splitting node " + node + " no longer exists"); @@ -222,9 +229,14 @@ public class ZKSplitTransactionCoordination implements SplitTransactionCoordinat // Tell master about split by updating zk. If we fail, abort. if (coordinationManager.getServer() != null) { try { - zstd.setZnodeVersion(transitionSplittingNode(parent.getRegionInfo(), a.getRegionInfo(), + int newNodeVersion = transitionSplittingNode(parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(), coordinationManager.getServer().getServerName(), zstd, - RS_ZK_REGION_SPLITTING, RS_ZK_REGION_SPLIT)); + RS_ZK_REGION_SPLITTING, RS_ZK_REGION_SPLIT); + if (newNodeVersion == -1) { + throw new IOException("Notifying master of RS split failed for region: " + + parent.getRegionInfo().getRegionNameAsString()); + } + zstd.setZnodeVersion(newNodeVersion); int spins = 0; // Now wait for the master to process the split. We know it's done -- 2.17.2 (Apple Git-113)