diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java index 7ceb637..158629b 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java @@ -124,7 +124,8 @@ public class ReplicationSource extends Thread // Indicates if this queue is recovered (and will be deleted when depleted) private boolean queueRecovered; // List of all the dead region servers that had this queue (if recovered) - private String[] deadRegionServers; + // package access for testing + Set deadRegionServers = new HashSet(); // Maximum number of retries before taking bold actions private int maxRetriesMultiplier; // Socket timeouts require even bolder actions since we don't want to DDOS @@ -201,16 +202,57 @@ public class ReplicationSource extends Thread // The passed znode will be either the id of the peer cluster or // the handling story of that queue in the form of id-servername-* + // + // Since servername can contains "-" like "ip-10-46-221-101.ec2.internal", so we need skip some + // "-" during parsing for the following cases: + // 2-ip-10-46-221-101.ec2.internal,52170,1364333181125-ip-10-46-221-101.ec2.internal,52171, + // 1364333181127-... private void checkIfQueueRecovered(String peerClusterZnode) { - String[] parts = peerClusterZnode.split("-"); + String[] parts = peerClusterZnode.split("-", 2); this.queueRecovered = parts.length != 1; this.peerId = this.queueRecovered ? parts[0] : peerClusterZnode; this.peerClusterZnode = peerClusterZnode; - this.deadRegionServers = new String[parts.length-1]; - // Extract all the places where we could find the hlogs - for (int i = 1; i < parts.length; i++) { - this.deadRegionServers[i-1] = parts[i]; + + // extract dead servers + if (parts.length > 1) { + String deadServerListStr = parts[1]; + // valid server name delimiter "-" has to be after "," + int seenCommaCnt = 0; + int startIndex = 0; + int len = deadServerListStr.length(); + for (int i = 0; i < len; i++) { + switch (deadServerListStr.charAt(i)) { + case ',': + seenCommaCnt += 1; + break; + case '-': + if(seenCommaCnt>=2) { + if (i > startIndex) { + String serverName = deadServerListStr.substring(startIndex, i); + if(ServerName.isFullServerName(serverName)){ + this.deadRegionServers.add(deadServerListStr.substring(startIndex, i)); + } else { + LOG.error("Found invalid server name:" + serverName); + } + startIndex = i + 1; + } + seenCommaCnt = 0; + } + break; + default: + break; + } + } + // add tail + if(startIndex < len - 1){ + String serverName = deadServerListStr.substring(startIndex, len); + if(ServerName.isFullServerName(serverName)){ + this.deadRegionServers.add(deadServerListStr.substring(startIndex, len)); + } else { + LOG.error("Found invalid server name:" + serverName); + } + } } } @@ -509,11 +551,10 @@ public class ReplicationSource extends Thread // We didn't find the log in the archive directory, look if it still // exists in the dead RS folder (there could be a chain of failures // to look at) - LOG.info("NB dead servers : " + deadRegionServers.length); - for (int i = this.deadRegionServers.length - 1; i >= 0; i--) { - + LOG.info("NB dead servers : " + deadRegionServers.size()); + for (String curDeadServerName : deadRegionServers) { Path deadRsDirectory = - new Path(manager.getLogDir().getParent(), this.deadRegionServers[i]); + new Path(manager.getLogDir().getParent(), curDeadServerName); Path[] locs = new Path[] { new Path(deadRsDirectory, currentPath.getName()), new Path(deadRsDirectory.suffix(HLog.SPLITTING_EXT), @@ -525,6 +566,8 @@ public class ReplicationSource extends Thread // We found the right new location LOG.info("Log " + this.currentPath + " still exists at " + possibleLogLocation); + // Sleep fixed interval to wait for log splitting work get done + this.sleepForRetries("waiting for log splitting is done", 1); // Breaking here will make us sleep since reader is null return true; }