diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java index d1fffbe..745570e 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java @@ -125,14 +125,6 @@ public class RegionStates { new HashMap(); /** - * Map a host port pair string to the latest start code - * of a region server which is known to be dead. It is dead - * to us, but server manager may not know it yet. - */ - private final HashMap deadServers = - new HashMap(); - - /** * Map a dead servers to the time when log split is done. * Since log splitting is not ordered, we have to remember * all processed instances. The map is cleaned up based @@ -778,10 +770,6 @@ public class RegionStates { * If so, we should hold re-assign this region till SSH has split its wals. * Once logs are split, the last assignment of this region will be reset, * which means a null last assignment server is ok for re-assigning. - * - * A region server could be dead but we don't know it yet. We may - * think it's online falsely. Therefore if a server is online, we still - * need to confirm it reachable and having the expected start code. */ synchronized boolean wasRegionOnDeadServer(final String encodedName) { ServerName server = lastAssignments.get(encodedName); @@ -791,24 +779,9 @@ public class RegionStates { synchronized boolean isServerDeadAndNotProcessed(ServerName server) { if (server == null) return false; if (serverManager.isServerOnline(server)) { - String hostAndPort = server.getHostAndPort(); - long startCode = server.getStartcode(); - Long deadCode = deadServers.get(hostAndPort); - if (deadCode == null || startCode > deadCode.longValue()) { - if (serverManager.isServerReachable(server)) { - return false; - } - // The size of deadServers won't grow unbounded. - deadServers.put(hostAndPort, Long.valueOf(startCode)); + if (!serverManager.isServerDead(server)) { + return false; } - // Watch out! If the server is not dead, the region could - // remain unassigned. That's why ServerManager#isServerReachable - // should use some retry. - // - // We cache this info since it is very unlikely for that - // instance to come back up later on. We don't want to expire - // the server since we prefer to let it die naturally. - LOG.warn("Couldn't reach online server " + server); } // Now, we know it's dead. Check if it's processed return !processedServers.containsKey(server); diff --git hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 1ed2514..018cc8d 100644 --- hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -61,7 +61,6 @@ import org.apache.hadoop.hbase.protobuf.ResponseConverter; import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService; import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionRequest; import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionResponse; -import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ServerInfo; import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionServerStartupRequest; import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.RegionStoreSequenceIds; import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.StoreSequenceId; @@ -70,7 +69,6 @@ import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.regionserver.RegionOpeningState; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Pair; -import org.apache.hadoop.hbase.util.RetryCounter; import org.apache.hadoop.hbase.util.RetryCounterFactory; import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; @@ -832,7 +830,7 @@ public class ServerManager { * Contacts a region server and waits up to timeout ms * to close the region. This bypasses the active hmaster. */ - public static void closeRegionSilentlyAndWait(ClusterConnection connection, + public static void closeRegionSilentlyAndWait(ClusterConnection connection, ServerName server, HRegionInfo region, long timeout) throws IOException, InterruptedException { AdminService.BlockingInterface rs = connection.getAdmin(server); try { @@ -887,34 +885,6 @@ public class ServerManager { ProtobufUtil.mergeRegions(admin, region_a, region_b, forcible); } - /** - * Check if a region server is reachable and has the expected start code - */ - public boolean isServerReachable(ServerName server) { - if (server == null) throw new NullPointerException("Passed server is null"); - - RetryCounter retryCounter = pingRetryCounterFactory.create(); - while (retryCounter.shouldRetry()) { - try { - AdminService.BlockingInterface admin = getRsAdmin(server); - if (admin != null) { - ServerInfo info = ProtobufUtil.getServerInfo(admin); - return info != null && info.hasServerName() - && server.getStartcode() == info.getServerName().getStartCode(); - } - } catch (IOException ioe) { - LOG.debug("Couldn't reach " + server + ", try=" + retryCounter.getAttemptTimes() - + " of " + retryCounter.getMaxAttempts(), ioe); - try { - retryCounter.sleepUntilNextRetry(); - } catch(InterruptedException ie) { - Thread.currentThread().interrupt(); - } - } - } - return false; - } - /** * @param sn * @return Admin interface for the remote regionserver named sn @@ -1074,9 +1044,19 @@ public class ServerManager { * master any more, for example, a very old previous instance). */ public synchronized boolean isServerDead(ServerName serverName) { - return serverName == null || deadservers.isDeadServer(serverName) - || queuedDeadServers.contains(serverName) - || requeuedDeadServers.containsKey(serverName); + if (serverName == null || deadservers.isDeadServer(serverName) + || queuedDeadServers.contains(serverName) + || requeuedDeadServers.containsKey(serverName)) { + return true; + } + + // we are not acquiring the lock + ServerName onlineServer = findServerWithSameHostnamePortWithLock(serverName); + if (onlineServer != null && serverName.getStartcode() < onlineServer.getStartcode()) { + return true; + } + + return false; } public void shutdownCluster() { diff --git hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java index eb72220..9cf221a 100644 --- hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java +++ hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java @@ -982,12 +982,6 @@ public class TestAssignmentManagerOnCluster { assertTrue(regionStates.isRegionOnline(hri)); assertEquals(oldServerName, regionStates.getRegionServerOfRegion(hri)); - // Try to unassign the dead region before SSH - am.unassign(hri); - // The region should be moved to offline since the server is dead - RegionState state = regionStates.getRegionState(hri); - assertTrue(state.isOffline()); - // Kill the hosting server, which doesn't have meta on it. cluster.killRegionServer(oldServerName); cluster.waitForRegionServerToStop(oldServerName, -1); @@ -1061,12 +1055,6 @@ public class TestAssignmentManagerOnCluster { assertTrue(regionStates.isRegionOnline(hri)); assertEquals(oldServerName, regionStates.getRegionServerOfRegion(hri)); - // Try to unassign the dead region before SSH - am.unassign(hri); - // The region should be moved to offline since the server is dead - RegionState state = regionStates.getRegionState(hri); - assertTrue(state.isOffline()); - // Disable the table now. master.disableTable(hri.getTable());