Index: src/test/java/org/apache/hadoop/hbase/master/TestDeadServer.java =================================================================== --- src/test/java/org/apache/hadoop/hbase/master/TestDeadServer.java (revision 1094796) +++ src/test/java/org/apache/hadoop/hbase/master/TestDeadServer.java (working copy) @@ -46,13 +46,23 @@ assertFalse(ds.areDeadServersInProgress()); final String hostname12345 = "127.0.0.2,12345,4"; ds.add(hostname12345); - // hostname123 should now be evicted - assertFalse(ds.isDeadServer(hostname123, false)); - // but others should still be dead assertTrue(ds.isDeadServer(hostname1234, false)); assertTrue(ds.isDeadServer(hostname12345, false)); assertTrue(ds.areDeadServersInProgress()); ds.finish(hostname12345); assertFalse(ds.areDeadServersInProgress()); + + // Already dead = 127.0.0.1,9090,112321 + // Coming back alive = 127.0.0.1,9090,223341 + + final String deadServer = "127.0.0.1,9090,112321"; + assertFalse(ds.cleanPreviousInstance(deadServer)); + ds.add(deadServer); + assertTrue(ds.isDeadServer(deadServer)); + final String deadServerHostComingAlive = "127.0.0.1,9090,112321"; + assertTrue(ds.cleanPreviousInstance(deadServerHostComingAlive)); + assertFalse(ds.isDeadServer(deadServer)); + assertFalse(ds.cleanPreviousInstance(deadServerHostComingAlive)); + } } \ No newline at end of file Index: src/main/java/org/apache/hadoop/hbase/HServerInfo.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/HServerInfo.java (revision 1094796) +++ src/main/java/org/apache/hadoop/hbase/HServerInfo.java (working copy) @@ -279,4 +279,20 @@ } return false; } + + /** + * Uyility method to excise the start code from a server name + * @param inServerName full server name + * @return server name less its start code + */ + public static String getServerNameLessStartCode(String inServerName) { + if (inServerName != null && inServerName.length() > 0) { + int index = inServerName.lastIndexOf(SERVERNAME_SEPARATOR); + if (index > 0) { + return inServerName.substring(0, index); + } + } + return inServerName; + } + } Index: src/main/java/org/apache/hadoop/hbase/master/ServerManager.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (revision 1094796) +++ src/main/java/org/apache/hadoop/hbase/master/ServerManager.java (working copy) @@ -183,18 +183,30 @@ } /** - * If this server is on the dead list, reject it with a LeaseStillHeldException + * If this server is on the dead list, reject it with a YouAreDeadException. + * If it was dead but came back with a new start code, remove the old entry + * from the dead list. * @param serverName Server name formatted as host_port_startcode. * @param what START or REPORT - * @throws LeaseStillHeldException + * @throws YouAreDeadException */ private void checkIsDead(final String serverName, final String what) - throws YouAreDeadException { - if (!this.deadservers.isDeadServer(serverName)) return; - String message = "Server " + what + " rejected; currently processing " + - serverName + " as dead server"; - LOG.debug(message); - throw new YouAreDeadException(message); + throws YouAreDeadException { + if (this.deadservers.isDeadServer(serverName)) { + // host name, port and start code all match with existing one of the + // dead servers. So, this server must be dead. + String message = "Server " + what + " rejected; currently processing " + + serverName + " as dead server"; + LOG.debug(message); + throw new YouAreDeadException(message); + } + + if (this.deadservers.cleanPreviousInstance(serverName)) { + // This server has now become alive after we marked it as dead. + // We removed it's previous entry from the dead list to reflect it. + LOG.debug("Server " + serverName + " came back up, removed it from the" + + " dead servers list"); + } } /** Index: src/main/java/org/apache/hadoop/hbase/master/DeadServer.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/master/DeadServer.java (revision 1094796) +++ src/main/java/org/apache/hadoop/hbase/master/DeadServer.java (working copy) @@ -42,9 +42,6 @@ */ private final Set deadServers = new HashSet(); - /** Linked list of dead servers used to bound size of dead server set */ - private final List deadServerList = new LinkedList(); - /** Maximum number of dead servers to keep track of */ private final int maxDeadServers; @@ -66,6 +63,27 @@ } /** + * A dead server that comes back alive has a different start code. + * @param newServerName Servername as either host:port or + * host,port,startcode. + * @return true if this server was dead before and coming back alive again + */ + public boolean cleanPreviousInstance(final String newServerName) { + + String serverAddress = + HServerInfo.getServerNameLessStartCode(newServerName); + for (String serverName: deadServers) { + String deadServerAddress = + HServerInfo.getServerNameLessStartCode(serverName); + if (deadServerAddress.equals(serverAddress)) { + remove(serverName); + return true; + } + } + return false; + } + + /** * @param serverName Servername as either host:port or * host,port,startcode. * @param hostAndPortOnly True if serverName is host and @@ -95,11 +113,6 @@ public synchronized boolean add(String e) { this.numProcessing++; - // Check to see if we are at capacity for dead servers - if (deadServerList.size() == this.maxDeadServers) { - deadServers.remove(deadServerList.remove(0)); - } - deadServerList.add(e); return deadServers.add(e); } @@ -132,7 +145,7 @@ } public synchronized boolean remove(Object o) { - throw new UnsupportedOperationException(); + return this.deadServers.remove(o); } public synchronized boolean containsAll(Collection c) {