diff --git hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java index c61f590..e36279f 100644 --- hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java +++ hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java @@ -18,9 +18,10 @@ package org.apache.hadoop.hbase; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; -import com.google.common.collect.Sets; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.ClusterManager.ServiceType; @@ -36,6 +37,8 @@ import org.apache.hadoop.hbase.protobuf.generated.MasterMonitorProtos; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Threads; +import com.google.common.collect.Sets; + /** * Manages the interactions with an already deployed distributed cluster (as opposed to * a pseudo-distributed, or mini/local cluster). This is used by integration and system tests. @@ -222,38 +225,64 @@ public class DistributedHBaseCluster extends HBaseCluster { } @Override - public void restoreClusterStatus(ClusterStatus initial) throws IOException { - //TODO: caution: not tested throughly + public boolean restoreClusterStatus(ClusterStatus initial) throws IOException { ClusterStatus current = getClusterStatus(); - //restore masters + LOG.info("Restoring cluster - started"); + + // do a best effort restore + boolean success = true; + success &= restoreMasters(initial, current); + success &= restoreRegionServers(initial, current); + success &= restoreAdmin(); + + LOG.info("Restoring cluster - done"); + return success; + } + protected boolean restoreMasters(ClusterStatus initial, ClusterStatus current) { + List deferred = new ArrayList(); //check whether current master has changed if (!ServerName.isSameHostnameAndPort(initial.getMaster(), current.getMaster())) { - LOG.info("Initial active master : " + initial.getMaster().getHostname() + LOG.info("Restoring cluster - Initial active master : " + initial.getMaster().getHostname() + " has changed to : " + current.getMaster().getHostname()); // If initial master is stopped, start it, before restoring the state. // It will come up as a backup master, if there is already an active master. - if (!clusterManager.isRunning(ServiceType.HBASE_MASTER, initial.getMaster().getHostname())) { - startMaster(initial.getMaster().getHostname()); - } + try { + if (!clusterManager.isRunning(ServiceType.HBASE_MASTER, initial.getMaster().getHostname())) { + LOG.info("Restoring cluster - starting initial active master at:" + initial.getMaster().getHostname()); + startMaster(initial.getMaster().getHostname()); + } - //master has changed, we would like to undo this. - //1. Kill the current backups - //2. Stop current master - //3. Start backup masters - for (ServerName currentBackup : current.getBackupMasters()) { - if (!ServerName.isSameHostnameAndPort(currentBackup, initial.getMaster())) { - stopMaster(currentBackup); + //master has changed, we would like to undo this. + //1. Kill the current backups + //2. Stop current master + //3. Start backup masters + for (ServerName currentBackup : current.getBackupMasters()) { + if (!ServerName.isSameHostnameAndPort(currentBackup, initial.getMaster())) { + LOG.info("Restoring cluster - stopping backup master: " + currentBackup); + stopMaster(currentBackup); + } } + LOG.info("Restoring cluster - stopping active master: " + current.getMaster()); + stopMaster(current.getMaster()); + waitForActiveAndReadyMaster(); //wait so that active master takes over + } catch (IOException ex) { + // if we fail to start the initial active master, we do not want to continue stopping + // backup masters. Just keep what we have now + deferred.add(ex); } - stopMaster(current.getMaster()); - waitForActiveAndReadyMaster(); //wait so that active master takes over + //start backup masters for (ServerName backup : initial.getBackupMasters()) { - //these are not started in backup mode, but we should already have an active master - if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, backup.getHostname())) { - startMaster(backup.getHostname()); + try { + //these are not started in backup mode, but we should already have an active master + if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, backup.getHostname())) { + LOG.info("Restoring cluster - starting initial backup master: " + backup.getHostname()); + startMaster(backup.getHostname()); + } + } catch (IOException ex) { + deferred.add(ex); } } } else { @@ -269,19 +298,38 @@ public class DistributedHBaseCluster extends HBaseCluster { } for (String hostname : Sets.difference(initialBackups.keySet(), currentBackups.keySet())) { - if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, hostname)) { - startMaster(hostname); + try { + if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, hostname)) { + LOG.info("Restoring cluster - starting initial backup master: " + hostname); + startMaster(hostname); + } + } catch (IOException ex) { + deferred.add(ex); } } for (String hostname : Sets.difference(currentBackups.keySet(), initialBackups.keySet())) { - if(clusterManager.isRunning(ServiceType.HBASE_MASTER, hostname)) { - stopMaster(currentBackups.get(hostname)); + try { + if(clusterManager.isRunning(ServiceType.HBASE_MASTER, hostname)) { + LOG.info("Restoring cluster - stopping backup master: " + hostname); + stopMaster(currentBackups.get(hostname)); + } + } catch (IOException ex) { + deferred.add(ex); } } } + if (!deferred.isEmpty()) { + LOG.warn("Restoring cluster - restoring region servers reported " + deferred.size() + " errors:"); + for (int i=0; i initialServers = new HashMap(); HashMap currentServers = new HashMap(); @@ -292,17 +340,39 @@ public class DistributedHBaseCluster extends HBaseCluster { currentServers.put(server.getHostname(), server); } + List deferred = new ArrayList(); for (String hostname : Sets.difference(initialServers.keySet(), currentServers.keySet())) { - if(!clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, hostname)) { - startRegionServer(hostname); + try { + if(!clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, hostname)) { + LOG.info("Restoring cluster - starting initial region server: " + hostname); + startRegionServer(hostname); + } + } catch (IOException ex) { + deferred.add(ex); } } for (String hostname : Sets.difference(currentServers.keySet(), initialServers.keySet())) { - if(clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, hostname)) { - stopRegionServer(currentServers.get(hostname)); + try { + if(clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, hostname)) { + LOG.info("Restoring cluster - stopping initial region server: " + hostname); + stopRegionServer(currentServers.get(hostname)); + } + } catch (IOException ex) { + deferred.add(ex); } } + if (!deferred.isEmpty()) { + LOG.warn("Restoring cluster - restoring region servers reported " + deferred.size() + " errors:"); + for (int i=0; i