diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZKSplitLogManagerCoordination.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZKSplitLogManagerCoordination.java index 07986f2..391efca 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZKSplitLogManagerCoordination.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/coordination/ZKSplitLogManagerCoordination.java @@ -297,7 +297,8 @@ public class ZKSplitLogManagerCoordination extends ZooKeeperListener implements */ @Override public void removeRecoveringRegions(final Set recoveredServerNameSet, - Boolean isMetaRecovery) throws IOException { + Boolean isMetaRecovery) + throws IOException { final String metaEncodeRegionName = HRegionInfo.FIRST_META_REGIONINFO.getEncodedName(); int count = 0; try { @@ -312,16 +313,20 @@ public class ZKSplitLogManagerCoordination extends ZooKeeperListener implements } if (count == 0 && this.details.getMaster().isInitialized() && !this.details.getMaster().getServerManager().areDeadServersInProgress()) { - // no splitting work items left + // No splitting work items left ZKSplitLog.deleteRecoveringRegionZNodes(watcher, null); // reset lastRecoveringNodeCreationTime because we cleared all recovering znodes at // this point. lastRecoveringNodeCreationTime = Long.MAX_VALUE; } else if (!recoveredServerNameSet.isEmpty()) { - // remove recovering regions which doesn't have any RS associated with it + // Remove recovering regions which don't have any RS associated with it List regions = ZKUtil.listChildrenNoWatch(watcher, watcher.recoveringRegionsZNode); if (regions != null) { int listSize = regions.size(); + if (LOG.isDebugEnabled()) { + LOG.debug("Processing recovering " + regions + " and servers " + + recoveredServerNameSet + ", isMetaRecovery=" + isMetaRecovery); + } for (int i = 0; i < listSize; i++) { String region = regions.get(i); if (isMetaRecovery != null) { @@ -341,8 +346,8 @@ public class ZKSplitLogManagerCoordination extends ZooKeeperListener implements if (recoveredServerNameSet.containsAll(failedServers)) { ZKUtil.deleteNodeRecursively(watcher, nodePath); } else { - listSize = failedServers.size(); - for (int j = 0; j < listSize; j++) { + int size = failedServers.size(); + for (int j = 0; j < size; j++) { String failedServer = failedServers.get(j); if (recoveredServerNameSet.contains(failedServer)) { String tmpPath = ZKUtil.joinZNode(nodePath, failedServer); @@ -644,9 +649,10 @@ public class ZKSplitLogManagerCoordination extends ZooKeeperListener implements } ZKUtil.createSetData(this.watcher, nodePath, ZKUtil.regionSequenceIdsToByteArray(lastSequenceId, null)); - LOG.debug("Mark region " + regionEncodeName + " recovering from failed region server " - + serverName); - + if (LOG.isDebugEnabled()) { + LOG.debug("Marked " + regionEncodeName + " as recovering from " + serverName + + ": " + nodePath); + } // break retry loop break; } catch (KeeperException e) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 3d474f2..1d06a8b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -855,14 +855,15 @@ public class HMaster extends HRegionServer implements MasterServices, Server { new LogCleaner(cleanerInterval, this, conf, getMasterFileSystem().getFileSystem(), getMasterFileSystem().getOldLogDir()); - Threads.setDaemonThreadRunning(logCleaner.getThread(), getName() + ".oldLogCleaner"); + Threads.setDaemonThreadRunning(logCleaner.getThread(), + getServerName().toShortString() + ".oldLogCleaner"); //start the hfile archive cleaner thread Path archiveDir = HFileArchiveUtil.getArchivePath(conf); this.hfileCleaner = new HFileCleaner(cleanerInterval, this, conf, getMasterFileSystem() .getFileSystem(), archiveDir); Threads.setDaemonThreadRunning(hfileCleaner.getThread(), - getName() + ".archivedHFileCleaner"); + getServerName().toShortString() + ".archivedHFileCleaner"); serviceStarted = true; if (LOG.isTraceEnabled()) { @@ -1287,7 +1288,7 @@ public class HMaster extends HRegionServer implements MasterServices, Server { status.cleanup(); } } - }, "ActiveMasterManager")); + }, getServerName().toShortString() + ".activeMasterManager")); } private void checkCompression(final HTableDescriptor htd) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java index 3ec523d..97ac02c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java @@ -1,5 +1,5 @@ /** - * Licensed to the Apache Software Foundation (ASF) under one + * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file @@ -228,13 +228,14 @@ public class SplitLogManager { */ public long splitLogDistributed(final Set serverNames, final List logDirs, PathFilter filter) throws IOException { - MonitoredTask status = - TaskMonitor.get().createStatus("Doing distributed log split in " + logDirs); + MonitoredTask status = TaskMonitor.get().createStatus("Doing distributed log split in " + + logDirs + " for serverName=" + serverNames); FileStatus[] logfiles = getFileList(logDirs, filter); status.setStatus("Checking directory contents..."); LOG.debug("Scheduling batch of logs to split"); SplitLogCounters.tot_mgr_log_split_batch_start.incrementAndGet(); - LOG.info("started splitting " + logfiles.length + " logs in " + logDirs); + LOG.info("started splitting " + logfiles.length + " logs in " + logDirs + + " for " + serverNames); long t = EnvironmentEdgeManager.currentTime(); long totalSize = 0; TaskBatch batch = new TaskBatch(); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java index cb6eada..e85ca20 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java @@ -132,8 +132,7 @@ public class ServerShutdownHandler extends EventHandler { // the dead server for further processing too. AssignmentManager am = services.getAssignmentManager(); ServerManager serverManager = services.getServerManager(); - if (isCarryingMeta() // hbase:meta - || !am.isFailoverCleanupDone()) { + if (isCarryingMeta() /* hbase:meta */ || !am.isFailoverCleanupDone()) { serverManager.processDeadServer(serverName, this.shouldSplitHlog); return; } @@ -182,12 +181,14 @@ public class ServerShutdownHandler extends EventHandler { try { if (this.shouldSplitHlog) { - LOG.info("Splitting logs for " + serverName + " before assignment."); if (distributedLogReplay) { - LOG.info("Mark regions in recovery before assignment."); + LOG.info("Mark regions in recovery for crashed server " + serverName + + " before assignment; regions=" + hris); MasterFileSystem mfs = this.services.getMasterFileSystem(); mfs.prepareLogReplay(serverName, hris); } else { + LOG.info("Splitting logs for " + serverName + + " before assignment; region count=" + hris.size()); this.services.getMasterFileSystem().splitLog(serverName); } am.getRegionStates().logSplit(serverName); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java index e362a17..f5695a8 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java @@ -6053,9 +6053,10 @@ public class HRegion implements HeapSize { // , Writable{ case BATCH_MUTATE: case COMPACT_REGION: // when a region is in recovering state, no read, split or merge is allowed - if (this.isRecovering() && (this.disallowWritesInRecovering || + if (isRecovering() && (this.disallowWritesInRecovering || (op != Operation.PUT && op != Operation.DELETE && op != Operation.BATCH_MUTATE))) { - throw new RegionInRecoveryException(this.getRegionNameAsString() + " is recovering"); + throw new RegionInRecoveryException(this.getRegionNameAsString() + + " is recovering; cannot take reads"); } break; default: diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java index 1254ba5..18aa8aa 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java @@ -1331,13 +1331,13 @@ public class RSRpcServices implements HBaseRPCErrorHandler, // check if the region to be opened is marked in recovering state in ZK if (ZKSplitLog.isRegionMarkedRecoveringInZK(regionServer.getZooKeeper(), region.getEncodedName())) { - // check if current region open is for distributedLogReplay. This check is to support + // Check if current region open is for distributedLogReplay. This check is to support // rolling restart/upgrade where we want to Master/RS see same configuration if (!regionOpenInfo.hasOpenForDistributedLogReplay() || regionOpenInfo.getOpenForDistributedLogReplay()) { regionServer.recoveringRegions.put(region.getEncodedName(), null); } else { - // remove stale recovery region from ZK when we open region not for recovering which + // Remove stale recovery region from ZK when we open region not for recovering which // could happen when turn distributedLogReplay off from on. List tmpRegions = new ArrayList(); tmpRegions.add(region.getEncodedName()); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java index 41cb97f..94864ba 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java @@ -170,7 +170,7 @@ public class SplitLogWorker implements Runnable { * start the SplitLogWorker thread */ public void start() { - worker = new Thread(null, this, "SplitLogWorker-" + server.getServerName()); + worker = new Thread(null, this, "SplitLogWorker-" + server.getServerName().toShortString()); worker.start(); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoveringRegionWatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoveringRegionWatcher.java index b0e7105..a07bd2f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoveringRegionWatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoveringRegionWatcher.java @@ -63,7 +63,7 @@ public class RecoveringRegionWatcher extends ZooKeeperListener { region.setRecovering(false); } - LOG.info(path + " znode deleted. Region: " + regionName + " completes recovery."); + LOG.info(path + " deleted; " + regionName + " recovered."); } @Override diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/RegionServerTracker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/RegionServerTracker.java index 0b8846c..025d98e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/RegionServerTracker.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/zookeeper/RegionServerTracker.java @@ -86,13 +86,13 @@ public class RegionServerTracker extends ZooKeeperListener { try { String nodePath = ZKUtil.joinZNode(watcher.rsZNode, n); byte[] data = ZKUtil.getData(watcher, nodePath); - if (LOG.isDebugEnabled()) { - LOG.debug("RS node: " + nodePath + " data: " + Bytes.toString(data)); - } if (data != null && data.length > 0 && ProtobufUtil.isPBMagicPrefix(data)) { int magicLen = ProtobufUtil.lengthOfPBMagic(); rsInfoBuilder.mergeFrom(data, magicLen, data.length - magicLen); } + if (LOG.isDebugEnabled()) { + LOG.debug("Added tracking of RS " + nodePath); + } } catch (KeeperException e) { LOG.warn("Get Rs info port from ephemeral node", e); } catch (IOException e) { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java index 3c845fd..e936ace 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java @@ -529,8 +529,7 @@ public class TestDistributedLogSplitting { }); Thread.sleep(2000); - LOG.info("Current Open Regions:" - + HBaseTestingUtility.getAllOnlineRegions(cluster).size()); + LOG.info("Current Open Regions:" + HBaseTestingUtility.getAllOnlineRegions(cluster).size()); // wait for all regions are fully recovered TEST_UTIL.waitFor(180000, 200, new Waiter.Predicate() { @@ -538,7 +537,11 @@ public class TestDistributedLogSplitting { public boolean evaluate() throws Exception { List recoveringRegions = zkw.getRecoverableZooKeeper().getChildren( zkw.recoveringRegionsZNode, false); - return (recoveringRegions != null && recoveringRegions.size() == 0); + boolean done = recoveringRegions != null && recoveringRegions.size() == 0; + if (!done) { + LOG.info("Recovering regions: " + recoveringRegions); + } + return done; } });