diff --git a/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java b/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java index e505694..4c877cb 100644 --- a/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java +++ b/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java @@ -135,7 +135,7 @@ import com.google.common.collect.Lists; */ public class HRegion implements HeapSize { // , Writable{ public static final Log LOG = LogFactory.getLog(HRegion.class); - static final String MERGEDIR = "merges"; + private static final String MERGEDIR = ".merges"; final AtomicBoolean closed = new AtomicBoolean(false); /* Closing can take some time; use the closing flag if there is stuff we don't diff --git a/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 199638e..892da35 100644 --- a/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -568,6 +568,7 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, if (tryReportForDuty()) break; } long lastMsg = 0; + long oldRequestCount = -1; List outboundMessages = new ArrayList(); // The main run loop. for (int tries = 0; !this.stopped && isHealthy();) { @@ -577,7 +578,22 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, } else if (!this.stopping) { this.stopping = true; closeUserRegions(this.abortRequested); - } else if (this.stopping && LOG.isDebugEnabled()) { + } else if (this.stopping) { + boolean allUserRegionsOffline = areAllUserRegionsOffline(); + if (allUserRegionsOffline) { + // Set stopped if no requests since last time we went around the loop. + // The remaining meta regions will be closed on our way out. + if (oldRequestCount == this.requestCount.get()) { + stop("Stopped; only catalog regions remaining online"); + break; + } + oldRequestCount = this.requestCount.get(); + } else { + // Make sure all regions have been closed -- some regions may + // have not got it because we were splitting at the time of + // the call to closeUserRegions. + closeUserRegions(this.abortRequested); + } LOG.debug("Waiting on " + getOnlineRegionsAsPrintableString()); } } @@ -725,6 +741,18 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, } } + private boolean areAllUserRegionsOffline() { + if (getNumberOfOnlineRegions() > 2) return false; + boolean allUserRegionsOffline = true; + for (Map.Entry e: this.onlineRegions.entrySet()) { + if (!e.getValue().getRegionInfo().isMetaRegion()) { + allUserRegionsOffline = false; + break; + } + } + return allUserRegionsOffline; + } + List tryRegionServerReport(final List outboundMessages) throws IOException { this.serverInfo.setLoad(buildServerLoad()); @@ -1618,6 +1646,8 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, /** * Schedule closes on all user regions. + * Should be safe calling multiple times because it wont' close regions + * that are already closed or that are closing. * @param abort Whether we're running an abort. */ void closeUserRegions(final boolean abort) { @@ -1626,6 +1656,7 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler, for (Map.Entry e: this.onlineRegions.entrySet()) { HRegion r = e.getValue(); if (!r.getRegionInfo().isMetaRegion()) { + if (r.isClosed() || r.isClosing()) continue; // Don't update zk with this close transition; pass false. closeRegion(r.getRegionInfo(), abort, false); } diff --git a/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java b/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java new file mode 100644 index 0000000..d279909 --- /dev/null +++ b/src/main/java/org/apache/hadoop/hbase/regionserver/SplitRequest.java @@ -0,0 +1,101 @@ +/** + * Copyright 2011 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.regionserver; + +import java.io.IOException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.RemoteExceptionHandler; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.util.StringUtils; + +import com.google.common.base.Preconditions; + +/** + * Handles processing region splits. Put in a queue, owned by HRegionServer. + */ +class SplitRequest implements Runnable { + static final Log LOG = LogFactory.getLog(SplitRequest.class); + private final HRegion parent; + private final byte[] midKey; + private final HRegionServer server; + + SplitRequest(HRegion region, byte[] midKey, HRegionServer hrs) { + Preconditions.checkNotNull(hrs); + this.parent = region; + this.midKey = midKey; + this.server = hrs; + } + + @Override + public String toString() { + return "regionName=" + parent + ", midKey=" + Bytes.toStringBinary(midKey); + } + + @Override + public void run() { + if (this.server.isStopping() || this.server.isStopped()) { + LOG.debug("Skipping split because server is stopping=" + + this.server.isStopping() + " or stopped=" + this.server.isStopped()); + return; + } + try { + final long startTime = System.currentTimeMillis(); + SplitTransaction st = new SplitTransaction(parent, midKey); + // If prepare does not return true, for some reason -- logged inside in + // the prepare call -- we are not ready to split just now. Just return. + if (!st.prepare()) return; + try { + st.execute(this.server, this.server); + this.server.getMetrics().incrementSplitSuccessCount(); + } catch (Exception e) { + try { + LOG.info("Running rollback/cleanup of failed split of " + + parent.getRegionNameAsString() + "; " + e.getMessage()); + if (st.rollback(this.server, this.server)) { + LOG.info("Successful rollback of failed split of " + + parent.getRegionNameAsString()); + this.server.getMetrics().incrementSplitFailureCount(); + } else { + this.server.abort("Abort; we got an error after point-of-no-return"); + } + } catch (RuntimeException ee) { + String msg = "Failed rollback of failed split of " + + parent.getRegionNameAsString() + " -- aborting server"; + // If failed rollback, kill this server to avoid having a hole in table. + LOG.info(msg, ee); + this.server.abort(msg); + } + return; + } + LOG.info("Region split, META updated, and report to master. Parent=" + + parent.getRegionInfo().getRegionNameAsString() + ", new regions: " + + st.getFirstDaughter().getRegionNameAsString() + ", " + + st.getSecondDaughter().getRegionNameAsString() + ". Split took " + + StringUtils.formatTimeDiff(System.currentTimeMillis(), startTime)); + } catch (IOException ex) { + LOG.error("Split failed " + this, RemoteExceptionHandler + .checkIOException(ex)); + this.server.getMetrics().incrementSplitFailureCount(); + server.checkFileSystem(); + } + } +} diff --git a/src/main/java/org/apache/hadoop/hbase/regionserver/SplitTransaction.java b/src/main/java/org/apache/hadoop/hbase/regionserver/SplitTransaction.java index 7f1f4a4..372f0c9 100644 --- a/src/main/java/org/apache/hadoop/hbase/regionserver/SplitTransaction.java +++ b/src/main/java/org/apache/hadoop/hbase/regionserver/SplitTransaction.java @@ -75,7 +75,7 @@ import com.google.common.util.concurrent.ThreadFactoryBuilder; */ class SplitTransaction { private static final Log LOG = LogFactory.getLog(SplitTransaction.class); - private static final String SPLITDIR = "splits"; + private static final String SPLITDIR = ".splits"; /* * Region to split @@ -618,12 +618,18 @@ class SplitTransaction { break; case CLOSED_PARENT_REGION: - // So, this returns a seqid but if we just closed and then reopened, we - // should be ok. On close, we flushed using sequenceid obtained from - // hosting regionserver so no need to propagate the sequenceid returned - // out of initialize below up into regionserver as we normally do. - // TODO: Verify. - this.parent.initialize(); + try { + // So, this returns a seqid but if we just closed and then reopened, we + // should be ok. On close, we flushed using sequenceid obtained from + // hosting regionserver so no need to propagate the sequenceid returned + // out of initialize below up into regionserver as we normally do. + // TODO: Verify. + this.parent.initialize(); + } catch (IOException e) { + LOG.error("Failed rollbacking CLOSED_PARENT_REGION of region " + + this.parent.getRegionNameAsString(), e); + throw new RuntimeException(e); + } break; case STARTED_REGION_A_CREATION: diff --git a/src/main/java/org/apache/hadoop/hbase/regionserver/metrics/RegionServerMetrics.java b/src/main/java/org/apache/hadoop/hbase/regionserver/metrics/RegionServerMetrics.java index e2adaa3..da6a6df 100644 --- a/src/main/java/org/apache/hadoop/hbase/regionserver/metrics/RegionServerMetrics.java +++ b/src/main/java/org/apache/hadoop/hbase/regionserver/metrics/RegionServerMetrics.java @@ -37,6 +37,7 @@ import org.apache.hadoop.metrics.util.MetricsIntValue; import org.apache.hadoop.metrics.util.MetricsLongValue; import org.apache.hadoop.metrics.util.MetricsRegistry; import org.apache.hadoop.metrics.util.MetricsTimeVaryingRate; +import org.apache.hadoop.metrics.util.MetricsTimeVaryingLong; import java.io.IOException; import java.lang.management.ManagementFactory; @@ -185,6 +186,12 @@ public class RegionServerMetrics implements Updater { protected final PersistentMetricsTimeVaryingRate flushSize = new PersistentMetricsTimeVaryingRate("flushSize", registry); + + public final MetricsTimeVaryingLong regionSplitSuccessCount = + new MetricsTimeVaryingLong("regionSplitSuccessCount", registry); + + public final MetricsTimeVaryingLong regionSplitFailureCount = + new MetricsTimeVaryingLong("regionSplitFailureCount", registry); public RegionServerMetrics() { MetricsContext context = MetricsUtil.getContext("hbase"); @@ -282,6 +289,8 @@ public class RegionServerMetrics implements Updater { this.compactionSize.pushMetric(this.metricsRecord); this.flushTime.pushMetric(this.metricsRecord); this.flushSize.pushMetric(this.metricsRecord); + this.regionSplitSuccessCount.pushMetric(this.metricsRecord); + this.regionSplitFailureCount.pushMetric(this.metricsRecord); } this.metricsRecord.update(); } @@ -311,6 +320,13 @@ public class RegionServerMetrics implements Updater { } } + public void incrementSplitSuccessCount() { + this.regionSplitSuccessCount.inc(); + } + + public void incrementSplitFailureCount() { + this.regionSplitFailureCount.inc(); + } @Override public String toString() {