From 6c2634fb29df65229648785a98e59caeb5acca1b Mon Sep 17 00:00:00 2001 From: stack Date: Wed, 29 Oct 2014 12:36:49 -0700 Subject: [PATCH] 12376 HBaseAdmin leaks ZK connections if failure starting watchers (ConnectionLossException) --- .../hadoop/hbase/catalog/CatalogTracker.java | 12 +++++- .../org/apache/hadoop/hbase/client/HBaseAdmin.java | 45 ++++++++++++++----- .../org/apache/hadoop/hbase/client/TestAdmin.java | 50 +++++++++++++++++++++- 3 files changed, 94 insertions(+), 13 deletions(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java index 6a7a6ff..09c66cc 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java @@ -17,7 +17,9 @@ */ package org.apache.hadoop.hbase.catalog; +import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Stopwatch; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.classification.InterfaceAudience; @@ -113,7 +115,7 @@ public class CatalogTracker { private boolean instantiatedzkw = false; private Abortable abortable; - private boolean stopped = false; + private volatile boolean stopped = false; static final byte [] META_REGION_NAME = HRegionInfo.FIRST_META_REGIONINFO.getRegionName(); @@ -203,6 +205,14 @@ public class CatalogTracker { } /** + * @return True if we are stopped. Call only after start else indeterminate answer. + */ + @VisibleForTesting + public boolean isStopped() { + return this.stopped; + } + + /** * Stop working. * Interrupts any ongoing waits. */ diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseAdmin.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseAdmin.java index 022c382..01257d4 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseAdmin.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/HBaseAdmin.java @@ -25,11 +25,9 @@ import java.io.InterruptedIOException; import java.net.SocketTimeoutException; import java.util.ArrayList; import java.util.Arrays; -import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; -import java.util.Properties; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReference; import java.util.regex.Pattern; @@ -58,10 +56,10 @@ import org.apache.hadoop.hbase.TableNotEnabledException; import org.apache.hadoop.hbase.TableNotFoundException; import org.apache.hadoop.hbase.UnknownRegionException; import org.apache.hadoop.hbase.ZooKeeperConnectionException; -import org.apache.hadoop.hbase.classification.InterfaceAudience; -import org.apache.hadoop.hbase.classification.InterfaceStability; import org.apache.hadoop.hbase.catalog.CatalogTracker; import org.apache.hadoop.hbase.catalog.MetaReader; +import org.apache.hadoop.hbase.classification.InterfaceAudience; +import org.apache.hadoop.hbase.classification.InterfaceStability; import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor; import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitorBase; import org.apache.hadoop.hbase.exceptions.DeserializationException; @@ -103,6 +101,8 @@ import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DeleteTableReques import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DisableTableRequest; import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.DispatchMergingRegionsRequest; import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.EnableTableRequest; +import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ExecProcedureRequest; +import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ExecProcedureResponse; import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetClusterStatusRequest; import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetCompletedSnapshotsRequest; import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetNamespaceDescriptorRequest; @@ -110,6 +110,8 @@ import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetSchemaAlterSta import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetSchemaAlterStatusResponse; import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableDescriptorsRequest; import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.GetTableDescriptorsResponse; +import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsProcedureDoneRequest; +import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsProcedureDoneResponse; import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsRestoreSnapshotDoneRequest; import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsRestoreSnapshotDoneResponse; import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsSnapshotDoneRequest; @@ -131,15 +133,11 @@ import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.StopMasterRequest import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.TruncateTableRequest; import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.UnassignRegionRequest; import org.apache.hadoop.hbase.regionserver.wal.FailedLogCloseException; +import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils; import org.apache.hadoop.hbase.snapshot.HBaseSnapshotException; import org.apache.hadoop.hbase.snapshot.RestoreSnapshotException; import org.apache.hadoop.hbase.snapshot.SnapshotCreationException; import org.apache.hadoop.hbase.snapshot.UnknownSnapshotException; -import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ExecProcedureRequest; -import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.ExecProcedureResponse; -import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsProcedureDoneRequest; -import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.IsProcedureDoneResponse; -import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils; import org.apache.hadoop.hbase.util.Addressing; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; @@ -148,6 +146,7 @@ import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.util.StringUtils; import org.apache.zookeeper.KeeperException; +import com.google.common.annotations.VisibleForTesting; import com.google.protobuf.ByteString; import com.google.protobuf.ServiceException; @@ -225,19 +224,43 @@ public class HBaseAdmin implements Abortable, Closeable { * @throws IOException * @see #cleanupCatalogTracker(CatalogTracker) */ - private synchronized CatalogTracker getCatalogTracker() + @VisibleForTesting + synchronized CatalogTracker getCatalogTracker() throws ZooKeeperConnectionException, IOException { + boolean succeeded = false; CatalogTracker ct = null; try { ct = new CatalogTracker(this.conf); - ct.start(); + startCatalogTracker(ct); + succeeded = true; } catch (InterruptedException e) { // Let it out as an IOE for now until we redo all so tolerate IEs throw (InterruptedIOException)new InterruptedIOException("Interrupted").initCause(e); + } finally { + // If we did not succeed but created a catalogtracker, clean it up. CT has a ZK instance + // in it and we'll leak if we don't do the 'stop'. + if (!succeeded && ct != null) { + ct.stop(); + ct = null; + } } return ct; } + /** + * For testing so can intercept the catalog tracker start. + * @param c + * @return Instance of CatalogTracker or exceptions if we fail + * @throws IOException + * @throws InterruptedException + */ + @VisibleForTesting + protected CatalogTracker startCatalogTracker(final CatalogTracker ct) + throws IOException, InterruptedException { + ct.start(); + return ct; + } + private void cleanupCatalogTracker(final CatalogTracker ct) { ct.stop(); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestAdmin.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestAdmin.java index d6ca7be..05d9d5e 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestAdmin.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestAdmin.java @@ -33,6 +33,7 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -69,7 +70,13 @@ import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.zookeeper.ZKTableReadOnly; import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; -import org.junit.*; +import org.apache.zookeeper.KeeperException; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; import org.junit.experimental.categories.Category; import com.google.protobuf.ServiceException; @@ -115,6 +122,47 @@ public class TestAdmin { } @Test (timeout=300000) + public void testFailedCatalogTrackerGetCleansUpProperly() + throws ZooKeeperConnectionException, IOException { + // An HBaseAdmin that we can make fail when it goes to get catalogtracker. + final AtomicBoolean fail = new AtomicBoolean(false); + final AtomicReference internalCt = new AtomicReference(); + HBaseAdmin doctoredAdmin = new HBaseAdmin(this.admin.getConfiguration()) { + @Override + protected CatalogTracker startCatalogTracker(CatalogTracker ct) + throws IOException, InterruptedException { + internalCt.set(ct); + super.startCatalogTracker(ct); + if (fail.get()) { + throw new IOException("Intentional test fail", + new KeeperException.ConnectionLossException()); + } + return ct; + } + }; + try { + CatalogTracker ct = doctoredAdmin.getCatalogTracker(); + assertFalse(ct.isStopped()); + ct.stop(); + assertTrue(ct.isStopped()); + // Now have mess with our doctored admin and make the start of catalog tracker 'fail'. + fail.set(true); + boolean expectedException = false; + try { + doctoredAdmin.getCatalogTracker(); + } catch (IOException ioe) { + assertTrue(ioe.getCause() instanceof KeeperException.ConnectionLossException); + expectedException = true; + } + if (!expectedException) fail("Didn't get expected exception!"); + // Assert that the internally made ct was properly shutdown. + assertTrue("Internal CatalogTracker not closed down", internalCt.get().isStopped()); + } finally { + doctoredAdmin.close(); + } + } + + @Test (timeout=300000) public void testSplitFlushCompactUnknownTable() throws InterruptedException { final String unknowntable = "fubar"; Exception exception = null; -- 1.8.5.2 (Apple Git-48)