From ab07b301e184811913c04385fbe338566402738d Mon Sep 17 00:00:00 2001 From: Balazs Meszaros Date: Tue, 16 Jan 2018 21:35:12 -0800 Subject: [PATCH] HBASE-19598 Fix TestAssignmentManagerMetrics flaky test Root issue is that Master was stuck in waitForMasterActive, regions were being assigned to Master, and the metrics we were expecting were incorrect (if the killed regionserver was hosting user-space and hbase:meta region). Master never left waitForMasterActive because it never checked state of the clusterUp flag. The test here was aborting regionserver and then just exiting. The minihbasecluster shutdown sets the cluster down flag but we were never looking at it so Master thread was staying up. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java Changed log from ERROR to WARN and suppressed stack trace. This is the 'stop' method. It should allow that we may be going down a little unclean. No need of spew in logs. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java The tableOnMaster check in waitForMasterActive looks wrong. It was making it so a 'normal' Master was getting stuck in here. This is not the place to worry about tablesOnMaster. That is for the balancer to be concerned with. There is a problem with Master hosting system-tables-only. After further study, Master can carry regions like a regionserver but making it so it carries system tables only is tricky given meta assign happens ahead of all others which means that the Master needs to have checked-in as a regionserver super early... It needs work. Punted for now. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java Mostly renaming so lists and maps of region infos have same name as they have elsewhere in code base and cleaning up confusion that may arise when we talk of servers-for-system-tables....It is talking about something else in the code changes here that is other than the normal understanding. It is about filtering regionservers by their version numbers so we favor regions with higher version numbers. Needs to go back up into the balancer. M hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java It was possible for the Master to be given regions if no regionservers available (as per the failing unit test in this case). M hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Minor reordering moving the waitForMasterActive later in the initialize and wrapping each test in a check if we are to keep looping (which checks cluster status flag). M hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java This was an old test from the days when Master carried system tables. Updated test and fixed metrics. Metrics count the hbase:meta along with the userspace region so upped expected numbers (previously the hbase:meta was hosted on the master so metrics were not incremented). M hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestRegionsOnMasterOptions.java I took a look at this test again but nope, needs a load of work still to make it pass. M hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKUtil.java Stop being so whiney. --- .../hadoop/hbase/procedure2/ProcedureExecutor.java | 2 +- .../org/apache/hadoop/hbase/ipc/RpcExecutor.java | 2 +- .../hadoop/hbase/master/ActiveMasterManager.java | 2 +- .../org/apache/hadoop/hbase/master/HMaster.java | 10 ++- .../apache/hadoop/hbase/master/ServerManager.java | 2 +- .../hadoop/hbase/master/SplitLogManager.java | 6 +- .../hbase/master/assignment/AssignmentManager.java | 81 +++++++++++----------- .../hbase/master/balancer/BaseLoadBalancer.java | 10 ++- .../hadoop/hbase/regionserver/HRegionServer.java | 34 ++++----- .../hadoop/hbase/TestClientClusterMetrics.java | 8 ++- .../hadoop/hbase/TestClientClusterStatus.java | 2 +- .../hbase/master/TestAssignmentManagerMetrics.java | 37 +++++----- .../balancer/TestRegionsOnMasterOptions.java | 5 +- .../org/apache/hadoop/hbase/zookeeper/ZKUtil.java | 10 +-- 14 files changed, 106 insertions(+), 105 deletions(-) diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java index 069bf2832c..7e052c4836 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java @@ -1751,7 +1751,7 @@ public class ProcedureExecutor { } catch (Throwable t) { LOG.warn("Worker terminating UNNATURALLY " + this.activeProcedure, t); } finally { - LOG.debug("Worker terminated."); + LOG.trace("Worker terminated."); } workerThreads.remove(this); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcExecutor.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcExecutor.java index fa30d0dc71..9123d7ea07 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcExecutor.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcExecutor.java @@ -327,7 +327,7 @@ public abstract class RpcExecutor { int failedCount = failedHandlerCount.incrementAndGet(); if (this.handlerFailureThreshhold >= 0 && failedCount > handlerCount * this.handlerFailureThreshhold) { - String message = "Number of failed RpcServer handler runs exceeded threshhold " + String message = "Number of failed RpcServer handler runs exceeded threshold " + this.handlerFailureThreshhold + "; reason: " + StringUtils.stringifyException(e); if (abortable != null) { abortable.abort(message, e); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java index 62073db863..a7384aa4f7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java @@ -277,7 +277,7 @@ public class ActiveMasterManager extends ZKListener { ZNodeClearer.deleteMyEphemeralNodeOnDisk(); } } catch (KeeperException e) { - LOG.error(this.watcher.prefix("Error deleting our own master address node"), e); + LOG.debug(this.watcher.prefix("failed cleanup (already cleared?); " + e.getMessage())); } } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 0e626ce6ff..ba95bd6ef9 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -652,14 +652,12 @@ public class HMaster extends HRegionServer implements MasterServices { } /** - * If configured to put regions on active master, - * wait till a backup master becomes active. - * Otherwise, loop till the server is stopped or aborted. + * Wait here if backup Master. This avoids showing backup masters as regionservers in master + * web UI, or assigning any region to them. */ @Override - protected void waitForMasterActive(){ - boolean tablesOnMaster = LoadBalancer.isTablesOnMaster(conf); - while (!(tablesOnMaster && activeMaster) && !isStopped() && !isAborted()) { + protected void waitForMasterActive() { + while (!this.activeMaster && keepLooping()) { sleeper.sleep(); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 3db60335d8..47b072e28c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -888,7 +888,7 @@ public class ServerManager { if (isClusterShutdown()) { this.master.stop("Cluster shutdown"); } - LOG.info("Finished wait on RegionServer count=" + count + "; waited=" + slept + "ms," + + LOG.info("RegionServer count=" + count + "; waited=" + slept + "ms," + " expected min=" + minToStart + " server(s), max=" + getStrForMax(maxToStart) + " server(s),"+ " master is "+ (this.master.isStopped() ? "stopped.": "running")); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java index d1c1612f2e..0d144362ab 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java @@ -254,9 +254,7 @@ public class SplitLogManager { if (batch.done != batch.installed) { batch.isDead = true; SplitLogCounters.tot_mgr_log_split_batch_err.increment(); - LOG.warn("error while splitting logs in " + logDirs + " installed = " + batch.installed - + " but only " + batch.done + " done"); - String msg = "error or interrupted while splitting logs in " + logDirs + " Task = " + batch; + String msg = "Error or interrupted while splitting WALs in " + logDirs + "; task=" + batch; status.abort(msg); throw new IOException(msg); } @@ -470,7 +468,7 @@ public class SplitLogManager { @Override public String toString() { - return ("installed = " + installed + " done = " + done + " error = " + error); + return ("installed=" + installed + ", done=" + done + ", error=" + error); } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 1b8e757754..19cb0aa39f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -1711,93 +1711,93 @@ public class AssignmentManager implements ServerListener { } // TODO: Optimize balancer. pass a RegionPlan? - final HashMap retainMap = new HashMap<>(); - final List userRRList = new ArrayList<>(); - // regions for system tables requiring reassignment - final List sysRRList = new ArrayList<>(); + final HashMap retainPlan = new HashMap<>(); + final List userRegionInfos = new ArrayList<>(); + // Regions for system tables requiring reassignment + final List systemRegionInfos = new ArrayList<>(); for (RegionStateNode regionNode : regions.values()) { boolean sysTable = regionNode.isSystemTable(); - final List rrList = sysTable ? sysRRList : userRRList; - + final List regionInfos = sysTable? systemRegionInfos: userRegionInfos; if (regionNode.getRegionLocation() != null) { - retainMap.put(regionNode.getRegionInfo(), regionNode.getRegionLocation()); + retainPlan.put(regionNode.getRegionInfo(), regionNode.getRegionLocation()); } else { - rrList.add(regionNode.getRegionInfo()); + regionInfos.add(regionNode.getRegionInfo()); } } // TODO: connect with the listener to invalidate the cache - // TODO use events + List servers = master.getServerManager().createDestinationServersList(); for (int i = 0; servers.size() < 1; ++i) { if (i % 4 == 0) { - LOG.warn("no server available, unable to find a location for " + regions.size() + - " unassigned regions. waiting"); + // Log every 4th time; we wait 250ms below so means every second. + LOG.warn("No server available; unable to find a location for " + regions.size() + + " regions. waiting..."); } - - // the was AM killed if (!isRunning()) { - LOG.debug("aborting assignment-queue with " + regions.size() + " not assigned"); + LOG.debug("Aborting assignment-queue with " + regions.size() + " unassigned"); return; } - Threads.sleep(250); + // Refresh server list. servers = master.getServerManager().createDestinationServersList(); } - if (!sysRRList.isEmpty()) { - // system table regions requiring reassignment are present, get region servers - // not available for system table regions + if (!systemRegionInfos.isEmpty()) { + // System table regions requiring reassignment are present, get region servers + // not available for system table regions. Here we are filtering out any regionservers + // that might be running older versions of the RegionServer; we want system tables on any + // newer servers that may be present. Newer servers means we are probably doing a rolling + // upgrade. final List excludeServers = getExcludedServersForSystemTable(); List serversForSysTables = servers.stream() .filter(s -> !excludeServers.contains(s)).collect(Collectors.toList()); if (serversForSysTables.isEmpty()) { - LOG.warn("No servers available for system table regions, considering all servers!"); + LOG.warn("All servers excluded! Considering all servers!"); } - LOG.debug("Processing assignment plans for System tables sysServersCount=" + - serversForSysTables.size() + ", allServersCount=" + servers.size()); - processAssignmentPlans(regions, null, sysRRList, - serversForSysTables.isEmpty() ? servers : serversForSysTables); + LOG.debug("Candidate servers to host system regions=" + serversForSysTables.size() + + "; totalServersCount=" + servers.size()); + processAssignmentPlans(regions, null, systemRegionInfos, + serversForSysTables.isEmpty()? servers: serversForSysTables); } - processAssignmentPlans(regions, retainMap, userRRList, servers); + processAssignmentPlans(regions, retainPlan, userRegionInfos, servers); } private void processAssignmentPlans(final HashMap regions, - final HashMap retainMap, final List rrList, + final HashMap retain, final List hris, final List servers) { boolean isTraceEnabled = LOG.isTraceEnabled(); if (isTraceEnabled) { - LOG.trace("available servers count=" + servers.size() + ": " + servers); + LOG.trace("Available servers count=" + servers.size() + ": " + servers); } - final LoadBalancer balancer = getBalancer(); // ask the balancer where to place regions - if (retainMap != null && !retainMap.isEmpty()) { + if (retain != null && !retain.isEmpty()) { if (isTraceEnabled) { - LOG.trace("retain assign regions=" + retainMap); + LOG.trace("Retain assign regions=" + retain); } try { - acceptPlan(regions, balancer.retainAssignment(retainMap, servers)); + acceptPlan(regions, balancer.retainAssignment(retain, servers)); } catch (HBaseIOException e) { LOG.warn("unable to retain assignment", e); - addToPendingAssignment(regions, retainMap.keySet()); + addToPendingAssignment(regions, retain.keySet()); } } // TODO: Do we need to split retain and round-robin? // the retain seems to fallback to round-robin/random if the region is not in the map. - if (!rrList.isEmpty()) { - Collections.sort(rrList, RegionInfo.COMPARATOR); + if (!hris.isEmpty()) { + Collections.sort(hris, RegionInfo.COMPARATOR); if (isTraceEnabled) { - LOG.trace("round robin regions=" + rrList); + LOG.trace("Round-robin regions=" + hris); } try { - acceptPlan(regions, balancer.roundRobinAssignment(rrList, servers)); + acceptPlan(regions, balancer.roundRobinAssignment(hris, servers)); } catch (HBaseIOException e) { - LOG.warn("unable to round-robin assignment", e); - addToPendingAssignment(regions, rrList); + LOG.warn("Unable to round-robin assignment", e); + addToPendingAssignment(regions, hris); } } } @@ -1808,7 +1808,7 @@ public class AssignmentManager implements ServerListener { final long st = System.currentTimeMillis(); if (plan == null) { - throw new HBaseIOException("unable to compute plans for regions=" + regions.size()); + throw new HBaseIOException("Unable to compute plans for " + regions.size() + " regions"); } if (plan.isEmpty()) return; @@ -1844,8 +1844,9 @@ public class AssignmentManager implements ServerListener { } /** - * Get a list of servers that this region can not assign to. - * For system table, we must assign them to a server with highest version. + * Get a list of servers that this region can NOT be assigned to. + * For system tables, we must assign them to a server with highest version (rolling upgrade + * scenario). */ public List getExcludedServersForSystemTable() { List> serverList = master.getServerManager().getOnlineServersList() diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java index a8dd9aeb7f..c32d50dbed 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/BaseLoadBalancer.java @@ -1229,10 +1229,16 @@ public abstract class BaseLoadBalancer implements LoadBalancer { if (regions == null || regions.isEmpty()) { return assignments; } + if (!this.tablesOnMaster) { + // Make sure Master is not in set of possible servers. + if (servers != null && !servers.isEmpty()) { + servers.remove(this.masterServerName); + } + } - int numServers = servers == null ? 0 : servers.size(); + int numServers = servers == null? 0: servers.size(); if (numServers == 0) { - LOG.warn("Wanted to do round robin assignment but no servers to assign to"); + LOG.warn("Wanted to round-robin assignment but no server(s) to assign to."); return null; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index 516a77ea87..84f5dfb3bb 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -870,22 +870,15 @@ public class HRegionServer extends HasThread implements } } - // In case colocated master, wait here till it's active. - // So backup masters won't start as regionservers. - // This is to avoid showing backup masters as regionservers - // in master web UI, or assigning any region to them. - waitForMasterActive(); - if (isStopped() || isAborted()) { - return; // No need for further initialization - } - - // watch for snapshots and other procedures - try { - rspmHost = new RegionServerProcedureManagerHost(); - rspmHost.loadProcedures(conf); - rspmHost.initialize(this); - } catch (KeeperException e) { - this.abort("Failed to reach coordination cluster when creating procedure handler.", e); + // Watch for snapshots and other procedures. Check we have not been stopped before proceeding. + if (keepLooping()) { + try { + rspmHost = new RegionServerProcedureManagerHost(); + rspmHost.loadProcedures(conf); + rspmHost.initialize(this); + } catch (KeeperException e) { + this.abort("Failed setup of RegionServerProcedureManager.", e); + } } } @@ -926,7 +919,10 @@ public class HRegionServer extends HasThread implements } try { - if (!isStopped() && !isAborted()) { + // If we are backup server instance, wait till we become active master before proceeding. + waitForMasterActive(); + + if (keepLooping()) { ShutdownHook.install(conf, fs, this, Thread.currentThread()); // Initialize the RegionServerCoprocessorHost now that our ephemeral // node was created, in case any coprocessors want to use ZooKeeper @@ -2147,7 +2143,7 @@ public class HRegionServer extends HasThread implements */ public void stop(final String msg, final boolean force, final User user) { if (!this.stopped) { - LOG.info("***** STOPPING region server '" + this + "' *****"); + LOG.info("STOPPING server '" + this); if (this.rsHost != null) { // when forced via abort don't allow CPs to override try { @@ -2579,7 +2575,7 @@ public class HRegionServer extends HasThread implements * @return True if we should break loop because cluster is going down or * this server has been stopped or hdfs has gone bad. */ - private boolean keepLooping() { + protected boolean keepLooping() { return !this.stopped && isClusterUp(); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestClientClusterMetrics.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestClientClusterMetrics.java index b2688420d7..b3898bcc9a 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestClientClusterMetrics.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestClientClusterMetrics.java @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -20,6 +20,7 @@ package org.apache.hadoop.hbase; import java.io.IOException; import java.util.EnumSet; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.concurrent.CompletableFuture; import java.util.concurrent.atomic.AtomicInteger; @@ -146,7 +147,10 @@ public class TestClientClusterMetrics { // live servers = nums of regionservers // By default, HMaster don't carry any regions so it won't report its load. // Hence, it won't be in the server list. - Assert.assertEquals(numRs, metrics.getLiveServerMetrics().size()); + for (ServerName sn: metrics.getLiveServerMetrics().keySet()) { + System.out.println(sn.toString()); + } + Assert.assertEquals(numRs + 1 /*Master*/, metrics.getLiveServerMetrics().size()); Assert.assertTrue(metrics.getRegionCount() > 0); Assert.assertNotNull(metrics.getDeadServerNames()); Assert.assertEquals(1, metrics.getDeadServerNames().size()); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestClientClusterStatus.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestClientClusterStatus.java index d7e6f144f2..6bc560e8d5 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestClientClusterStatus.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestClientClusterStatus.java @@ -136,7 +136,7 @@ public class TestClientClusterStatus { // live servers = nums of regionservers // By default, HMaster don't carry any regions so it won't report its load. // Hence, it won't be in the server list. - Assert.assertEquals(status.getServers().size(), numRs); + Assert.assertEquals(status.getServers().size(), numRs + 1/*Master*/); Assert.assertTrue(status.getRegionsCount() > 0); Assert.assertNotNull(status.getDeadServerNames()); Assert.assertEquals(1, status.getDeadServersSize()); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java index 717933aa76..e700a08484 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerMetrics.java @@ -23,12 +23,14 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.CompatibilityFactory; import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.HColumnDescriptor; -import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.MiniHBaseCluster; import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; +import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.client.TableDescriptor; +import org.apache.hadoop.hbase.client.TableDescriptorBuilder; import org.apache.hadoop.hbase.test.MetricsAssertHelper; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.util.Bytes; @@ -74,10 +76,7 @@ public class TestAssignmentManagerMetrics { // set msgInterval to 1 second conf.setInt("hbase.regionserver.msginterval", msgInterval); - // set tablesOnMaster to none - conf.set("hbase.balancer.tablesOnMaster", "none"); - - // set client sync wait timeout to 5sec + // Set client sync wait timeout to 5sec conf.setInt("hbase.client.sync.wait.timeout.msec", 2500); conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 1); conf.setInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 2500); @@ -123,27 +122,25 @@ public class TestAssignmentManagerMetrics { amSource); // alter table with a non-existing coprocessor - HTableDescriptor htd = new HTableDescriptor(TABLENAME); - HColumnDescriptor hcd = new HColumnDescriptor(FAMILY); - - htd.addFamily(hcd); - - String spec = "hdfs:///foo.jar|com.foo.FooRegionObserver|1001|arg1=1,arg2=2"; - htd.addCoprocessorWithSpec(spec); - + ColumnFamilyDescriptor hcd = ColumnFamilyDescriptorBuilder.newBuilder(FAMILY).build(); + TableDescriptor htd = TableDescriptorBuilder.newBuilder(TABLENAME).addColumnFamily(hcd). + addCoprocessorWithSpec("hdfs:///foo.jar|com.foo.FooRegionObserver|1001|arg1=1,arg2=2"). + build(); try { - TEST_UTIL.getAdmin().modifyTable(TABLENAME, htd); + TEST_UTIL.getAdmin().modifyTable(htd); fail("Expected region failed to open"); } catch (IOException e) { - // expected, the RS will crash and the assignment will spin forever waiting for a RS - // to assign the region. the region will not go to FAILED_OPEN because in this case - // we have just one RS and it will do one retry. + // Expected, the RS will crash and the assignment will spin forever waiting for a RS + // to assign the region. + LOG.info("Expected exception", e); } // Sleep 3 seconds, wait for doMetrics chore catching up Thread.sleep(msgInterval * 3); - metricsHelper.assertGauge(MetricsAssignmentManagerSource.RIT_COUNT_NAME, 1, amSource); - metricsHelper.assertGauge(MetricsAssignmentManagerSource.RIT_COUNT_OVER_THRESHOLD_NAME, 1, + // Two regions in RIT -- meta and the testRITAssignementManagerMetrics table region. + metricsHelper.assertGauge(MetricsAssignmentManagerSource.RIT_COUNT_NAME, 2, amSource); + // Both are over the threshold because no RegionServer to assign to. + metricsHelper.assertGauge(MetricsAssignmentManagerSource.RIT_COUNT_OVER_THRESHOLD_NAME, 2, amSource); } finally { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestRegionsOnMasterOptions.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestRegionsOnMasterOptions.java index 8f0688654a..5fa95abe1d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestRegionsOnMasterOptions.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestRegionsOnMasterOptions.java @@ -47,6 +47,7 @@ import static org.junit.Assert.assertTrue; * Test options for regions on master; none, system, or any (i.e. master is like any other * regionserver). Checks how regions are deployed when each of the options are enabled. * It then does kill combinations to make sure the distribution is more than just for startup. + * NOTE: System-tables only on Master doesn't work. TODO. */ @Category({MediumTests.class}) public class TestRegionsOnMasterOptions { @@ -103,8 +104,8 @@ public class TestRegionsOnMasterOptions { checkBalance(0, rsCount); } - @Ignore // Fix this. The Master startup doesn't allow Master reporting as a RegionServer, not - // until way late after the Master startup finishes. Needs more work. + @Ignore // Needs a bunch of work. We need to assign meta first and do it ahead of all others. + // This special handling messes up being able to host system tables only on Master w/o hacks. @Test public void testSystemTablesOnMaster() throws Exception { c.setBoolean(LoadBalancer.TABLES_ON_MASTER, true); diff --git a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKUtil.java b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKUtil.java index 7bb4752759..37fc4614dd 100644 --- a/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKUtil.java +++ b/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKUtil.java @@ -611,13 +611,13 @@ public final class ZKUtil { logRetrievedMsg(zkw, znode, data, false); return data; } catch (KeeperException.NoNodeException e) { - LOG.debug(zkw.prefix("Unable to get data of znode " + znode + " " + - "because node does not exist (not an error)")); + LOG.debug(zkw.prefix("failed to get data of " + znode + " " + + "; does not exist (not an error)")); return null; } catch (KeeperException e) { - LOG.warn(zkw.prefix("Unable to get data of znode " + znode), e); - zkw.keeperException(e); - return null; + LOG.debug(zkw.prefix("failed to get data of " + znode + "; " + e.getMessage())); + // Rethrow + throw e; } } -- 2.11.0 (Apple Git-81)