From f8ea6bfb6468f9622bbeae6d95a0fc2d36f0651c Mon Sep 17 00:00:00 2001 From: Elliott Clark Date: Wed, 15 Apr 2015 13:49:55 -0700 Subject: [PATCH] HBASE-13477 Create metrics on failed requests Summary: Add metrics on how many requests are exceptions and what type. Test Plan: behold unit tests. Differential Revision: https://reviews.facebook.net/D37167 --- .../hadoop/hbase/ipc/MetricsHBaseServerSource.java | 22 +++++++++ .../hbase/ipc/MetricsHBaseServerSourceImpl.java | 52 ++++++++++++++++++++++ .../hadoop/hbase/ipc/MetricsHBaseServer.java | 34 ++++++++++++++ .../org/apache/hadoop/hbase/ipc/RpcServer.java | 15 +++++-- .../hadoop/hbase/regionserver/RSRpcServices.java | 4 ++ .../apache/hadoop/hbase/ipc/TestRpcMetrics.java | 18 ++++++++ 6 files changed, 142 insertions(+), 3 deletions(-) diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/ipc/MetricsHBaseServerSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/ipc/MetricsHBaseServerSource.java index 1f4c950..482fdba 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/ipc/MetricsHBaseServerSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/ipc/MetricsHBaseServerSource.java @@ -58,6 +58,16 @@ public interface MetricsHBaseServerSource extends BaseSource { String NUM_ACTIVE_HANDLER_NAME = "numActiveHandler"; String NUM_ACTIVE_HANDLER_DESC = "Number of active rpc handlers."; + String EXCEPTIONS_NAME="exceptions"; + String EXCEPTIONS_DESC="Exceptions caused by requests"; + String EXCEPTIONS_TYPE_DESC="Number of requests that resulted in the specified type of Exception"; + String EXCEPTIONS_OOO_NAME="exceptions.OutOfOrderScannerNextException"; + String EXCEPTIONS_BUSY_NAME="exceptions.RegionTooBusyException"; + String EXCEPTIONS_UNKNOWN_NAME="exceptions.UnknownScannerException"; + String EXCEPTIONS_SANITY_NAME="exceptions.FailedSanityCheckException"; + String EXCEPTIONS_MOVED_NAME="exceptions.RegionMovedException"; + String EXCEPTIONS_NSRE_NAME="exceptions.NotServingRegionException"; + void authorizationSuccess(); void authorizationFailure(); @@ -66,6 +76,18 @@ public interface MetricsHBaseServerSource extends BaseSource { void authenticationFailure(); + void exception(); + + /** + * Different types of exceptions + */ + void outOfOrderException(); + void failedSanityException(); + void movedRegionException(); + void notServingRegionException(); + void unknownScannerException(); + void tooBusyException(); + void sentBytes(long count); void receivedBytes(int count); diff --git a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/ipc/MetricsHBaseServerSourceImpl.java b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/ipc/MetricsHBaseServerSourceImpl.java index 8eefb08..7fa27bf 100644 --- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/ipc/MetricsHBaseServerSourceImpl.java +++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/ipc/MetricsHBaseServerSourceImpl.java @@ -38,6 +38,16 @@ public class MetricsHBaseServerSourceImpl extends BaseSourceImpl private final MutableCounterLong authenticationFailures; private final MutableCounterLong sentBytes; private final MutableCounterLong receivedBytes; + + private final MutableCounterLong exceptions; + private final MutableCounterLong exceptionsOOO; + private final MutableCounterLong exceptionsBusy; + private final MutableCounterLong exceptionsUnknown; + private final MutableCounterLong exceptionsSanity; + private final MutableCounterLong exceptionsNSRE; + private final MutableCounterLong exceptionsMoved; + + private MutableHistogram queueCallTime; private MutableHistogram processCallTime; private MutableHistogram totalCallTime; @@ -54,6 +64,13 @@ public class MetricsHBaseServerSourceImpl extends BaseSourceImpl AUTHORIZATION_SUCCESSES_DESC, 0l); this.authorizationFailures = this.getMetricsRegistry().newCounter(AUTHORIZATION_FAILURES_NAME, AUTHORIZATION_FAILURES_DESC, 0l); + this.exceptions = this.getMetricsRegistry().newCounter(EXCEPTIONS_NAME,EXCEPTIONS_DESC, 0l); + this.exceptionsOOO = this.getMetricsRegistry().newCounter(EXCEPTIONS_OOO_NAME,EXCEPTIONS_TYPE_DESC, 0l); + this.exceptionsBusy = this.getMetricsRegistry().newCounter(EXCEPTIONS_BUSY_NAME,EXCEPTIONS_TYPE_DESC, 0l); + this.exceptionsUnknown = this.getMetricsRegistry().newCounter(EXCEPTIONS_UNKNOWN_NAME,EXCEPTIONS_TYPE_DESC, 0l); + this.exceptionsSanity = this.getMetricsRegistry().newCounter(EXCEPTIONS_SANITY_NAME,EXCEPTIONS_TYPE_DESC, 0l); + this.exceptionsMoved = this.getMetricsRegistry().newCounter(EXCEPTIONS_MOVED_NAME,EXCEPTIONS_TYPE_DESC, 0l); + this.exceptionsNSRE = this.getMetricsRegistry().newCounter(EXCEPTIONS_NSRE_NAME,EXCEPTIONS_TYPE_DESC, 0l); this.authenticationSuccesses = this.getMetricsRegistry().newCounter( AUTHENTICATION_SUCCESSES_NAME, AUTHENTICATION_SUCCESSES_DESC, 0l); @@ -87,6 +104,41 @@ public class MetricsHBaseServerSourceImpl extends BaseSourceImpl } @Override + public void exception() { + exceptions.incr(); + } + + @Override + public void outOfOrderException() { + exceptionsOOO.incr(); + } + + @Override + public void failedSanityException() { + exceptionsSanity.incr(); + } + + @Override + public void movedRegionException() { + exceptionsMoved.incr(); + } + + @Override + public void notServingRegionException() { + exceptionsNSRE.incr(); + } + + @Override + public void unknownScannerException() { + exceptionsUnknown.incr(); + } + + @Override + public void tooBusyException() { + exceptionsBusy.incr(); + } + + @Override public void authenticationSuccess() { authenticationSuccesses.incr(); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/MetricsHBaseServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/MetricsHBaseServer.java index 825e688..94f4143 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/MetricsHBaseServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/MetricsHBaseServer.java @@ -19,8 +19,14 @@ package org.apache.hadoop.hbase.ipc; +import org.apache.hadoop.hbase.NotServingRegionException; +import org.apache.hadoop.hbase.RegionTooBusyException; +import org.apache.hadoop.hbase.UnknownScannerException; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.CompatibilitySingletonFactory; +import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException; +import org.apache.hadoop.hbase.exceptions.OutOfOrderScannerNextException; +import org.apache.hadoop.hbase.exceptions.RegionMovedException; @InterfaceAudience.Private public class MetricsHBaseServer { @@ -67,6 +73,34 @@ public class MetricsHBaseServer { source.queuedAndProcessedCall(totalTime); } + public void exception(Throwable throwable) { + source.exception(); + + /** + * Keep some metrics for commonly seen exceptions + * + * Try and put the most common types first. + * Place child types before the parent type that they extend. + * + * If this gets much larger we might have to go to a hashmap + */ + if (throwable != null) { + if (throwable instanceof OutOfOrderScannerNextException) { + source.outOfOrderException(); + } else if (throwable instanceof RegionTooBusyException) { + source.tooBusyException(); + } else if (throwable instanceof UnknownScannerException) { + source.unknownScannerException(); + } else if (throwable instanceof RegionMovedException) { + source.movedRegionException(); + } else if (throwable instanceof NotServingRegionException ) { + source.notServingRegionException(); + } else if (throwable instanceof FailedSanityCheckException) { + source.failedSanityException(); + } + } + } + public MetricsHBaseServerSource getMetricsSource() { return source; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcServer.java index c69a187..4ba7d33 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcServer.java @@ -158,6 +158,8 @@ import com.google.protobuf.TextFormat; @InterfaceStability.Evolving public class RpcServer implements RpcServerInterface { public static final Log LOG = LogFactory.getLog(RpcServer.class); + private static final CallQueueTooBigException CALL_QUEUE_TOO_BIG_EXCEPTION + = new CallQueueTooBigException(); private final boolean authorize; private boolean isSecurityEnabled; @@ -1787,9 +1789,10 @@ public class RpcServer implements RpcServerInterface { new Call(id, this.service, null, null, null, null, this, responder, totalRequestSize, null, null); ByteArrayOutputStream responseBuffer = new ByteArrayOutputStream(); - setupResponse(responseBuffer, callTooBig, new CallQueueTooBigException(), - "Call queue is full on " + getListenerAddress() + - ", is hbase.ipc.server.max.callqueue.size too small?"); + metrics.exception(CALL_QUEUE_TOO_BIG_EXCEPTION); + setupResponse(responseBuffer, callTooBig, CALL_QUEUE_TOO_BIG_EXCEPTION, + "Call queue is full on " + getListenerAddress() + + ", is hbase.ipc.server.max.callqueue.size too small?"); responder.doRespond(callTooBig); return; } @@ -1819,6 +1822,8 @@ public class RpcServer implements RpcServerInterface { getHostAddress(); LOG.warn(msg, t); + metrics.exception(t); + // probably the hbase hadoop version does not match the running hadoop version if (t instanceof LinkageError) { t = new DoNotRetryIOException(t); @@ -2140,6 +2145,10 @@ public class RpcServer implements RpcServerInterface { // putting it on the wire. Its needed to adhere to the pb Service Interface but we don't // need to pass it over the wire. if (e instanceof ServiceException) e = e.getCause(); + + // increment the number of requests that were exceptions. + metrics.exception(e); + if (e instanceof LinkageError) throw new DoNotRetryIOException(e); if (e instanceof IOException) throw (IOException)e; LOG.error("Unexpected throwable object ", e); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java index f9b8d61..050b108 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java @@ -552,6 +552,7 @@ public class RSRpcServices implements HBaseRPCErrorHandler, .setName(result.getClass().getName()) .setValue(result.toByteString()))); } catch (IOException ioe) { + rpcServer.getMetrics().exception(ioe); resultOrExceptionBuilder.setException(ResponseConverter.buildException(ioe)); } } else if (action.hasMutation()) { @@ -601,6 +602,7 @@ public class RSRpcServices implements HBaseRPCErrorHandler, // case the corresponding ResultOrException instance for the Put or Delete will be added // down in the doBatchOp method call rather than up here. } catch (IOException ie) { + rpcServer.getMetrics().exception(ie); resultOrExceptionBuilder = ResultOrException.newBuilder(). setException(ResponseConverter.buildException(ie)); } @@ -1897,6 +1899,7 @@ public class RSRpcServices implements HBaseRPCErrorHandler, region = getRegion(regionAction.getRegion()); quota = getQuotaManager().checkQuota(region, regionAction.getActionList()); } catch (IOException e) { + rpcServer.getMetrics().exception(e); regionActionResultBuilder.setException(ResponseConverter.buildException(e)); responseBuilder.addRegionActionResult(regionActionResultBuilder.build()); continue; // For this region it's a failure. @@ -1927,6 +1930,7 @@ public class RSRpcServices implements HBaseRPCErrorHandler, processed = Boolean.TRUE; } } catch (IOException e) { + rpcServer.getMetrics().exception(e); // As it's atomic, we may expect it's a global failure. regionActionResultBuilder.setException(ResponseConverter.buildException(e)); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/ipc/TestRpcMetrics.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/ipc/TestRpcMetrics.java index c2b0344..6bb97fd 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/ipc/TestRpcMetrics.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/ipc/TestRpcMetrics.java @@ -20,6 +20,11 @@ package org.apache.hadoop.hbase.ipc; import org.apache.hadoop.hbase.CompatibilityFactory; +import org.apache.hadoop.hbase.NotServingRegionException; +import org.apache.hadoop.hbase.RegionTooBusyException; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.exceptions.OutOfOrderScannerNextException; +import org.apache.hadoop.hbase.exceptions.RegionMovedException; import org.apache.hadoop.hbase.testclassification.RPCTests; import org.apache.hadoop.hbase.testclassification.SmallTests; import org.apache.hadoop.hbase.test.MetricsAssertHelper; @@ -113,6 +118,19 @@ public class TestRpcMetrics { HELPER.assertCounter("sentBytes", 309, serverSource); HELPER.assertCounter("receivedBytes", 208, serverSource); + + mrpc.exception(null); + HELPER.assertCounter("exceptions", 1, serverSource); + + mrpc.exception(new RegionMovedException(ServerName.parseServerName("localhost:60020"), 100)); + mrpc.exception(new RegionTooBusyException()); + mrpc.exception(new OutOfOrderScannerNextException()); + mrpc.exception(new NotServingRegionException()); + HELPER.assertCounter("exceptions.RegionMovedException", 1, serverSource); + HELPER.assertCounter("exceptions.RegionTooBusyException", 1, serverSource); + HELPER.assertCounter("exceptions.OutOfOrderScannerNextException", 1, serverSource); + HELPER.assertCounter("exceptions.NotServingRegionException", 1, serverSource); + HELPER.assertCounter("exceptions", 5, serverSource); } } -- 2.3.0