diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ClientScanner.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ClientScanner.java index bdf5791..841857d 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ClientScanner.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ClientScanner.java @@ -285,11 +285,13 @@ public class ClientScanner extends AbstractClientScanner { values = callable.withRetries(); retryAfterOutOfOrderException = true; } catch (DoNotRetryIOException e) { + // DNRIOEs are thrown to make us break out of retries. Some types of DNRIOEs want us + // to reset the scanner and come back in again. if (e instanceof UnknownScannerException) { long timeout = lastNext + scannerTimeout; - // If we are over the timeout, throw this exception to the client - // Else, it's because the region moved and we used the old id - // against the new region server; reset the scanner. + // If we are over the timeout, throw this exception to the client wrapped in + // a ScannerTimeoutException. Else, it's because the region moved and we used the old + // id against the new region server; reset the scanner. if (timeout < System.currentTimeMillis()) { long elapsed = System.currentTimeMillis() - lastNext; ScannerTimeoutException ex = new ScannerTimeoutException( @@ -299,16 +301,20 @@ public class ClientScanner extends AbstractClientScanner { throw ex; } } else { + // If exception is any but the list below throw it back to the client; else setup + // the scanner and retry. Throwable cause = e.getCause(); - if ((cause == null || (!(cause instanceof NotServingRegionException) - && !(cause instanceof RegionServerStoppedException))) && - !(e instanceof RegionServerStoppedException) && - !(e instanceof OutOfOrderScannerNextException)) { + if ((cause != null && cause instanceof NotServingRegionException) || + (cause != null && cause instanceof RegionServerStoppedException) || + e instanceof OutOfOrderScannerNextException) { + // Pass + // It is easier writing the if loop test as list of what is allowed rather than + // as a list of what is not allowed... so if in here, it means we do not throw. + } else { throw e; } } - // Else, its signal from depths of ScannerCallable that we got an - // NSRE on a next and that we need to reset the scanner. + // Else, its signal from depths of ScannerCallable that we need to reset the scanner. if (this.lastResult != null) { this.scan.setStartRow(this.lastResult.getRow()); // Skip first row returned. We already let it out on previous @@ -319,13 +325,17 @@ public class ClientScanner extends AbstractClientScanner { if (retryAfterOutOfOrderException) { retryAfterOutOfOrderException = false; } else { + // TODO: Why wrap this in a DNRIOE when it already is a DNRIOE? throw new DoNotRetryIOException("Failed after retry of " + "OutOfOrderScannerNextException: was there a rpc timeout?", e); } } - // Clear region + // Clear region. this.currentRegion = null; + // Set this to zero so we don't try and do an rpc and close on remote server when + // the exception we got was UnknownScanner or the Server is going down. callable = null; + // This continue will take us to while at end of loop where we will set up new scanner. continue; } long currentTime = System.currentTimeMillis(); diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ScannerCallable.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ScannerCallable.java index bde1588..167706b 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ScannerCallable.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ScannerCallable.java @@ -140,8 +140,7 @@ public class ScannerCallable extends ServerCallable { ScanRequest request = null; try { incRPCcallsMetrics(); - request = - RequestConverter.buildScanRequest(scannerId, caching, false, nextCallSeq); + request = RequestConverter.buildScanRequest(scannerId, caching, false, nextCallSeq); ScanResponse response = null; try { response = stub.scan(null, request); @@ -194,14 +193,24 @@ public class ScannerCallable extends ServerCallable { LOG.info("Failed to relocate region", t); } } + // The below convertion of exceptions into DoNotRetryExceptions is a little strange. + // Why not just have these exceptions implment DNRIOE you ask? Well, usually we want + // ServerCallable#withRetries to just retry when it gets these exceptions. In here in + // a scan when doing a next in particular, we want to break out and get the scanner to + // reset itself up again. Throwing a DNRIOE is how we signal this to happen (its ugly, + // yeah and hard to follow and in need of a refactor). if (ioe instanceof NotServingRegionException) { // Throw a DNRE so that we break out of cycle of calling NSRE // when what we need is to open scanner against new location. - // Attach NSRE to signal client that it needs to resetup scanner. + // Attach NSRE to signal client that it needs to re-setup scanner. if (this.scanMetrics != null) { this.scanMetrics.countOfNSRE.incrementAndGet(); } throw new DoNotRetryIOException("Resetting the scanner -- see exception cause", ioe); + } else if (ioe instanceof RegionServerStoppedException) { + // Throw a DNRE so that we break out of cycle of the retries and instead go and + // open scanner against new location. + throw new DoNotRetryIOException("Resetting the scanner -- see exception cause", ioe); } else { // The outer layers will retry throw ioe; diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/exceptions/RegionServerStoppedException.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/exceptions/RegionServerStoppedException.java index 4560a31..d1e3114 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/exceptions/RegionServerStoppedException.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/exceptions/RegionServerStoppedException.java @@ -18,6 +18,8 @@ */ package org.apache.hadoop.hbase.exceptions; +import java.io.IOException; + import org.apache.hadoop.classification.InterfaceAudience; /** @@ -25,8 +27,7 @@ import org.apache.hadoop.classification.InterfaceAudience; */ @SuppressWarnings("serial") @InterfaceAudience.Private -public class RegionServerStoppedException extends DoNotRetryIOException { - +public class RegionServerStoppedException extends IOException { public RegionServerStoppedException(String s) { super(s); } diff --git a/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestClientNoCluster.java b/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestClientNoCluster.java index 1ed1d96..8a541e2 100644 --- a/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestClientNoCluster.java +++ b/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestClientNoCluster.java @@ -52,7 +52,6 @@ public class TestClientNoCluster { // Run my HConnection overrides. Use my little HConnectionImplementation below which // allows me insert mocks and also use my Registry below rather than the default zk based // one so tests run faster and don't have zk dependency. - this.conf.set("hbase.client.connection.impl", NoClusterConnection.class.getName()); this.conf.set("hbase.client.registry.impl", SimpleRegistry.class.getName()); } @@ -90,11 +89,15 @@ public class TestClientNoCluster { @Test public void testDoNotRetryMetaScanner() throws IOException { + this.conf.set("hbase.client.connection.impl", + RegionServerStoppedOnScannerOpenConnection.class.getName()); MetaScanner.metaScan(this.conf, null); } @Test - public void testDoNotRetryOnScan() throws IOException { + public void testDoNotRetryOnScanNext() throws IOException { + this.conf.set("hbase.client.connection.impl", + RegionServerStoppedOnScannerOpenConnection.class.getName()); // Go against meta else we will try to find first region for the table on construction which // means we'll have to do a bunch more mocking. Tests that go against meta only should be // good for a bit of testing. @@ -111,13 +114,67 @@ public class TestClientNoCluster { } } + @Test + public void testRegionServerStoppedOnScannerOpen() throws IOException { + this.conf.set("hbase.client.connection.impl", + RegionServerStoppedOnScannerOpenConnection.class.getName()); + // Go against meta else we will try to find first region for the table on construction which + // means we'll have to do a bunch more mocking. Tests that go against meta only should be + // good for a bit of testing. + HTable table = new HTable(this.conf, HConstants.META_TABLE_NAME); + ResultScanner scanner = table.getScanner(HConstants.CATALOG_FAMILY); + try { + Result result = null; + while ((result = scanner.next()) != null) { + LOG.info(result); + } + } finally { + scanner.close(); + table.close(); + } + } + + /** + * Override to shutdown going to zookeeper for cluster id and meta location. + */ + static class ScanOpenNextThenExceptionThenRecoverConnection + extends HConnectionManager.HConnectionImplementation { + final ClientService.BlockingInterface stub; + + ScanOpenNextThenExceptionThenRecoverConnection(Configuration conf, + boolean managed) throws IOException { + super(conf, managed); + // Mock up my stub so open scanner returns a scanner id and then on next, we throw + // exceptions for three times and then after that, we return no more to scan. + this.stub = Mockito.mock(ClientService.BlockingInterface.class); + long sid = 12345L; + try { + Mockito.when(stub.scan((RpcController)Mockito.any(), + (ClientProtos.ScanRequest)Mockito.any())). + thenReturn(ClientProtos.ScanResponse.newBuilder().setScannerId(sid).build()). + thenThrow(new ServiceException(new RegionServerStoppedException("From Mockito"))). + thenReturn(ClientProtos.ScanResponse.newBuilder().setScannerId(sid). + setMoreResults(false).build()); + } catch (ServiceException e) { + throw new IOException(e); + } + } + + @Override + public BlockingInterface getClient(ServerName sn) throws IOException { + return this.stub; + } + } + /** * Override to shutdown going to zookeeper for cluster id and meta location. */ - static class NoClusterConnection extends HConnectionManager.HConnectionImplementation { + static class RegionServerStoppedOnScannerOpenConnection + extends HConnectionManager.HConnectionImplementation { final ClientService.BlockingInterface stub; - NoClusterConnection(Configuration conf, boolean managed) throws IOException { + RegionServerStoppedOnScannerOpenConnection(Configuration conf, boolean managed) + throws IOException { super(conf, managed); // Mock up my stub so open scanner returns a scanner id and then on next, we throw // exceptions for three times and then after that, we return no more to scan.