diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 1e7fce6..62f4979 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1317,6 +1317,23 @@ public static final boolean TIMELINE_SERVICE_HTTP_CROSS_ORIGIN_ENABLED_DEFAULT = false; + /** Timeline client settings */ + public static final String TIMELINE_SERVICE_CLIENT_PREFIX = + TIMELINE_SERVICE_PREFIX + "client."; + + /** Timeline client RESTful call, max retries (-1 means no limit) */ + public static final String TIMELINE_SERVICE_CLIENT_MAX_RETRIES = + TIMELINE_SERVICE_CLIENT_PREFIX + "max-retries"; + + public static final int DEFAULT_TIMELINE_SERVICE_CLIENT_MAX_RETRIES = 30; + + /** Timeline client RESTful call, retry interval */ + public static final String TIMELINE_SERVICE_CLIENT_RETRY_INTERVAL_MS = + TIMELINE_SERVICE_CLIENT_PREFIX + "retry-interval-ms"; + + public static final long + DEFAULT_TIMELINE_SERVICE_CLIENT_RETRY_INTERVAL_MS = 1000; + // /////////////////////////////// // Shared Cache Configs // /////////////////////////////// diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/api/impl/TimelineClientImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/api/impl/TimelineClientImpl.java index fbddd14..ea06633 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/api/impl/TimelineClientImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/api/impl/TimelineClientImpl.java @@ -20,6 +20,7 @@ import java.io.File; import java.io.IOException; +import java.net.ConnectException; import java.net.HttpURLConnection; import java.net.URI; import java.net.URL; @@ -68,7 +69,10 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Joiner; import com.sun.jersey.api.client.Client; +import com.sun.jersey.api.client.filter.ClientFilter; import com.sun.jersey.api.client.ClientResponse; +import com.sun.jersey.api.client.ClientRequest; +import com.sun.jersey.api.client.ClientHandlerException; import com.sun.jersey.api.client.WebResource; import com.sun.jersey.api.client.config.ClientConfig; import com.sun.jersey.api.client.config.DefaultClientConfig; @@ -102,6 +106,93 @@ private URI resURI; private boolean isEnabled; private KerberosAuthenticatedURLConnectionFactory urlFactory; + private TimelineJerseyRetryFilter retryFilter; + + static class TimelineJerseyRetryFilter extends ClientFilter { + // maxRetries < 0 means keep trying + @Private + @VisibleForTesting + public int maxRetries; + + @Private + @VisibleForTesting + public long retryInterval; + + // Indicates if retries happened last time + @Private + @VisibleForTesting + public boolean retried = false; + + // Constructor with default retry settings + public TimelineJerseyRetryFilter(Configuration conf) { + super(); + maxRetries = conf.getInt( + YarnConfiguration.TIMELINE_SERVICE_CLIENT_MAX_RETRIES, + YarnConfiguration.DEFAULT_TIMELINE_SERVICE_CLIENT_MAX_RETRIES); + retryInterval = conf.getLong( + YarnConfiguration.TIMELINE_SERVICE_CLIENT_RETRY_INTERVAL_MS, + YarnConfiguration.DEFAULT_TIMELINE_SERVICE_CLIENT_RETRY_INTERVAL_MS); + } + + // Customize retry settings synchronously + @Private + @VisibleForTesting + public synchronized void changeRetrySettings(int maxRetries, long interval) { + this.maxRetries = maxRetries; + this.retryInterval = interval; + } + + @Override + public ClientResponse handle(ClientRequest cr) + throws ClientHandlerException { + // synchronously get a snapshot of current retry settings + int leftRetries = 0; + long sleepMs; + retried = false; + synchronized (this) { + leftRetries = maxRetries; + sleepMs = retryInterval; + } + // keep trying + while (true) { + try { + // try pass the request on, if fail, keep retrying + return getNext().handle(cr); + } catch (ClientHandlerException e) { + // break if there's no retries left + if (leftRetries == 0) { + break; + } + if(e.getCause() instanceof ConnectException) { + if (leftRetries > 0) { + LOG.info("Connection Timeout (" + cr.getURI() + "), will try " + + leftRetries + " more time(s)."); + } else { + // note that maxRetries may be -1 at the very beginning + // maxRetries = -1 means keep trying + LOG.info("Connection Timeout (" + cr.getURI() + + "), will keep retrying."); + } + retried = true; + } else { + throw e; + } + } + if (leftRetries > 0) { + leftRetries--; + } + try { + // sleep for the given time interval + Thread.sleep(retryInterval); + } catch (InterruptedException ie) { + LOG.warn("Client retry sleep interrupted! "); + } + } + throw new ClientHandlerException("Failed to connect to timeline server. " + + "Connection retries limit exceeded. " + + "The posted timeline event may be missing"); + }; + } public TimelineClientImpl() { super(TimelineClientImpl.class.getName()); @@ -125,6 +216,8 @@ protected void serviceInit(Configuration conf) throws Exception { client = new Client(new URLConnectionClientHandler( new PseudoAuthenticatedURLConnectionFactory(connConfigurator)), cc); } + retryFilter = new TimelineJerseyRetryFilter(conf); + client.addFilter(retryFilter); if (YarnConfiguration.useHttps(conf)) { resURI = URI .create(JOINER.join("https://", conf.get( @@ -208,6 +301,12 @@ private ClientResponse doPosting(Object obj, String path) throws IOException, Ya @Private @VisibleForTesting + public TimelineJerseyRetryFilter getRetryFilter() { + return retryFilter; + } + + @Private + @VisibleForTesting public ClientResponse doPostingObject(Object object, String path) { WebResource webResource = client.resource(resURI); if (path == null) { diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestTimelineClient.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestTimelineClient.java index 1301556..277b8b4 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestTimelineClient.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestTimelineClient.java @@ -181,6 +181,26 @@ public void testPutDomainConnectionRefused() throws Exception { } } + @Test + public void testCheckRetryCount() throws Exception { + int newMaxRetries = 1; + long newIntervalMs = 1500; + client.getRetryFilter().changeRetrySettings(newMaxRetries, newIntervalMs); + try { + // this call should block a while + TimelinePutResponse response = client.putEntities(generateEntity()); + Assert.fail("Exception expected!" + + "Timeline server should be off to run this test. "); + } catch (ClientHandlerException ce) { + Assert.assertTrue( + "Handler exception for reason other than retry: " + ce.getMessage(), + ce.getMessage().contains("Connection retries limit exceeded")); + // we would expect this exception here, check if the client has retried + Assert.assertTrue("Retry filter didn't perform any retries! ", client + .getRetryFilter().retried); + } + } + private static ClientResponse mockEntityClientResponse( TimelineClientImpl client, ClientResponse.Status status, boolean hasError, boolean hasRuntimeError) {