diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/ServiceScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/ServiceScheduler.java index 0fcca165099..9e6c42942a6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/ServiceScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/ServiceScheduler.java @@ -519,6 +519,9 @@ private void createAllComponents() { for (org.apache.hadoop.yarn.service.api.records.Component compSpec : sortedComponents) { Component component = new Component(compSpec, allocateId, context); + if (component.isHealthThresholdMonitorEnabled()) { + addService(component.getHealthThresholdMonitor()); + } componentsById.put(allocateId, component); componentsByName.put(component.getName(), component); allocateId++; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/Component.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/Component.java index 39897f62011..9f9001b32d1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/Component.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/Component.java @@ -45,6 +45,7 @@ import org.apache.hadoop.yarn.service.component.instance.ComponentInstance; import org.apache.hadoop.yarn.service.component.instance.ComponentInstanceEvent; import org.apache.hadoop.yarn.service.component.instance.ComponentInstanceId; +import org.apache.hadoop.yarn.service.monitor.ComponentHealthThresholdMonitor; import org.apache.hadoop.yarn.service.monitor.probe.MonitorUtils; import org.apache.hadoop.yarn.service.monitor.probe.Probe; import org.apache.hadoop.yarn.service.provider.ProviderUtils; @@ -69,6 +70,7 @@ import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.ReentrantReadWriteLock; @@ -78,7 +80,7 @@ import static org.apache.hadoop.yarn.service.component.ComponentEventType.*; import static org.apache.hadoop.yarn.service.component.ComponentState.*; import static org.apache.hadoop.yarn.service.component.instance.ComponentInstanceEventType.*; -import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.CONTAINER_FAILURE_THRESHOLD; +import static org.apache.hadoop.yarn.service.conf.YarnServiceConf.*; public class Component implements EventHandler { private static final Logger LOG = LoggerFactory.getLogger(Component.class); @@ -104,6 +106,8 @@ // The number of containers failed since last reset. This excludes preempted, // disk_failed containers etc. This will be reset to 0 periodically. public AtomicInteger currentContainerFailure = new AtomicInteger(0); + private boolean healthThresholdMonitorEnabled = false; + private ComponentHealthThresholdMonitor healthThresholdMonitor; private StateMachine stateMachine; @@ -180,6 +184,7 @@ public Component( .getPropertyInt(CONTAINER_FAILURE_THRESHOLD, 10); createNumCompInstances(component.getNumberOfContainers()); setDesiredContainers(component.getNumberOfContainers().intValue()); + checkAndCreateHealthThresholdMonitor(); } private void createNumCompInstances(long count) { @@ -197,6 +202,67 @@ private void createOneCompInstance() { pendingInstances.add(instance); } + private void checkAndCreateHealthThresholdMonitor() { + // Determine health threshold percent + int healthThresholdPercent = componentSpec.getConfiguration() + .getPropertyInt(CONTAINER_HEALTH_THRESHOLD_PERCENT, + DEFAULT_CONTAINER_HEALTH_THRESHOLD_PERCENT); + // Validations + if (healthThresholdPercent == CONTAINER_HEALTH_THRESHOLD_PERCENT_DISABLED) { + LOG.info("No health threshold monitor enabled for component {}", + componentSpec.getName()); + return; + } + // If threshold is set to outside acceptable range then don't enable monitor + if (healthThresholdPercent <= 0 || healthThresholdPercent > 100) { + LOG.error( + "Invalid health threshold percent {}% for component {}. Monitor not " + + "enabled.", + healthThresholdPercent, componentSpec.getName()); + return; + } + // Determine the threshold properties + long window = componentSpec.getConfiguration().getPropertyLong( + CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC, + DEFAULT_CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC); + long initDelay = componentSpec.getConfiguration().getPropertyLong( + CONTAINER_HEALTH_THRESHOLD_INIT_DELAY_SEC, + DEFAULT_CONTAINER_HEALTH_THRESHOLD_INIT_DELAY_SEC); + long pollFrequency = componentSpec.getConfiguration().getPropertyLong( + CONTAINER_HEALTH_THRESHOLD_POLL_FREQUENCY_SEC, + DEFAULT_CONTAINER_HEALTH_THRESHOLD_POLL_FREQUENCY_SEC); + // Validations + if (window <= 0) { + LOG.error( + "Invalid health monitor window {} secs for component {}. Monitor not " + + "enabled.", + window, componentSpec.getName()); + return; + } + if (initDelay < 0) { + LOG.error("Invalid health monitor init delay {} secs for component {}. " + + "Monitor not enabled.", initDelay, componentSpec.getName()); + return; + } + if (pollFrequency <= 0) { + LOG.error( + "Invalid health monitor poll frequency {} secs for component {}. " + + "Monitor not enabled.", + pollFrequency, componentSpec.getName()); + return; + } + LOG.info( + "Scheduling the health threshold monitor for component {} with percent " + + "= {}%, window = {} secs, poll freq = {} secs, init-delay = {} " + + "secs", + componentSpec.getName(), healthThresholdPercent, window, pollFrequency, + initDelay); + healthThresholdMonitor = new ComponentHealthThresholdMonitor(this, + healthThresholdPercent, window, initDelay, pollFrequency, + TimeUnit.SECONDS); + setHealthThresholdMonitorEnabled(true); + } + private static class FlexComponentTransition implements MultipleArcTransition { // For flex up, go to FLEXING state @@ -759,4 +825,22 @@ public ServiceContext getContext() { public List getPendingInstances() { return pendingInstances; } + + public boolean isHealthThresholdMonitorEnabled() { + return healthThresholdMonitorEnabled; + } + + public void setHealthThresholdMonitorEnabled( + boolean healthThresholdMonitorEnabled) { + this.healthThresholdMonitorEnabled = healthThresholdMonitorEnabled; + } + + public ComponentHealthThresholdMonitor getHealthThresholdMonitor() { + return healthThresholdMonitor; + } + + public void setHealthThresholdMonitor( + ComponentHealthThresholdMonitor healthThresholdMonitor) { + this.healthThresholdMonitor = healthThresholdMonitor; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/instance/ComponentInstance.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/instance/ComponentInstance.java index 0e3e11bc72e..f208a389294 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/instance/ComponentInstance.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/instance/ComponentInstance.java @@ -229,8 +229,11 @@ public void transition(ComponentInstance compInstance, } compInstance.component.decRunningContainers(); boolean shouldExit = false; - // check if it exceeds the failure threshold - if (comp.currentContainerFailure.get() > comp.maxContainerFailurePerComp) { + // Check if it exceeds the failure threshold, but only if health threshold + // monitor is not enabled + if (!comp.isHealthThresholdMonitorEnabled() + && comp.currentContainerFailure + .get() > comp.maxContainerFailurePerComp) { String exitDiag = MessageFormat.format( "[COMPONENT {0}]: Failed {1} times, exceeded the limit - {2}. Shutting down now... " + System.lineSeparator(), diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/conf/YarnServiceConf.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/conf/YarnServiceConf.java index 14c4d1585d1..fdefb9ad908 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/conf/YarnServiceConf.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/conf/YarnServiceConf.java @@ -103,6 +103,57 @@ public static final String DEPENDENCY_TARBALL_PATH = YARN_SERVICE_PREFIX + "framework.path"; + public static final String YARN_SERVICE_CONTAINER_HEALTH_THRESHOLD_PREFIX = + YARN_SERVICE_PREFIX + "container-health-threshold."; + + /** + * The container health threshold percent when explicitly set for a specific + * component or globally for all components, will schedule a health check + * monitor to periodically check for the percentage of healthy containers. It + * runs the check at a specified/default poll frequency. It allows a component + * to be below the health threshold for a specified/default window after which + * it considers the service to be unhealthy and triggers a service stop. When + * health threshold percent is enabled, CONTAINER_FAILURE_THRESHOLD is + * ignored. + */ + public static final String CONTAINER_HEALTH_THRESHOLD_PERCENT = + YARN_SERVICE_CONTAINER_HEALTH_THRESHOLD_PREFIX + "percent"; + /** + * Health check monitor poll frequency. It is an advanced setting and does not + * need to be set unless the service owner understands the implication and + * does not want the default. + */ + public static final String CONTAINER_HEALTH_THRESHOLD_POLL_FREQUENCY_SEC = + YARN_SERVICE_CONTAINER_HEALTH_THRESHOLD_PREFIX + "poll-frequency-secs"; + /** + * The amount of time the health check monitor allows a specific component to + * be below the health threshold after which it considers the service to be + * unhealthy. + */ + public static final String CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC = + YARN_SERVICE_CONTAINER_HEALTH_THRESHOLD_PREFIX + "window-secs"; + /** + * The amount of initial time the health check monitor waits before the first + * check kicks in. It gives a lead time for the service containers to come up + * for the first time. + */ + public static final String CONTAINER_HEALTH_THRESHOLD_INIT_DELAY_SEC = + YARN_SERVICE_CONTAINER_HEALTH_THRESHOLD_PREFIX + "init-delay-secs"; + /** + * By default the health threshold percent does not come into play until it is + * explicitly set in resource config for a specific component or globally for + * all components. -1 signifies disabled. + */ + public static final int CONTAINER_HEALTH_THRESHOLD_PERCENT_DISABLED = -1; + + public static final int DEFAULT_CONTAINER_HEALTH_THRESHOLD_PERCENT = + CONTAINER_HEALTH_THRESHOLD_PERCENT_DISABLED; + public static final long DEFAULT_CONTAINER_HEALTH_THRESHOLD_POLL_FREQUENCY_SEC = 10; + public static final long DEFAULT_CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC = 600; + // The default for initial delay is same as default health window + public static final long DEFAULT_CONTAINER_HEALTH_THRESHOLD_INIT_DELAY_SEC = + DEFAULT_CONTAINER_HEALTH_THRESHOLD_WINDOW_SEC; + /** * Get long value for the property. First get from the userConf, if not * present, get from systemConf. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/monitor/ComponentHealthThresholdMonitor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/monitor/ComponentHealthThresholdMonitor.java new file mode 100644 index 00000000000..679b747a718 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/monitor/ComponentHealthThresholdMonitor.java @@ -0,0 +1,189 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.service.monitor; + +import java.util.Date; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.service.CompositeService; +import org.apache.hadoop.util.ExitUtil; +import org.apache.hadoop.yarn.service.component.Component; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Monitors the health of containers of a specific component at a regular + * interval. It takes necessary actions when the health of a component drops + * below a desired threshold. + */ +public class ComponentHealthThresholdMonitor extends CompositeService { + private static final Logger LOG = LoggerFactory + .getLogger(ComponentHealthThresholdMonitor.class); + private ScheduledExecutorService executorService; + private static final String NAME_PREFIX = "ComponentHealthThresholdMonitor-"; + private final Component component; + private final int healthThresholdPercent; + private final long healthThresholdWindowSecs; + private final long initialDelay; + private final long interval; + private final TimeUnit timeUnit; + private final long healthThresholdWindowNanos; + private long firstOccurrenceTimestamp = 0; + // Sufficient logging happens when component health is below threshold. + // However, there has to be some logging when it is above threshold, otherwise + // service owners have no idea how the health is fluctuating. So let's log + // whenever there is a change in component health, thereby preventing + // excessive logging on every poll. + private float prevRunningContainerFraction = 0; + + public ComponentHealthThresholdMonitor(Component component, + int healthThresholdPercent, long healthThresholdWindowSecs, + long initialDelay, long interval, TimeUnit timeUnit) { + super(NAME_PREFIX + component.getName()); + this.component = component; + this.healthThresholdPercent = healthThresholdPercent; + this.healthThresholdWindowSecs = healthThresholdWindowSecs; + this.initialDelay = initialDelay; + this.interval = interval; + this.timeUnit = timeUnit; + this.healthThresholdWindowNanos = TimeUnit.NANOSECONDS + .convert(healthThresholdWindowSecs, TimeUnit.SECONDS); + addService(this); + } + + @Override + public void serviceInit(Configuration conf) throws Exception { + executorService = Executors.newScheduledThreadPool(1); + super.serviceInit(conf); + } + + @Override + public void serviceStart() throws Exception { + executorService.scheduleAtFixedRate(new ContainerHealthChecker(), + initialDelay, interval, timeUnit); + } + + @Override + public void serviceStop() throws Exception { + if (executorService != null) { + executorService.shutdownNow(); + } + } + + private class ContainerHealthChecker implements Runnable { + @Override + public void run() { + LOG.debug( + "ComponentHealthThresholdMonitor.ContainerHealthChecker run method"); + // Perform container health checks against desired threshold + long desiredContainerCount = component.getNumDesiredInstances(); + // If desired container count for this component is 0 then nothing to do + if (desiredContainerCount == 0) { + return; + } + long runningContainerCount = component.getNumRunningInstances(); + float thresholdFraction = (float) healthThresholdPercent / 100; + // No possibility of div by 0 since desiredContainerCount won't be 0 here + float runningContainerFraction = (float) runningContainerCount + / desiredContainerCount; + boolean healthChanged = false; + if (Math.abs( + runningContainerFraction - prevRunningContainerFraction) > .0000001) { + prevRunningContainerFraction = runningContainerFraction; + healthChanged = true; + } + String runningContainerPercentStr = String.format("%.2f", + runningContainerFraction * 100); + // Check if the current running container percent is less than the + // threshold percent + if (runningContainerFraction < thresholdFraction) { + // Check if it is the first occurrence and if yes set the timestamp + long currentTimestamp = System.nanoTime(); + if (firstOccurrenceTimestamp == 0) { + firstOccurrenceTimestamp = currentTimestamp; + LOG.info( + "[Component {}] Health is going below threshold for the first " + + "time at ts = {}", + component.getName(), (new Date()).getTime()); + } + long elapsedTime = currentTimestamp - firstOccurrenceTimestamp; + long elapsedTimeSecs = TimeUnit.SECONDS.convert(elapsedTime, + TimeUnit.NANOSECONDS); + LOG.warn( + "[Component {}] Current health {}% is below health threshold of " + + "{}% for {} secs (threshold window = {} secs)", + component.getName(), runningContainerPercentStr, + healthThresholdPercent, elapsedTimeSecs, healthThresholdWindowSecs); + if (elapsedTime > healthThresholdWindowNanos) { + LOG.warn( + "[Component {}] Current health {}% has been below health " + + "threshold of {}% for {} secs (threshold window = {} secs)", + component.getName(), runningContainerPercentStr, + healthThresholdPercent, elapsedTimeSecs, + healthThresholdWindowSecs); + // Trigger service stop + String exitDiag = String.format( + "Service is being killed because container health for component " + + "%s was %s%% (health threshold = %d%%) for %d secs " + + "(threshold window = %d secs)", + component.getName(), runningContainerPercentStr, + healthThresholdPercent, elapsedTimeSecs, + healthThresholdWindowSecs); + // Append to global diagnostics that will be reported to RM. + component.getScheduler().getDiagnostics().append(exitDiag); + LOG.warn(exitDiag); + // Sleep for 5 seconds in hope that the state can be recorded in ATS. + // In case there's a client polling the component state, it can be + // notified. + try { + Thread.sleep(5000); + } catch (InterruptedException e) { + LOG.error("Interrupted on sleep while exiting.", e); + } + ExitUtil.terminate(-1); + } + } else { + String logMsg = "[Component {}] Health threshold = {}%, Current health " + + "= {}% (Current Running count = {}, Desired count = {})"; + if (healthChanged) { + LOG.info(logMsg, component.getName(), healthThresholdPercent, + runningContainerPercentStr, runningContainerCount, + desiredContainerCount); + } else { + LOG.debug(logMsg, component.getName(), healthThresholdPercent, + runningContainerPercentStr, runningContainerCount, + desiredContainerCount); + } + // The container health might have recovered above threshold after being + // below for less than the threshold window amount of time. So we need + // to reset firstOccurrenceTimestamp to 0. + if (firstOccurrenceTimestamp != 0) { + LOG.info( + "[Component {}] Resetting first occurence to 0, since it " + + "recovered above health threshold of {}%", + component.getName(), healthThresholdPercent); + firstOccurrenceTimestamp = 0; + } + } + } + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/TestYarnNativeServices.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/TestYarnNativeServices.java index 5e267bb15b1..72acc39139e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/TestYarnNativeServices.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/TestYarnNativeServices.java @@ -33,6 +33,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager; import org.apache.hadoop.yarn.service.api.records.Component; import org.apache.hadoop.yarn.service.api.records.ComponentState; +import org.apache.hadoop.yarn.service.api.records.Configuration; import org.apache.hadoop.yarn.service.api.records.Container; import org.apache.hadoop.yarn.service.api.records.ContainerState; import org.apache.hadoop.yarn.service.api.records.PlacementConstraint; @@ -489,6 +490,108 @@ public void testCreateServiceWithPlacementPolicy() throws Exception { client.actionDestroy(exampleApp.getName()); } + // Test to verify component health threshold monitor. It uses anti-affinity + // placement policy to make it easier to simulate container failure by + // allocating more containers than the no of NMs. + // 1. Start mini cluster with 3 NMs and scheduler placement-constraint handler + // 2. Create an example service of 3 containers with anti-affinity placement + // policy and health threshold = 65%, window = 2 secs, init-delay = 0 secs, + // poll-frequency = 1 secs + // 3. Flex the component to 4 containers. This makes health = 75%, so based on + // threshold the service will continue to run beyond the window of 2 secs. + // 4. Flex the component to 5 containers. This makes health = 60%, so based on + // threshold the service will be stopped after the window of 2 secs. + @Test (timeout = 200000) + public void testComponentHealthThresholdMonitor() throws Exception { + // We need to enable scheduler placement-constraint at the cluster level to + // let apps use placement policies. + YarnConfiguration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.RM_PLACEMENT_CONSTRAINTS_HANDLER, + YarnConfiguration.SCHEDULER_RM_PLACEMENT_CONSTRAINTS_HANDLER); + setConf(conf); + setupInternal(3); + ServiceClient client = createClient(getConf()); + Service exampleApp = new Service(); + exampleApp.setName("example-app"); + exampleApp.setVersion("v1"); + Component comp = createComponent("compa", 3L, "sleep 1000"); + PlacementPolicy pp = new PlacementPolicy(); + PlacementConstraint pc = new PlacementConstraint(); + pc.setName("CA1"); + pc.setTargetTags(Collections.singletonList("compa")); + pc.setScope(PlacementScope.NODE); + pc.setType(PlacementType.ANTI_AFFINITY); + pp.setConstraints(Collections.singletonList(pc)); + comp.setPlacementPolicy(pp); + Configuration config = new Configuration(); + config.setProperty("yarn.service.container-health-threshold.percent", "65"); + config.setProperty("yarn.service.container-health-threshold.window-secs", + "3"); + config.setProperty( + "yarn.service.container-health-threshold.init-delay-secs", "0"); + config.setProperty( + "yarn.service.container-health-threshold.poll-frequency-secs", "1"); + comp.setConfiguration(config); + exampleApp.addComponent(comp); + client.actionCreate(exampleApp); + waitForServiceToBeStable(client, exampleApp); + + // Check service is stable and all 3 containers are running + Service service = client.getStatus(exampleApp.getName()); + Component component = service.getComponent("compa"); + Assert.assertEquals("Service state should be STABLE", ServiceState.STABLE, + service.getState()); + Assert.assertEquals("3 containers are expected to be running", 3, + component.getContainers().size()); + + // Flex compa up to 4 - will make health 75% (3 out of 4 running), but still + // above threshold of 65%, so service will continue to run. + Map compCounts = new HashMap<>(); + compCounts.put("compa", 4L); + exampleApp.getComponent("compa").setNumberOfContainers(4L); + client.flexByRestService(exampleApp.getName(), compCounts); + try { + // Wait for 6 secs (window 3 secs + 1 for next poll + 2 for buffer). Since + // the service will never go to stable state (because of anti-affinity the + // 4th container will never be allocated) it will timeout. However, after + // the timeout the service should continue to run since health is 75% + // which is above the threshold of 65%. + waitForServiceToBeStable(client, exampleApp, 6000); + Assert.fail("Service should not be in a stable state. It should throw " + + "a timeout exception."); + } catch (Exception e) { + // Check that service state is STARTED and only 3 containers are running + service = client.getStatus(exampleApp.getName()); + component = service.getComponent("compa"); + Assert.assertEquals("Service state should be STARTED", + ServiceState.STARTED, service.getState()); + Assert.assertEquals("Component state should be FLEXING", + ComponentState.FLEXING, component.getState()); + Assert.assertEquals("3 containers are expected to be running", 3, + component.getContainers().size()); + } + + // Flex compa up to 5 - will make health 60% (3 out of 5 running), so + // service will stop since it is below threshold of 65%. + compCounts.put("compa", 5L); + exampleApp.getComponent("compa").setNumberOfContainers(5L); + client.flexByRestService(exampleApp.getName(), compCounts); + try { + // Wait for 11 secs (window 3 secs + 1 for next poll + 2 for buffer + 5 + // secs of service wait before shutting down). The service should be in + // stopped state since its health of 60% is below the threshold of 65% + // (because of anti-affinity the 4th and 5th container will never be + // allocated). + waitForServiceToBeInState(client, exampleApp, ServiceState.STOPPED, + 11000); + } catch (Exception e) { + Assert.fail("Should not have thrown exception " + e.getMessage()); + } + + LOG.info("Destroy service {}", exampleApp); + client.actionDestroy(exampleApp.getName()); + } + // Check containers launched are in dependency order // Get all containers into a list and sort based on container launch time e.g. // compa-c1, compa-c2, compb-c1, compb-c2; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/yarn-service/Configurations.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/yarn-service/Configurations.md index 7ec2ecb879d..f326cfa6ea4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/yarn-service/Configurations.md +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/yarn-service/Configurations.md @@ -118,9 +118,13 @@ Above config make the service AM to be retried at max 10 times. |yarn.service.am-resource.memory | the memory size in GB for the framework AM. By default, it is set to 1024 |yarn.service.queue | the default queue to which the service will be submitted. By default, it is submitted to `default` queue |yarn.service.base.path | the root location for the service artifacts on hdfs for a user. By default, it is under ${user_home_dir}/.yarn/ -|yarn.service.container-failure-per-component.threshold | the max number of container failures for a given component before the AM exits. +|yarn.service.container-failure-per-component.threshold | the max absolute number of container failures for a given component before the AM exits. |yarn.service.node-blacklist.threshold | Maximum number of container failures on a node before the node is blacklisted by the AM |yarn.service.failure-count-reset.window | The interval in seconds when the `yarn.service.container-failure-per-component.threshold` and `yarn.service.node-blacklist.threshold` gets reset. By default, it is 21600, i.e. 6 hours +|yarn.service.container-health-threshold.percent | The container health threshold percent when explicitly set for a specific component or globally for all components, will schedule a health check monitor to periodically check for the percentage of healthy containers. It runs the check at a specified/default poll frequency. It allows a component to be below the health threshold for a specified/default window after which it considers the service to be unhealthy and triggers a service stop. When health threshold percent is enabled, yarn.service.container-failure-per-component.threshold is ignored. +|yarn.service.container-health-threshold.poll-frequency-secs | Health check monitor poll frequency. It is an advanced setting and does not need to be set unless the service owner understands the implication and does not want the default. The default is 10 secs. +|yarn.service.container-health-threshold.window-secs | The amount of time the health check monitor allows a specific component to be below the health threshold after which it considers the service to be unhealthy. The default is 600 secs (10 mins). +|yarn.service.container-health-threshold.init-delay-secs | The amount of initial time the health check monitor waits before the first check kicks in. It gives a lead time for the service containers to come up for the first time. The default is 600 secs (10 mins). |yarn.service.readiness-check-interval.seconds | The interval in seconds between readiness checks. By default, it is 30 seconds |yarn.service.log.include-pattern| The regex expression for including log files whose file name matches it when aggregating the logs after the application completes. |yarn.service.log.exclude-pattern| The regex expression for excluding log files whose file name matches it when aggregating the logs after the application completes. If the log file name matches both include and exclude pattern, this file will be excluded. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/yarn-service/YarnServiceAPI.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/yarn-service/YarnServiceAPI.md index 429c8c1a0dd..004af03a2f0 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/yarn-service/YarnServiceAPI.md +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/yarn-service/YarnServiceAPI.md @@ -820,3 +820,38 @@ fulfilled and the service will be in non-STABLE state. "quicklinks": {} } ``` + +### Create a service with health threshold monitor enabled for a component +POST URL - http://localhost:8088/app/v1/services + +##### POST Request JSON +```json +{ + "name": "hello-world", + "version": "1.0.0", + "description": "hello world example with health threshold monitor", + "components" : + [ + { + "name": "hello", + "number_of_containers": 100, + "artifact": { + "id": "nginx:latest", + "type": "DOCKER" + }, + "launch_command": "./start_nginx.sh", + "resource": { + "cpus": 1, + "memory": "256" + }, + "configuration": { + "properties": { + "yarn.service.container-health-threshold.percent": "90", + "yarn.service.container-health-threshold.window-secs": "400", + "yarn.service.container-health-threshold.init-delay-secs": "800" + } + } + } + ] +} +```