diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 6677478..4eaf274 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -1328,7 +1328,10 @@ public static boolean isAclEnabled(Configuration conf) { * How long for ResourceManager to wait for NodeManager to report its * log aggregation status. If waiting time of which the log aggregation status * is reported from NodeManager exceeds the configured value, RM will report - * log aggregation status for this NodeManager as TIME_OUT + * log aggregation status for this NodeManager as TIME_OUT. + * + * This configuration will be used in NodeManager as well to decide + * whether and when to delete the cached log aggregation status. */ public static final String LOG_AGGREGATION_STATUS_TIME_OUT_MS = YARN_PREFIX + "log-aggregation-status.time-out.ms"; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index adf8d8a..77502b6 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -1286,7 +1286,9 @@ How long for ResourceManager to wait for NodeManager to report its log aggregation status. If waiting time of which the log aggregation status is reported from NodeManager exceeds the configured value, RM - will report log aggregation status for this NodeManager as TIME_OUT + will report log aggregation status for this NodeManager as TIME_OUT. + This configuration will be used in NodeManager as well to decide + whether and when to delete the cached log aggregation status. yarn.log-aggregation-status.time-out.ms 600000 diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RegisterNodeManagerRequest.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RegisterNodeManagerRequest.java index fc30a80..ff50330 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RegisterNodeManagerRequest.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/RegisterNodeManagerRequest.java @@ -112,4 +112,9 @@ public abstract void setRunningApplications( * @param physicalResource Physical resources in the node. */ public abstract void setPhysicalResource(Resource physicalResource); + + public abstract List getLogAggregationReportsForApps(); + + public abstract void setLogAggregationReportsForApps( + List logAggregationReportsForApps); } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RegisterNodeManagerRequestPBImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RegisterNodeManagerRequestPBImpl.java index eda06d0..02fd20f 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RegisterNodeManagerRequestPBImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/api/protocolrecords/impl/pb/RegisterNodeManagerRequestPBImpl.java @@ -38,11 +38,13 @@ import org.apache.hadoop.yarn.proto.YarnProtos.NodeIdProto; import org.apache.hadoop.yarn.proto.YarnProtos.NodeLabelProto; import org.apache.hadoop.yarn.proto.YarnProtos.ResourceProto; +import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.LogAggregationReportProto; import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.NMContainerStatusProto; import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.NodeLabelsProto; import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.NodeLabelsProto.Builder; import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.RegisterNodeManagerRequestProto; import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.RegisterNodeManagerRequestProtoOrBuilder; +import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport; import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus; import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest; @@ -57,6 +59,8 @@ private List runningApplications = null; private Set labels = null; + private List logAggregationReportsForApps = null; + /** Physical resources in the node. */ private Resource physicalResource = null; @@ -100,6 +104,48 @@ private synchronized void mergeLocalToBuilder() { if (this.physicalResource != null) { builder.setPhysicalResource(convertToProtoFormat(this.physicalResource)); } + if (this.logAggregationReportsForApps != null) { + addLogAggregationStatusForAppsToProto(); + } + } + + private void addLogAggregationStatusForAppsToProto() { + maybeInitBuilder(); + builder.clearLogAggregationReportsForApps(); + if (this.logAggregationReportsForApps == null) { + return; + } + Iterable it = + new Iterable() { + @Override + public Iterator iterator() { + return new Iterator() { + private Iterator iter = + logAggregationReportsForApps.iterator(); + + @Override + public boolean hasNext() { + return iter.hasNext(); + } + + @Override + public LogAggregationReportProto next() { + return convertToProtoFormat(iter.next()); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + }; + builder.addAllLogAggregationReportsForApps(it); + } + + private LogAggregationReportProto convertToProtoFormat( + LogAggregationReport value) { + return ((LogAggregationReportPBImpl) value).getProto(); } private synchronized void addNMContainerStatusesToProto() { @@ -400,4 +446,38 @@ private static NMContainerStatusProto convertToProtoFormat( NMContainerStatus c) { return ((NMContainerStatusPBImpl)c).getProto(); } + + @Override + public synchronized List + getLogAggregationReportsForApps() { + if (this.logAggregationReportsForApps != null) { + return this.logAggregationReportsForApps; + } + initLogAggregationReportsForApps(); + return logAggregationReportsForApps; + } + + private void initLogAggregationReportsForApps() { + RegisterNodeManagerRequestProtoOrBuilder p = viaProto ? proto : builder; + List list = + p.getLogAggregationReportsForAppsList(); + this.logAggregationReportsForApps = new ArrayList(); + for (LogAggregationReportProto c : list) { + this.logAggregationReportsForApps.add(convertFromProtoFormat(c)); + } + } + + private LogAggregationReport convertFromProtoFormat( + LogAggregationReportProto logAggregationReport) { + return new LogAggregationReportPBImpl(logAggregationReport); + } + + @Override + public synchronized void setLogAggregationReportsForApps( + List logAggregationStatusForApps) { + if(logAggregationStatusForApps == null) { + builder.clearLogAggregationReportsForApps(); + } + this.logAggregationReportsForApps = logAggregationStatusForApps; + } } \ No newline at end of file diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto index e782cc2..1b090bf 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/proto/yarn_server_common_service_protos.proto @@ -66,6 +66,7 @@ message RegisterNodeManagerRequestProto { repeated ApplicationIdProto runningApplications = 7; optional NodeLabelsProto nodeLabels = 8; optional ResourceProto physicalResource = 9; + repeated LogAggregationReportProto log_aggregation_reports_for_apps = 10; } message RegisterNodeManagerResponseProto { diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java index d7e3b52..84b3915 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java @@ -33,8 +33,8 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; - import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; +import org.apache.hadoop.yarn.server.nodemanager.logaggregation.tracker.NMLogAggregationStatusTracker; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator; @@ -121,6 +121,8 @@ NMTimelinePublisher getNMTimelinePublisher(); + NMLogAggregationStatusTracker getNMLogAggregationStatusTracker(); + ContainerExecutor getContainerExecutor(); ContainerStateTransitionListener getContainerStateTransitionListener(); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java index 42b7b5f..2748a8f 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java @@ -59,6 +59,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; +import org.apache.hadoop.yarn.server.nodemanager.logaggregation.tracker.NMLogAggregationStatusTracker; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.ConfigurationNodeLabelsProvider; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider; @@ -135,6 +136,7 @@ public int getExitCode() { private boolean rmWorkPreservingRestartEnabled; private boolean shouldExitOnShutdownEvent = false; + private NMLogAggregationStatusTracker nmLogAggregationStatusTracker; /** * Default Container State transition listener. */ @@ -424,6 +426,12 @@ protected void serviceInit(Configuration conf) throws Exception { addService(containerManager); ((NMContext) context).setContainerManager(containerManager); + this.nmLogAggregationStatusTracker = createNMLogAggregationStatusTracker( + context); + addService(nmLogAggregationStatusTracker); + ((NMContext)context).setNMLogAggregationStatusTracker( + this.nmLogAggregationStatusTracker); + WebServer webServer = createWebServer(context, containerManager .getContainersMonitor(), this.aclsManager, dirsHandler); addService(webServer); @@ -621,6 +629,8 @@ protected void reregisterCollectors() { private ResourcePluginManager resourcePluginManager; + private NMLogAggregationStatusTracker nmLogAggregationStatusTracker; + public NMContext(NMContainerTokenSecretManager containerTokenSecretManager, NMTokenSecretManagerInNM nmTokenSecretManager, LocalDirsHandlerService dirsHandler, ApplicationACLsManager aclsManager, @@ -862,6 +872,15 @@ public DeletionService getDeletionService() { public void setDeletionService(DeletionService deletionService) { this.deletionService = deletionService; } + + public void setNMLogAggregationStatusTracker( + NMLogAggregationStatusTracker nmLogAggregationStatusTracker) { + this.nmLogAggregationStatusTracker = nmLogAggregationStatusTracker; + } + @Override + public NMLogAggregationStatusTracker getNMLogAggregationStatusTracker() { + return nmLogAggregationStatusTracker; + } } /** @@ -965,4 +984,9 @@ public static void main(String[] args) throws IOException { public NodeStatusUpdater getNodeStatusUpdater() { return nodeStatusUpdater; } + + private NMLogAggregationStatusTracker createNMLogAggregationStatusTracker( + Context ctxt) { + return new NMLogAggregationStatusTracker(ctxt); + } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java index 3d3f573..8154723 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java @@ -381,6 +381,20 @@ protected void registerWithRM() if (containerReports != null) { LOG.info("Registering with RM using containers :" + containerReports); } + if (logAggregationEnabled) { + // pull log aggregation status for application running in this NM + List logAggregationReports = + context.getNMLogAggregationStatusTracker() + .pullCachedLogAggregationReports(); + if (LOG.isDebugEnabled()) { + LOG.debug("The cache log aggregation status size:" + + logAggregationReports.size()); + } + if (logAggregationReports != null + && !logAggregationReports.isEmpty()) { + request.setLogAggregationReportsForApps(logAggregationReports); + } + } regNMResponse = resourceTracker.registerNodeManager(request); // Make sure rmIdentifier is set before we release the lock diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java index 4ac150a..c7e06ff 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java @@ -385,7 +385,8 @@ private void sendLogAggregationReport( logAggregationSucceedInThisCycle ? LogAggregationStatus.RUNNING : LogAggregationStatus.RUNNING_WITH_FAILURE; - sendLogAggregationReportInternal(logAggregationStatus, diagnosticMessage); + sendLogAggregationReportInternal(logAggregationStatus, diagnosticMessage, + false); if (appFinished) { // If the app is finished, one extra final report with log aggregation // status SUCCEEDED/FAILED will be sent to RM to inform the RM @@ -394,18 +395,22 @@ private void sendLogAggregationReport( renameTemporaryLogFileFailed || !logAggregationSucceedInThisCycle ? LogAggregationStatus.FAILED : LogAggregationStatus.SUCCEEDED; - sendLogAggregationReportInternal(finalLogAggregationStatus, ""); + sendLogAggregationReportInternal(finalLogAggregationStatus, "", true); } } private void sendLogAggregationReportInternal( - LogAggregationStatus logAggregationStatus, String diagnosticMessage) { + LogAggregationStatus logAggregationStatus, String diagnosticMessage, + boolean finalized) { LogAggregationReport report = Records.newRecord(LogAggregationReport.class); report.setApplicationId(appId); report.setDiagnosticMessage(diagnosticMessage); report.setLogAggregationStatus(logAggregationStatus); this.context.getLogAggregationStatusForApps().add(report); + this.context.getNMLogAggregationStatusTracker().updateLogAggregationStatus( + appId, logAggregationStatus, System.currentTimeMillis(), + diagnosticMessage, finalized); } @SuppressWarnings("unchecked") diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/logaggregation/tracker/NMLogAggregationStatusTracker.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/logaggregation/tracker/NMLogAggregationStatusTracker.java new file mode 100644 index 0000000..57a5f3f --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/logaggregation/tracker/NMLogAggregationStatusTracker.java @@ -0,0 +1,260 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.logaggregation.tracker; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Timer; +import java.util.TimerTask; +import java.util.Map.Entry; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock; +import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.service.CompositeService; +import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.LogAggregationStatus; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * {@link NMLogAggregationStatusTracker} is used to cache log aggregation + * status for finished applications. It will also delete the old cached + * log aggregation status periodically. + * + */ +public class NMLogAggregationStatusTracker extends CompositeService { + + private static final Logger LOG = + LoggerFactory.getLogger(NMLogAggregationStatusTracker.class); + + private final ReadLock readLocker; + private final WriteLock writeLocker; + private final Context nmContext; + private final long rollingInterval; + private final Timer timer; + private final Map + recoveryStatuses; + private boolean disabled = false; + + public NMLogAggregationStatusTracker(Context context) { + super(NMLogAggregationStatusTracker.class.getName()); + this.nmContext = context; + Configuration conf = context.getConf(); + if (!conf.getBoolean(YarnConfiguration.LOG_AGGREGATION_ENABLED, + YarnConfiguration.DEFAULT_LOG_AGGREGATION_ENABLED)) { + disabled = true; + } + this.recoveryStatuses = new ConcurrentHashMap<>(); + ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); + this.readLocker = lock.readLock(); + this.writeLocker = lock.writeLock(); + this.timer = new Timer(); + this.rollingInterval = conf.getLong( + YarnConfiguration.LOG_AGGREGATION_STATUS_TIME_OUT_MS, + YarnConfiguration.DEFAULT_LOG_AGGREGATION_STATUS_TIME_OUT_MS); + LOG.info("the rolling interval seconds for the NodeManager Cached Log " + + "aggregation status is " + (rollingInterval/1000)); + } + + @Override + protected void serviceStart() throws Exception { + if (disabled) { + LOG.warn("Log Aggregation is disabled." + + "So is the LogAggregationStatusTracker."); + } else { + this.timer.scheduleAtFixedRate(new LogAggregationStatusRoller(), + rollingInterval, rollingInterval); + } + } + + @Override + public void serviceStop() throws Exception { + this.timer.cancel(); + } + + public void updateLogAggregationStatus(ApplicationId appId, + LogAggregationStatus logAggregationStatus, long updateTime, + String diagnosis, boolean finalized) { + if (disabled) { + LOG.warn("The log aggregation is diabled. No need to update " + + "the log aggregation status"); + } + // In NM, each application has exactly one appLogAggregator thread + // to handle the log aggregation. So, it is fine which multiple + // appLogAggregator thread to update log aggregation status for its + // own application. This is why we are using readLocker here. + this.readLocker.lock(); + try { + AppLogAggregationStatusForRMRecovery tracker = recoveryStatuses + .get(appId); + if (tracker == null) { + Application application = this.nmContext.getApplications().get(appId); + if (application == null) { + LOG.warn("The application:" + appId + " has already finished," + + " and has been removed from NodeManager, we should not " + + "receive the log aggregation status update for " + + "this application."); + return; + } + AppLogAggregationStatusForRMRecovery newTracker = + new AppLogAggregationStatusForRMRecovery(logAggregationStatus, + diagnosis); + newTracker.setLastModifiedTime(updateTime); + newTracker.setFinalized(finalized); + recoveryStatuses.put(appId, newTracker); + } else { + if (tracker.isFinalized()) { + LOG.warn("Ignore the log aggregation status update request " + + "for the application:" + appId + ". The cached log aggregation " + + "status is " + tracker.getLogAggregationStatus() + "."); + } else { + if (tracker.getLastModifiedTime() > updateTime) { + LOG.warn("Ignore the log aggregation status update request " + + "for the application:" + appId + ". The request log " + + "aggregation status update is older than the cached " + + "log aggregation status."); + } else { + tracker.setLogAggregationStatus(logAggregationStatus); + tracker.setDiagnosis(diagnosis); + tracker.setLastModifiedTime(updateTime); + tracker.setFinalized(finalized); + recoveryStatuses.put(appId, tracker); + } + } + } + } finally { + this.readLocker.unlock(); + } + } + + public List pullCachedLogAggregationReports() { + List reports = new ArrayList<>(); + if (disabled) { + LOG.warn("The log aggregation is diabled." + + "There is no cached log aggregation status."); + return reports; + } + // When we pull cached Log aggregation reports for all application in + // this NM, we should make sure that we need to block all of the + // updateLogAggregationStatus calls. So, the writeLocker is used here. + this.writeLocker.lock(); + try { + for(Entry tracker : + recoveryStatuses.entrySet()) { + AppLogAggregationStatusForRMRecovery current = tracker.getValue(); + LogAggregationReport report = LogAggregationReport.newInstance( + tracker.getKey(), current.getLogAggregationStatus(), + current.getDiagnosis()); + reports.add(report); + } + return reports; + } finally { + this.writeLocker.unlock(); + } + } + + private class LogAggregationStatusRoller extends TimerTask { + @Override + public void run() { + rollLogAggregationStatus(); + } + } + + private void rollLogAggregationStatus() { + // When we call call rollLogAggregationStatus, basically fetch all + // cached log aggregation status and delete the out-of-timeout period + // log aggregation status, we should block the rollLogAggregationStatus + // calls as well as pullCachedLogAggregationReports call. So, the + // writeLocker is used here. + this.writeLocker.lock(); + try { + long currentTimeStamp = System.currentTimeMillis(); + LOG.info("Rolling over the cached log aggregation status."); + Iterator> it + = recoveryStatuses.entrySet().iterator(); + while (it.hasNext()) { + Entry tracker = + it.next(); + // the application has finished. + if (nmContext.getApplications().get(tracker.getKey()) == null) { + if (currentTimeStamp - tracker.getValue().getLastModifiedTime() + > rollingInterval) { + it.remove(); + } + } + } + } finally { + this.writeLocker.unlock(); + } + } + + private static class AppLogAggregationStatusForRMRecovery { + private LogAggregationStatus logAggregationStatus; + private long lastModifiedTime; + private boolean finalized; + private String diagnosis; + + AppLogAggregationStatusForRMRecovery( + LogAggregationStatus logAggregationStatus, String diagnosis) { + this.setLogAggregationStatus(logAggregationStatus); + this.setDiagnosis(diagnosis); + } + + public LogAggregationStatus getLogAggregationStatus() { + return logAggregationStatus; + } + + public void setLogAggregationStatus( + LogAggregationStatus logAggregationStatus) { + this.logAggregationStatus = logAggregationStatus; + } + + public long getLastModifiedTime() { + return lastModifiedTime; + } + + public void setLastModifiedTime(long lastModifiedTime) { + this.lastModifiedTime = lastModifiedTime; + } + + public boolean isFinalized() { + return finalized; + } + + public void setFinalized(boolean finalized) { + this.finalized = finalized; + } + + public String getDiagnosis() { + return diagnosis; + } + + public void setDiagnosis(String diagnosis) { + this.diagnosis = diagnosis; + } + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java index 9602142..4b1a887 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java @@ -56,6 +56,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; +import org.apache.hadoop.yarn.server.nodemanager.logaggregation.tracker.NMLogAggregationStatusTracker; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; @@ -814,5 +815,10 @@ public NodeManagerMetrics getNodeManagerMetrics() { public DeletionService getDeletionService() { return null; } + + @Override + public NMLogAggregationStatusTracker getNMLogAggregationStatusTracker() { + return null; + } } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java index fc9e6c4..93d0afb 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/BaseContainerManagerTest.java @@ -18,7 +18,14 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager; +import static org.mockito.Mockito.mock; import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.doNothing; +import static org.mockito.Matchers.any; +import static org.mockito.Matchers.anyBoolean; +import static org.mockito.Matchers.anyLong; +import static org.mockito.Matchers.anyString; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -48,6 +55,7 @@ import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.ExecutionType; import org.apache.hadoop.yarn.api.records.LogAggregationContext; +import org.apache.hadoop.yarn.api.records.LogAggregationStatus; import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.Resource; @@ -77,6 +85,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.deletion.task.DeletionTask; import org.apache.hadoop.yarn.server.nodemanager.executor.DeletionAsUserContext; +import org.apache.hadoop.yarn.server.nodemanager.logaggregation.tracker.NMLogAggregationStatusTracker; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager; @@ -130,6 +139,16 @@ public int getHttpPort() { public ContainerExecutor getContainerExecutor() { return exec; } + + @Override + public NMLogAggregationStatusTracker getNMLogAggregationStatusTracker() { + NMLogAggregationStatusTracker mock = mock( + NMLogAggregationStatusTracker.class); + doNothing().when(mock).updateLogAggregationStatus( + any(ApplicationId.class), any(LogAggregationStatus.class), + anyLong(), anyString(), anyBoolean()); + return mock; + } }; protected ContainerExecutor exec; protected DeletionService delSrvc; diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/logaggregation/tracker/TestNMLogAggregationStatusTracker.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/logaggregation/tracker/TestNMLogAggregationStatusTracker.java new file mode 100644 index 0000000..3105b4e --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/logaggregation/tracker/TestNMLogAggregationStatusTracker.java @@ -0,0 +1,139 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.logaggregation.tracker; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.util.List; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.LogAggregationStatus; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; +import org.junit.Assert; +import org.junit.Test; + +/** + * Function test for {@link NMLogAggregationStatusTracker}. + * + */ +public class TestNMLogAggregationStatusTracker { + + @Test + public void testNMLogAggregationStatusUpdate() { + Context mockContext = mock(Context.class); + ConcurrentMap apps = new ConcurrentHashMap<>(); + ApplicationId appId1 = ApplicationId.newInstance( + System.currentTimeMillis(), 1); + apps.putIfAbsent(appId1, mock(Application.class)); + when(mockContext.getApplications()).thenReturn(apps); + // the log aggregation is disabled. + Configuration conf = new YarnConfiguration(); + conf.setBoolean(YarnConfiguration.LOG_AGGREGATION_ENABLED, false); + when(mockContext.getConf()).thenReturn(conf); + NMLogAggregationStatusTracker tracker = new NMLogAggregationStatusTracker( + mockContext); + ApplicationId appId0 = ApplicationId.newInstance(0, 0); + tracker.updateLogAggregationStatus(appId0, + LogAggregationStatus.RUNNING, System.currentTimeMillis(), "", false); + List reports = tracker + .pullCachedLogAggregationReports(); + // we can not get any cached log aggregation status because + // the log aggregation is disabled. + Assert.assertTrue(reports.isEmpty()); + + // enable the log aggregation. + conf.setBoolean(YarnConfiguration.LOG_AGGREGATION_ENABLED, true); + when(mockContext.getConf()).thenReturn(conf); + tracker = new NMLogAggregationStatusTracker(mockContext); + // update the log aggregation status for an un-existed/finished + // application, we should ignore the status update request. + appId0 = ApplicationId.newInstance(0, 0); + tracker.updateLogAggregationStatus(appId0, + LogAggregationStatus.RUNNING, + System.currentTimeMillis() - 15 * 60 * 1000, "", false); + reports = tracker + .pullCachedLogAggregationReports(); + Assert.assertTrue(reports.isEmpty()); + + tracker.updateLogAggregationStatus(appId1, + LogAggregationStatus.RUNNING, + System.currentTimeMillis() - 60 * 1000, "", false); + reports = tracker + .pullCachedLogAggregationReports(); + Assert.assertTrue(reports.size() == 1); + Assert.assertTrue(reports.get(0).getLogAggregationStatus() + == LogAggregationStatus.RUNNING); + + tracker.updateLogAggregationStatus(appId1, + LogAggregationStatus.SUCCEEDED, + System.currentTimeMillis() - 1 * 60 * 1000, "", true); + reports = tracker + .pullCachedLogAggregationReports(); + Assert.assertTrue(reports.size() == 1); + Assert.assertTrue(reports.get(0).getLogAggregationStatus() + == LogAggregationStatus.SUCCEEDED); + + // the log aggregation status is finalized. So, we would + // ingore the following update + tracker.updateLogAggregationStatus(appId1, + LogAggregationStatus.FAILED, + System.currentTimeMillis() - 1 * 60 * 1000, "", true); + reports = tracker + .pullCachedLogAggregationReports(); + Assert.assertTrue(reports.size() == 1); + Assert.assertTrue(reports.get(0).getLogAggregationStatus() + == LogAggregationStatus.SUCCEEDED); + } + + public void testLogAggregationStatusRoller() throws InterruptedException { + Context mockContext = mock(Context.class); + Configuration conf = new YarnConfiguration(); + conf.setLong(YarnConfiguration.LOG_AGGREGATION_STATUS_TIME_OUT_MS, + 10 * 1000); + when(mockContext.getConf()).thenReturn(conf); + ConcurrentMap apps = new ConcurrentHashMap<>(); + ApplicationId appId1 = ApplicationId.newInstance( + System.currentTimeMillis(), 1); + apps.putIfAbsent(appId1, mock(Application.class)); + when(mockContext.getApplications()).thenReturn(apps); + NMLogAggregationStatusTracker tracker = new NMLogAggregationStatusTracker( + mockContext); + tracker.updateLogAggregationStatus(appId1, + LogAggregationStatus.RUNNING, + System.currentTimeMillis(), "", false); + // verify that we have cached the log aggregation status for app1 + List reports = tracker + .pullCachedLogAggregationReports(); + Assert.assertTrue(reports.size() == 1); + Assert.assertTrue(reports.get(0).getLogAggregationStatus() + == LogAggregationStatus.RUNNING); + // sleep 10s + Thread.sleep(10*1000); + // the cache log aggregation status should be deleted. + reports = tracker + .pullCachedLogAggregationReports(); + Assert.assertTrue(reports.size() == 0); + } +} diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java index 9d95f63..e997192 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java @@ -399,9 +399,21 @@ public RegisterNodeManagerResponse registerNodeManager( RMNode oldNode = this.rmContext.getRMNodes().putIfAbsent(nodeId, rmNode); if (oldNode == null) { + RMNodeStartedEvent startEvent = new RMNodeStartedEvent(nodeId, + request.getNMContainerStatuses(), + request.getRunningApplications()); + if (request.getLogAggregationReportsForApps() != null + && !request.getLogAggregationReportsForApps().isEmpty()) { + if (LOG.isDebugEnabled()) { + LOG.debug("Found the number of previous cached log aggregation " + + "status from nodemanager:" + nodeId + " is :" + + request.getLogAggregationReportsForApps().size()); + } + startEvent.setLogAggregationReportsForApps(request + .getLogAggregationReportsForApps()); + } this.rmContext.getDispatcher().getEventHandler().handle( - new RMNodeStartedEvent(nodeId, request.getNMContainerStatuses(), - request.getRunningApplications())); + startEvent); } else { LOG.info("Reconnect from the node at: " + host); this.nmLivelinessMonitor.unregister(nodeId); @@ -426,7 +438,6 @@ public RegisterNodeManagerResponse registerNodeManager( this.rmContext.getRMNodes().put(nodeId, rmNode); this.rmContext.getDispatcher().getEventHandler() .handle(new RMNodeStartedEvent(nodeId, null, null)); - } else { // Reset heartbeat ID since node just restarted. oldNode.resetLastNodeHeartBeatResponse(); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java index 3cbde01..b942afa 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java @@ -866,6 +866,12 @@ public void transition(RMNodeImpl rmNode, RMNodeEvent event) { rmNode.context.getDispatcher().getEventHandler().handle( new NodesListManagerEvent( NodesListManagerEventType.NODE_USABLE, rmNode)); + List logAggregationReportsForApps = + startEvent.getLogAggregationReportsForApps(); + if (logAggregationReportsForApps != null + && !logAggregationReportsForApps.isEmpty()) { + rmNode.handleLogAggregationStatus(logAggregationReportsForApps); + } } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeStartedEvent.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeStartedEvent.java index 4fc983a..3976994 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeStartedEvent.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeStartedEvent.java @@ -22,12 +22,14 @@ import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.NodeId; +import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport; import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus; public class RMNodeStartedEvent extends RMNodeEvent { private List containerStatuses; private List runningApplications; + private List logAggregationReportsForApps; public RMNodeStartedEvent(NodeId nodeId, List containerReports, @@ -44,4 +46,13 @@ public RMNodeStartedEvent(NodeId nodeId, public List getRunningApplications() { return runningApplications; } + + public List getLogAggregationReportsForApps() { + return this.logAggregationReportsForApps; + } + + public void setLogAggregationReportsForApps( + List logAggregationReportsForApps) { + this.logAggregationReportsForApps = logAggregationReportsForApps; + } }