diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/CpuTimeTracker.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/CpuTimeTracker.java new file mode 100644 index 0000000..441539c --- /dev/null +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/CpuTimeTracker.java @@ -0,0 +1,81 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.util; + +import java.math.BigInteger; + +public class CpuTimeTracker { + public static final int UNAVAILABLE = -1; + BigInteger cumulativeCpuTime = BigInteger.ZERO; // CPU used time since system + // is on (ms) + BigInteger lastCumulativeCpuTime = BigInteger.ZERO; // CPU used time read + // last time (ms) + // Unix timestamp while reading the CPU time (ms) + float cpuUsage; + long sampleTime; + long lastSampleTime; + BigInteger jiffyLengthInMillis; + final long MINIMUM_UPDATE_INTERVAL; + + public CpuTimeTracker(long jiffyLengthInMillis) { + this.jiffyLengthInMillis = BigInteger.valueOf(jiffyLengthInMillis); + this.cpuUsage = UNAVAILABLE; + this.sampleTime = UNAVAILABLE; + this.lastSampleTime = UNAVAILABLE; + MINIMUM_UPDATE_INTERVAL = 10 * jiffyLengthInMillis; + } + + public float getCpuTrackerUsage(int numProcessors) { + if (lastSampleTime == UNAVAILABLE || + lastSampleTime > sampleTime) { + // lastSampleTime > sampleTime may happen when the system time is changed + lastSampleTime = sampleTime; + lastCumulativeCpuTime = cumulativeCpuTime; + return cpuUsage; + } + // When lastSampleTime is sufficiently old, update cpuUsage. + // Also take a sample of the current time and cumulative CPU time for the + // use of the next calculation. + if (sampleTime > lastSampleTime + MINIMUM_UPDATE_INTERVAL) { + cpuUsage = + ((cumulativeCpuTime.subtract(lastCumulativeCpuTime)).floatValue()) + * 100F / ((float) (sampleTime - lastSampleTime) * numProcessors); + lastSampleTime = sampleTime; + lastCumulativeCpuTime = cumulativeCpuTime; + } + return cpuUsage; + } + + public void updateElapsedJiffies(BigInteger elapedJiffies, long currentTime) { + this.cumulativeCpuTime = elapedJiffies.multiply(jiffyLengthInMillis); + sampleTime = currentTime; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("SampleTime " + this.sampleTime); + sb.append(" CummulativeCpuTime " + this.cumulativeCpuTime); + sb.append(" LastSampleTime " + this.lastSampleTime); + sb.append(" LastCummulativeCpuTime " + this.lastCumulativeCpuTime); + sb.append(" CpuUsage " + this.cpuUsage); + sb.append(" JiffyLengthMillisec " + this.jiffyLengthInMillis); + return sb.toString(); + } +} \ No newline at end of file diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/LinuxResourceCalculatorPlugin.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/LinuxResourceCalculatorPlugin.java index 2347f40..8600b37 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/LinuxResourceCalculatorPlugin.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/LinuxResourceCalculatorPlugin.java @@ -23,6 +23,7 @@ import java.io.FileNotFoundException; import java.io.InputStreamReader; import java.io.IOException; +import java.math.BigInteger; import java.nio.charset.Charset; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -41,8 +42,6 @@ private static final Log LOG = LogFactory.getLog(LinuxResourceCalculatorPlugin.class); - public static final int UNAVAILABLE = -1; - /** * proc's meminfo virtual file has keys-values in the format * "key:[ \t]*value[ \t]kB". @@ -74,6 +73,7 @@ private static final Pattern CPU_TIME_FORMAT = Pattern.compile("^cpu[ \t]*([0-9]*)" + "[ \t]*([0-9]*)[ \t]*([0-9]*)[ \t].*"); + private CpuTimeTracker cpuTimeTracker; private String procfsMemFile; private String procfsCpuFile; @@ -87,12 +87,6 @@ private long inactiveSize = 0; // inactive cache memory (kB) private int numProcessors = 0; // number of processors on the system private long cpuFrequency = 0L; // CPU frequency on the system (kHz) - private long cumulativeCpuTime = 0L; // CPU used time since system is on (ms) - private long lastCumulativeCpuTime = 0L; // CPU used time read last time (ms) - // Unix timestamp while reading the CPU time (ms) - private float cpuUsage = UNAVAILABLE; - private long sampleTime = UNAVAILABLE; - private long lastSampleTime = UNAVAILABLE; boolean readMemInfoFile = false; boolean readCpuInfoFile = false; @@ -106,10 +100,8 @@ long getCurrentTime() { } public LinuxResourceCalculatorPlugin() { - procfsMemFile = PROCFS_MEMFILE; - procfsCpuFile = PROCFS_CPUINFO; - procfsStatFile = PROCFS_STAT; - jiffyLengthInMillis = ProcfsBasedProcessTree.JIFFY_LENGTH_IN_MILLIS; + this(PROCFS_MEMFILE, PROCFS_CPUINFO, PROCFS_STAT, + ProcfsBasedProcessTree.JIFFY_LENGTH_IN_MILLIS); } /** @@ -128,6 +120,7 @@ public LinuxResourceCalculatorPlugin(String procfsMemFile, this.procfsCpuFile = procfsCpuFile; this.procfsStatFile = procfsStatFile; this.jiffyLengthInMillis = jiffyLengthInMillis; + this.cpuTimeTracker = new CpuTimeTracker(jiffyLengthInMillis); } /** @@ -276,12 +269,13 @@ private void readProcStatFile() { long uTime = Long.parseLong(mat.group(1)); long nTime = Long.parseLong(mat.group(2)); long sTime = Long.parseLong(mat.group(3)); - cumulativeCpuTime = uTime + nTime + sTime; // milliseconds + cpuTimeTracker.updateElapsedJiffies( + BigInteger.valueOf(uTime + nTime + sTime), + getCurrentTime()); break; } str = in.readLine(); } - cumulativeCpuTime *= jiffyLengthInMillis; } catch (IOException io) { LOG.warn("Error reading the stream " + io); } finally { @@ -345,32 +339,14 @@ public long getCpuFrequency() { @Override public long getCumulativeCpuTime() { readProcStatFile(); - return cumulativeCpuTime; + return cpuTimeTracker.cumulativeCpuTime.longValue(); } /** {@inheritDoc} */ @Override public float getCpuUsage() { readProcStatFile(); - sampleTime = getCurrentTime(); - if (lastSampleTime == UNAVAILABLE || - lastSampleTime > sampleTime) { - // lastSampleTime > sampleTime may happen when the system time is changed - lastSampleTime = sampleTime; - lastCumulativeCpuTime = cumulativeCpuTime; - return cpuUsage; - } - // When lastSampleTime is sufficiently old, update cpuUsage. - // Also take a sample of the current time and cumulative CPU time for the - // use of the next calculation. - final long MINIMUM_UPDATE_INTERVAL = 10 * jiffyLengthInMillis; - if (sampleTime > lastSampleTime + MINIMUM_UPDATE_INTERVAL) { - cpuUsage = (float)(cumulativeCpuTime - lastCumulativeCpuTime) * 100F / - ((float)(sampleTime - lastSampleTime) * getNumProcessors()); - lastSampleTime = sampleTime; - lastCumulativeCpuTime = cumulativeCpuTime; - } - return cpuUsage; + return cpuTimeTracker.getCpuTrackerUsage(getNumProcessors()); } /** diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/ProcfsBasedProcessTree.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/ProcfsBasedProcessTree.java index 69aa96d..be7ac79 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/ProcfsBasedProcessTree.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/ProcfsBasedProcessTree.java @@ -66,6 +66,7 @@ public static final String PROCFS_CMDLINE_FILE = "cmdline"; public static final long PAGE_SIZE; public static final long JIFFY_LENGTH_IN_MILLIS; // in millisecond + private final CpuTimeTracker cpuTimeTracker; enum MemInfo { SIZE("Size"), RSS("Rss"), PSS("Pss"), SHARED_CLEAN("Shared_Clean"), @@ -170,6 +171,7 @@ public ProcfsBasedProcessTree(String pid, String procfsDir) { super(pid); this.pid = getValidPID(pid); this.procfsDir = procfsDir; + this.cpuTimeTracker = new CpuTimeTracker(JIFFY_LENGTH_IN_MILLIS); } /** @@ -447,6 +449,26 @@ public long getCumulativeCpuTime() { return cpuTime; } + private BigInteger getTotalProcessJiffies() { + BigInteger totalStime = BigInteger.ZERO; + long totalUtime = 0; + for (ProcessInfo p : processTree.values()) { + if (p != null) { + totalUtime += p.getUtime(); + totalStime = totalStime.add(p.getStime()); + } + } + return totalStime.add(BigInteger.valueOf(totalUtime)); + } + + @Override + public float getCpuUsagePercent(int numProcessors) { + BigInteger processTotalJiffies = getTotalProcessJiffies(); + cpuTimeTracker.updateElapsedJiffies(processTotalJiffies, + System.currentTimeMillis()); + return cpuTimeTracker.getCpuTrackerUsage(numProcessors); + } + private static String getValidPID(String pid) { if (pid == null) return deadPid; Matcher m = numberPattern.matcher(pid); @@ -962,4 +984,42 @@ public String toString() { return sb.toString(); } } + + /** + * Test the {@link ProcfsBasedProcessTree} + * + * @param args + */ + public static void main(String[] args) { + if (args.length != 1) { + System.out.println("Provide "); + return; + } + + int numprocessors = + ResourceCalculatorPlugin.getResourceCalculatorPlugin(null, null) + .getNumProcessors(); + System.out.println("Number of processors " + numprocessors); + + System.out.println("Creating ProcfsBasedProcessTree for process " + + args[1]); + ProcfsBasedProcessTree procfsBasedProcessTree = new + ProcfsBasedProcessTree(args[0]); + procfsBasedProcessTree.updateProcessTree(); + + System.out.println(procfsBasedProcessTree.getProcessTreeDump()); + + try { + // Sleep so we can compute the CPU usage + Thread.sleep(500L); + } catch (InterruptedException e) { + // do nothing + } + + procfsBasedProcessTree.updateProcessTree(); + + System.out.println(procfsBasedProcessTree.getProcessTreeDump()); + System.out.println("Get cpu usage " + procfsBasedProcessTree + .getCpuUsagePercent(numprocessors)); + } } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/ResourceCalculatorProcessTree.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/ResourceCalculatorProcessTree.java index 85f6f1a..961b363 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/ResourceCalculatorProcessTree.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/ResourceCalculatorProcessTree.java @@ -108,13 +108,25 @@ public long getCumulativeRssmem() { /** * Get the CPU time in millisecond used by all the processes in the - * process-tree since the process-tree created + * process-tree since the process-tree was created * * @return cumulative CPU time in millisecond since the process-tree created * return 0 if it cannot be calculated */ public abstract long getCumulativeCpuTime(); + /** + * Get the CPU usage by all the processes in the process-tree based on + * average between samples as a ratio of overall CPU capacity of the machine + * Thus if 2 out of 4 cores are used this should return 50.0 + * + * @param numProcessors number of processors in the machine + * + * @return percentage CPU usage since the process-tree was created + * return 0 if it cannot be calculated + */ + public abstract float getCpuUsagePercent(int numProcessors); + /** Verify that the tree process id is same as its process group id. * @return true if the process id matches else return false. */ diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/WindowsBasedProcessTree.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/WindowsBasedProcessTree.java index 143d236..fb0e1b5 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/WindowsBasedProcessTree.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/WindowsBasedProcessTree.java @@ -34,7 +34,7 @@ static final Log LOG = LogFactory .getLog(WindowsBasedProcessTree.class); - + static class ProcessInfo { String pid; // process pid long vmem; // virtual memory @@ -202,4 +202,9 @@ public long getCumulativeCpuTime() { return cpuTimeMs; } + @Override + public float getCpuUsagePercent(int numProcessors) { + return 0; + } + } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/TestLinuxResourceCalculatorPlugin.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/TestLinuxResourceCalculatorPlugin.java index c9a33d0..ad09fdf 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/TestLinuxResourceCalculatorPlugin.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/TestLinuxResourceCalculatorPlugin.java @@ -171,8 +171,8 @@ public void parsingProcStatAndCpuFile() throws IOException { updateStatFile(uTime, nTime, sTime); assertEquals(plugin.getCumulativeCpuTime(), FAKE_JIFFY_LENGTH * (uTime + nTime + sTime)); - assertEquals(plugin.getCpuUsage(), (float)(LinuxResourceCalculatorPlugin.UNAVAILABLE),0.0); - + assertEquals(plugin.getCpuUsage(), (float)(CpuTimeTracker.UNAVAILABLE),0.0); + // Advance the time and sample again to test the CPU usage calculation uTime += 100L; plugin.advanceTime(200L); diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/TestResourceCalculatorProcessTree.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/TestResourceCalculatorProcessTree.java index 32ceb23..66c9574 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/TestResourceCalculatorProcessTree.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/util/TestResourceCalculatorProcessTree.java @@ -53,6 +53,11 @@ public long getCumulativeCpuTime() { return 0; } + @Override + public float getCpuUsagePercent(int numProcessors) { + return 0; + } + public boolean checkPidPgrpidForMatch() { return false; } diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerMetrics.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerMetrics.java index 7850688..f89593b 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerMetrics.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerMetrics.java @@ -46,10 +46,20 @@ public static final String VMEM_LIMIT_METRIC_NAME = "vMemLimit"; public static final String VCORE_LIMIT_METRIC_NAME = "vCoreLimit"; public static final String PMEM_USAGE_METRIC_NAME = "pMemUsage"; + private static final String PHY_CPU_USAGE_METRIC_NAME = "phyCpuUsagePercent"; + private static final String VCORE_USAGE_METRIC_NAME = "vcoreUsage"; @Metric public MutableStat pMemMBsStat; + // This tracks overall CPU percentage of the machine in terms of percentage + // Thus if you use 2 cores out of 4 available cores this value will be 50 + @Metric + public MutableStat totalCpuPercentUsed; + + @Metric + public MutableStat vcoresUsed; + @Metric public MutableGaugeInt pMemLimitMbs; @@ -57,7 +67,7 @@ public MutableGaugeInt vMemLimitMbs; @Metric - public MutableGaugeInt cpuVcores; + public MutableGaugeInt cpuVcoreLimit; static final MetricsInfo RECORD_INFO = info("ContainerResource", "Resource limit and usage by container"); @@ -95,11 +105,16 @@ this.pMemMBsStat = registry.newStat( PMEM_USAGE_METRIC_NAME, "Physical memory stats", "Usage", "MBs", true); + this.totalCpuPercentUsed = registry.newStat( + PHY_CPU_USAGE_METRIC_NAME, "Physical Cpu percent usage stats", "Usage", + "Percents", true); + this.vcoresUsed = registry.newStat( + VCORE_USAGE_METRIC_NAME, "Vcore usage stats", "Usage", "Vcores", true); this.pMemLimitMbs = registry.newGauge( PMEM_LIMIT_METRIC_NAME, "Physical memory limit in MBs", 0); this.vMemLimitMbs = registry.newGauge( VMEM_LIMIT_METRIC_NAME, "Virtual memory limit in MBs", 0); - this.cpuVcores = registry.newGauge( + this.cpuVcoreLimit = registry.newGauge( VCORE_LIMIT_METRIC_NAME, "CPU limit in number of vcores", 0); } @@ -170,6 +185,11 @@ public void recordMemoryUsage(int memoryMBs) { this.pMemMBsStat.add(memoryMBs); } + public void recordCpuUsage(int totalPhysicalCpuPercent, int vcoresUsed) { + this.totalCpuPercentUsed.add(totalPhysicalCpuPercent); + this.vcoresUsed.add(vcoresUsed); + } + public void recordProcessId(String processId) { registry.tag(PROCESSID_INFO, processId); } @@ -177,7 +197,7 @@ public void recordProcessId(String processId) { public void recordResourceLimit(int vmemLimit, int pmemLimit, int cpuVcores) { this.vMemLimitMbs.set(vmemLimit); this.pMemLimitMbs.set(pmemLimit); - this.cpuVcores.set(cpuVcores); + this.cpuVcoreLimit.set(cpuVcores); } private synchronized void scheduleTimerTaskIfRequired() { diff --git hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java index 2cecda6..2cedc2f 100644 --- hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java +++ hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java @@ -38,6 +38,7 @@ import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerKillEvent; +import org.apache.hadoop.yarn.server.nodemanager.util.NodeManagerHardwareUtils; import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree; import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin; @@ -434,6 +435,12 @@ public void run() { pTree.updateProcessTree(); // update process-tree long currentVmemUsage = pTree.getCumulativeVmem(); long currentPmemUsage = pTree.getCumulativeRssmem(); + float totalCpuPercentage = pTree.getCpuUsagePercent + (resourceCalculatorPlugin.getNumProcessors()); + int numVcoresUsed = (int) (totalCpuPercentage + * maxVCoresAllottedForContainers / + (100 * NodeManagerHardwareUtils.getContainersCores + (resourceCalculatorPlugin, conf))); // as processes begin with an age 1, we want to see if there // are processes more than 1 iteration old. long curMemUsageOfAgedProcesses = pTree.getCumulativeVmem(1); @@ -451,6 +458,9 @@ public void run() { ContainerMetrics.forContainer( containerId, containerMetricsPeriodMs).recordMemoryUsage( (int) (currentPmemUsage >> 20)); + ContainerMetrics.forContainer( + containerId, containerMetricsPeriodMs).recordCpuUsage + ((int)totalCpuPercentage, numVcoresUsed); } boolean isMemoryOverLimit = false;