Skip to content

Add container.cpu.time metric #5806

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ public long GetHostCpuUsageInNanoseconds()
$"'{_procStat}' should contain whitespace separated values according to POSIX. We've failed trying to get {i}th value. File content: '{new string(stat)}'.");
}

stat = stat.Slice(next, stat.Length - next);
stat = stat.Slice(next);
}

return (long)(total / (double)_userHz * NanosecondsInSecond);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ public string GetCgroupPath(string filename)
}

// Extract the part after the last colon and cache it for future use
ReadOnlySpan<char> trimmedPath = fileContent.Slice(colonIndex + 1);
ReadOnlySpan<char> trimmedPath = fileContent[(colonIndex + 1)..];
_cachedCgroupPath = "/sys/fs/cgroup" + trimmedPath.ToString().TrimEnd('/') + "/";

return $"{_cachedCgroupPath}{filename}";
Expand Down Expand Up @@ -195,7 +195,7 @@ public long GetHostCpuUsageInNanoseconds()
$"'{_procStat}' should contain whitespace separated values according to POSIX. We've failed trying to get {i}th value. File content: '{new string(stat)}'.");
}

stat = stat.Slice(next, stat.Length - next);
stat = stat.Slice(next);
}

return (long)(total / (double)_userHz * NanosecondsInSecond);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
using System;
using System.Collections.Generic;
using System.Diagnostics.Metrics;
using System.Linq;
using System.Threading;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
Expand All @@ -17,6 +16,7 @@ internal sealed class LinuxUtilizationProvider : ISnapshotProvider
{
private const double One = 1.0;
private const long Hundred = 100L;
private const double NanosecondsInSecond = 1_000_000_000;

private readonly object _cpuLocker = new();
private readonly object _memoryLocker = new();
Expand Down Expand Up @@ -82,14 +82,19 @@ public LinuxUtilizationProvider(IOptions<ResourceMonitoringOptions> options, ILi
(_previousCgroupCpuTime, _previousCgroupCpuPeriodCounter) = _parser.GetCgroupCpuUsageInNanosecondsAndCpuPeriodsV2();

_ = meter.CreateObservableGauge(
ResourceUtilizationInstruments.ContainerCpuLimitUtilization,
() => GetMeasurementWithRetry(() => CpuUtilizationLimit(cpuLimit)),
"1");
name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization,
observeValues: () => GetMeasurementWithRetry(() => CpuUtilizationLimit(cpuLimit)),
unit: "1");

_ = meter.CreateObservableGauge(
name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization,
observeValues: () => GetMeasurementWithRetry(() => CpuUtilizationRequest(cpuRequest)),
unit: "1");

_ = meter.CreateObservableGauge(
name: ResourceUtilizationInstruments.ContainerCpuTime,
observeValues: GetCpuTime,
unit: "1");
}
else
{
Expand All @@ -111,12 +116,12 @@ public LinuxUtilizationProvider(IOptions<ResourceMonitoringOptions> options, ILi

_ = meter.CreateObservableGauge(
name: ResourceUtilizationInstruments.ContainerMemoryLimitUtilization,
observeValues: () => GetMeasurementWithRetry(() => MemoryUtilization()),
observeValues: () => GetMeasurementWithRetry(MemoryUtilization),
unit: "1");

_ = meter.CreateObservableGauge(
name: ResourceUtilizationInstruments.ProcessMemoryUtilization,
observeValues: () => GetMeasurementWithRetry(() => MemoryUtilization()),
observeValues: () => GetMeasurementWithRetry(MemoryUtilization),
unit: "1");

// cpuRequest is a CPU request (aka guaranteed number of CPU units) for pod, for host its 1 core
Expand Down Expand Up @@ -259,23 +264,32 @@ public Snapshot GetSnapshot()
memoryUsageInBytes: memoryUsed);
}

private IEnumerable<Measurement<double>> GetMeasurementWithRetry(Func<double> func)
private Measurement<double>[] GetMeasurementWithRetry(Func<double> func)
{
if (!TryGetValueWithRetry(func, out double value))
{
return Array.Empty<Measurement<double>>();
}

return new[] { new Measurement<double>(value) };
}

private bool TryGetValueWithRetry<T>(Func<T> func, out T value)
where T : struct
{
value = default;
if (Volatile.Read(ref _measurementsUnavailable) == 1 &&
_timeProvider.GetUtcNow() - _lastFailure < _retryInterval)
{
return Enumerable.Empty<Measurement<double>>();
return false;
}

try
{
double result = func();
if (Volatile.Read(ref _measurementsUnavailable) == 1)
{
_ = Interlocked.Exchange(ref _measurementsUnavailable, 0);
}
value = func();
_ = Interlocked.CompareExchange(ref _measurementsUnavailable, 0, 1);

return new[] { new Measurement<double>(result) };
return true;
}
catch (Exception ex) when (
ex is System.IO.FileNotFoundException ||
Expand All @@ -285,12 +299,25 @@ ex is System.IO.DirectoryNotFoundException ||
_lastFailure = _timeProvider.GetUtcNow();
_ = Interlocked.Exchange(ref _measurementsUnavailable, 1);

return Enumerable.Empty<Measurement<double>>();
return false;
}
}

// Math.Min() is used below to mitigate margin errors and various kinds of precisions losses
// due to the fact that the calculation itself is not an atomic operation:
private double CpuUtilizationRequest(double cpuRequest) => Math.Min(One, CpuUtilizationV2() / cpuRequest);
private double CpuUtilizationLimit(double cpuLimit) => Math.Min(One, CpuUtilizationV2() / cpuLimit);

private IEnumerable<Measurement<double>> GetCpuTime()
{
if (TryGetValueWithRetry(_parser.GetHostCpuUsageInNanoseconds, out long systemCpuTime))
{
yield return new Measurement<double>(systemCpuTime / NanosecondsInSecond, [new KeyValuePair<string, object?>("cpu.mode", "system")]);
}

if (TryGetValueWithRetry(CpuUtilizationV2, out double userCpuTime))
{
yield return new Measurement<double>(userCpuTime, [new KeyValuePair<string, object?>("cpu.mode", "user")]);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Diagnostics.Metrics;
using System.Threading;
Expand All @@ -17,6 +18,7 @@ internal sealed class WindowsContainerSnapshotProvider : ISnapshotProvider
{
private const double One = 1.0d;
private const double Hundred = 100.0d;
private const double TicksPerSecondDouble = TimeSpan.TicksPerSecond;

private readonly Lazy<MEMORYSTATUSEX> _memoryStatus;

Expand Down Expand Up @@ -85,16 +87,16 @@ internal WindowsContainerSnapshotProvider(

_timeProvider = timeProvider;

using var jobHandle = _createJobHandleObject();
using IJobHandle jobHandle = _createJobHandleObject();

var memoryLimitLong = GetMemoryLimit(jobHandle);
ulong memoryLimitLong = GetMemoryLimit(jobHandle);
_memoryLimit = memoryLimitLong;
_cpuLimit = GetCpuLimit(jobHandle, systemInfo);

// CPU request (aka guaranteed CPU units) is not supported on Windows, so we set it to the same value as CPU limit (aka maximum CPU units).
// Memory request (aka guaranteed memory) is not supported on Windows, so we set it to the same value as memory limit (aka maximum memory).
var cpuRequest = _cpuLimit;
var memoryRequest = memoryLimitLong;
double cpuRequest = _cpuLimit;
ulong memoryRequest = memoryLimitLong;
Resources = new SystemResources(cpuRequest, _cpuLimit, memoryRequest, memoryLimitLong);
_logger.SystemResourcesInfo(_cpuLimit, cpuRequest, memoryLimitLong, memoryRequest);

Expand All @@ -110,10 +112,11 @@ internal WindowsContainerSnapshotProvider(
// We don't dispose the meter because IMeterFactory handles that
// An issue on analyzer side: https://github.com/dotnet/roslyn-analyzers/issues/6912
// Related documentation: https://github.com/dotnet/docs/pull/37170
var meter = meterFactory.Create(ResourceUtilizationInstruments.MeterName);
Meter meter = meterFactory.Create(ResourceUtilizationInstruments.MeterName);
#pragma warning restore CA2000 // Dispose objects before losing scope

// Container based metrics:
_ = meter.CreateObservableCounter(name: ResourceUtilizationInstruments.ContainerCpuTime, observeValues: GetCpuTime, unit: "s", description: "CPU time used by the container.");
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization, observeValue: CpuPercentage);
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerMemoryLimitUtilization, observeValue: () => MemoryPercentage(() => _processInfo.GetMemoryUsage()));

Expand Down Expand Up @@ -155,7 +158,7 @@ private static double GetCpuLimit(IJobHandle jobHandle, ISystemInfo systemInfo)
cpuRatio = cpuLimit.CpuRate / CpuCycles;
}

var systemInfoValue = systemInfo.GetSystemInfo();
SYSTEM_INFO systemInfoValue = systemInfo.GetSystemInfo();

// Multiply the cpu ratio by the number of processors to get you the portion
// of processors used from the system.
Expand All @@ -172,7 +175,7 @@ private ulong GetMemoryLimit(IJobHandle jobHandle)

if (memoryLimitInBytes <= 0)
{
var memoryStatus = _memoryStatus.Value;
MEMORYSTATUSEX memoryStatus = _memoryStatus.Value;

// Technically, the unconstrained limit is memoryStatus.TotalPageFile.
// Leaving this at physical as it is more understandable to consumers.
Expand All @@ -184,7 +187,7 @@ private ulong GetMemoryLimit(IJobHandle jobHandle)

private double MemoryPercentage(Func<ulong> getMemoryUsage)
{
var now = _timeProvider.GetUtcNow();
DateTimeOffset now = _timeProvider.GetUtcNow();

lock (_memoryLocker)
{
Expand All @@ -194,7 +197,7 @@ private double MemoryPercentage(Func<ulong> getMemoryUsage)
}
}

var memoryUsage = getMemoryUsage();
ulong memoryUsage = getMemoryUsage();

lock (_memoryLocker)
{
Expand All @@ -211,6 +214,17 @@ private double MemoryPercentage(Func<ulong> getMemoryUsage)
}
}

private IEnumerable<Measurement<double>> GetCpuTime()
{
using IJobHandle jobHandle = _createJobHandleObject();
var basicAccountingInfo = jobHandle.GetBasicAccountingInfo();

yield return new Measurement<double>(basicAccountingInfo.TotalUserTime / TicksPerSecondDouble,
[new KeyValuePair<string, object?>("cpu.mode", "user")]);
yield return new Measurement<double>(basicAccountingInfo.TotalKernelTime / TicksPerSecondDouble,
[new KeyValuePair<string, object?>("cpu.mode", "system")]);
}

private double CpuPercentage()
{
var now = _timeProvider.GetUtcNow();
Expand Down
8 changes: 8 additions & 0 deletions src/Shared/Instruments/ResourceUtilizationInstruments.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@ internal static class ResourceUtilizationInstruments
/// </summary>
public const string MeterName = "Microsoft.Extensions.Diagnostics.ResourceMonitoring";

/// <summary>
/// The name of an instrument to retrieve CPU time consumed by the specific container on all available CPU cores, measured in seconds.
/// </summary>
/// <remarks>
/// The type of an instrument is <see cref="System.Diagnostics.Metrics.ObservableCounter{T}"/>.
/// </remarks>
public const string ContainerCpuTime = "container.cpu.time";

/// <summary>
/// The name of an instrument to retrieve CPU limit consumption of all processes running inside a container or control group in range <c>[0, 1]</c>.
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,7 @@ public async Task TestCpuAndMemoryChecks_WithMetrics(
accountingInfoAfter1Ms.TotalUserTime = (long)(utilization * 100);
jobHandleMock.SetupSequence(j => j.GetBasicAccountingInfo())
.Returns(() => initialAccountingInfo) // this is called from the WindowsContainerSnapshotProvider's constructor
.Returns(() => initialAccountingInfo) // this is called from the WindowsContainerSnapshotProvider's GetCpuTime method
.Returns(() => accountingInfoAfter1Ms); // this is called from the WindowsContainerSnapshotProvider's CpuPercentage method

using var meter = new Meter("Microsoft.Extensions.Diagnostics.ResourceMonitoring");
Expand Down
Loading
Loading