Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix!: host-metrics system.cpu.utilization calculation fix #1741

Merged
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 46 additions & 16 deletions packages/opentelemetry-host-metrics/src/stats/common.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,30 +20,56 @@ import { CpuUsageData, MemoryData, ProcessCpuUsageData } from '../types';

const MILLISECOND = 1 / 1e3;
let cpuUsageTime: number | undefined = undefined;
let prevOsData: { time: number; cpus: os.CpuInfo[] };

/**
* It returns cpu load delta from last time - to be used with SumObservers.
* When called first time it will return 0 and then delta will be calculated
* For each CPU returned by `os.cpus()` it returns
* - the CPU times in each state (user, sys, ...) in seconds
* - the % of time the CPU was in each state since last measurement
*
* The first time will return 0 for % masurements since there is not enough
* data to calculate it
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it be possible to load cpus at startup and calculate the load from then? It would miss the first few milliseconds, but would avoid having a 0 measurement on each program restart.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it would make testing harder but it is possible. I'll give it a try

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Luckily the test is setup in a way that is not affected by this data preload. The changes are pushed :)

*/
export function getCpuUsageData(): CpuUsageData[] {
if (typeof cpuUsageTime !== 'number') {
cpuUsageTime = new Date().getTime() - process.uptime() * 1000;
if (typeof prevOsData !== 'object') {
const time = Date.now();
const cpus = os.cpus();
prevOsData = { time, cpus };

return cpus.map((cpu, cpuNumber) => ({
cpuNumber: String(cpuNumber),
idle: cpu.times.idle * MILLISECOND,
user: cpu.times.user * MILLISECOND,
system: cpu.times.sys * MILLISECOND,
interrupt: cpu.times.irq * MILLISECOND,
nice: cpu.times.nice * MILLISECOND,
userP: 0,
systemP: 0,
idleP: 0,
interruptP: 0,
niceP: 0,
}));
}

const timeElapsed = (new Date().getTime() - cpuUsageTime) / 1000;
const currentTime = Date.now();
const timeElapsed = currentTime - prevOsData.time;
const currentOsData = { time: currentTime, cpus: os.cpus() };

const usageData = currentOsData.cpus.map((cpu, cpuNumber) => {
const prevTimes = prevOsData.cpus[cpuNumber].times;
const currTimes = cpu.times;

return os.cpus().map((cpu, cpuNumber) => {
const idle = cpu.times.idle * MILLISECOND;
const user = cpu.times.user * MILLISECOND;
const system = cpu.times.sys * MILLISECOND;
const interrupt = cpu.times.irq * MILLISECOND;
const nice = cpu.times.nice * MILLISECOND;
const idle = currTimes.idle * MILLISECOND;
const user = currTimes.user * MILLISECOND;
const system = currTimes.sys * MILLISECOND;
const interrupt = currTimes.irq * MILLISECOND;
const nice = currTimes.nice * MILLISECOND;

const idleP = idle / timeElapsed;
const userP = user / timeElapsed;
const systemP = system / timeElapsed;
const interruptP = interrupt / timeElapsed;
const niceP = nice / timeElapsed;
const idleP = (currTimes.idle - prevTimes.idle) / timeElapsed;
const userP = (currTimes.user - prevTimes.user) / timeElapsed;
const systemP = (currTimes.sys - prevTimes.sys) / timeElapsed;
const interruptP = (currTimes.irq - prevTimes.irq) / timeElapsed;
const niceP = (currTimes.nice - prevTimes.nice) / timeElapsed;

return {
cpuNumber: String(cpuNumber),
Expand All @@ -59,6 +85,10 @@ export function getCpuUsageData(): CpuUsageData[] {
niceP,
};
});

prevOsData = currentOsData;

return usageData;
}

/**
Expand Down
33 changes: 20 additions & 13 deletions packages/opentelemetry-host-metrics/test/metric.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ const mockedOS = {
totalmem: function () {
return 1024 * 1024;
},
cpusIdx: 0,
cpus: function () {
return cpuJson[this.cpusIdx++ % 2];
},
};

const INTERVAL = 3000;
Expand Down Expand Up @@ -112,7 +116,7 @@ describe('Host Metrics', () => {
return mockedOS.freemem();
});
sandbox.stub(os, 'totalmem').returns(mockedOS.totalmem());
sandbox.stub(os, 'cpus').returns(cpuJson);
sandbox.stub(os, 'cpus').callsFake(() => mockedOS.cpus());
sandbox.stub(process, 'uptime').returns(0);
sandbox.stub(SI, 'networkStats').callsFake(() => {
return mockedSI.networkStats();
Expand Down Expand Up @@ -146,6 +150,9 @@ describe('Host Metrics', () => {
await reader.collect();
dateStub.returns(process.uptime() * 1000 + INTERVAL);

// advance the clock for the next collection
sandbox.clock.tick(1000);

// invalidates throttles
countSI = 0;
});
Expand All @@ -156,31 +163,31 @@ describe('Host Metrics', () => {
it('should export CPU time metrics', async () => {
const metric = await getRecords(reader, 'system.cpu.time');

ensureValue(metric, { state: 'user', cpu: '0' }, 90713.56);
ensureValue(metric, { state: 'system', cpu: '0' }, 63192.630000000005);
ensureValue(metric, { state: 'idle', cpu: '0' }, 374870.7);
ensureValue(metric, { state: 'user', cpu: '0' }, 90714.26);
dyladan marked this conversation as resolved.
Show resolved Hide resolved
ensureValue(metric, { state: 'system', cpu: '0' }, 63192.83);
ensureValue(metric, { state: 'idle', cpu: '0' }, 374870.8);
ensureValue(metric, { state: 'interrupt', cpu: '0' }, 0);
ensureValue(metric, { state: 'nice', cpu: '0' }, 0);

ensureValue(metric, { state: 'user', cpu: '1' }, 11005.42);
ensureValue(metric, { state: 'system', cpu: '1' }, 7678.12);
ensureValue(metric, { state: 'idle', cpu: '1' }, 510034.8);
ensureValue(metric, { state: 'user', cpu: '1' }, 11005.72);
ensureValue(metric, { state: 'system', cpu: '1' }, 7678.62);
ensureValue(metric, { state: 'idle', cpu: '1' }, 510035);
ensureValue(metric, { state: 'interrupt', cpu: '1' }, 0);
ensureValue(metric, { state: 'nice', cpu: '1' }, 0);
});

it('should export CPU utilization metrics', async () => {
const metric = await getRecords(reader, 'system.cpu.utilization');

ensureValue(metric, { state: 'user', cpu: '0' }, 30247.935978659552);
ensureValue(metric, { state: 'system', cpu: '0' }, 21071.23374458153);
ensureValue(metric, { state: 'idle', cpu: '0' }, 124998.56618872957);
ensureValue(metric, { state: 'user', cpu: '0' }, 0.7);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the value rounded or is this just a coincidence that they're all 1 significant figure?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not a coincidence. The data is mocked with packages/opentelemetry-host-metrics/test/mocks/cpu.json.

ensureValue(metric, { state: 'system', cpu: '0' }, 0.2);
ensureValue(metric, { state: 'idle', cpu: '0' }, 0.1);
ensureValue(metric, { state: 'interrupt', cpu: '0' }, 0);
ensureValue(metric, { state: 'nice', cpu: '0' }, 0);

ensureValue(metric, { state: 'user', cpu: '1' }, 3669.6965655218405);
ensureValue(metric, { state: 'system', cpu: '1' }, 2560.2267422474156);
ensureValue(metric, { state: 'idle', cpu: '1' }, 170068.28942980993);
ensureValue(metric, { state: 'user', cpu: '1' }, 0.3);
ensureValue(metric, { state: 'system', cpu: '1' }, 0.5);
ensureValue(metric, { state: 'idle', cpu: '1' }, 0.2);
ensureValue(metric, { state: 'interrupt', cpu: '1' }, 0);
ensureValue(metric, { state: 'nice', cpu: '1' }, 0);
});
Expand Down
76 changes: 54 additions & 22 deletions packages/opentelemetry-host-metrics/test/mocks/cpu.json
Original file line number Diff line number Diff line change
@@ -1,22 +1,54 @@
[{
"model": "CPU @ 2.60GHz",
"speed": 2600,
"times": {
"user": 90713560,
"nice": 0,
"sys": 63192630,
"idle": 374870700,
"irq": 0
}
}, {
"model": "CPU @ 2.60GHz",
"speed": 2600,
"times": {
"user": 11005420,
"nice": 0,
"sys": 7678120,
"idle": 510034800,
"irq": 0
}
}
]
[
[
{
"model": "CPU @ 2.60GHz",
"speed": 2600,
"// times": "this is the 1st collect of CPU data which will be used to calc utilization",
"times": {
"user": 90713560,
"nice": 0,
"sys": 63192630,
"idle": 374870700,
"irq": 0
}
},
{
"model": "CPU @ 2.60GHz",
"speed": 2600,
"// times": "this is the 1st collect of CPU data which will be used to calc utilization",
"times": {
"user": 11005420,
"nice": 0,
"sys": 7678120,
"idle": 510034800,
"irq": 0
}
}
],
[
{
"model": "CPU @ 2.60GHz",
"speed": 2600,
"// times": "assuming a time delta of 1000ms we distribute 70% to user, 20% to sys and 10% to idle",
"times": {
"user": 90714260,
"nice": 0,
"sys": 63192830,
"idle": 374870800,
"irq": 0
}
},
{
"model": "CPU @ 2.60GHz",
"speed": 2600,
"// times": "assuming a time delta of 1000ms we distribute 30% to user, 50% to sys and 20% to idle",
"times": {
"user": 11005720,
"nice": 0,
"sys": 7678620,
"idle": 510035000,
"irq": 0
}
}
]
]