Skip to content

Commit aab94e1

Browse files
authored
Migrate lf rollover percentage query to CH (#5847)
Removed references to the obsolete amz2023 runner prefixes Validation: Ensured the data in both CH and Rockset versions of the query return the same data and the charts look the same
1 parent d137d3a commit aab94e1

File tree

4 files changed

+106
-9
lines changed

4 files changed

+106
-9
lines changed
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
{
2-
"days_ago": "Int64",
3-
"granularity": "String"
2+
"days_ago": "Int64"
43
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"days_ago": "Int64"
3+
}
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
WITH
2+
normalized_jobs AS (
3+
SELECT
4+
l AS label,
5+
extract(j.name, '[^,]*') AS job_name, -- Remove shard number and label from job names
6+
j.workflow_name,
7+
toStartOfInterval(j.started_at, INTERVAL 1 HOUR) AS bucket
8+
FROM
9+
-- Deliberatly not adding FINAL to this workflow_job.
10+
-- Risks of not using it:
11+
-- - You may get duplicate records for rows that were updated corresponding to their
12+
-- before/after states, but as long as there’s some mechanism in the query to account
13+
-- for that it’s okay (we check for j.status = 'completed`).
14+
-- - In the worst case scenario, you may only see the ‘old’ version of the records for some rows
15+
-- Costs of using it:
16+
-- - Query procesing time increases from ~5 -> 16 seconds
17+
-- - Memory usage grows from 7.5 GB -> 32 GB
18+
-- So the tradeoff is worth it for this query.
19+
workflow_job AS j
20+
ARRAY JOIN j.labels as l
21+
WHERE
22+
j.created_at > now() - INTERVAL {days_ago: Int64} DAY
23+
AND j.status = 'completed'
24+
AND l != 'self-hosted'
25+
AND l NOT LIKE 'lf.c.%'
26+
AND l NOT LIKE '%canary%'
27+
),
28+
lf_jobs AS (
29+
SELECT
30+
DISTINCT j.job_name
31+
FROM
32+
normalized_jobs AS j
33+
WHERE
34+
j.label LIKE 'lf%'
35+
),
36+
-- filter jobs down to the ones that ran in both
37+
-- LF and Meta fleets
38+
comparable_jobs AS (
39+
SELECT
40+
j.bucket,
41+
j.label,
42+
j.job_name,
43+
-- Remove shard number and label from job names
44+
j.workflow_name
45+
FROM
46+
normalized_jobs AS j
47+
INNER JOIN
48+
lf_jobs AS lfj ON j.job_name = lfj.job_name
49+
),
50+
success_stats AS (
51+
SELECT
52+
bucket,
53+
count(*) AS group_size,
54+
job_name,
55+
workflow_name,
56+
label,
57+
if(substring(label, 1, 3) = 'lf.', True, False) AS lf_fleet
58+
FROM
59+
comparable_jobs
60+
GROUP BY
61+
bucket, job_name, workflow_name, label
62+
),
63+
comparison_stats AS (
64+
SELECT
65+
lf.bucket,
66+
SUM(lf.group_size + m.group_size) AS total_jobs,
67+
SUM(m.group_size) AS compliment_jobs,
68+
SUM(lf.group_size) AS counted_jobs,
69+
m.lf_fleet AS c_fleet,
70+
lf.lf_fleet AS m_fleet,
71+
CAST(SUM(lf.group_size) AS Float32) / SUM(lf.group_size + m.group_size) * 100 AS percentage,
72+
IF(lf.lf_fleet, 'Linux Foundation', 'Meta') AS fleet
73+
FROM
74+
success_stats AS lf
75+
INNER JOIN
76+
success_stats AS m ON lf.bucket = m.bucket
77+
WHERE
78+
lf.job_name = m.job_name
79+
AND lf.workflow_name = m.workflow_name
80+
AND (
81+
(lf.lf_fleet = 1 AND m.lf_fleet = 0)
82+
OR (lf.lf_fleet = 0 AND m.lf_fleet = 1)
83+
)
84+
AND lf.group_size > 3
85+
AND m.group_size > 3
86+
GROUP BY
87+
lf.bucket, lf.lf_fleet, m.lf_fleet
88+
)
89+
SELECT * FROM comparison_stats
90+
ORDER BY bucket DESC, fleet

torchci/pages/metrics.tsx

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1230,18 +1230,23 @@ export default function Page() {
12301230
title={"Percentage of jobs rolled over to Linux Foundation"}
12311231
queryName={"lf_rollover_percentage"}
12321232
queryCollection={"metrics"}
1233-
queryParams={[
1234-
{
1235-
name: "days_ago",
1236-
type: "int",
1237-
value: timeRange,
1238-
},
1239-
]}
1233+
queryParams={
1234+
useClickHouse
1235+
? { ...timeParamsClickHouse, days_ago: timeRange }
1236+
: [
1237+
{
1238+
name: "days_ago",
1239+
type: "int",
1240+
value: timeRange,
1241+
},
1242+
]
1243+
}
12401244
granularity={"hour"}
12411245
timeFieldName={"bucket"}
12421246
yAxisFieldName={"percentage"}
12431247
groupByFieldName={"fleet"}
12441248
yAxisRenderer={(value) => value.toFixed(2).toString() + "%"}
1249+
useClickHouse={useClickHouse}
12451250
/>
12461251
</Grid>
12471252
</Grid>

0 commit comments

Comments
 (0)