Migrate lf rollover percentage query to CH (#5847)

ZainRizvi · web-flow · commit aab94e1bcdd1 · 2024-11-01T17:19:06.000-05:00
Removed references to the obsolete amz2023 runner prefixes

Validation: Ensured the data in both CH and Rockset versions of the
query return the same data and the charts look the same
diff --git a/torchci/clickhouse_queries/lf_rollover_health/params.json b/torchci/clickhouse_queries/lf_rollover_health/params.json
@@ -1,4 +1,3 @@
 {
-  "days_ago": "Int64",
-  "granularity": "String"
+  "days_ago": "Int64"
 }
diff --git a/torchci/clickhouse_queries/lf_rollover_percentage/params.json b/torchci/clickhouse_queries/lf_rollover_percentage/params.json
@@ -0,0 +1,3 @@
+{
+  "days_ago": "Int64"
+}
diff --git a/torchci/clickhouse_queries/lf_rollover_percentage/query.sql b/torchci/clickhouse_queries/lf_rollover_percentage/query.sql
@@ -0,0 +1,90 @@
+WITH
+    normalized_jobs AS (
+        SELECT
+            l AS label,
+            extract(j.name, '[^,]*') AS job_name, -- Remove shard number and label from job names
+            j.workflow_name,
+            toStartOfInterval(j.started_at, INTERVAL 1 HOUR) AS bucket
+        FROM
+            -- Deliberatly not adding FINAL to this workflow_job.
+            -- Risks of not using it:
+            --   - You may get duplicate records for rows that were updated corresponding to their
+            --     before/after states, but as long as there’s some mechanism in the query to account
+            --     for that it’s okay (we check for j.status = 'completed`).
+            --   - In the worst case scenario, you may only see the ‘old’ version of the records for some rows
+            -- Costs of using it:
+            --   - Query procesing time increases from ~5 -> 16 seconds
+            --   - Memory usage grows from 7.5 GB -> 32 GB
+            -- So the tradeoff is worth it for this query.
+            workflow_job AS j
+            ARRAY JOIN j.labels as l
+        WHERE
+            j.created_at > now() - INTERVAL {days_ago: Int64} DAY
+            AND j.status = 'completed'
+            AND l != 'self-hosted'
+            AND l NOT LIKE 'lf.c.%'
+            AND l NOT LIKE '%canary%'
+    ),
+    lf_jobs AS (
+        SELECT
+            DISTINCT j.job_name
+        FROM
+            normalized_jobs AS j
+        WHERE
+            j.label LIKE 'lf%'
+    ),
+    -- filter jobs down to the ones that ran in both
+    -- LF and Meta fleets
+    comparable_jobs AS (
+        SELECT
+            j.bucket,
+            j.label,
+            j.job_name,
+            -- Remove shard number and label from job names
+            j.workflow_name
+        FROM
+            normalized_jobs AS j
+        INNER JOIN
+            lf_jobs AS lfj ON j.job_name = lfj.job_name
+    ),
+    success_stats AS (
+        SELECT
+            bucket,
+            count(*) AS group_size,
+            job_name,
+            workflow_name,
+            label,
+            if(substring(label, 1, 3) = 'lf.', True, False) AS lf_fleet
+        FROM
+            comparable_jobs
+        GROUP BY
+            bucket, job_name, workflow_name, label
+    ),
+    comparison_stats AS (
+        SELECT
+            lf.bucket,
+            SUM(lf.group_size + m.group_size) AS total_jobs,
+            SUM(m.group_size) AS compliment_jobs,
+            SUM(lf.group_size) AS counted_jobs,
+            m.lf_fleet AS c_fleet,
+            lf.lf_fleet AS m_fleet,
+            CAST(SUM(lf.group_size) AS Float32) / SUM(lf.group_size + m.group_size) * 100 AS percentage,
+            IF(lf.lf_fleet, 'Linux Foundation', 'Meta') AS fleet
+        FROM
+            success_stats AS lf
+        INNER JOIN
+            success_stats AS m ON lf.bucket = m.bucket
+        WHERE
+            lf.job_name = m.job_name
+            AND lf.workflow_name = m.workflow_name
+            AND (
+                (lf.lf_fleet = 1 AND m.lf_fleet = 0)
+                OR (lf.lf_fleet = 0 AND m.lf_fleet = 1)
+            )
+            AND lf.group_size > 3
+            AND m.group_size > 3
+        GROUP BY
+            lf.bucket, lf.lf_fleet, m.lf_fleet
+    )
+SELECT * FROM comparison_stats
+ORDER BY  bucket DESC, fleet
diff --git a/torchci/pages/metrics.tsx b/torchci/pages/metrics.tsx
@@ -1230,18 +1230,23 @@ export default function Page() {
             title={"Percentage of jobs rolled over to Linux Foundation"}
             queryName={"lf_rollover_percentage"}
             queryCollection={"metrics"}
-            queryParams={[
-              {
-                name: "days_ago",
-                type: "int",
-                value: timeRange,
-              },
-            ]}
+            queryParams={
+              useClickHouse
+                ? { ...timeParamsClickHouse, days_ago: timeRange }
+                : [
+                    {
+                      name: "days_ago",
+                      type: "int",
+                      value: timeRange,
+                    },
+                  ]
+            }
             granularity={"hour"}
             timeFieldName={"bucket"}
             yAxisFieldName={"percentage"}
             groupByFieldName={"fleet"}
             yAxisRenderer={(value) => value.toFixed(2).toString() + "%"}
+            useClickHouse={useClickHouse}
           />
         </Grid>
       </Grid>

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`	`1`	`{`
`2`		`- "days_ago": "Int64",`
`3`		`- "granularity": "String"`
	`2`	`+ "days_ago": "Int64"`
`4`	`3`	`}`