From f3c27c3147afae3ae3060931145ea55619460503 Mon Sep 17 00:00:00 2001 From: clee2000 <44682903+clee2000@users.noreply.github.com> Date: Wed, 22 Jan 2025 12:46:14 -0800 Subject: [PATCH] Improve master_commit_red query performance (#6174) Pre filter the commits so we can filter the workflow job and workflow run tables on it later This improves speed for all time ranges up to 1 year. I did not check beyond that I believe the memory used is about the same, but it scans more rows for some reason This is the query behind this chart image on the metrics page Tentative wins, sample size 3 ``` +------+----------+-----------+-------------+---------------+--------------+---------------+----------------+--------------+ | Test | Avg Time | Base Time | Time Change | % Time Change | Avg Mem | Base Mem | Mem Change | % Mem Change | +------+----------+-----------+-------------+---------------+--------------+---------------+----------------+--------------+ | 0 | 1163 | 23117 | -21954 | -95 | 157241294.6 | 1539051189.75 | -1381809895.15 | -90 | | 1 | 1281 | 14509 | -13228 | -91 | 350569557 | 1585082143.8 | -1234512586.8 | -78 | | 2 | 8704 | 22954 | -14250 | -62 | 1774302065.6 | 3929546293 | -2155244227.4 | -55 | +------+----------+-----------+-------------+---------------+--------------+---------------+----------------+--------------+ ``` --- .../master_commit_red/query.sql | 57 ++++++++++++------- 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/torchci/clickhouse_queries/master_commit_red/query.sql b/torchci/clickhouse_queries/master_commit_red/query.sql index c53640343b..51988e5775 100644 --- a/torchci/clickhouse_queries/master_commit_red/query.sql +++ b/torchci/clickhouse_queries/master_commit_red/query.sql @@ -1,8 +1,41 @@ --- This query is used to show the histogram of trunk red commits on HUD metrics page --- during a period of time -WITH all_jobs AS ( +-- Split up the query into multiple CTEs to make it faster. +with commits as ( + select + push.head_commit.'timestamp' as time, + push.head_commit.'id' as sha + from + -- Not using final since push table doesn't really get updated + push + where + push.ref in ('refs/heads/master', 'refs/heads/main') + and push.repository.'owner'.'name' = 'pytorch' + and push.repository.'name' = 'pytorch' + and push.head_commit.'timestamp' >= {startTime: DateTime64(3)} + and push.head_commit.'timestamp' < {stopTime: DateTime64(3)} +), +all_runs as ( + select + workflow_run.id as id, + workflow_run.head_commit.'id' as sha, + workflow_run.name as name, + commit.time as time + from + workflow_run final + join commits commit on workflow_run.head_commit.'id' = commit.sha + where + ( + -- Limit it to workflows which block viable/strict upgrades + workflow_run.name in ('Lint', 'pull', 'trunk') + OR workflow_run.name like 'linux-binary%' + ) + AND workflow_run.event != 'workflow_run' -- Filter out workflow_run-triggered jobs, which have nothing to do with the SHA + and workflow_run.id in (select id from materialized_views.workflow_run_by_head_sha where head_sha in (select sha from commits)) +), +all_jobs AS ( SELECT - push.head_commit.'timestamp' AS time, + all_runs.time AS time, CASE WHEN job.conclusion = 'failure' THEN 'red' WHEN job.conclusion = 'timed_out' THEN 'red' @@ -10,29 +43,15 @@ WITH all_jobs AS ( WHEN job.conclusion = '' THEN 'pending' ELSE 'green' END as conclusion, - push.head_commit.'id' AS sha + all_runs.sha AS sha FROM - workflow_job job FINAL - JOIN workflow_run FINAL ON workflow_run.id = workflow_job.run_id - JOIN push FINAL ON workflow_run.head_commit.'id' = push.head_commit.'id' + default.workflow_job job final join all_runs all_runs on all_runs.id = workflow_job.run_id WHERE job.name != 'ciflow_should_run' AND job.name != 'generate-test-matrix' - AND ( - -- Limit it to workflows which block viable/strict upgrades - workflow_run.name in ('Lint', 'pull', 'trunk') - OR workflow_run.name like 'linux-binary%' - ) AND job.name NOT LIKE '%rerun_disabled_tests%' AND job.name NOT LIKE '%unstable%' - AND workflow_run.event != 'workflow_run' -- Filter out workflow_run-triggered jobs, which have nothing to do with the SHA - AND push.ref IN ( - 'refs/heads/master', 'refs/heads/main' - ) - AND push.repository.'owner'.'name' = 'pytorch' - AND push.repository.'name' = 'pytorch' - AND push.head_commit.'timestamp' >= {startTime: DateTime64(3)} - AND push.head_commit.'timestamp' < {stopTime: DateTime64(3)} + and job.id in (select id from materialized_views.workflow_job_by_head_sha where head_sha in (select sha from commits)) ), commit_overall_conclusion AS ( SELECT