Skip to content

[Queue Time Analysis] #6680

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jun 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
"startTime": "DateTime64",
"endTime": "DateTime64",
"granularity": "String",
"items": "Array<String>",
"jobNames": "Array<String>",
"workflowNames": "Array<String>",
"machineTypes": "Array<String>",
"runnerLabels": "Array<String>",
"repos": "Array<String>"
},
"tests": []
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
WITH selected_data AS (
SELECT
multiIf(
{granularity:String} = 'half_hour', time,
{granularity:String} = 'hour', dateTrunc('hour', time),
{granularity:String} = 'day', dateTrunc('day', time),
{granularity:String} = 'week', dateTrunc('week', time),
{granularity:String} = 'month', dateTrunc('month', time),
time -- default fallback
) AS truncated_time,
multiIf(
{granularity:String} = 'half_hour', 1,
{granularity:String} = 'hour', 2,
{granularity:String} = 'day', 48,
{granularity:String} = 'week', 336,
{granularity:String} = 'month', 1440,
1
) AS time_divisor,
max(max_queue_time) AS aggr_max_queue_time,
sum(avg_queue_time * total_count) AS weighted_sum,
sum(total_count) AS total_count_sum,
groupArray(histogram) AS al
FROM fortesting.oss_ci_queue_time_histogram
WHERE time > {startTime: DateTime64}
AND time <= {endTime: DateTime64}
AND repo IN ({repos: Array(String)})
AND (
{jobNames: Array(String)} = [] OR job_name IN { jobNames: Array(String)}
)
AND (
{workflowNames: Array(String)} = [] OR workflow_name IN {workflowNames: Array(String)}
)
AND (
{machineTypes: Array(String)} = [] OR machine_type IN {machineTypes: Array(String)}
)
AND (
{runnerLabels: Array(String)} = [] OR hasAny(runner_labels, {runnerLabels: Array(String)})
)
GROUP BY truncated_time
)

SELECT
aggr_max_queue_time AS max_queue_time,
total_count_sum,
time_divisor,
total_count_sum / time_divisor AS avg_queued_job_count,
weighted_sum / total_count_sum AS avg_queue_time,

truncated_time AS time,

arrayMap(
i -> arraySum(arrayMap(arr -> arr[i], al)),
range(1, length(al[1]))
) AS data,

round(arrayReduce('quantile(0.5)',
arrayFlatten(arrayMap(
(x, i) -> arrayResize([0], toUInt32(x), i),
arrayMap(i -> arraySum(arrayMap(arr -> arr[i], al)), range(1, length(al[1]))),
arrayEnumerate(arrayMap(i -> arraySum(arrayMap(arr -> arr[i], al)), range(1, length(al[1]))))
)))
) AS p50_index,

round(arrayReduce('quantile(0.9)',
arrayFlatten(arrayMap(
(x, i) -> arrayResize([0], toUInt32(x), i),
arrayMap(i -> arraySum(arrayMap(arr -> arr[i], al)), range(1, length(al[1]))),
arrayEnumerate(arrayMap(i -> arraySum(arrayMap(arr -> arr[i], al)), range(1, length(al[1]))))
)))
) AS p90_index,

round(arrayReduce('quantile(0.2)',
arrayFlatten(arrayMap(
(x, i) -> arrayResize([0], toUInt32(x), i),
arrayMap(i -> arraySum(arrayMap(arr -> arr[i], al)), range(1, length(al[1]))),
arrayEnumerate(arrayMap(i -> arraySum(arrayMap(arr -> arr[i], al)), range(1, length(al[1]))))
)))
) AS p20_index
FROM selected_data
ORDER BY time ASC
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ SELECT
arrayDistinct(groupArray(repo)) AS repos,
arrayDistinct(groupArray(workflow_name)) AS workflow_names,
arrayDistinct(groupArray(job_name)) AS job_names,
arrayDistinct(groupArray(machine_type)) AS machine_types
arrayDistinct(groupArray(machine_type)) AS machine_types,
arrayDistinct(arrayFlatten(groupArray(runner_labels))) AS runner_labels
FROM fortesting.oss_ci_queue_time_histogram
WHERE time >= {startTime: DateTime64} AND time < {endTime: DateTime64}
1 change: 1 addition & 0 deletions torchci/components/metrics/pickers/ToggleIconPicker.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ export interface ToggleIconPickerContent {
value: string;
icon: JSX.Element;
tooltipContent: string;
charts?: string[];
}

export default function ToggleIconPicker({
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
const QueueDataExplanation = () => (
<div style={{ fontSize: "0.9rem", color: "#666", lineHeight: 1.5 }}>
<strong>How We Collect Queue Data:</strong>
<br />
Every 30 minutes, we capture a snapshot of all jobs that were in the queue
during that window. This includes:
<br />- Jobs that <em>were queued and completed</em> before the snapshot.
<br />- Jobs that <em>are still in the queue</em> at the time of collection.
<br />
<br />
This provides a more complete view of queue activity and wait times.
</div>
);

export default QueueDataExplanation;
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { Grid2, Stack, styled, Typography } from "@mui/material";
import { Stack, styled, Tooltip, Typography } from "@mui/material";
import { useRouter } from "next/router";
import { useEffect, useReducer, useState } from "react";

Expand All @@ -14,6 +14,9 @@ const FlexNoWrap = styled("div")({
flexWrap: "nowrap",
});

import InfoIcon from "@mui/icons-material/Info"; // Add this import statement
import QueueDataExplanation from "./QueueDataExplanation";

export default function QueueTimeChartPage() {
const router = useRouter();
const [routerReady, setRouterReady] = useState(false);
Expand All @@ -39,8 +42,18 @@ export default function QueueTimeChartPage() {
<Typography variant="caption" color="textSecondary">
* All datetime values are in UTC. <Clock />
</Typography>
<Typography variant="caption" color="textSecondary">
<span>
{" "}
* Data is collected every 30 minutes, including all jobs in queue at
that time.{" "}
</span>
<Tooltip title={<QueueDataExplanation />}>
<InfoIcon fontSize="small" />
</Tooltip>
</Typography>
</Stack>
<Grid2 container spacing={2}>
<div>
<FlexNoWrap>
<div>
<QueueTimeCharts
Expand All @@ -57,7 +70,7 @@ export default function QueueTimeChartPage() {
/>
</div>
</FlexNoWrap>
</Grid2>
</div>
</div>
);
}
Expand Down
Loading