Skip to content

Commit dbde372

Browse files
authored
Move meta canary scale-config.yml file to this repository (#6266)
## Added `.github/canary-scale-config.yml` file. This file should be the one used by Pytorch Canary on Meta's environment. Its been painful to develop/test things using pytorch/pytorch-canary on meta's fleet. The main reason for this is the need to add/update the scale-config.yml file every time a test needs to be executed. This always generates merge conflicts with pytorch/pytorch and requires a substantial manual work. The other advantage of having this file here is the clarity of where all the configuration is, so it is central in a single place for all 4 environments that we have. Finally, the validation scripts should then ensure the quality of the code and that all 4 scale-config files are in sync. ## Using the standard `c.<something>` The usage of the standard `c.<something>` in detriment of `<something>.canary` for Meta's canary runners, is beneficial. Reduced names sizes, a single standard for all our runners, and greater compatibility with current implementations. This should also leverage and empower in the future to streamline pytorch/pytorch-canary CI to run with canary runners automatically, without the need for the boring change of runners definitions in all workflows when opening the PR.
1 parent 1faa4b5 commit dbde372

File tree

4 files changed

+227
-4
lines changed

4 files changed

+227
-4
lines changed

.github/canary-scale-config.yml

+215
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
# canary-scale-config.yml:
2+
# Powers what instance types are available for GHA auto-scaled
3+
# runners. Runners listed here will be available as self hosted
4+
# runners, configuration is directly pulled from the main branch.
5+
#
6+
#
7+
# NOTES:
8+
# - Linux runners are by default non-ephemeral to reduce the amount of CreateInstaces calls
9+
# to avoid RequestLimitExceeded issues
10+
# - When updating this file, run the following command to validate the YAML and to generate
11+
# corresponding versions of scale-config for the pytorch/pytorch repo and merge the
12+
# pytorch/pytorch changes before merging these changes.
13+
# `python .github/scripts/validate_scale_config.py --generate`
14+
#
15+
# TODO: Add some documentation on how the auto-scaling works
16+
#
17+
# NOTE: Default values,
18+
#
19+
# runner_types:
20+
# runner_label:
21+
# instance_type: m4.large
22+
# os: linux
23+
# # min_available defaults to the global cfg in the ALI Terraform
24+
# min_available: undefined
25+
# # when max_available value is not defined, no max runners is enforced
26+
# max_available: undefined
27+
# disk_size: 50
28+
# is_ephemeral: true
29+
30+
runner_types:
31+
c.linux.8xlarge.amx:
32+
disk_size: 200
33+
instance_type: m7i-flex.8xlarge
34+
is_ephemeral: false
35+
os: linux
36+
c.linux.12xlarge:
37+
disk_size: 200
38+
instance_type: c5.12xlarge
39+
is_ephemeral: false
40+
os: linux
41+
c.linux.10xlarge.avx2:
42+
disk_size: 200
43+
instance_type: m4.10xlarge
44+
is_ephemeral: false
45+
os: linux
46+
c.linux.24xl.spr-metal:
47+
disk_size: 200
48+
instance_type: c7i.metal-24xl
49+
is_ephemeral: false
50+
os: linux
51+
c.linux.16xlarge.spr:
52+
disk_size: 200
53+
instance_type: c7i.16xlarge
54+
is_ephemeral: false
55+
os: linux
56+
c.linux.9xlarge.ephemeral:
57+
disk_size: 200
58+
instance_type: c5.9xlarge
59+
is_ephemeral: true
60+
os: linux
61+
c.linux.12xlarge.ephemeral:
62+
disk_size: 200
63+
instance_type: c5.12xlarge
64+
is_ephemeral: true
65+
os: linux
66+
c.linux.16xlarge.nvidia.gpu:
67+
disk_size: 150
68+
instance_type: g3.16xlarge
69+
is_ephemeral: false
70+
os: linux
71+
c.linux.24xlarge:
72+
disk_size: 150
73+
instance_type: c5.24xlarge
74+
is_ephemeral: false
75+
os: linux
76+
c.linux.24xlarge.ephemeral:
77+
disk_size: 150
78+
instance_type: c5.24xlarge
79+
is_ephemeral: true
80+
os: linux
81+
c.linux.2xlarge:
82+
disk_size: 150
83+
instance_type: c5.2xlarge
84+
is_ephemeral: false
85+
os: linux
86+
c.linux.4xlarge:
87+
disk_size: 150
88+
instance_type: c5.4xlarge
89+
is_ephemeral: false
90+
os: linux
91+
c.linux.4xlarge.nvidia.gpu:
92+
disk_size: 150
93+
instance_type: g3.4xlarge
94+
is_ephemeral: false
95+
os: linux
96+
c.linux.8xlarge.nvidia.gpu:
97+
disk_size: 150
98+
instance_type: g3.8xlarge
99+
is_ephemeral: false
100+
os: linux
101+
c.linux.g4dn.12xlarge.nvidia.gpu:
102+
disk_size: 150
103+
instance_type: g4dn.12xlarge
104+
is_ephemeral: false
105+
os: linux
106+
c.linux.g4dn.metal.nvidia.gpu:
107+
disk_size: 150
108+
instance_type: g4dn.metal
109+
is_ephemeral: false
110+
os: linux
111+
c.linux.g5.48xlarge.nvidia.gpu:
112+
disk_size: 150
113+
instance_type: g5.48xlarge
114+
is_ephemeral: false
115+
os: linux
116+
c.linux.g5.12xlarge.nvidia.gpu:
117+
disk_size: 150
118+
instance_type: g5.12xlarge
119+
is_ephemeral: false
120+
os: linux
121+
c.linux.g5.4xlarge.nvidia.gpu:
122+
disk_size: 150
123+
instance_type: g5.4xlarge
124+
is_ephemeral: false
125+
os: linux
126+
c.linux.g6.4xlarge.experimental.nvidia.gpu:
127+
disk_size: 150
128+
instance_type: g6.4xlarge
129+
is_ephemeral: false
130+
os: linux
131+
c.linux.large:
132+
disk_size: 15
133+
instance_type: c5.large
134+
is_ephemeral: false
135+
os: linux
136+
c.linux.arm64.2xlarge:
137+
disk_size: 256
138+
instance_type: t4g.2xlarge
139+
is_ephemeral: false
140+
os: linux
141+
c.linux.arm64.m7g.4xlarge:
142+
disk_size: 256
143+
instance_type: m7g.4xlarge
144+
is_ephemeral: false
145+
os: linux
146+
c.linux.arm64.2xlarge.ephemeral:
147+
disk_size: 256
148+
instance_type: t4g.2xlarge
149+
is_ephemeral: true
150+
os: linux
151+
c.linux.arm64.m7g.4xlarge.ephemeral:
152+
disk_size: 256
153+
instance_type: m7g.4xlarge
154+
is_ephemeral: true
155+
os: linux
156+
c.linux.arm64.m7g.metal:
157+
disk_size: 256
158+
instance_type: m7g.metal
159+
is_ephemeral: false
160+
os: linux
161+
c.windows.g4dn.xlarge:
162+
disk_size: 256
163+
instance_type: g4dn.xlarge
164+
is_ephemeral: true
165+
os: windows
166+
c.windows.g4dn.xlarge.nonephemeral:
167+
disk_size: 256
168+
instance_type: g4dn.xlarge
169+
is_ephemeral: false
170+
os: windows
171+
c.windows.4xlarge:
172+
disk_size: 256
173+
instance_type: c5d.4xlarge
174+
is_ephemeral: true
175+
os: windows
176+
c.windows.4xlarge.nonephemeral:
177+
disk_size: 256
178+
instance_type: c5d.4xlarge
179+
is_ephemeral: false
180+
os: windows
181+
c.windows.8xlarge.nvidia.gpu:
182+
disk_size: 256
183+
instance_type: p3.2xlarge
184+
is_ephemeral: true
185+
os: windows
186+
c.windows.8xlarge.nvidia.gpu.nonephemeral:
187+
disk_size: 256
188+
instance_type: p3.2xlarge
189+
is_ephemeral: false
190+
os: windows
191+
c.windows.g5.4xlarge.nvidia.gpu:
192+
disk_size: 256
193+
instance_type: g5.4xlarge
194+
is_ephemeral: false
195+
os: windows
196+
c.linux.2xlarge.memory:
197+
disk_size: 200
198+
instance_type: r5.2xlarge
199+
is_ephemeral: false
200+
os: linux
201+
c.linux.4xlarge.memory:
202+
disk_size: 300
203+
instance_type: r5.4xlarge
204+
is_ephemeral: false
205+
os: linux
206+
c.linux.8xlarge.memory:
207+
disk_size: 400
208+
instance_type: r5.8xlarge
209+
is_ephemeral: false
210+
os: linux
211+
c.linux.12xlarge.memory:
212+
disk_size: 600
213+
instance_type: r5.12xlarge
214+
is_ephemeral: false
215+
os: linux

.github/scripts/validate_scale_config.py

+6
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,14 @@
2121

2222
# Paths relative to their respective repositories
2323
META_SCALE_CONFIG_PATH = ".github/scale-config.yml"
24+
META_CANARY_SCALE_CONFIG_PATH = ".github/canary-scale-config.yml"
2425
LF_SCALE_CONFIG_PATH = ".github/lf-scale-config.yml"
2526
LF_CANARY_SCALE_CONFIG_PATH = ".github/lf-canary-scale-config.yml"
2627

2728
RUNNER_TYPE_CONFIG_KEY = "runner_types"
2829

2930
PREFIX_META = ""
31+
PREFIX_META_CANARY = "c."
3032
PREFIX_LF = "lf."
3133
PREFIX_LF_CANARY = "lf.c."
3234

@@ -294,6 +296,10 @@ def main() -> None:
294296

295297
# Contains scale configs that are generated from the source scale config
296298
generated_scale_config_infos: List[ScaleConfigInfo] = [
299+
ScaleConfigInfo(
300+
path=repo_root / META_CANARY_SCALE_CONFIG_PATH,
301+
prefix=PREFIX_META_CANARY,
302+
),
297303
ScaleConfigInfo(
298304
path=repo_root / LF_SCALE_CONFIG_PATH,
299305
prefix=PREFIX_LF,

torchci/clickhouse_queries/lf_rollover_health/query.sql

+3-2
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ normalized_jobs AS (
3030
AND j.status = 'completed'
3131
AND l != 'self-hosted'
3232
AND l NOT LIKE 'lf.c.%'
33-
AND l NOT LIKE '%canary%'
33+
AND l NOT LIKE '%.canary'
34+
AND l NOT LIKE 'c.%'
3435
),
3536
lf_jobs AS (
3637
SELECT DISTINCT
@@ -103,4 +104,4 @@ SELECT
103104
FROM
104105
comparison_stats
105106
ORDER BY
106-
bucket DESC, job_name DESC, success_rate_delta, workflow_name
107+
bucket DESC, job_name DESC, success_rate_delta, workflow_name

torchci/clickhouse_queries/lf_rollover_percentage/query.sql

+3-2
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ WITH
2323
AND j.status = 'completed'
2424
AND l != 'self-hosted'
2525
AND l NOT LIKE 'lf.c.%'
26-
AND l NOT LIKE '%canary%'
26+
AND l NOT LIKE '%.canary'
27+
AND l NOT LIKE 'c.%'
2728
),
2829
lf_jobs AS (
2930
SELECT
@@ -87,4 +88,4 @@ WITH
8788
lf.bucket, lf.lf_fleet, m.lf_fleet
8889
)
8990
SELECT * FROM comparison_stats
90-
ORDER BY bucket DESC, fleet
91+
ORDER BY bucket DESC, fleet

0 commit comments

Comments
 (0)