Move meta canary scale-config.yml file to this repository (#6266)

jeanschmidt · web-flow · commit dbde372cc0bc · 2025-02-10T15:05:11.000+01:00
## Added `.github/canary-scale-config.yml` file. 

This file should be the one used by Pytorch Canary on Meta's
environment.

Its been painful to develop/test things using pytorch/pytorch-canary on
meta's fleet. The main reason for this is the need to add/update the
scale-config.yml file every time a test needs to be executed. This
always generates merge conflicts with pytorch/pytorch and requires a
substantial manual work.

The other advantage of having this file here is the clarity of where all
the configuration is, so it is central in a single place for all 4
environments that we have.

Finally, the validation scripts should then ensure the quality of the
code and that all 4 scale-config files are in sync.

## Using the standard `c.&lt;something&gt;` 

The usage of the standard `c.&lt;something&gt;` in detriment of
`&lt;something&gt;.canary` for Meta's canary runners, is beneficial. Reduced
names sizes, a single standard for all our runners, and greater
compatibility with current implementations. This should also leverage
and empower in the future to streamline pytorch/pytorch-canary CI to run
with canary runners automatically, without the need for the boring
change of runners definitions in all workflows when opening the PR.
diff --git a/.github/canary-scale-config.yml b/.github/canary-scale-config.yml
@@ -0,0 +1,215 @@
+# canary-scale-config.yml:
+#   Powers what instance types are available for GHA auto-scaled
+#   runners. Runners listed here will be available as self hosted
+#   runners, configuration is directly pulled from the main branch.
+#
+#
+# NOTES:
+#  - Linux runners are by default non-ephemeral to reduce the amount of CreateInstaces calls
+#    to avoid RequestLimitExceeded issues
+#  - When updating this file, run the following command to validate the YAML and to generate
+#    corresponding versions of scale-config for the pytorch/pytorch repo and merge the
+#    pytorch/pytorch changes before merging these changes.
+#    `python .github/scripts/validate_scale_config.py --generate`
+#
+# TODO: Add some documentation on how the auto-scaling works
+#
+# NOTE: Default values,
+#
+# runner_types:
+#   runner_label:
+#     instance_type: m4.large
+#     os: linux
+#     # min_available defaults to the global cfg in the ALI Terraform
+#     min_available: undefined
+#     # when max_available value is not defined, no max runners is enforced
+#     max_available: undefined
+#     disk_size: 50
+#     is_ephemeral: true
+
+runner_types:
+  c.linux.8xlarge.amx:
+    disk_size: 200
+    instance_type: m7i-flex.8xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.12xlarge:
+    disk_size: 200
+    instance_type: c5.12xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.10xlarge.avx2:
+    disk_size: 200
+    instance_type: m4.10xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.24xl.spr-metal:
+    disk_size: 200
+    instance_type: c7i.metal-24xl
+    is_ephemeral: false
+    os: linux
+  c.linux.16xlarge.spr:
+    disk_size: 200
+    instance_type: c7i.16xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.9xlarge.ephemeral:
+    disk_size: 200
+    instance_type: c5.9xlarge
+    is_ephemeral: true
+    os: linux
+  c.linux.12xlarge.ephemeral:
+    disk_size: 200
+    instance_type: c5.12xlarge
+    is_ephemeral: true
+    os: linux
+  c.linux.16xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.16xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.24xlarge:
+    disk_size: 150
+    instance_type: c5.24xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.24xlarge.ephemeral:
+    disk_size: 150
+    instance_type: c5.24xlarge
+    is_ephemeral: true
+    os: linux
+  c.linux.2xlarge:
+    disk_size: 150
+    instance_type: c5.2xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.4xlarge:
+    disk_size: 150
+    instance_type: c5.4xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.4xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.4xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.8xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.8xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.g4dn.12xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g4dn.12xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.g4dn.metal.nvidia.gpu:
+    disk_size: 150
+    instance_type: g4dn.metal
+    is_ephemeral: false
+    os: linux
+  c.linux.g5.48xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.48xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.g5.12xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.12xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.g5.4xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.4xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.g6.4xlarge.experimental.nvidia.gpu:
+    disk_size: 150
+    instance_type: g6.4xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.large:
+    disk_size: 15
+    instance_type: c5.large
+    is_ephemeral: false
+    os: linux
+  c.linux.arm64.2xlarge:
+    disk_size: 256
+    instance_type: t4g.2xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.arm64.m7g.4xlarge:
+    disk_size: 256
+    instance_type: m7g.4xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.arm64.2xlarge.ephemeral:
+    disk_size: 256
+    instance_type: t4g.2xlarge
+    is_ephemeral: true
+    os: linux
+  c.linux.arm64.m7g.4xlarge.ephemeral:
+    disk_size: 256
+    instance_type: m7g.4xlarge
+    is_ephemeral: true
+    os: linux
+  c.linux.arm64.m7g.metal:
+    disk_size: 256
+    instance_type: m7g.metal
+    is_ephemeral: false
+    os: linux
+  c.windows.g4dn.xlarge:
+    disk_size: 256
+    instance_type: g4dn.xlarge
+    is_ephemeral: true
+    os: windows
+  c.windows.g4dn.xlarge.nonephemeral:
+    disk_size: 256
+    instance_type: g4dn.xlarge
+    is_ephemeral: false
+    os: windows
+  c.windows.4xlarge:
+    disk_size: 256
+    instance_type: c5d.4xlarge
+    is_ephemeral: true
+    os: windows
+  c.windows.4xlarge.nonephemeral:
+    disk_size: 256
+    instance_type: c5d.4xlarge
+    is_ephemeral: false
+    os: windows
+  c.windows.8xlarge.nvidia.gpu:
+    disk_size: 256
+    instance_type: p3.2xlarge
+    is_ephemeral: true
+    os: windows
+  c.windows.8xlarge.nvidia.gpu.nonephemeral:
+    disk_size: 256
+    instance_type: p3.2xlarge
+    is_ephemeral: false
+    os: windows
+  c.windows.g5.4xlarge.nvidia.gpu:
+    disk_size: 256
+    instance_type: g5.4xlarge
+    is_ephemeral: false
+    os: windows
+  c.linux.2xlarge.memory:
+    disk_size: 200
+    instance_type: r5.2xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.4xlarge.memory:
+    disk_size: 300
+    instance_type: r5.4xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.8xlarge.memory:
+    disk_size: 400
+    instance_type: r5.8xlarge
+    is_ephemeral: false
+    os: linux
+  c.linux.12xlarge.memory:
+    disk_size: 600
+    instance_type: r5.12xlarge
+    is_ephemeral: false
+    os: linux
diff --git a/.github/scripts/validate_scale_config.py b/.github/scripts/validate_scale_config.py
@@ -21,12 +21,14 @@
 
 # Paths relative to their respective repositories
 META_SCALE_CONFIG_PATH = ".github/scale-config.yml"
+META_CANARY_SCALE_CONFIG_PATH = ".github/canary-scale-config.yml"
 LF_SCALE_CONFIG_PATH = ".github/lf-scale-config.yml"
 LF_CANARY_SCALE_CONFIG_PATH = ".github/lf-canary-scale-config.yml"
 
 RUNNER_TYPE_CONFIG_KEY = "runner_types"
 
 PREFIX_META = ""
+PREFIX_META_CANARY = "c."
 PREFIX_LF = "lf."
 PREFIX_LF_CANARY = "lf.c."
 
@@ -294,6 +296,10 @@ def main() -> None:
 
     # Contains scale configs that are generated from the source scale config
     generated_scale_config_infos: List[ScaleConfigInfo] = [
+        ScaleConfigInfo(
+            path=repo_root / META_CANARY_SCALE_CONFIG_PATH,
+            prefix=PREFIX_META_CANARY,
+        ),
         ScaleConfigInfo(
             path=repo_root / LF_SCALE_CONFIG_PATH,
             prefix=PREFIX_LF,
diff --git a/torchci/clickhouse_queries/lf_rollover_health/query.sql b/torchci/clickhouse_queries/lf_rollover_health/query.sql
@@ -30,7 +30,8 @@ normalized_jobs AS (
     AND j.status = 'completed'
     AND l != 'self-hosted'
     AND l NOT LIKE 'lf.c.%'
-    AND l NOT LIKE '%canary%'
+    AND l NOT LIKE '%.canary'
+    AND l NOT LIKE 'c.%'
 ),
 lf_jobs AS (
   SELECT DISTINCT
@@ -103,4 +104,4 @@ SELECT
 FROM
   comparison_stats
 ORDER BY
-  bucket DESC, job_name DESC, success_rate_delta, workflow_name
+  bucket DESC, job_name DESC, success_rate_delta, workflow_name
diff --git a/torchci/clickhouse_queries/lf_rollover_percentage/query.sql b/torchci/clickhouse_queries/lf_rollover_percentage/query.sql
@@ -23,7 +23,8 @@ WITH
             AND j.status = 'completed'
             AND l != 'self-hosted'
             AND l NOT LIKE 'lf.c.%'
-            AND l NOT LIKE '%canary%'
+            AND l NOT LIKE '%.canary'
+            AND l NOT LIKE 'c.%'
     ),
     lf_jobs AS (
         SELECT
@@ -87,4 +88,4 @@ WITH
             lf.bucket, lf.lf_fleet, m.lf_fleet
     )
 SELECT * FROM comparison_stats
-ORDER BY  bucket DESC, fleet
+ORDER BY  bucket DESC, fleet