From 39d9f9bf742adc7c619cc921a1def8b68ff6ac9d Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Sat, 17 May 2025 18:54:51 -0700
Subject: [PATCH 1/6] feat: parametrize GPUS_PER_NODE and CPUS_PER_WORKER in
 ray.sub

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 docs/cluster.md | 17 +++++++++++++++--
 ray.sub         | 12 +++++++-----
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/docs/cluster.md b/docs/cluster.md
index cfac258c8..7e2abe935 100644
--- a/docs/cluster.md
+++ b/docs/cluster.md
@@ -25,8 +25,21 @@ sbatch \
     ray.sub
 ```
 
-Notes:
-* Some clusters may or may not need `--gres=gpu:8` to be added to the `sbatch` command.
+```{tip}
+Some Slurm clusters may or may not need `--gres=gpu:8` to be added to the `sbatch` command.
+```
+
+````{tip}
+The default number of CPUs assigned to each worker is `16 * GPUS_PER_NODE`. For users with a different
+number of CPUs per node, you may control this when launching via:
+
+```sh
+CPUS_PER_WORKER=64 \
+sbatch \
+    ... \
+    ray.sub
+```
+````
 
 Which will print the `SLURM_JOB_ID`:
 ```text
diff --git a/ray.sub b/ray.sub
index 28ed72742..84427a146 100644
--- a/ray.sub
+++ b/ray.sub
@@ -61,8 +61,10 @@ COMMON_SRUN_ARGS+=" -p $SLURM_JOB_PARTITION"
 COMMON_SRUN_ARGS+=" -A $SLURM_JOB_ACCOUNT"
 COMMON_SRUN_ARGS+=" --gres=gpu:8"
 
-# Number of GPUs per node
-gpus_per_node=8
+# Number of GPUs per worker node
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+# Number of CPUs per worker node
+CPUS_PER_WORKER=${CPUS_PER_WORKER:-$((GPUS_PER_NODE * 16))}
 
 num_retries=3
 
@@ -148,7 +150,7 @@ EOF
 )
 srun $COMMON_SRUN_ARGS --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
 
-NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))
+NUM_ACTORS=$((GPUS_PER_NODE * SLURM_JOB_NUM_NODES))
 
 # Start Ray worker nodes
 # We want 1 Ray worker node per physical node
@@ -183,7 +185,7 @@ monitor-sidecar &
 cat <<EOFINNER | tee /launch-worker.sh
 ray start --address "$ip_head" \
           --disable-usage-stats \
-          --resources="{\"worker_units\": $gpus_per_node, \"slurm_managed_ray_cluster\": 1}" \
+          --resources="{\"worker_units\": $GPUS_PER_NODE, \"slurm_managed_ray_cluster\": 1}" \
           --min-worker-port=${MIN_WORKER_PORT} \
           --max-worker-port=${MAX_WORKER_PORT} \
           \
@@ -211,7 +213,7 @@ EOF
   if [[ $i -eq 0 ]]; then
     OVERLAP_HEAD_AND_WORKER_ARG="--overlap"
   fi
-  srun $COMMON_SRUN_ARGS ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$((16 * gpus_per_node)) -w "$node_i" -o $LOG_DIR/ray-worker-$i.log bash -x -c "$worker_cmd" &
+  srun $COMMON_SRUN_ARGS ${OVERLAP_HEAD_AND_WORKER_ARG:-} --container-name=ray-worker-$i --exact --nodes=1 --ntasks=1 --cpus-per-task=$CPUS_PER_WORKER -w "$node_i" -o $LOG_DIR/ray-worker-$i.log bash -x -c "$worker_cmd" &
   sleep 3
 done
 

From 629fc1433097f40a88f9f9c387eb994713a1b550 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Tue, 20 May 2025 23:06:48 -0700
Subject: [PATCH 2/6] cluster.md

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 docs/cluster.md | 88 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 76 insertions(+), 12 deletions(-)

diff --git a/docs/cluster.md b/docs/cluster.md
index 7e2abe935..32731604d 100644
--- a/docs/cluster.md
+++ b/docs/cluster.md
@@ -29,18 +29,6 @@ sbatch \
 Some Slurm clusters may or may not need `--gres=gpu:8` to be added to the `sbatch` command.
 ```
 
-````{tip}
-The default number of CPUs assigned to each worker is `16 * GPUS_PER_NODE`. For users with a different
-number of CPUs per node, you may control this when launching via:
-
-```sh
-CPUS_PER_WORKER=64 \
-sbatch \
-    ... \
-    ray.sub
-```
-````
-
 Which will print the `SLURM_JOB_ID`:
 ```text
 Submitted batch job 1980204
@@ -103,6 +91,82 @@ There several choices for `UV_CACHE_DIR` when using `ray.sub`:
 don't want to persist the cache, you can use (2), which is just as performant as (1) if the `uv.lock` is 
 covered by warmed cache.
 
+### Slurm Environment Variables
+
+All Slurm environment variables described below can be added to the `sbatch`
+invocation of `ray.sub`. For example, `GPUS_PER_NODE=8` can be specified as follows:
+
+```sh
+GPUS_PER_NODE=8 \
+... \
+sbatch ray.sub \
+   ...
+```
+#### Common Environment Configuration
+``````{list-table}
+:header-rows: 1
+
+* - Environment Variable
+  - Explanation
+* - `CONTAINER`
+  - (Required) Specifies the container image to be used for the Ray cluster.
+    Use either a docker image from a registry or a squashfs (if using enroot/pyxis).
+* - `MOUNTS`
+  - (Required) Defines paths to mount into the container. Examples:
+    ```md
+    * `MOUNTS="$PWD:$PWD"` (mount in current working directory (CWD))
+    * `MOUNTS="$PWD:$PWD,/nfs:/nfs:ro"` (mount in CWD and another mount as read-only)
+    ```
+* - `COMMAND`
+  - Command to execute after the Ray cluster starts. If empty, cluster idles.
+    and enters interactive mode. See the [Slurm interactive instructions](#interactive-launching)
+``````
+#### Advanced Environment Configuration
+``````{list-table}
+:header-rows: 1
+
+* - Environment Variable
+    (and default)
+  - Explanation
+* - `CPUS_PER_WORKER=128`
+  - CPUs each Ray worker node claims. Default is `16 * GPUS_PER_NODE`.
+* - `GPUS_PER_NODE=8`
+  - GPUs each Ray worker node claims. Look up 
+  number using `nvidia-smi` on worker nodes.
+* - `BASE_LOG_DIR=$SLURM_SUBMIT_DIR`
+  - Base directory for storing Ray logs. Defaults to the Slurm submission directory ([SLURM_SUBMIT_DIR](https://slurm.schedmd.com/sbatch.html#OPT_SLURM_SUBMIT_DIR)).
+* - `NODE_MANAGER_PORT=53001`
+  - Port for the Ray node manager on worker nodes.
+* - `OBJECT_MANAGER_PORT=53003`
+  - Port for the Ray object manager on worker nodes.
+* - `RUNTIME_ENV_AGENT_PORT=53005`
+  - Port for the Ray runtime environment agent on worker nodes.
+* - `DASHBOARD_AGENT_GRPC_PORT=53007`
+  - gRPC port for the Ray dashboard agent on worker nodes.
+* - `METRICS_EXPORT_PORT=53009`
+  - Port for exporting metrics from worker nodes.
+* - `PORT=6379`
+  - Main port for the Ray head node.
+* - `RAY_CLIENT_SERVER_PORT=10001`
+  - Port for the Ray client server on the head node.
+* - `DASHBOARD_GRPC_PORT=52367`
+  - gRPC port for the Ray dashboard on the head node.
+* - `DASHBOARD_PORT=8265`
+  - Port for the Ray dashboard UI on the head node. This is also the port
+    used by the Ray distributed debugger.
+* - `DASHBOARD_AGENT_LISTEN_PORT=52365`
+  - Listening port for the dashboard agent on the head node.
+* - `MIN_WORKER_PORT=54001`
+  - Minimum port in the range for Ray worker processes.
+* - `MAX_WORKER_PORT=54257`
+  - Maximum port in the range for Ray worker processes.
+
+``````
+
+:::{note}
+For the most part, you will not need to change ports unless these
+are already taken by some other service backgrounded on your cluster.
+:::
 
 ## Kubernetes
 

From 377956325bc1a5c181ae11110d48eb82d5b58c6c Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Tue, 20 May 2025 23:44:10 -0700
Subject: [PATCH 3/6] touchup

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 docs/cluster.md | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/docs/cluster.md b/docs/cluster.md
index 32731604d..f6b8589d9 100644
--- a/docs/cluster.md
+++ b/docs/cluster.md
@@ -115,12 +115,29 @@ sbatch ray.sub \
   - (Required) Defines paths to mount into the container. Examples:
     ```md
     * `MOUNTS="$PWD:$PWD"` (mount in current working directory (CWD))
-    * `MOUNTS="$PWD:$PWD,/nfs:/nfs:ro"` (mount in CWD and another mount as read-only)
+    * `MOUNTS="$PWD:$PWD,/nfs:/nfs:ro"` (mounts the current working directory and `/nfs`, with `/nfs` mounted as read-only)
     ```
 * - `COMMAND`
-  - Command to execute after the Ray cluster starts. If empty, cluster idles.
-    and enters interactive mode. See the [Slurm interactive instructions](#interactive-launching)
+  - Command to execute after the Ray cluster starts. If empty, the cluster idles and enters interactive mode (see the [Slurm interactive instructions](#interactive-launching)).
+* - `HF_HOME`
+  - Sets the cache directory for huggingface-hub assets (e.g., models/tokenizers).
+* - `WANDB_API_KEY`
+  - Setting this allows you to use the wandb logger without having to run `wandb login`.
+* - `HF_TOKEN`
+  - Setting the token used by huggingface-hub. Avoids having to run the `huggingface-cli login`
+* - `HF_DATASETS_CACHE`
+  - Sets the cache dir for downloaded Huggingface datasets.
 ``````
+
+:::{tip}
+When `HF_TOKEN`, `WANDB_API_KEY`, `HF_HOME`, and `HF_DATASETS_CACHE` are set in your shell environment using `export`, they are automatically passed to `ray.sub`. For instance, if you set:
+
+```sh
+export HF_TOKEN=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+```
+this token will be available to your NeMo RL run. Consider adding these exports to your shell configuration file, such as `~/.bashrc`.
+:::
+
 #### Advanced Environment Configuration
 ``````{list-table}
 :header-rows: 1
@@ -131,8 +148,7 @@ sbatch ray.sub \
 * - `CPUS_PER_WORKER=128`
   - CPUs each Ray worker node claims. Default is `16 * GPUS_PER_NODE`.
 * - `GPUS_PER_NODE=8`
-  - GPUs each Ray worker node claims. Look up 
-  number using `nvidia-smi` on worker nodes.
+  - Number of GPUs each Ray worker node claims. To determine this, run `nvidia-smi` on a worker node.
 * - `BASE_LOG_DIR=$SLURM_SUBMIT_DIR`
   - Base directory for storing Ray logs. Defaults to the Slurm submission directory ([SLURM_SUBMIT_DIR](https://slurm.schedmd.com/sbatch.html#OPT_SLURM_SUBMIT_DIR)).
 * - `NODE_MANAGER_PORT=53001`
@@ -160,7 +176,6 @@ sbatch ray.sub \
   - Minimum port in the range for Ray worker processes.
 * - `MAX_WORKER_PORT=54257`
   - Maximum port in the range for Ray worker processes.
-
 ``````
 
 :::{note}

From a9f81706ee6f0f42b0c811041e621ca5fcc4114e Mon Sep 17 00:00:00 2001
From: Terry Kong <terrycurtiskong@gmail.com>
Date: Thu, 22 May 2025 13:32:29 -0700
Subject: [PATCH 4/6] Update docs/cluster.md

Co-authored-by: jgerh <163925524+jgerh@users.noreply.github.com>
Signed-off-by: Terry Kong <terrycurtiskong@gmail.com>
---
 docs/cluster.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/cluster.md b/docs/cluster.md
index f6b8589d9..6cdda932f 100644
--- a/docs/cluster.md
+++ b/docs/cluster.md
@@ -26,7 +26,7 @@ sbatch \
 ```
 
 ```{tip}
-Some Slurm clusters may or may not need `--gres=gpu:8` to be added to the `sbatch` command.
+Depending on your Slurm cluster configuration, you may or may not need to include the `--gres=gpu:8` option in the `sbatch` command.
 ```
 
 Which will print the `SLURM_JOB_ID`:

From 66c5d0bc90d165bc9898d9e204b33d8f06dccaa6 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Thu, 22 May 2025 13:38:28 -0700
Subject: [PATCH 5/6] edit

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 docs/cluster.md | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/docs/cluster.md b/docs/cluster.md
index 6cdda932f..3d03f5fe2 100644
--- a/docs/cluster.md
+++ b/docs/cluster.md
@@ -2,7 +2,7 @@
 
 This guide explains how to run NeMo RL with Ray on Slurm or Kubernetes.
 
-## Slurm (Batched and Interactive)
+## Use Slurm for Batched and Interactive Jobs
 
  The following code provides instructions on how to use Slurm to run batched job submissions and run jobs interactively.
 
@@ -29,11 +29,11 @@ sbatch \
 Depending on your Slurm cluster configuration, you may or may not need to include the `--gres=gpu:8` option in the `sbatch` command.
 ```
 
-Which will print the `SLURM_JOB_ID`:
+Upon successful submission, Slurm will print the `SLURM_JOB_ID`:
 ```text
 Submitted batch job 1980204
 ```
-Make note of the the job submission number. Once the job begins, you can track its process in the driver logs which you can `tail`:
+Make a note of the job submission number. Once the job begins, you can track its process in the driver logs which you can `tail`:
 ```sh
 tail -f 1980204-logs/ray-driver.log
 ```
@@ -60,12 +60,11 @@ sbatch \
     --gres=gpu:8 \
     ray.sub
 ```
-Which will print the `SLURM_JOB_ID`:
+Upon successful submission, Slurm will print the `SLURM_JOB_ID`:
 ```text
 Submitted batch job 1980204
 ```
-Once the Ray cluster is up, a script should be created to attach to the Ray head node,
-which you can use to launch experiments.
+Once the Ray cluster is up, a script will be created to attach to the Ray head node. Run this script to launch experiments:
 ```sh
 bash 1980204-attach.sh
 ```
@@ -182,7 +181,3 @@ this token will be available to your NeMo RL run. Consider adding these exports
 For the most part, you will not need to change ports unless these
 are already taken by some other service backgrounded on your cluster.
 :::
-
-## Kubernetes
-
-TBD
\ No newline at end of file

From 1387e0bcab0628402b8f8aae2298a17d3d8de5bf Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Thu, 22 May 2025 13:42:32 -0700
Subject: [PATCH 6/6] restore kubernetes line

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 docs/cluster.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/cluster.md b/docs/cluster.md
index 3d03f5fe2..a0a3503a5 100644
--- a/docs/cluster.md
+++ b/docs/cluster.md
@@ -181,3 +181,7 @@ this token will be available to your NeMo RL run. Consider adding these exports
 For the most part, you will not need to change ports unless these
 are already taken by some other service backgrounded on your cluster.
 :::
+
+## Kubernetes
+
+TBD