Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CI: run MaxText tests on AWS with NGC release candidate images #1237

Merged
merged 1 commit into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions .github/eks-workflow-files/maxtext-job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
apiVersion: v1
kind: Service
metadata:
name: PLACEHOLDER
olupton marked this conversation as resolved.
Show resolved Hide resolved
spec:
clusterIP: None # clusterIP must be None to create a headless service
selector:
job-name: PLACEHOLDER # must match Job name
---
apiVersion: batch/v1
kind: Job
metadata:
name: PLACEHOLDER
labels:
kueue.x-k8s.io/queue-name: p5-queue
spec:
completions: 2 # number of nodes
parallelism: 2 # number of nodes
completionMode: Indexed
backoffLimitPerIndex: 0 # max failures per index
maxFailedIndexes: 0 # all indices must succeed
template:
spec:
subdomain: PLACEHOLDER # has to match Service name
restartPolicy: Never
imagePullSecrets:
- name: PLACEHOLDER
containers:
- name: maxtext
image: PLACEHOLDER
ports:
- containerPort: 3389
command:
- bash
- -c
# The logging logic: stream stdout/stderr from the 0th process inside this pod,
# record all of the processes' stdout/stderr + the INFO-level NCCL logs to file
olupton marked this conversation as resolved.
Show resolved Hide resolved
- |
export SERVICE_NAME=$0
export JOB_NAME=$1
cat >each-process.sh <<'EOL'
export JAX_COORDINATOR_IP=${JOB_NAME}-0.${SERVICE_NAME}
export JAX_COORDINATOR_PORT=3389
export NNODES=16 # actually #processes == #GPUs
export NODE_RANK=$((JOB_COMPLETION_INDEX*8 + LOCAL_RANK))
export JAX_LOCAL_DEVICE_IDS=$LOCAL_RANK
export NCCL_DEBUG=INFO
export NCCL_DEBUG_FILE=/opt/output/nccl.$NODE_RANK.log
olupton marked this conversation as resolved.
Show resolved Hide resolved
[[ $LOCAL_RANK == 0 ]] && console="/dev/stdout" || console="/dev/null"
nsys-jax \
--capture-range=cudaProfilerApi \
--capture-range-end=stop \
-o /opt/output/profile.$NODE_RANK.zip \
-- \
test-maxtext.sh \
-n 2 \
-b 2 \
--model-name=llama2-7b \
--attn-type=cudnn_flash_te \
--remat-policy=minimal_flash \
--steps=20 \
--fsdp=16 \
-a "scan_layers=false \
max_target_length=4096 \
use_iota_embed=true \
logits_dot_in_fp32=false \
profiler=nsys \
skip_first_n_steps_for_profiler=3 \
profiler_steps=8" \
|& tee /opt/output/output.$NODE_RANK.log >"${console}"
olupton marked this conversation as resolved.
Show resolved Hide resolved
code=$?
# Should run even on failure
cat /opt/output/nccl.$NODE_RANK.log >"${console}"
exit $code
EOL
# TODO: upgrade parallel-launch to return a failure code as soon as any
# of its children do (it already does this eventually, but it could
# be slow)
parallel-launch LOCAL_RANK 8 bash each-process.sh
code=$?
# Should run even on failure
touch /opt/output/.done
exit $code
- PLACEHOLDER
- PLACEHOLDER
resources:
limits:
nvidia.com/gpu: 8
vpc.amazonaws.com/efa: 32
volumeMounts:
- mountPath: /dev/shm
name: shmem
- mountPath: /opt/output
name: output
- name: upload
image: amazon/aws-cli
command:
- bash
- -c
- |
JOB_NAME="$0"
olupton marked this conversation as resolved.
Show resolved Hide resolved
while [[ ! -f /opt/output/.done ]]; do
sleep 1
olupton marked this conversation as resolved.
Show resolved Hide resolved
done
rm /opt/output/.done
aws s3 cp \
--recursive \
/opt/output \
"s3://jax-toolbox-eks-output/${JOB_NAME}/"
- PLACEHOLDER
volumeMounts:
- mountPath: /opt/output
name: output
volumes:
- name: output
emptyDir: {}
- name: shmem
emptyDir:
medium: Memory
sizeLimit: 16Gi
107 changes: 107 additions & 0 deletions .github/workflows/_test_maxtext_k8s.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
name: ~test MaxText functionality on Kubernetes

on:
workflow_call:
inputs:
MAXTEXT_IMAGE:
type: string
description: MaxText container to test
required: true

permissions:
contents: read # to fetch code

jobs:
maxtext:
runs-on: eks
env:
CONTAINER_IMAGE: "${{ inputs.MAXTEXT_IMAGE }}"
JOB_NAME: "maxtext-${{ github.run_id }}-${{ github.run_attempt }}"
steps:
- name: Check out the repository
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Login to NVIDIA Container Registry
uses: docker/login-action@v3
with:
registry: nvcr.io
username: $oauthtoken
password: ${{ secrets.NVCR_TOKEN }}
- name: Store GitHub Container Registry token as Kubernetes secret
run: |
# Make this available to later steps
TOKEN_NAME="${JOB_NAME}-token"
echo "TOKEN_NAME=${TOKEN_NAME}" >> "$GITHUB_ENV"
kubectl create secret generic \
${TOKEN_NAME} \
--from-file=.dockerconfigjson=$HOME/.docker/config.json \
--type=kubernetes.io/dockerconfigjson
- name: Configure Kubernetes job
run: |
export SERVICE_NAME="${JOB_NAME}-svc"
yq -i ea 'select(di == 0).metadata.name = strenv(SERVICE_NAME)
| select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
| select(di == 1).metadata.name = strenv(JOB_NAME)
| select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME)
| select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
| select(di == 1).spec.template.spec.containers[0].image = strenv(CONTAINER_IMAGE)
| select(di == 1).spec.template.spec.containers[0].command[3] = strenv(SERVICE_NAME)
| select(di == 1).spec.template.spec.containers[0].command[4] = strenv(JOB_NAME)
| select(di == 1).spec.template.spec.containers[1].command[3] = strenv(JOB_NAME)' \
.github/eks-workflow-files/maxtext-job.yaml
git diff .github/eks-workflow-files/maxtext-job.yaml
- name: Submit Kubernetes job
run: kubectl apply -f .github/eks-workflow-files/maxtext-job.yaml
- name: Wait for Kubernetes job to start
run: |
# Launcher job is created eagerly, but suspended. Kueue un-suspends it when
# resources are available, but that is where there can be a long wait if the
# cluster is busy executing other jobs.
kubectl wait --for=create job/${JOB_NAME}
kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${JOB_NAME} --timeout=3600s
olupton marked this conversation as resolved.
Show resolved Hide resolved
- name: Stream Kubernetes job output
run: |
# Streaming logs will fail if the container/pod is still pending
while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${JOB_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
sleep 1
olupton marked this conversation as resolved.
Show resolved Hide resolved
done
kubectl logs --all-containers=true --all-pods=true --follow job/${JOB_NAME}
- name: Retrieve Kubernetes job status
shell: bash -exo pipefail {0}
run: |
while readarray -d : -t status < <(kubectl get job/${JOB_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
failure=${status[0]:-0}
success=${status[1]:-0}
total=$((failure+success))
if [[ ${total} < 2 ]]; then
sleep 1
elif [[ ${total} == 2 ]]; then
break
else
# FIXME
exit 255
fi
done
exit ${failure}
# Provide more debug output in case of failure; note that some kinds of launch
# failure do not produce any log output.
- name: Debug failed Kubernetes job
if: failure()
run: |
# Provide better debug in case of launch failures that will not produce log output
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${JOB_NAME} -o name)
if [[ -n "${pods}" ]]; then
kubectl describe ${pods}
fi
# Clean up in case of errors as well as success
- name: Delete Kubernetes job
if: always()
run: kubectl delete -f .github/eks-workflow-files/maxtext-job.yaml
- name: Delete GitHub Container Registry token
if: always()
run: kubectl delete secret ${TOKEN_NAME}
13 changes: 10 additions & 3 deletions .github/workflows/ngc-release-testing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
docker run -i --shm-size=1g --gpus all \
${{ inputs.JAX_IMAGE }} \
bash <<"EOF" |& tee test-backend-independent.log
test-jax.sh -b backend-independent
test-jax.sh -b backend-independent
EOF
docker run -i --shm-size=1g --gpus all \
${{ inputs.JAX_IMAGE }} \
Expand Down Expand Up @@ -80,8 +80,15 @@ jobs:
MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }}
secrets: inherit

test-maxtext-eks:
if: inputs.MAXTEXT_IMAGE != ''
uses: ./.github/workflows/_test_maxtext_k8s.yaml
with:
MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }}
secrets: inherit

finalize:
needs: [ test-nccl, test-jax, test-rosetta-pax, test-maxtext ]
needs: [ test-nccl, test-jax, test-rosetta-pax, test-maxtext, test-maxtext-eks ]
if: "!cancelled()"
uses: ./.github/workflows/_finalize.yaml
secrets: inherit
secrets: inherit
Loading