From cf0a13929c94aa614be4516474bbc473aa1440e2 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Sat, 28 Sep 2024 14:13:45 +0545 Subject: [PATCH] Add AMD examples with vLLM, Axolotl and Trl (#1693) Add llama31-service-vllm-amd example [Docs] Added vLLM with AMD example Add AMD examples Update AMD-Axolotl example with official example Add build wheel tasks for AMD examples - Minor updates Add necessary comments in AMD Readme Co-authored-by: Bihan Rana --- examples/accelerators/amd/README.md | 204 ++++++++++++++++-- examples/accelerators/tpu/README.md | 1 - examples/deployment/vllm/amd/.dstack.yml | 15 ++ .../deployment/vllm/amd/build.vllm-rocm.yaml | 46 ++++ .../deployment/vllm/amd/service.dstack.yml | 49 +++++ examples/fine-tuning/axolotl/README.md | 28 ++- .../axolotl/amd/build.flash-attention.yaml | 37 ++++ .../axolotl/amd/build.xformers.yaml | 38 ++++ .../fine-tuning/axolotl/amd/train.dstack.yaml | 36 ++++ examples/fine-tuning/trl/README.md | 8 +- .../fine-tuning/trl/amd/train.dstack.yaml | 32 +++ examples/fine-tuning/trl/amd/train.py | 61 ++++++ 12 files changed, 531 insertions(+), 24 deletions(-) create mode 100644 examples/deployment/vllm/amd/.dstack.yml create mode 100644 examples/deployment/vllm/amd/build.vllm-rocm.yaml create mode 100644 examples/deployment/vllm/amd/service.dstack.yml create mode 100644 examples/fine-tuning/axolotl/amd/build.flash-attention.yaml create mode 100644 examples/fine-tuning/axolotl/amd/build.xformers.yaml create mode 100644 examples/fine-tuning/axolotl/amd/train.dstack.yaml create mode 100644 examples/fine-tuning/trl/amd/train.dstack.yaml create mode 100644 examples/fine-tuning/trl/amd/train.py diff --git a/examples/accelerators/amd/README.md b/examples/accelerators/amd/README.md index 1821ab68a..5eb24f41e 100644 --- a/examples/accelerators/amd/README.md +++ b/examples/accelerators/amd/README.md @@ -7,11 +7,10 @@ you can specify an AMD GPU under `resources`. Below are a few examples. ## Deployment -### Running as a service +You can use any serving framework, such as TGI and vLLM. Here's an example of a [service](https://dstack.ai/docs/services) that deploys +Llama 3.1 70B in FP16 using [TGI :material-arrow-top-right-thin:{ .external }](https://huggingface.co/docs/text-generation-inference/en/installation_amd){:target="_blank"} and [vLLM :material-arrow-top-right-thin:{ .external }](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html){:target="_blank"}. === "TGI" - Here's an example of a [service](https://dstack.ai/docs/services) that deploys - Llama 3.1 70B in FP16 using [TGI :material-arrow-top-right-thin:{ .external }](https://huggingface.co/docs/text-generation-inference/en/installation_amd){:target="_blank"}.
@@ -19,22 +18,29 @@ you can specify an AMD GPU under `resources`. Below are a few examples. type: service name: amd-service-tgi + # Using the official TGI's ROCm Docker image image: ghcr.io/huggingface/text-generation-inference:sha-a379d55-rocm + + # Required environment variables env: - HUGGING_FACE_HUB_TOKEN - MODEL_ID=meta-llama/Meta-Llama-3.1-70B-Instruct - TRUST_REMOTE_CODE=true - ROCM_USE_FLASH_ATTN_V2_TRITON=true + # Commands of the task commands: - text-generation-launcher --port 8000 + # Service port port: 8000 resources: gpu: MI300X disk: 150GB + # Use spot or on-demand instances spot_policy: auto - + + # Register the model model: type: chat name: meta-llama/Meta-Llama-3.1-70B-Instruct @@ -43,26 +49,188 @@ you can specify an AMD GPU under `resources`. Below are a few examples.
+ +=== "vLLM" + +
+ + ```yaml + type: service + name: llama31-service-vllm-amd + + # Using RunPod's ROCm Docker image + image: runpod/pytorch:2.4.0-py3.10-rocm6.1.0-ubuntu22.04 + + # Required environment variables + env: + - HUGGING_FACE_HUB_TOKEN + - MODEL_ID=meta-llama/Meta-Llama-3.1-70B-Instruct + - MAX_MODEL_LEN=126192 + # Commands of the task + commands: + - export PATH=/opt/conda/envs/py_3.10/bin:$PATH + - wget https://github.com/ROCm/hipBLAS/archive/refs/tags/rocm-6.1.0.zip + - unzip rocm-6.1.0.zip + - cd hipBLAS-rocm-6.1.0 + - python rmake.py + - cd .. + - git clone https://github.com/vllm-project/vllm.git + - cd vllm + - pip install triton + - pip uninstall torch -y + - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1 + - pip install /opt/rocm/share/amd_smi + - pip install --upgrade numba scipy huggingface-hub[cli] + - pip install "numpy<2" + - pip install -r requirements-rocm.txt + - wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib + - rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* + - export PYTORCH_ROCM_ARCH="gfx90a;gfx942" + - wget https://dstack-binaries.s3.amazonaws.com/vllm-0.6.0%2Brocm614-cp310-cp310-linux_x86_64.whl + - pip install vllm-0.6.0+rocm614-cp310-cp310-linux_x86_64.whl + - vllm serve $MODEL_ID --max-model-len $MAX_MODEL_LEN --port 8000 + # Service port + port: 8000 + + # Use spot or on-demand instances + spot_policy: auto + + resources: + gpu: MI300X + disk: 200GB + + # Register the model + model: + format: openai + type: chat + name: meta-llama/Meta-Llama-3.1-70B-Instruct + ``` +
+ + Note, maximum size of vLLM’s `KV cache` is 126192, consequently we must set `MAX_MODEL_LEN` to 126192. Adding `/opt/conda/envs/py_3.10/bin` to PATH ensures we use the Python 3.10 environment necessary for the pre-built binaries compiled specifically for this version. + + > To speed up the `vLLM-ROCm` installation, we use a pre-built binary from S3. + > You can find the task to build and upload the binary in [`examples/fine-tuning/axolotl/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/deployment/vllm/amd){:target="_blank"}. + !!! info "Docker image" - Please note that if you want to use AMD, specifying `image` is currently required. This must be an image that includes + If you want to use AMD, specifying `image` is currently required. This must be an image that includes ROCm drivers. To request multiple GPUs, specify the quantity after the GPU name, separated by a colon, e.g., `MI300X:4`. -AMD accelerators can also be used with other frameworks like vLLM, Ollama, etc., and we'll be adding more examples soon. +## Fine-tuning + +=== "TRL" + + Below is an example of LoRA fine-tuning Llama 3.1 8B using [TRL :material-arrow-top-right-thin:{ .external }](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference.html){:target="_blank"} + and the [`mlabonne/guanaco-llama2-1k` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/mlabonne/guanaco-llama2-1k){:target="_blank"} + dataset. + +
+ + ```yaml + type: task + name: trl-amd-llama31-train + + # Using RunPod's ROCm Docker image + image: runpod/pytorch:2.1.2-py3.10-rocm6.1-ubuntu22.04 + + # Required environment variables + env: + - HUGGING_FACE_HUB_TOKEN + # Commands of the task + commands: + - export PATH=/opt/conda/envs/py_3.10/bin:$PATH + - git clone https://github.com/ROCm/bitsandbytes + - cd bitsandbytes + - git checkout rocm_enabled + - pip install -r requirements-dev.txt + - cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S . + - make + - pip install . + - pip install trl + - pip install peft + - pip install transformers datasets huggingface-hub scipy + - cd .. + - python examples/fine-tuning/trl/amd/train.py + + # Use spot or on-demand instances + spot_policy: auto + + resources: + gpu: MI300X + disk: 150GB + ``` + +
+ +=== "Axolotl" + Below is an example of fine-tuning Llama 3.1 8B using [Axolotl :material-arrow-top-right-thin:{ .external }](https://rocm.blogs.amd.com/artificial-intelligence/axolotl/README.html){:target="_blank"} + and the [tatsu-lab/alpaca :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/tatsu-lab/alpaca){:target="_blank"} + dataset. + +
+ + ```yaml + type: task + name: axolotl-amd-llama31-train + + # Using RunPod's ROCm Docker image + image: runpod/pytorch:2.1.2-py3.10-rocm6.0.2-ubuntu22.04 + # Required environment variables + env: + - HUGGING_FACE_HUB_TOKEN + # Commands of the task + commands: + - export PATH=/opt/conda/envs/py_3.10/bin:$PATH + - pip uninstall torch torchvision torchaudio -y + - python3 -m pip install --pre torch==2.3.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0/ + - git clone https://github.com/OpenAccess-AI-Collective/axolotl + - cd axolotl + - git checkout d4f6c65 + - pip install -e . + - cd .. + - wget https://dstack-binaries.s3.amazonaws.com/flash_attn-2.0.4-cp310-cp310-linux_x86_64.whl + - pip install flash_attn-2.0.4-cp310-cp310-linux_x86_64.whl + - wget https://dstack-binaries.s3.amazonaws.com/xformers-0.0.26-cp310-cp310-linux_x86_64.whl + - pip install xformers-0.0.26-cp310-cp310-linux_x86_64.whl + - git clone --recurse https://github.com/ROCm/bitsandbytes + - cd bitsandbytes + - git checkout rocm_enabled + - pip install -r requirements-dev.txt + - cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S . + - make + - pip install . + - cd .. + - accelerate launch -m axolotl.cli.train axolotl/examples/llama-3/fft-8b.yaml + + # Use spot or on-demand instances + spot_policy: auto + + resources: + gpu: MI300X + disk: 150GB + ``` +
+ Note,To support ROCm, we need to checkout to commit `d4f6c65`. You can find the installation instruction in [rocm-blogs :material-arrow-top-right-thin:{ .external }](https://github.com/ROCm/rocm-blogs/blob/release/blogs/artificial-intelligence/axolotl/src/Dockerfile.rocm){:target="_blank"}. + + > To speed up installation of `flash-attention` and `xformers `, we use pre-built binaries uploaded to S3. + > You can find the tasks that build and upload the binaries + > in [examples/fine-tuning/axolotl/amd :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/axolotl/amd){:target="_blank"}. -### Running a configuration +## Running a configuration Once the configuration is ready, run `dstack apply -f `, and `dstack` will automatically provision the cloud resources and run the configuration. -## Fleets +
-By default, `dstack apply` reuses `idle` instances from one of the existing [fleets](https://dstack.ai/docs/fleets). -If no `idle` instances meet the requirements, it creates a new fleet using one of the configured backends. +```shell +$ HUGGING_FACE_HUB_TOKEN=... +$ dstack apply -f examples/deployment/vllm/amd/service.dstack.yml +``` -Use [fleets](https://dstack.ai/docs/fleets.md) configurations to create fleets manually. This reduces startup time for dev environments, -tasks, and services, and is very convenient if you want to reuse fleets across runs. +
## Dev environments @@ -73,9 +241,17 @@ allow you to run commands interactively. ## Source code The source-code of this example can be found in -[`examples/deployment/tgi/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/deployment/tgi/amd){:target="_blank"}. +[`examples/deployment/tgi/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/deployment/tgi/amd){:target="_blank"}, +[`examples/deployment/vllm/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/deployment/vllm/amd){:target="_blank"}, +[`examples/fine-tuning/axolotl/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/axolotl/amd){:target="_blank"} and +[`examples/fine-tuning/trl/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/trl/amd){:target="_blank"} ## What's next? -1. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), and +1. Browse [TGI :material-arrow-top-right-thin:{ .external }](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/deploy-your-model.html#serving-using-hugging-face-tgi), + [vLLM :material-arrow-top-right-thin:{ .external }](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html#build-from-source-rocm), + [Axolotl :material-arrow-top-right-thin:{ .external }](https://github.com/ROCm/rocm-blogs/tree/release/blogs/artificial-intelligence/axolotl), + [TRL :material-arrow-top-right-thin:{ .external }](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.html) and + [ROCm Bitsandbytes :material-arrow-top-right-thin:{ .external }](https://github.com/ROCm/bitsandbytes) +2. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), and [services](https://dstack.ai/docs/services). \ No newline at end of file diff --git a/examples/accelerators/tpu/README.md b/examples/accelerators/tpu/README.md index 3439ae3e3..471481cae 100644 --- a/examples/accelerators/tpu/README.md +++ b/examples/accelerators/tpu/README.md @@ -10,7 +10,6 @@ Below are a few examples on using TPUs for deployment and fine-tuning. ## Deployment -### Running as a service You can use any serving framework, such as vLLM, TGI. Here's an example of a [service](https://dstack.ai/docs/services) that deploys Llama 3.1 8B using [Optimum TPU :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-tpu){:target="_blank"} diff --git a/examples/deployment/vllm/amd/.dstack.yml b/examples/deployment/vllm/amd/.dstack.yml new file mode 100644 index 000000000..6aaed21a0 --- /dev/null +++ b/examples/deployment/vllm/amd/.dstack.yml @@ -0,0 +1,15 @@ +type: dev-environment +name: dev-vLLM-amd + +image: runpod/pytorch:2.4.0-py3.10-rocm6.1.0-ubuntu22.04 + +env: + - HUGGING_FACE_HUB_TOKEN + +ide: vscode + +resources: + gpu: MI300X + disk: 150GB + +spot_policy: auto \ No newline at end of file diff --git a/examples/deployment/vllm/amd/build.vllm-rocm.yaml b/examples/deployment/vllm/amd/build.vllm-rocm.yaml new file mode 100644 index 000000000..00112df96 --- /dev/null +++ b/examples/deployment/vllm/amd/build.vllm-rocm.yaml @@ -0,0 +1,46 @@ +type: task +name: build-vllm-rocm + +image: runpod/pytorch:2.4.0-py3.10-rocm6.1.0-ubuntu22.04 + +env: + - HUGGING_FACE_HUB_TOKEN + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + - AWS_REGION + - BUCKET_NAME + +command: + - apt-get update -y + - apt-get install awscli -y + - aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID + - aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY + - aws configure set region $AWS_REGION + - export PATH=/opt/conda/envs/py_3.10/bin:$PATH + - wget https://github.com/ROCm/hipBLAS/archive/refs/tags/rocm-6.1.0.zip + - unzip rocm-6.1.0.zip + - cd hipBLAS-rocm-6.1.0 + - python rmake.py + - cd .. + - git clone https://github.com/vllm-project/vllm.git + - cd vllm + - pip install triton + - pip uninstall torch -y + - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1 + - pip install /opt/rocm/share/amd_smi + - pip install --upgrade numba scipy huggingface-hub[cli] + - pip install "numpy<2" + - pip install -r requirements-rocm.txt + - wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib + - rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* + - export PYTORCH_ROCM_ARCH="gfx90a;gfx942" + - pip install wheel setuptools setuptools_scm ninja + - python setup.py bdist_wheel -d dist/ + - cd dist + - aws s3 cp "$(ls -1 | head -n 1)" s3://$BUCKET_NAME/ --acl public-read + +spot_policy: auto + +resources: + gpu: MI300X + disk: 150GB \ No newline at end of file diff --git a/examples/deployment/vllm/amd/service.dstack.yml b/examples/deployment/vllm/amd/service.dstack.yml new file mode 100644 index 000000000..e91858f28 --- /dev/null +++ b/examples/deployment/vllm/amd/service.dstack.yml @@ -0,0 +1,49 @@ +type: service +name: llama31-service-vllm-amd + +image: runpod/pytorch:2.4.0-py3.10-rocm6.1.0-ubuntu22.04 + +env: + - HUGGING_FACE_HUB_TOKEN + - MODEL_ID=meta-llama/Meta-Llama-3.1-70B-Instruct + - MAX_MODEL_LEN=126192 + +commands: + - export PATH=/opt/conda/envs/py_3.10/bin:$PATH + - wget https://github.com/ROCm/hipBLAS/archive/refs/tags/rocm-6.1.0.zip + - unzip rocm-6.1.0.zip + - cd hipBLAS-rocm-6.1.0 + - python rmake.py + - cd .. + - git clone https://github.com/vllm-project/vllm.git + - cd vllm + - pip install triton + - pip uninstall torch -y + - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1 + - pip install /opt/rocm/share/amd_smi + - pip install --upgrade numba scipy huggingface-hub[cli] + - pip install "numpy<2" + - pip install -r requirements-rocm.txt + - wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib + - rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* + - export PYTORCH_ROCM_ARCH="gfx90a;gfx942" + - wget https://dstack-binaries.s3.amazonaws.com/vllm-0.6.0%2Brocm614-cp310-cp310-linux_x86_64.whl + - pip install vllm-0.6.0+rocm614-cp310-cp310-linux_x86_64.whl + - vllm serve $MODEL_ID + --max-model-len $MAX_MODEL_LEN + --port 8000 + +# Expose the vllm server port +port: 8000 + +spot_policy: auto + +resources: + gpu: MI300X + disk: 200GB + +# (Optional) Enable the OpenAI-compatible endpoint +model: + format: openai + type: chat + name: meta-llama/Meta-Llama-3.1-70B-Instruct diff --git a/examples/fine-tuning/axolotl/README.md b/examples/fine-tuning/axolotl/README.md index 6bbf20cfb..2946594ca 100644 --- a/examples/fine-tuning/axolotl/README.md +++ b/examples/fine-tuning/axolotl/README.md @@ -28,11 +28,12 @@ You can modify it as needed. ## Single-node training The easiest way to run a training script with `dstack` is by creating a task configuration file. -This file can be found at [`examples/fine-tuning/axolotl/train.dstack.yml` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/axolotl/train.dstack.yml){:target="_blank"}. Below is its content: +This file can be found at [`examples/fine-tuning/axolotl/train.dstack.yml` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/axolotl/train.dstack.yml){:target="_blank"}. + +
```yaml type: task -# The name is optional, if not specified, generated randomly name: axolotl-train # Using the official Axolotl's Docker image @@ -46,6 +47,9 @@ env: commands: - accelerate launch -m axolotl.cli.train examples/fine-tuning/axolotl/config.yaml +# Use spot or on-demand instances +spot_policy: auto + resources: gpu: # 24GB or more vRAM @@ -54,16 +58,23 @@ resources: count: 2.. ``` +
+ The task uses Axolotl's Docker image, where Axolotl is already pre-installed. -To run the task, use `dstack apply`: +!!! info "AMD" + The example above uses NVIDIA accelerators. To use it with AMD, check out [AMD](https://dstack.ai/examples/accelerators/amd#axolotl). + +## Running a configuration + +Once the configuration is ready, run `dstack apply -f `, and `dstack` will automatically provision the +cloud resources and run the configuration.
```shell $ HUGGING_FACE_HUB_TOKEN=... $ WANDB_API_KEY=... - $ dstack apply -f examples/fine-tuning/axolotl/train.dstack.yml ``` @@ -75,7 +86,7 @@ $ dstack apply -f examples/fine-tuning/axolotl/train.dstack.yml > If no `idle` instances meet the requirements, it creates a new fleet using one of the configured backends. The example folder includes a fleet configuration: -[ `examples/fine-tuning/axolotl/fleet.dstack.yml` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/axolotl/fleet.dstack.yml) {:target="_blank"} +[ `examples/fine-tuning/axolotl/fleet.dstack.yml` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/axolotl/fleet.dstack.yml){:target="_blank"} (a single node with a `24GB` GPU). You can update the fleet configuration to change the vRAM size, GPU model, number of GPUs per node, or number of nodes. @@ -105,7 +116,9 @@ If you'd like to play with the example using a dev environment, run
```shell -dstack apply -f examples/fine-tuning/axolotl/.dstack.yaml +$ HUGGING_FACE_HUB_TOKEN=... +$ WANDB_API_KEY=... +$ dstack apply -f examples/fine-tuning/axolotl/.dstack.yaml ```
@@ -119,4 +132,5 @@ The source-code of this example can be found in 1. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), [services](https://dstack.ai/docs/services), and [fleets](https://dstack.ai/docs/concepts/fleets). -2. Browse [Axolotl :material-arrow-top-right-thin:{ .external }](https://github.com/OpenAccess-AI-Collective/axolotl){:target="_blank"}. +2. See [AMD](https://dstack.ai/examples/accelerators/amd#axolotl). +3. Browse [Axolotl :material-arrow-top-right-thin:{ .external }](https://github.com/OpenAccess-AI-Collective/axolotl){:target="_blank"}. diff --git a/examples/fine-tuning/axolotl/amd/build.flash-attention.yaml b/examples/fine-tuning/axolotl/amd/build.flash-attention.yaml new file mode 100644 index 000000000..1468bf8dc --- /dev/null +++ b/examples/fine-tuning/axolotl/amd/build.flash-attention.yaml @@ -0,0 +1,37 @@ +type: task +# The name is optional, if not specified, generated randomly +name: build-flash-attention + +image: runpod/pytorch:2.1.2-py3.10-rocm6.0.2-ubuntu22.04 + +# Required environment variables +env: + - HUGGING_FACE_HUB_TOKEN + - GPU_ARCHS="gfx90a;gfx942" + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + - AWS_REGION + - BUCKET_NAME + +# Commands of the task +commands: + - apt-get update -y + - apt-get install awscli -y + - aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID + - aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY + - aws configure set region $AWS_REGION + - export PATH=/opt/conda/envs/py_3.10/bin:$PATH + - pip uninstall torch torchvision torchaudio -y + - python3 -m pip install --pre torch==2.3.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0/ + - pip install ninja + - pip install wheel setuptools + - git clone https://github.com/ROCm/flash-attention.git + - cd flash-attention + - git checkout stride_fix + - python setup.py bdist_wheel -d dist/ + - cd dist + - aws s3 cp "$(ls -1 | head -n 1)" s3://$BUCKET_NAME/ --acl public-read + +resources: + gpu: MI300X + disk: 150GB \ No newline at end of file diff --git a/examples/fine-tuning/axolotl/amd/build.xformers.yaml b/examples/fine-tuning/axolotl/amd/build.xformers.yaml new file mode 100644 index 000000000..a3733ec50 --- /dev/null +++ b/examples/fine-tuning/axolotl/amd/build.xformers.yaml @@ -0,0 +1,38 @@ +type: task +# The name is optional, if not specified, generated randomly +name: build-flash-attention + +image: runpod/pytorch:2.1.2-py3.10-rocm6.0.2-ubuntu22.04 + +# Required environment variables +env: + - HUGGING_FACE_HUB_TOKEN + - GPU_ARCHS="gfx90a;gfx942" + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + - AWS_REGION + - BUCKET_NAME + +# Commands of the task +commands: + - apt-get update -y + - apt-get install awscli -y + - aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID + - aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY + - aws configure set region $AWS_REGION + - export PATH=/opt/conda/envs/py_3.10/bin:$PATH + - pip uninstall torch torchvision torchaudio -y + - python3 -m pip install --pre torch==2.3.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0/ + - pip install ninja + - pip install wheel setuptools + - git clone https://github.com/ROCm/xformers + - cd xformers + - git checkout dfc196d + - git submodule update --init --recursive + - python setup.py bdist_wheel -d dist/ + - cd dist + - aws s3 cp "$(ls -1 | head -n 1)" s3://$BUCKET_NAME/ --acl public-read + +resources: + gpu: MI300X + disk: 150GB \ No newline at end of file diff --git a/examples/fine-tuning/axolotl/amd/train.dstack.yaml b/examples/fine-tuning/axolotl/amd/train.dstack.yaml new file mode 100644 index 000000000..5de02b353 --- /dev/null +++ b/examples/fine-tuning/axolotl/amd/train.dstack.yaml @@ -0,0 +1,36 @@ +type: task +# The name is optional, if not specified, generated randomly +name: axolotl-amd-llama31-train + +image: runpod/pytorch:2.1.2-py3.10-rocm6.0.2-ubuntu22.04 + +# Required environment variables +env: + - HUGGING_FACE_HUB_TOKEN +# Commands of the task +commands: + - export PATH=/opt/conda/envs/py_3.10/bin:$PATH + - pip uninstall torch torchvision torchaudio -y + - python3 -m pip install --pre torch==2.3.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0/ + - git clone https://github.com/OpenAccess-AI-Collective/axolotl + - cd axolotl + - git checkout d4f6c65 + - pip install -e . + - cd .. + - wget https://dstack-binaries.s3.amazonaws.com/flash_attn-2.0.4-cp310-cp310-linux_x86_64.whl + - pip install flash_attn-2.0.4-cp310-cp310-linux_x86_64.whl + - wget https://dstack-binaries.s3.amazonaws.com/xformers-0.0.26-cp310-cp310-linux_x86_64.whl + - pip install xformers-0.0.26-cp310-cp310-linux_x86_64.whl + - git clone --recurse https://github.com/ROCm/bitsandbytes + - cd bitsandbytes + - git checkout rocm_enabled + - pip install -r requirements-dev.txt + - cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S . + - make + - pip install . + - cd .. + - accelerate launch -m axolotl.cli.train axolotl/examples/llama-3/fft-8b.yaml + +resources: + gpu: MI300X + disk: 150GB \ No newline at end of file diff --git a/examples/fine-tuning/trl/README.md b/examples/fine-tuning/trl/README.md index de353586e..5cffec021 100644 --- a/examples/fine-tuning/trl/README.md +++ b/examples/fine-tuning/trl/README.md @@ -78,7 +78,10 @@ shm_size: 24GB
-Change the `resources` property to specify more GPUs. +Change the `resources` property to specify more GPUs. + +!!! info "AMD" + The example above uses NVIDIA accelerators. To use it with AMD, check out [AMD](https://dstack.ai/examples/accelerators/amd#trl). ### DeepSpeed @@ -183,5 +186,6 @@ and [`examples/fine-tuning/trl` :material-arrow-top-right-thin:{ .external }](ht 1. Browse the [Axolotl](https://dstack.ai/docs/examples/fine-tuning/axolotl) and [Alignment Handbook](https://dstack.ai/docs/examples/fine-tuning/alignment-handbook) examples -2. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), +2. See [AMD](https://dstack.ai/examples/accelerators/amd#axolotl). +3. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), [services](https://dstack.ai/docs/services), and [fleets](https://dstack.ai/docs/fleets). \ No newline at end of file diff --git a/examples/fine-tuning/trl/amd/train.dstack.yaml b/examples/fine-tuning/trl/amd/train.dstack.yaml new file mode 100644 index 000000000..69b8744c3 --- /dev/null +++ b/examples/fine-tuning/trl/amd/train.dstack.yaml @@ -0,0 +1,32 @@ +type: task +# The name is optional, if not specified, generated randomly +name: trl-amd-llama31-train + +# If `image` is not specified, dstack uses its default image +image: runpod/pytorch:2.1.2-py3.10-rocm6.1-ubuntu22.04 + +# Required environment variables +env: + - HUGGING_FACE_HUB_TOKEN + +commands: + - export PATH=/opt/conda/envs/py_3.10/bin:$PATH + - git clone https://github.com/ROCm/bitsandbytes + - cd bitsandbytes + - git checkout rocm_enabled + - pip install -r requirements-dev.txt + - cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S . # Use to target specific gpu arch + - make + - pip install . + - pip install trl + - pip install peft + - pip install transformers datasets huggingface-hub scipy + - cd .. + - python examples/fine-tuning/trl/amd/train.py + +# Use either spot or on-demand instances +spot_policy: auto + +resources: + gpu: MI300X + disk: 150GB diff --git a/examples/fine-tuning/trl/amd/train.py b/examples/fine-tuning/trl/amd/train.py new file mode 100644 index 000000000..15118fc2a --- /dev/null +++ b/examples/fine-tuning/trl/amd/train.py @@ -0,0 +1,61 @@ +from datasets import load_dataset +from peft import LoraConfig, get_peft_model +from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments +from trl import SFTTrainer + +# Base model and tokenizer names. +base_model_name = "meta-llama/Meta-Llama-3.1-8B" + +# Load base model to GPU memory. +device = "cuda:0" +base_model = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code=True).to( + device +) + +# Load tokenizer. +tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True) +tokenizer.pad_token = tokenizer.eos_token +tokenizer.padding_side = "right" + +# Dataset for fine-tuning. +training_dataset_name = "mlabonne/guanaco-llama2-1k" +training_dataset = load_dataset(training_dataset_name, split="train") + + +# Training parameters for SFTTrainer. +training_arguments = TrainingArguments( + output_dir="./results", + num_train_epochs=1, + per_device_train_batch_size=4, + gradient_accumulation_steps=1, + optim="paged_adamw_32bit", + save_steps=50, + logging_steps=50, + learning_rate=4e-5, + weight_decay=0.001, + fp16=False, + bf16=False, + max_grad_norm=0.3, + max_steps=-1, + warmup_ratio=0.03, + group_by_length=True, + lr_scheduler_type="constant", + report_to="tensorboard", +) + +peft_config = LoraConfig(lora_alpha=16, lora_dropout=0.1, r=64, bias="none", task_type="CAUSAL_LM") +peft_model = get_peft_model(base_model, peft_config) +peft_model.print_trainable_parameters() + +# Initialize an SFT trainer. +sft_trainer = SFTTrainer( + model=base_model, + train_dataset=training_dataset, + peft_config=peft_config, + dataset_text_field="text", + tokenizer=tokenizer, + args=training_arguments, +) + +# Run the trainer. +sft_trainer.train()