From cf0a13929c94aa614be4516474bbc473aa1440e2 Mon Sep 17 00:00:00 2001
From: Bihan Rana <sjbbihan@gmail.com>
Date: Sat, 28 Sep 2024 14:13:45 +0545
Subject: [PATCH] Add AMD examples with vLLM, Axolotl and Trl (#1693)

Add llama31-service-vllm-amd example

[Docs] Added vLLM with AMD example

Add AMD examples

Update AMD-Axolotl example with official example

Add build wheel tasks for AMD examples

- Minor updates

Add necessary comments in AMD Readme

Co-authored-by: Bihan  Rana <bihan@Bihans-MacBook-Pro.local>
---
 examples/accelerators/amd/README.md           | 204 ++++++++++++++++--
 examples/accelerators/tpu/README.md           |   1 -
 examples/deployment/vllm/amd/.dstack.yml      |  15 ++
 .../deployment/vllm/amd/build.vllm-rocm.yaml  |  46 ++++
 .../deployment/vllm/amd/service.dstack.yml    |  49 +++++
 examples/fine-tuning/axolotl/README.md        |  28 ++-
 .../axolotl/amd/build.flash-attention.yaml    |  37 ++++
 .../axolotl/amd/build.xformers.yaml           |  38 ++++
 .../fine-tuning/axolotl/amd/train.dstack.yaml |  36 ++++
 examples/fine-tuning/trl/README.md            |   8 +-
 .../fine-tuning/trl/amd/train.dstack.yaml     |  32 +++
 examples/fine-tuning/trl/amd/train.py         |  61 ++++++
 12 files changed, 531 insertions(+), 24 deletions(-)
 create mode 100644 examples/deployment/vllm/amd/.dstack.yml
 create mode 100644 examples/deployment/vllm/amd/build.vllm-rocm.yaml
 create mode 100644 examples/deployment/vllm/amd/service.dstack.yml
 create mode 100644 examples/fine-tuning/axolotl/amd/build.flash-attention.yaml
 create mode 100644 examples/fine-tuning/axolotl/amd/build.xformers.yaml
 create mode 100644 examples/fine-tuning/axolotl/amd/train.dstack.yaml
 create mode 100644 examples/fine-tuning/trl/amd/train.dstack.yaml
 create mode 100644 examples/fine-tuning/trl/amd/train.py
diff --git a/examples/accelerators/amd/README.md b/examples/accelerators/amd/README.md
index 1821ab68a..5eb24f41e 100644
--- a/examples/accelerators/amd/README.md
+++ b/examples/accelerators/amd/README.md
@@ -7,11 +7,10 @@ you can specify an AMD GPU under `resources`. Below are a few examples.
 
 ## Deployment
 
-### Running as a service
+You can use any serving framework, such as TGI and vLLM. Here's an example of a [service](https://dstack.ai/docs/services) that deploys 
+Llama 3.1 70B in FP16 using [TGI :material-arrow-top-right-thin:{ .external }](https://huggingface.co/docs/text-generation-inference/en/installation_amd){:target="_blank"} and [vLLM :material-arrow-top-right-thin:{ .external }](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html){:target="_blank"}.
 
 === "TGI"
-    Here's an example of a [service](https://dstack.ai/docs/services) that deploys
-    Llama 3.1 70B in FP16 using [TGI :material-arrow-top-right-thin:{ .external }](https://huggingface.co/docs/text-generation-inference/en/installation_amd){:target="_blank"}.
     
     <div editor-title="examples/deployment/tgi/amd/service.dstack.yml"> 
     
@@ -19,22 +18,29 @@ you can specify an AMD GPU under `resources`. Below are a few examples.
     type: service
     name: amd-service-tgi
     
+    # Using the official TGI's ROCm Docker image
     image: ghcr.io/huggingface/text-generation-inference:sha-a379d55-rocm
+
+    # Required environment variables
     env:
       - HUGGING_FACE_HUB_TOKEN
       - MODEL_ID=meta-llama/Meta-Llama-3.1-70B-Instruct
       - TRUST_REMOTE_CODE=true
       - ROCM_USE_FLASH_ATTN_V2_TRITON=true
+    # Commands of the task
     commands:
       - text-generation-launcher --port 8000
+    # Service port
     port: 8000
     
     resources:
       gpu: MI300X
       disk: 150GB
     
+    # Use spot or on-demand instances
     spot_policy: auto
-    
+
+    # Register the model    
     model:
       type: chat
       name: meta-llama/Meta-Llama-3.1-70B-Instruct
@@ -43,26 +49,188 @@ you can specify an AMD GPU under `resources`. Below are a few examples.
     
     </div>
 
+
+=== "vLLM"
+
+    <div editor-title="examples/deployment/vllm/amd/service.dstack.yml"> 
+    
+    ```yaml
+    type: service
+    name: llama31-service-vllm-amd
+    
+    # Using RunPod's ROCm Docker image
+    image: runpod/pytorch:2.4.0-py3.10-rocm6.1.0-ubuntu22.04
+    
+    # Required environment variables
+    env:
+      - HUGGING_FACE_HUB_TOKEN
+      - MODEL_ID=meta-llama/Meta-Llama-3.1-70B-Instruct
+      - MAX_MODEL_LEN=126192
+    # Commands of the task
+    commands:
+      - export PATH=/opt/conda/envs/py_3.10/bin:$PATH
+      - wget https://github.com/ROCm/hipBLAS/archive/refs/tags/rocm-6.1.0.zip
+      - unzip rocm-6.1.0.zip
+      - cd hipBLAS-rocm-6.1.0
+      - python rmake.py
+      - cd ..
+      - git clone https://github.com/vllm-project/vllm.git
+      - cd vllm
+      - pip install triton
+      - pip uninstall torch -y
+      - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1
+      - pip install /opt/rocm/share/amd_smi
+      - pip install --upgrade numba scipy huggingface-hub[cli]
+      - pip install "numpy<2"
+      - pip install -r requirements-rocm.txt
+      - wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib
+      - rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so*
+      - export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+      - wget https://dstack-binaries.s3.amazonaws.com/vllm-0.6.0%2Brocm614-cp310-cp310-linux_x86_64.whl
+      - pip install vllm-0.6.0+rocm614-cp310-cp310-linux_x86_64.whl
+      - vllm serve $MODEL_ID --max-model-len $MAX_MODEL_LEN --port 8000
+    # Service port
+    port: 8000
+    
+    # Use spot or on-demand instances
+    spot_policy: auto
+    
+    resources:
+      gpu: MI300X
+      disk: 200GB
+    
+    # Register the model
+    model:
+      format: openai
+      type: chat
+      name: meta-llama/Meta-Llama-3.1-70B-Instruct
+    ```
+    </div>
+
+    Note, maximum size of vLLM’s `KV cache` is 126192, consequently we must set `MAX_MODEL_LEN` to 126192. Adding `/opt/conda/envs/py_3.10/bin` to PATH ensures we use the Python 3.10 environment necessary for the pre-built binaries compiled specifically for this version.
+    
+    > To speed up the `vLLM-ROCm` installation, we use a pre-built binary from S3. 
+    > You can find the task to build and upload the binary in [`examples/fine-tuning/axolotl/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/deployment/vllm/amd){:target="_blank"}.
+
 !!! info "Docker image"
-    Please note that if you want to use AMD, specifying `image` is currently required. This must be an image that includes
+    If you want to use AMD, specifying `image` is currently required. This must be an image that includes
     ROCm drivers.
 
 To request multiple GPUs, specify the quantity after the GPU name, separated by a colon, e.g., `MI300X:4`.
 
-AMD accelerators can also be used with other frameworks like vLLM, Ollama, etc., and we'll be adding more examples soon.
+## Fine-tuning
+
+=== "TRL"
+
+    Below is an example of LoRA fine-tuning Llama 3.1 8B using [TRL :material-arrow-top-right-thin:{ .external }](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference.html){:target="_blank"} 
+    and the [`mlabonne/guanaco-llama2-1k` :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/mlabonne/guanaco-llama2-1k){:target="_blank"}
+    dataset.
+    
+    <div editor-title="examples/fine-tuning/trl/amd/train.dstack.yml">
+    
+    ```yaml
+    type: task
+    name: trl-amd-llama31-train
+    
+    # Using RunPod's ROCm Docker image
+    image: runpod/pytorch:2.1.2-py3.10-rocm6.1-ubuntu22.04
+
+    # Required environment variables
+    env:
+      - HUGGING_FACE_HUB_TOKEN
+    # Commands of the task
+    commands:
+      - export PATH=/opt/conda/envs/py_3.10/bin:$PATH
+      - git clone https://github.com/ROCm/bitsandbytes
+      - cd bitsandbytes
+      - git checkout rocm_enabled
+      - pip install -r requirements-dev.txt
+      - cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S  .
+      - make
+      - pip install .
+      - pip install trl
+      - pip install peft
+      - pip install transformers datasets huggingface-hub scipy
+      - cd ..
+      - python examples/fine-tuning/trl/amd/train.py
+    
+    # Use spot or on-demand instances
+    spot_policy: auto
+    
+    resources:
+      gpu: MI300X
+      disk: 150GB
+    ```
+    
+    </div>
+
+=== "Axolotl"
+    Below is an example of fine-tuning Llama 3.1 8B using [Axolotl :material-arrow-top-right-thin:{ .external }](https://rocm.blogs.amd.com/artificial-intelligence/axolotl/README.html){:target="_blank"} 
+    and the [tatsu-lab/alpaca :material-arrow-top-right-thin:{ .external }](https://huggingface.co/datasets/tatsu-lab/alpaca){:target="_blank"}
+    dataset.
+    
+    <div editor-title="examples/fine-tuning/axolotl/amd/train.dstack.yml">
+    
+    ```yaml
+    type: task
+    name: axolotl-amd-llama31-train
+    
+    # Using RunPod's ROCm Docker image
+    image: runpod/pytorch:2.1.2-py3.10-rocm6.0.2-ubuntu22.04
+    # Required environment variables
+    env:
+      - HUGGING_FACE_HUB_TOKEN
+    # Commands of the task
+    commands:
+      - export PATH=/opt/conda/envs/py_3.10/bin:$PATH
+      - pip uninstall torch torchvision torchaudio -y
+      - python3 -m pip install --pre torch==2.3.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0/
+      - git clone https://github.com/OpenAccess-AI-Collective/axolotl
+      - cd axolotl
+      - git checkout d4f6c65
+      - pip install -e .
+      - cd ..
+      - wget https://dstack-binaries.s3.amazonaws.com/flash_attn-2.0.4-cp310-cp310-linux_x86_64.whl
+      - pip install flash_attn-2.0.4-cp310-cp310-linux_x86_64.whl
+      - wget https://dstack-binaries.s3.amazonaws.com/xformers-0.0.26-cp310-cp310-linux_x86_64.whl
+      - pip install xformers-0.0.26-cp310-cp310-linux_x86_64.whl
+      - git clone --recurse https://github.com/ROCm/bitsandbytes
+      - cd bitsandbytes
+      - git checkout rocm_enabled
+      - pip install -r requirements-dev.txt
+      - cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S  .
+      - make
+      - pip install .
+      - cd ..
+      - accelerate launch -m axolotl.cli.train axolotl/examples/llama-3/fft-8b.yaml
+    
+    # Use spot or on-demand instances
+    spot_policy: auto
+
+    resources:
+      gpu: MI300X
+      disk: 150GB
+    ```
+    </div>
+    Note,To support ROCm, we need to checkout to commit `d4f6c65`. You can find the installation instruction in [rocm-blogs :material-arrow-top-right-thin:{ .external }](https://github.com/ROCm/rocm-blogs/blob/release/blogs/artificial-intelligence/axolotl/src/Dockerfile.rocm){:target="_blank"}.
+
+    > To speed up installation of `flash-attention` and `xformers `, we use pre-built binaries uploaded to S3. 
+    > You can find the tasks that build and upload the binaries
+    > in [examples/fine-tuning/axolotl/amd :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/axolotl/amd){:target="_blank"}.
 
-### Running a configuration
+## Running a configuration
 
 Once the configuration is ready, run `dstack apply -f <configuration file>`, and `dstack` will automatically provision the
 cloud resources and run the configuration.
 
-## Fleets
+<div class="termy">
 
-By default, `dstack apply` reuses `idle` instances from one of the existing [fleets](https://dstack.ai/docs/fleets).
-If no `idle` instances meet the requirements, it creates a new fleet using one of the configured backends.
+```shell
+$ HUGGING_FACE_HUB_TOKEN=...
+$ dstack apply -f examples/deployment/vllm/amd/service.dstack.yml
+```
 
-Use [fleets](https://dstack.ai/docs/fleets.md) configurations to create fleets manually. This reduces startup time for dev environments,
-tasks, and services, and is very convenient if you want to reuse fleets across runs.
+</div>
 
 ## Dev environments
 
@@ -73,9 +241,17 @@ allow you to run commands interactively.
 ## Source code
 
 The source-code of this example can be found in 
-[`examples/deployment/tgi/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/deployment/tgi/amd){:target="_blank"}.
+[`examples/deployment/tgi/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/deployment/tgi/amd){:target="_blank"},
+[`examples/deployment/vllm/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/deployment/vllm/amd){:target="_blank"},
+[`examples/fine-tuning/axolotl/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/axolotl/amd){:target="_blank"} and
+[`examples/fine-tuning/trl/amd` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/trl/amd){:target="_blank"}
 
 ## What's next?
 
-1. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), and
+1. Browse [TGI :material-arrow-top-right-thin:{ .external }](https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/deploy-your-model.html#serving-using-hugging-face-tgi),
+   [vLLM :material-arrow-top-right-thin:{ .external }](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html#build-from-source-rocm),
+   [Axolotl :material-arrow-top-right-thin:{ .external }](https://github.com/ROCm/rocm-blogs/tree/release/blogs/artificial-intelligence/axolotl),
+   [TRL :material-arrow-top-right-thin:{ .external }](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.html) and
+   [ROCm Bitsandbytes :material-arrow-top-right-thin:{ .external }](https://github.com/ROCm/bitsandbytes)
+2. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), and
    [services](https://dstack.ai/docs/services).
\ No newline at end of file
diff --git a/examples/accelerators/tpu/README.md b/examples/accelerators/tpu/README.md
index 3439ae3e3..471481cae 100644
--- a/examples/accelerators/tpu/README.md
+++ b/examples/accelerators/tpu/README.md
@@ -10,7 +10,6 @@ Below are a few examples on using TPUs for deployment and fine-tuning.
 
 ## Deployment
 
-### Running as a service
 You can use any serving framework, such as vLLM, TGI. Here's an example of a [service](https://dstack.ai/docs/services) that deploys
 Llama 3.1 8B using 
 [Optimum TPU :material-arrow-top-right-thin:{ .external }](https://github.com/huggingface/optimum-tpu){:target="_blank"}
diff --git a/examples/deployment/vllm/amd/.dstack.yml b/examples/deployment/vllm/amd/.dstack.yml
new file mode 100644
index 000000000..6aaed21a0
--- /dev/null
+++ b/examples/deployment/vllm/amd/.dstack.yml
@@ -0,0 +1,15 @@
+type: dev-environment
+name: dev-vLLM-amd
+
+image: runpod/pytorch:2.4.0-py3.10-rocm6.1.0-ubuntu22.04
+
+env:
+  - HUGGING_FACE_HUB_TOKEN
+
+ide: vscode
+
+resources:
+  gpu: MI300X
+  disk: 150GB
+
+spot_policy: auto
\ No newline at end of file
diff --git a/examples/deployment/vllm/amd/build.vllm-rocm.yaml b/examples/deployment/vllm/amd/build.vllm-rocm.yaml
new file mode 100644
index 000000000..00112df96
--- /dev/null
+++ b/examples/deployment/vllm/amd/build.vllm-rocm.yaml
@@ -0,0 +1,46 @@
+type: task
+name: build-vllm-rocm
+
+image: runpod/pytorch:2.4.0-py3.10-rocm6.1.0-ubuntu22.04
+
+env:
+  - HUGGING_FACE_HUB_TOKEN
+  - AWS_ACCESS_KEY_ID
+  - AWS_SECRET_ACCESS_KEY
+  - AWS_REGION
+  - BUCKET_NAME
+
+command:
+  - apt-get update -y
+  - apt-get install awscli -y
+  - aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID
+  - aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY
+  - aws configure set region $AWS_REGION
+  - export PATH=/opt/conda/envs/py_3.10/bin:$PATH
+  - wget https://github.com/ROCm/hipBLAS/archive/refs/tags/rocm-6.1.0.zip
+  - unzip rocm-6.1.0.zip
+  - cd hipBLAS-rocm-6.1.0
+  - python rmake.py
+  - cd ..
+  - git clone https://github.com/vllm-project/vllm.git
+  - cd vllm
+  - pip install triton
+  - pip uninstall torch -y
+  - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1
+  - pip install /opt/rocm/share/amd_smi
+  - pip install --upgrade numba scipy huggingface-hub[cli]
+  - pip install "numpy<2"
+  - pip install -r requirements-rocm.txt
+  - wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib
+  - rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so*
+  - export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+  - pip install wheel setuptools setuptools_scm ninja
+  - python setup.py bdist_wheel -d dist/
+  - cd dist
+  - aws s3 cp "$(ls -1 | head -n 1)" s3://$BUCKET_NAME/ --acl public-read
+
+spot_policy: auto
+
+resources:
+  gpu: MI300X
+  disk: 150GB
\ No newline at end of file
diff --git a/examples/deployment/vllm/amd/service.dstack.yml b/examples/deployment/vllm/amd/service.dstack.yml
new file mode 100644
index 000000000..e91858f28
--- /dev/null
+++ b/examples/deployment/vllm/amd/service.dstack.yml
@@ -0,0 +1,49 @@
+type: service
+name: llama31-service-vllm-amd
+
+image: runpod/pytorch:2.4.0-py3.10-rocm6.1.0-ubuntu22.04
+
+env:
+  - HUGGING_FACE_HUB_TOKEN
+  - MODEL_ID=meta-llama/Meta-Llama-3.1-70B-Instruct
+  - MAX_MODEL_LEN=126192
+
+commands:
+  - export PATH=/opt/conda/envs/py_3.10/bin:$PATH
+  - wget https://github.com/ROCm/hipBLAS/archive/refs/tags/rocm-6.1.0.zip
+  - unzip rocm-6.1.0.zip
+  - cd hipBLAS-rocm-6.1.0
+  - python rmake.py
+  - cd ..
+  - git clone https://github.com/vllm-project/vllm.git
+  - cd vllm
+  - pip install triton
+  - pip uninstall torch -y
+  - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1
+  - pip install /opt/rocm/share/amd_smi
+  - pip install --upgrade numba scipy huggingface-hub[cli]
+  - pip install "numpy<2"
+  - pip install -r requirements-rocm.txt
+  - wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib
+  - rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so*
+  - export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+  - wget https://dstack-binaries.s3.amazonaws.com/vllm-0.6.0%2Brocm614-cp310-cp310-linux_x86_64.whl
+  - pip install vllm-0.6.0+rocm614-cp310-cp310-linux_x86_64.whl
+  - vllm serve $MODEL_ID
+      --max-model-len $MAX_MODEL_LEN
+      --port 8000
+
+# Expose the vllm server port
+port: 8000
+
+spot_policy: auto
+
+resources:
+  gpu: MI300X
+  disk: 200GB
+
+# (Optional) Enable the OpenAI-compatible endpoint
+model:
+  format: openai
+  type: chat
+  name: meta-llama/Meta-Llama-3.1-70B-Instruct
diff --git a/examples/fine-tuning/axolotl/README.md b/examples/fine-tuning/axolotl/README.md
index 6bbf20cfb..2946594ca 100644
--- a/examples/fine-tuning/axolotl/README.md
+++ b/examples/fine-tuning/axolotl/README.md
@@ -28,11 +28,12 @@ You can modify it as needed.
 ## Single-node training
 
 The easiest way to run a training script with `dstack` is by creating a task configuration file.
-This file can be found at [`examples/fine-tuning/axolotl/train.dstack.yml` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/axolotl/train.dstack.yml){:target="_blank"}. Below is its content: 
+This file can be found at [`examples/fine-tuning/axolotl/train.dstack.yml` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/axolotl/train.dstack.yml){:target="_blank"}.
+
+<div editor-title="examples/fine-tuning/axolotl/train.dstack.yml">
 
 ```yaml
 type: task
-# The name is optional, if not specified, generated randomly
 name: axolotl-train
 
 # Using the official Axolotl's Docker image
@@ -46,6 +47,9 @@ env:
 commands:
   - accelerate launch -m axolotl.cli.train examples/fine-tuning/axolotl/config.yaml
 
+# Use spot or on-demand instances
+spot_policy: auto
+
 resources:
   gpu:
     # 24GB or more vRAM
@@ -54,16 +58,23 @@ resources:
     count: 2..
 ```
 
+</div>
+
 The task uses Axolotl's Docker image, where Axolotl is already pre-installed.
 
-To run the task, use `dstack apply`:
+!!! info "AMD"
+    The example above uses NVIDIA accelerators. To use it with AMD, check out [AMD](https://dstack.ai/examples/accelerators/amd#axolotl).
+
+## Running a configuration
+
+Once the configuration is ready, run `dstack apply -f <configuration file>`, and `dstack` will automatically provision the
+cloud resources and run the configuration.
 
 <div class="termy">
 
 ```shell
 $ HUGGING_FACE_HUB_TOKEN=...
 $ WANDB_API_KEY=...
-
 $ dstack apply -f examples/fine-tuning/axolotl/train.dstack.yml
 ```
 
@@ -75,7 +86,7 @@ $ dstack apply -f examples/fine-tuning/axolotl/train.dstack.yml
 > If no `idle` instances meet the requirements, it creates a new fleet using one of the configured backends.
 
 The example folder includes a fleet configuration: 
-[ `examples/fine-tuning/axolotl/fleet.dstack.yml` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/axolotl/fleet.dstack.yml) {:target="_blank"}
+[ `examples/fine-tuning/axolotl/fleet.dstack.yml` :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/blob/master/examples/fine-tuning/axolotl/fleet.dstack.yml){:target="_blank"}
 (a single node with a `24GB` GPU).
 
 You can update the fleet configuration to change the vRAM size, GPU model, number of GPUs per node, or number of nodes. 
@@ -105,7 +116,9 @@ If you'd like to play with the example using a dev environment, run
 <div class="termy">
 
 ```shell
-dstack apply -f examples/fine-tuning/axolotl/.dstack.yaml 
+$ HUGGING_FACE_HUB_TOKEN=...
+$ WANDB_API_KEY=...
+$ dstack apply -f examples/fine-tuning/axolotl/.dstack.yaml 
 ```
 
 </div>
@@ -119,4 +132,5 @@ The source-code of this example can be found in
 
 1. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), 
    [services](https://dstack.ai/docs/services), and [fleets](https://dstack.ai/docs/concepts/fleets).
-2. Browse [Axolotl :material-arrow-top-right-thin:{ .external }](https://github.com/OpenAccess-AI-Collective/axolotl){:target="_blank"}.
+2. See [AMD](https://dstack.ai/examples/accelerators/amd#axolotl). 
+3. Browse [Axolotl :material-arrow-top-right-thin:{ .external }](https://github.com/OpenAccess-AI-Collective/axolotl){:target="_blank"}.
diff --git a/examples/fine-tuning/axolotl/amd/build.flash-attention.yaml b/examples/fine-tuning/axolotl/amd/build.flash-attention.yaml
new file mode 100644
index 000000000..1468bf8dc
--- /dev/null
+++ b/examples/fine-tuning/axolotl/amd/build.flash-attention.yaml
@@ -0,0 +1,37 @@
+type: task
+# The name is optional, if not specified, generated randomly
+name: build-flash-attention
+
+image: runpod/pytorch:2.1.2-py3.10-rocm6.0.2-ubuntu22.04
+
+# Required environment variables
+env:
+  - HUGGING_FACE_HUB_TOKEN
+  - GPU_ARCHS="gfx90a;gfx942"
+  - AWS_ACCESS_KEY_ID
+  - AWS_SECRET_ACCESS_KEY
+  - AWS_REGION
+  - BUCKET_NAME
+
+# Commands of the task
+commands:
+  - apt-get update -y
+  - apt-get install awscli -y
+  - aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID
+  - aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY
+  - aws configure set region $AWS_REGION
+  - export PATH=/opt/conda/envs/py_3.10/bin:$PATH
+  - pip uninstall torch torchvision torchaudio -y
+  - python3 -m pip install --pre torch==2.3.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0/
+  - pip install ninja
+  - pip install wheel setuptools
+  - git clone https://github.com/ROCm/flash-attention.git
+  - cd flash-attention
+  - git checkout stride_fix
+  - python setup.py bdist_wheel -d dist/
+  - cd dist
+  - aws s3 cp "$(ls -1 | head -n 1)" s3://$BUCKET_NAME/ --acl public-read
+
+resources:
+  gpu: MI300X
+  disk: 150GB
\ No newline at end of file
diff --git a/examples/fine-tuning/axolotl/amd/build.xformers.yaml b/examples/fine-tuning/axolotl/amd/build.xformers.yaml
new file mode 100644
index 000000000..a3733ec50
--- /dev/null
+++ b/examples/fine-tuning/axolotl/amd/build.xformers.yaml
@@ -0,0 +1,38 @@
+type: task
+# The name is optional, if not specified, generated randomly
+name: build-flash-attention
+
+image: runpod/pytorch:2.1.2-py3.10-rocm6.0.2-ubuntu22.04
+
+# Required environment variables
+env:
+  - HUGGING_FACE_HUB_TOKEN
+  - GPU_ARCHS="gfx90a;gfx942"
+  - AWS_ACCESS_KEY_ID
+  - AWS_SECRET_ACCESS_KEY
+  - AWS_REGION
+  - BUCKET_NAME
+
+# Commands of the task
+commands:
+  - apt-get update -y
+  - apt-get install awscli -y
+  - aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID
+  - aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY
+  - aws configure set region $AWS_REGION
+  - export PATH=/opt/conda/envs/py_3.10/bin:$PATH
+  - pip uninstall torch torchvision torchaudio -y
+  - python3 -m pip install --pre torch==2.3.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0/
+  - pip install ninja
+  - pip install wheel setuptools
+  - git clone https://github.com/ROCm/xformers
+  - cd xformers
+  - git checkout dfc196d
+  - git submodule update --init --recursive
+  - python setup.py bdist_wheel -d dist/
+  - cd dist
+  - aws s3 cp "$(ls -1 | head -n 1)" s3://$BUCKET_NAME/ --acl public-read
+
+resources:
+  gpu: MI300X
+  disk: 150GB
\ No newline at end of file
diff --git a/examples/fine-tuning/axolotl/amd/train.dstack.yaml b/examples/fine-tuning/axolotl/amd/train.dstack.yaml
new file mode 100644
index 000000000..5de02b353
--- /dev/null
+++ b/examples/fine-tuning/axolotl/amd/train.dstack.yaml
@@ -0,0 +1,36 @@
+type: task
+# The name is optional, if not specified, generated randomly
+name: axolotl-amd-llama31-train
+
+image: runpod/pytorch:2.1.2-py3.10-rocm6.0.2-ubuntu22.04
+
+# Required environment variables
+env:
+  - HUGGING_FACE_HUB_TOKEN
+# Commands of the task
+commands:
+  - export PATH=/opt/conda/envs/py_3.10/bin:$PATH
+  - pip uninstall torch torchvision torchaudio -y
+  - python3 -m pip install --pre torch==2.3.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0/
+  - git clone https://github.com/OpenAccess-AI-Collective/axolotl
+  - cd axolotl
+  - git checkout d4f6c65
+  - pip install -e .
+  - cd ..
+  - wget https://dstack-binaries.s3.amazonaws.com/flash_attn-2.0.4-cp310-cp310-linux_x86_64.whl
+  - pip install flash_attn-2.0.4-cp310-cp310-linux_x86_64.whl
+  - wget https://dstack-binaries.s3.amazonaws.com/xformers-0.0.26-cp310-cp310-linux_x86_64.whl
+  - pip install xformers-0.0.26-cp310-cp310-linux_x86_64.whl
+  - git clone --recurse https://github.com/ROCm/bitsandbytes
+  - cd bitsandbytes
+  - git checkout rocm_enabled
+  - pip install -r requirements-dev.txt
+  - cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S  .
+  - make
+  - pip install .
+  - cd ..
+  - accelerate launch -m axolotl.cli.train axolotl/examples/llama-3/fft-8b.yaml
+
+resources:
+  gpu: MI300X
+  disk: 150GB
\ No newline at end of file
diff --git a/examples/fine-tuning/trl/README.md b/examples/fine-tuning/trl/README.md
index de353586e..5cffec021 100644
--- a/examples/fine-tuning/trl/README.md
+++ b/examples/fine-tuning/trl/README.md
@@ -78,7 +78,10 @@ shm_size: 24GB
 
 </div>
 
-Change the `resources` property to specify more GPUs. 
+Change the `resources` property to specify more GPUs.
+
+!!! info "AMD"
+    The example above uses NVIDIA accelerators. To use it with AMD, check out [AMD](https://dstack.ai/examples/accelerators/amd#trl).
 
 ### DeepSpeed
 
@@ -183,5 +186,6 @@ and [`examples/fine-tuning/trl` :material-arrow-top-right-thin:{ .external }](ht
 
 1. Browse the [Axolotl](https://dstack.ai/docs/examples/fine-tuning/axolotl) 
    and [Alignment Handbook](https://dstack.ai/docs/examples/fine-tuning/alignment-handbook) examples
-2. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), 
+2. See [AMD](https://dstack.ai/examples/accelerators/amd#axolotl). 
+3. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), 
    [services](https://dstack.ai/docs/services), and [fleets](https://dstack.ai/docs/fleets).
\ No newline at end of file
diff --git a/examples/fine-tuning/trl/amd/train.dstack.yaml b/examples/fine-tuning/trl/amd/train.dstack.yaml
new file mode 100644
index 000000000..69b8744c3
--- /dev/null
+++ b/examples/fine-tuning/trl/amd/train.dstack.yaml
@@ -0,0 +1,32 @@
+type: task
+# The name is optional, if not specified, generated randomly
+name: trl-amd-llama31-train
+
+# If `image` is not specified, dstack uses its default image
+image: runpod/pytorch:2.1.2-py3.10-rocm6.1-ubuntu22.04
+
+# Required environment variables
+env:
+  - HUGGING_FACE_HUB_TOKEN
+
+commands:
+  - export PATH=/opt/conda/envs/py_3.10/bin:$PATH
+  - git clone https://github.com/ROCm/bitsandbytes
+  - cd bitsandbytes
+  - git checkout rocm_enabled
+  - pip install -r requirements-dev.txt
+  - cmake -DBNB_ROCM_ARCH="gfx942" -DCOMPUTE_BACKEND=hip -S  .  # Use  to target specific gpu arch
+  - make
+  - pip install .
+  - pip install trl
+  - pip install peft
+  - pip install transformers datasets huggingface-hub scipy
+  - cd ..
+  - python examples/fine-tuning/trl/amd/train.py
+
+# Use either spot or on-demand instances
+spot_policy: auto
+
+resources:
+  gpu: MI300X
+  disk: 150GB
diff --git a/examples/fine-tuning/trl/amd/train.py b/examples/fine-tuning/trl/amd/train.py
new file mode 100644
index 000000000..15118fc2a
--- /dev/null
+++ b/examples/fine-tuning/trl/amd/train.py
@@ -0,0 +1,61 @@
+from datasets import load_dataset
+from peft import LoraConfig, get_peft_model
+from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
+from trl import SFTTrainer
+
+# Base model and tokenizer names.
+base_model_name = "meta-llama/Meta-Llama-3.1-8B"
+
+# Load base model to GPU memory.
+device = "cuda:0"
+base_model = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code=True).to(
+    device
+)
+
+# Load tokenizer.
+tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right"
+
+# Dataset for fine-tuning.
+training_dataset_name = "mlabonne/guanaco-llama2-1k"
+training_dataset = load_dataset(training_dataset_name, split="train")
+
+
+# Training parameters for SFTTrainer.
+training_arguments = TrainingArguments(
+    output_dir="./results",
+    num_train_epochs=1,
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=1,
+    optim="paged_adamw_32bit",
+    save_steps=50,
+    logging_steps=50,
+    learning_rate=4e-5,
+    weight_decay=0.001,
+    fp16=False,
+    bf16=False,
+    max_grad_norm=0.3,
+    max_steps=-1,
+    warmup_ratio=0.03,
+    group_by_length=True,
+    lr_scheduler_type="constant",
+    report_to="tensorboard",
+)
+
+peft_config = LoraConfig(lora_alpha=16, lora_dropout=0.1, r=64, bias="none", task_type="CAUSAL_LM")
+peft_model = get_peft_model(base_model, peft_config)
+peft_model.print_trainable_parameters()
+
+# Initialize an SFT trainer.
+sft_trainer = SFTTrainer(
+    model=base_model,
+    train_dataset=training_dataset,
+    peft_config=peft_config,
+    dataset_text_field="text",
+    tokenizer=tokenizer,
+    args=training_arguments,
+)
+
+# Run the trainer.
+sft_trainer.train()