Skip to content

Commit c3b2697

Browse files
committed
Switch to vllm0.8.0 and torch2.6.0+cu126
- Only targeting Hopper for now - Add vllm/cuda versions to image tags for easier tracking
1 parent b6b0137 commit c3b2697

File tree

2 files changed

+29
-33
lines changed

2 files changed

+29
-33
lines changed

.github/workflows/build-vllm.yaml

+21-18
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,16 @@ name: Build vLLM Docker Image Matrix
22

33
env:
44
PARALLELISM: 1
5-
MAX_JOBS: 32 # Limit for building multiple archs
6-
NVCC_THREADS: 2
7-
TORCH_CUDA_ARCH_LIST: 9.0a;10.0a
8-
VLLM_FA_CMAKE_GPU_ARCHES: 90a-real;100a-real
9-
TRITON_REF: release/3.3.x
10-
TRITON_BUILD_VERSION: 3.3.0
11-
XFORMERS_REF: v0.0.29.post3
12-
XFORMERS_BUILD_VERSION: 0.0.29.post3
5+
TORCH_CUDA_ARCH_LIST: 9.0a
6+
VLLM_FA_CMAKE_GPU_ARCHES: 90a-real
7+
TRITON_REF: release/3.2.x
8+
TRITON_BUILD_VERSION: 3.2.0
9+
XFORMERS_REF: v0.0.29.post2
10+
XFORMERS_BUILD_VERSION: 0.0.29.post2
1311
FLASHINFER_REF: v0.2.2.post1
1412
FLASHINFER_BUILD_VERSION: 0.2.2.post1
15-
VLLM_REF: d47807ba
16-
VLLM_BUILD_VERSION: 0.7.4
13+
VLLM_REF: v0.8.0
14+
VLLM_BUILD_VERSION: 0.8.0
1715

1816
on:
1917
push:
@@ -25,13 +23,16 @@ jobs:
2523
strategy:
2624
matrix:
2725
arch: [amd64, arm64]
28-
cuda_version: [12.8.0]
26+
cuda_version: [12.6.3]
2927
image_distro: [ubuntu24.04]
3028
runs-on: [self-hosted, "${{ matrix.arch }}"]
3129
steps:
32-
- name: Generate image name
30+
- name: Prepare some env vars
3331
run: |
3432
echo "GHCR_IMAGE=ghcr.io/${GITHUB_REPOSITORY@L}" >> ${GITHUB_ENV}
33+
CUDA_VERSION=${{ matrix.cuda_version }}
34+
CUDA_SHORT=${CUDA_VERSION%.*}
35+
echo "CUDA_TAG=${CUDA_SHORT//./}" >> ${GITHUB_ENV}
3536
3637
- name: Login to GHCR
3738
uses: docker/login-action@v3
@@ -65,21 +66,23 @@ jobs:
6566
FLASHINFER_BUILD_VERSION=${{ env.FLASHINFER_BUILD_VERSION }}
6667
VLLM_REF=${{ env.VLLM_REF }}
6768
VLLM_BUILD_VERSION=${{ env.VLLM_BUILD_VERSION }}
68-
cache-from: type=registry,ref=${{ env.GHCR_IMAGE }}:cache-${{ matrix.arch }}
69-
cache-to: type=registry,ref=${{ env.GHCR_IMAGE }}:cache-${{ matrix.arch }},mode=max
69+
cache-from: type=registry,ref=${{ env.GHCR_IMAGE }}:cache-cu${{ env.CUDA_TAG }}-${{ matrix.arch }}
70+
cache-to: type=registry,ref=${{ env.GHCR_IMAGE }}:cache-cu${{ env.CUDA_TAG }}-${{ matrix.arch }},mode=max
7071
context: .
7172
file: Dockerfile
7273
platforms: linux/${{ matrix.arch }}
7374
push: true
74-
tags: ${{ env.GHCR_IMAGE }}:${{ matrix.arch }}
75+
tags: ${{ env.GHCR_IMAGE }}:${{ env.VLLM_BUILD_VERSION }}-cu${{ env.CUDA_TAG }}-${{ matrix.arch }}
7576

77+
# Fix this to use matrix and handle imagetools create --append
7678
ghcr:
7779
needs: build
7880
runs-on: self-hosted
7981
steps:
80-
- name: Generate image name
82+
- name: Prepare some env vars
8183
run: |
8284
echo "GHCR_IMAGE=ghcr.io/${GITHUB_REPOSITORY@L}" >> ${GITHUB_ENV}
85+
echo "CUDA_TAG=126" >> ${GITHUB_ENV}
8386
8487
- name: Login to GHCR
8588
uses: docker/login-action@v3
@@ -90,5 +93,5 @@ jobs:
9093

9194
- name: Append images
9295
run: |
93-
ARCHS=(amd64 arm64)
94-
docker buildx imagetools create -t ${GHCR_IMAGE}:latest ${ARCHS[@]/#/${GHCR_IMAGE}:}
96+
TAGS=(${VLLM_BUILD_VERSION}-cu${CUDA_TAG}-{amd,arm}64)
97+
docker buildx imagetools create -t ${GHCR_IMAGE}:latest ${TAGS[@]/#/${GHCR_IMAGE}:}

Dockerfile

+8-15
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,10 @@
1-
ARG CUDA_VERSION=12.8.0
1+
ARG CUDA_VERSION=12.6.3
22
ARG IMAGE_DISTRO=ubuntu24.04
33
ARG PYTHON_VERSION=3.12
44

55
# ---------- Builder Base ----------
66
FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-${IMAGE_DISTRO} AS base
77

8-
# Set build scaling
9-
ARG MAX_JOBS=32
10-
ENV MAX_JOBS=${MAX_JOBS}
11-
ARG NVCC_THREADS=2
12-
ENV NVCC_THREADS=${NVCC_THREADS}
13-
148
# Set arch lists for all targets
159
# 'a' suffix is not forward compatible but enables all optimizations
1610
ARG TORCH_CUDA_ARCH_LIST="9.0a"
@@ -52,7 +46,7 @@ ENV CUDA_HOME=/usr/local/cuda
5246
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
5347

5448
# Install pytorch nightly
55-
RUN uv pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu128
49+
RUN uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu126
5650

5751
FROM base AS build-base
5852
RUN mkdir /wheels
@@ -62,8 +56,8 @@ RUN mkdir /wheels
6256
RUN uv pip install -U build cmake ninja pybind11 setuptools wheel
6357

6458
FROM build-base AS build-triton
65-
ARG TRITON_REF=release/3.3.x
66-
ARG TRITON_BUILD_VERSION=3.3.0
59+
ARG TRITON_REF=release/3.2.x
60+
ARG TRITON_BUILD_VERSION=3.2.0
6761
ENV BUILD_VERSION=${TRITON_BUILD_VERSION:-${TRITON_REF#v}}
6862
RUN git clone https://github.com/triton-lang/triton.git
6963
RUN cd triton && \
@@ -73,8 +67,8 @@ RUN cd triton && \
7367
uv build python --wheel --no-build-isolation -o /wheels
7468

7569
FROM build-base AS build-xformers
76-
ARG XFORMERS_REF=v0.0.29.post3
77-
ARG XFORMERS_BUILD_VERSION=0.0.29.post3
70+
ARG XFORMERS_REF=v0.0.29.post2
71+
ARG XFORMERS_BUILD_VERSION=0.0.29.post2
7872
ENV BUILD_VERSION=${XFORMERS_BUILD_VERSION:-${XFORMERS_REF#v}}
7973
RUN git clone https://github.com/facebookresearch/xformers.git
8074
RUN cd xformers && \
@@ -96,15 +90,14 @@ RUN cd flashinfer && \
9690
uv build --wheel --no-build-isolation -o /wheels
9791

9892
FROM build-base AS build-vllm
99-
ARG VLLM_REF=53be4a86
100-
ARG VLLM_BUILD_VERSION=0.7.4
93+
ARG VLLM_REF=v0.8.0
94+
ARG VLLM_BUILD_VERSION=0.8.0
10195
ENV BUILD_VERSION=${VLLM_BUILD_VERSION:-${VLLM_REF#v}}
10296
RUN git clone https://github.com/vllm-project/vllm.git
10397
RUN cd vllm && \
10498
git checkout ${VLLM_REF} && \
10599
git submodule sync && \
106100
git submodule update --init --recursive -j 8 && \
107-
python use_existing_torch.py && \
108101
uv pip install -r requirements/build.txt && \
109102
uv build --wheel --no-build-isolation -o /wheels
110103

0 commit comments

Comments
 (0)