Skip to content

Commit

Permalink
Merge pull request #7 from ROCm/integration_no_fp8
Browse files Browse the repository at this point in the history
Features integration:
Custom ops and kernels from private v0.2.7_dllehr
Triton attention kernel from jpvillam/v0.3.3_triton
Option to run multi GPU using torchrun instead or ray
  • Loading branch information
gshtras authored Mar 22, 2024
2 parents 54be8a0 + 5e3ec52 commit 629f74b
Show file tree
Hide file tree
Showing 20 changed files with 1,515 additions and 88 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ _build/
# hip files generated by PyTorch
*.hip
*_hip*
hip_compat.h

# Benchmark dataset
*.json
36 changes: 18 additions & 18 deletions Dockerfile.rocm
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"

FROM $BASE_IMAGE

ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"

RUN echo "Base image is $BASE_IMAGE"

# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
Expand All @@ -26,22 +24,12 @@ ARG BUILD_FA="1"
# whether to build cupy on rocm
ARG BUILD_CUPY="1"

# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y
# whether to build triton on rocm
ARG BUILD_TRITON="1"

# Install some basic utilities
RUN apt-get update && apt-get install -y \
curl \
ca-certificates \
sudo \
git \
bzip2 \
libx11-6 \
build-essential \
wget \
unzip \
nvidia-cuda-toolkit \
tmux \
sqlite3 libsqlite3-dev libfmt-dev \
&& rm -rf /var/lib/apt/lists/*

### Mount Point ###
Expand Down Expand Up @@ -95,6 +83,17 @@ RUN if [ "$BUILD_CUPY" = "1" ]; then \
&& cd ..; \
fi

# build triton
RUN if [ "$BUILD_TRITON" = "1" ]; then \
mkdir -p libs \
&& cd libs \
&& pip uninstall -y triton \
&& git clone https://github.com/ROCm/triton.git \
&& cd triton/python \
&& pip3 install . \
&& cd ../..; \
fi

COPY ./ /app/vllm

RUN python3 -m pip install --upgrade pip
Expand All @@ -104,12 +103,13 @@ RUN cd /app \
&& cd vllm \
&& pip install -U -r requirements-rocm.txt \
&& if [ "$BUILD_FA" = "1" ]; then \
bash patch_xformers.rocm.sh; fi \
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
bash patch_xformers.rocm.sh; fi \
&& if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch; fi \
&& python3 setup.py install \
&& cd ..

RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir ray[all]
RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3

CMD ["/bin/bash"]
28 changes: 16 additions & 12 deletions benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,17 @@ def main(args: argparse.Namespace):

# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
llm = LLM(
model=args.model,
tokenizer=args.tokenizer,
quantization=args.quantization,
tensor_parallel_size=args.tensor_parallel_size,
trust_remote_code=args.trust_remote_code,
dtype=args.dtype,
enforce_eager=args.enforce_eager,
kv_cache_dtype=args.kv_cache_dtype,
device=args.device,
ray_workers_use_nsight=args.ray_workers_use_nsight,
)
llm = LLM(model=args.model,
tokenizer=args.tokenizer,
quantization=args.quantization,
tensor_parallel_size=args.tensor_parallel_size,
trust_remote_code=args.trust_remote_code,
dtype=args.dtype,
enforce_eager=args.enforce_eager,
kv_cache_dtype=args.kv_cache_dtype,
device=args.device,
ray_workers_use_nsight=args.ray_workers_use_nsight,
worker_use_ray=args.worker_use_ray)

sampling_params = SamplingParams(
n=args.n,
Expand Down Expand Up @@ -151,5 +150,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
action='store_true',
help="If specified, use nsight to profile ray workers",
)
parser.add_argument('--worker-use-ray',
action='store_true',
help='use Ray for distributed serving, will be '
'automatically set when using more than 1 GPU '
'unless on ROCm where the default is torchrun')
args = parser.parse_args()
main(args)
38 changes: 24 additions & 14 deletions benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,21 +75,25 @@ def run_vllm(
device: str,
enable_prefix_caching: bool,
gpu_memory_utilization: float = 0.9,
worker_use_ray: bool = False,
) -> float:
from vllm import LLM, SamplingParams
llm = LLM(model=model,
tokenizer=tokenizer,
quantization=quantization,
tensor_parallel_size=tensor_parallel_size,
seed=seed,
trust_remote_code=trust_remote_code,
dtype=dtype,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype,
device=device,
enable_prefix_caching=enable_prefix_caching)
llm = LLM(
model=model,
tokenizer=tokenizer,
quantization=quantization,
tensor_parallel_size=tensor_parallel_size,
seed=seed,
trust_remote_code=trust_remote_code,
dtype=dtype,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype,
device=device,
enable_prefix_caching=enable_prefix_caching,
worker_use_ray=worker_use_ray,
)

# Add the requests to the engine.
for prompt, _, output_len in requests:
Expand Down Expand Up @@ -213,7 +217,8 @@ def main(args: argparse.Namespace):
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
args.trust_remote_code, args.dtype, args.max_model_len,
args.enforce_eager, args.kv_cache_dtype, args.device,
args.enable_prefix_caching, args.gpu_memory_utilization)
args.enable_prefix_caching, args.gpu_memory_utilization,
args.worker_use_ray)
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
Expand Down Expand Up @@ -314,6 +319,11 @@ def main(args: argparse.Namespace):
"--enable-prefix-caching",
action='store_true',
help="enable automatic prefix caching for vLLM backend.")
parser.add_argument('--worker-use-ray',
action='store_true',
help='use Ray for distributed serving, will be '
'automatically set when using more than 1 GPU '
'unless on ROCm where the default is torchrun')
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
Expand Down
9 changes: 9 additions & 0 deletions csrc/attention/attention_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,11 @@ template<
typename CACHE_T,
int BLOCK_SIZE,
bool IS_FP8_E5M2_KV_CACHE,
#ifdef USE_ROCM
int NUM_THREADS = 1024>
#else
int NUM_THREADS = 128>
#endif
void paged_attention_v1_launcher(
torch::Tensor& out,
torch::Tensor& query,
Expand Down Expand Up @@ -779,8 +783,13 @@ template<
typename CACHE_T,
int BLOCK_SIZE,
bool IS_FP8_E5M2_KV_CACHE,
#ifdef USE_ROCM
int NUM_THREADS = 1024,
int PARTITION_SIZE = 1024>
#else
int NUM_THREADS = 128,
int PARTITION_SIZE = 512>
#endif
void paged_attention_v2_launcher(
torch::Tensor& out,
torch::Tensor& exp_sums,
Expand Down
74 changes: 74 additions & 0 deletions csrc/custom/custom.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#include <ATen/cuda/CUDAContext.h>
#include <torch/extension.h>
#include <pybind11/pybind11.h>
#include <cuda_runtime.h>

namespace py = pybind11;

// declare templates for front (cpp) and back (cuda) sides of function:
//template <typename T>

void LLGemm_Silu(void *in_a, void *in_b, void *out_c, const int M, const int K, cudaStream_t stream, const int rows_per_block);
void LLMM_Silu(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c, const int rows_per_block) {
int M = in_a.size(0);
int K = in_a.size(1);
LLGemm_Silu(in_a.data_ptr(), in_b.data_ptr(),
out_c.data_ptr(), M, K, at::cuda::getCurrentCUDAStream(),rows_per_block);
}

void LLGemm1(void *in_a, void *in_b, void *out_c, const int M, const int K, cudaStream_t stream,const int rows_per_block);

//template <typename T>
void LLMM1(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c, const int rows_per_block=4) {
int M = in_a.size(0);
int K = in_a.size(1);
//if (N != in_b.numel())
// throw std::invalid_argument("Size mismatch A.numel(): " + std::to_string(in_a.numel())
// + ", B.numel(): " + std::to_string(in_b.numel()));

//out_c.resize_({N});

// call the kernel function...
LLGemm1(in_a.data_ptr(), in_b.data_ptr(),
out_c.data_ptr(), M, K, at::cuda::getCurrentCUDAStream(),rows_per_block);
}

void LLGemmZZ(void *in_a, void *in_b, void *out_c, const int M, const int K, cudaStream_t stream, const int solidx);

void LLZZ(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c, const int solidx=0) {
int M = in_a.size(0);
int K = in_a.size(1);

LLGemmZZ(in_a.data_ptr(), in_b.data_ptr(),
out_c.data_ptr(), M, K, at::cuda::getCurrentCUDAStream(),solidx);
}
// instantiate the CPP template for T=float:
//template void AddGPU<float>(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c);


void MMGPUKernel(float *in_a, float *in_b, float *out_c,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns,
cudaStream_t stream);


void MMCustomGPU(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c) {
auto matA_sizes { in_a.sizes() };
auto matB_sizes { in_b.sizes() };
auto matO_sizes { out_c.sizes() };
MMGPUKernel(in_a.data_ptr<float>(), in_b.data_ptr<float>(), out_c.data_ptr<float>(),
matA_sizes[0], matA_sizes[1],
matB_sizes[0], matB_sizes[1],
matO_sizes[0], matO_sizes[1],
at::cuda::getCurrentCUDAStream());
}

// declare the extension module with the AddGPU function:
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
m.doc() = "pybind11 example plugin";
m.def("LLMM1", &LLMM1);
m.def("LLMM_Silu", &LLMM_Silu);
m.def("LLZZ", &LLZZ);
//m.def("MMCustomGPU", &MMCustomGPU);
}
Loading

0 comments on commit 629f74b

Please sign in to comment.