Merge pull request #7 from ROCm/integration_no_fp8

Features integration: Custom ops and kernels from private v0.2.7_dllehr Triton attention kernel from jpvillam/v0.3.3_triton Option to run multi GPU using torchrun instead or ray
ROCm · Mar 22, 2024 · 629f74b · 629f74b
2 parents 54be8a0 + 5e3ec52
commit 629f74b
Show file tree

Hide file tree

Showing 20 changed files with 1,515 additions and 88 deletions.
diff --git a/.gitignore b/.gitignore
@@ -181,6 +181,7 @@ _build/
 # hip files generated by PyTorch
 *.hip
 *_hip*
+hip_compat.h
 
 # Benchmark dataset
 *.json
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
@@ -3,8 +3,6 @@ ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
 
 FROM $BASE_IMAGE
 
-ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
-
 RUN echo "Base image is $BASE_IMAGE"
 
 # BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
@@ -26,22 +24,12 @@ ARG BUILD_FA="1"
 # whether to build cupy on rocm
 ARG BUILD_CUPY="1"
 
-# Install some basic utilities
-RUN apt-get update && apt-get install python3 python3-pip -y
+# whether to build triton on rocm
+ARG BUILD_TRITON="1"
 
 # Install some basic utilities
 RUN apt-get update && apt-get install -y \
-    curl \
-    ca-certificates \
-    sudo \
-    git \
-    bzip2 \
-    libx11-6 \
-    build-essential \
-    wget \
-    unzip \
-    nvidia-cuda-toolkit \
-    tmux \
+    sqlite3 libsqlite3-dev libfmt-dev \
  && rm -rf /var/lib/apt/lists/*
 
 ### Mount Point ###
@@ -95,6 +83,17 @@ RUN if [ "$BUILD_CUPY" = "1" ]; then \
     && cd ..; \
     fi
 
+# build triton
+RUN if [ "$BUILD_TRITON" = "1" ]; then \
+    mkdir -p libs \
+    && cd libs \
+    && pip uninstall -y triton \
+    && git clone https://github.com/ROCm/triton.git \
+    && cd triton/python \
+    && pip3 install . \
+    && cd ../..; \
+    fi
+
 COPY ./ /app/vllm
 
 RUN python3 -m pip install --upgrade pip
@@ -104,12 +103,13 @@ RUN cd /app \
     && cd vllm \
     && pip install -U -r requirements-rocm.txt \
     && if [ "$BUILD_FA" = "1" ]; then \
-       bash patch_xformers.rocm.sh; fi \
-    && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
+    bash patch_xformers.rocm.sh; fi \
+    && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
+    patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch; fi \
     && python3 setup.py install \
     && cd ..
 
 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir ray[all]
+RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3
 
 CMD ["/bin/bash"]
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -16,18 +16,17 @@ def main(args: argparse.Namespace):
 
     # NOTE(woosuk): If the request cannot be processed in a single batch,
     # the engine will automatically process the request in multiple batches.
-    llm = LLM(
-        model=args.model,
-        tokenizer=args.tokenizer,
-        quantization=args.quantization,
-        tensor_parallel_size=args.tensor_parallel_size,
-        trust_remote_code=args.trust_remote_code,
-        dtype=args.dtype,
-        enforce_eager=args.enforce_eager,
-        kv_cache_dtype=args.kv_cache_dtype,
-        device=args.device,
-        ray_workers_use_nsight=args.ray_workers_use_nsight,
-    )
+    llm = LLM(model=args.model,
+              tokenizer=args.tokenizer,
+              quantization=args.quantization,
+              tensor_parallel_size=args.tensor_parallel_size,
+              trust_remote_code=args.trust_remote_code,
+              dtype=args.dtype,
+              enforce_eager=args.enforce_eager,
+              kv_cache_dtype=args.kv_cache_dtype,
+              device=args.device,
+              ray_workers_use_nsight=args.ray_workers_use_nsight,
+              worker_use_ray=args.worker_use_ray)
 
     sampling_params = SamplingParams(
         n=args.n,
@@ -151,5 +150,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
         action='store_true',
         help="If specified, use nsight to profile ray workers",
     )
+    parser.add_argument('--worker-use-ray',
+                        action='store_true',
+                        help='use Ray for distributed serving, will be '
+                        'automatically set when using more than 1 GPU '
+                        'unless on ROCm where the default is torchrun')
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -75,21 +75,25 @@ def run_vllm(
     device: str,
     enable_prefix_caching: bool,
     gpu_memory_utilization: float = 0.9,
+    worker_use_ray: bool = False,
 ) -> float:
     from vllm import LLM, SamplingParams
-    llm = LLM(model=model,
-              tokenizer=tokenizer,
-              quantization=quantization,
-              tensor_parallel_size=tensor_parallel_size,
-              seed=seed,
-              trust_remote_code=trust_remote_code,
-              dtype=dtype,
-              max_model_len=max_model_len,
-              gpu_memory_utilization=gpu_memory_utilization,
-              enforce_eager=enforce_eager,
-              kv_cache_dtype=kv_cache_dtype,
-              device=device,
-              enable_prefix_caching=enable_prefix_caching)
+    llm = LLM(
+        model=model,
+        tokenizer=tokenizer,
+        quantization=quantization,
+        tensor_parallel_size=tensor_parallel_size,
+        seed=seed,
+        trust_remote_code=trust_remote_code,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+        enforce_eager=enforce_eager,
+        kv_cache_dtype=kv_cache_dtype,
+        device=device,
+        enable_prefix_caching=enable_prefix_caching,
+        worker_use_ray=worker_use_ray,
+    )
 
     # Add the requests to the engine.
     for prompt, _, output_len in requests:
@@ -213,7 +217,8 @@ def main(args: argparse.Namespace):
             args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
             args.trust_remote_code, args.dtype, args.max_model_len,
             args.enforce_eager, args.kv_cache_dtype, args.device,
-            args.enable_prefix_caching, args.gpu_memory_utilization)
+            args.enable_prefix_caching, args.gpu_memory_utilization,
+            args.worker_use_ray)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -314,6 +319,11 @@ def main(args: argparse.Namespace):
         "--enable-prefix-caching",
         action='store_true',
         help="enable automatic prefix caching for vLLM backend.")
+    parser.add_argument('--worker-use-ray',
+                        action='store_true',
+                        help='use Ray for distributed serving, will be '
+                        'automatically set when using more than 1 GPU '
+                        'unless on ROCm where the default is torchrun')
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model

diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
@@ -602,7 +602,11 @@ template<
   typename CACHE_T,
   int BLOCK_SIZE,
   bool IS_FP8_E5M2_KV_CACHE,
+#ifdef USE_ROCM
+  int NUM_THREADS = 1024>
+#else
   int NUM_THREADS = 128>
+#endif
 void paged_attention_v1_launcher(
   torch::Tensor& out,
   torch::Tensor& query,
@@ -779,8 +783,13 @@ template<
   typename CACHE_T,
   int BLOCK_SIZE,
   bool IS_FP8_E5M2_KV_CACHE,
+#ifdef USE_ROCM
+  int NUM_THREADS = 1024,
+  int PARTITION_SIZE = 1024>
+#else
   int NUM_THREADS = 128,
   int PARTITION_SIZE = 512>
+#endif
 void paged_attention_v2_launcher(
   torch::Tensor& out,
   torch::Tensor& exp_sums,

diff --git a/csrc/custom/custom.cpp b/csrc/custom/custom.cpp
@@ -0,0 +1,74 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include <pybind11/pybind11.h>
+#include <cuda_runtime.h>
+
+namespace py = pybind11;
+
+// declare templates for front (cpp) and back (cuda) sides of function:
+//template <typename T>
+
+void LLGemm_Silu(void *in_a, void *in_b, void *out_c, const int M, const int K, cudaStream_t stream, const int rows_per_block);
+void LLMM_Silu(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c, const int rows_per_block) {
+      int M = in_a.size(0);
+      int K = in_a.size(1);
+      LLGemm_Silu(in_a.data_ptr(), in_b.data_ptr(),
+                             out_c.data_ptr(), M, K, at::cuda::getCurrentCUDAStream(),rows_per_block);
+}
+
+void LLGemm1(void *in_a, void *in_b, void *out_c, const int M, const int K, cudaStream_t stream,const int rows_per_block);
+
+//template <typename T>
+void LLMM1(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c, const int rows_per_block=4) {
+      int M = in_a.size(0);
+      int K = in_a.size(1);
+        //if (N != in_b.numel())
+        //        throw std::invalid_argument("Size mismatch A.numel(): " + std::to_string(in_a.numel())
+        //                          + ", B.numel(): " + std::to_string(in_b.numel()));
+
+          //out_c.resize_({N});
+
+          // call the kernel function...
+           LLGemm1(in_a.data_ptr(), in_b.data_ptr(),
+                             out_c.data_ptr(), M, K, at::cuda::getCurrentCUDAStream(),rows_per_block);
+}
+
+void LLGemmZZ(void *in_a, void *in_b, void *out_c, const int M, const int K, cudaStream_t stream, const int solidx);
+
+void LLZZ(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c, const int solidx=0) {
+      int M = in_a.size(0);
+      int K = in_a.size(1);
+
+           LLGemmZZ(in_a.data_ptr(), in_b.data_ptr(),
+                             out_c.data_ptr(), M, K, at::cuda::getCurrentCUDAStream(),solidx);
+}
+// instantiate the CPP template for T=float:
+//template void AddGPU<float>(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c);
+
+
+void MMGPUKernel(float *in_a, float *in_b, float *out_c, 
+        int numARows, int numAColumns,
+        int numBRows, int numBColumns,
+        int numCRows, int numCColumns, 
+        cudaStream_t stream);
+
+
+void MMCustomGPU(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c) {
+        auto matA_sizes { in_a.sizes() };
+        auto matB_sizes { in_b.sizes() };
+        auto matO_sizes { out_c.sizes() };
+        MMGPUKernel(in_a.data_ptr<float>(), in_b.data_ptr<float>(), out_c.data_ptr<float>(), 
+            matA_sizes[0], matA_sizes[1],
+            matB_sizes[0], matB_sizes[1],
+            matO_sizes[0], matO_sizes[1],
+        at::cuda::getCurrentCUDAStream());
+}
+
+// declare the extension module with the AddGPU function:
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
+      m.doc() = "pybind11 example plugin";
+        m.def("LLMM1", &LLMM1);
+        m.def("LLMM_Silu", &LLMM_Silu);
+        m.def("LLZZ", &LLZZ);
+//m.def("MMCustomGPU", &MMCustomGPU);
+}