Removed obsolete parts. Made rocm attention defaults define guarded

ROCm · Mar 19, 2024 · 3d82eea · 3d82eea
1 parent 1510843
commit 3d82eea
Show file tree

Hide file tree

Showing 7 changed files with 92 additions and 618 deletions.
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
@@ -1,9 +1,14 @@
-FROM rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1
-ENV WORKSPACE_DIR=/workspace
-RUN mkdir -p $WORKSPACE_DIR
-WORKDIR $WORKSPACE_DIR
-# Limit arch's so composable kernel doesn't take days to finish
-ENV PYTORCH_ROCM_ARCH=gfx90a;gfx942
+# default base image
+ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+
+FROM $BASE_IMAGE
+
+RUN echo "Base image is $BASE_IMAGE"
+
+# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
+# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+
+
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
 RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
 
@@ -22,22 +27,9 @@ ARG BUILD_CUPY="1"
 # whether to build triton on rocm
 ARG BUILD_TRITON="1"
 
-# Install some basic utilities
-RUN apt-get update && apt-get install python3 python3-pip -y
-
 # Install some basic utilities
 RUN apt-get update && apt-get install -y \
-    curl \
-    ca-certificates \
-    sudo \
-    git \
-    bzip2 \
-    libx11-6 \
-    build-essential \
-    wget \
-    unzip \
-    nvidia-cuda-toolkit \
-    tmux \
+    sqlite3 libsqlite3-dev libfmt-dev \
  && rm -rf /var/lib/apt/lists/*
 
 ### Mount Point ###
@@ -60,6 +52,8 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
     && cd libs \
     && git clone https://github.com/ROCm/flash-attention.git \
     && cd flash-attention \
+    && git checkout ${FA_BRANCH} \
+    && git submodule update --init \
     && export GPU_ARCHS=${FA_GFX_ARCHS} \
     && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
         patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
@@ -94,27 +88,29 @@ RUN if [ "$BUILD_TRITON" = "1"]; then \
     mkdir -p libs \
     && cd libs \
     && pip uninstall -y triton \
-    && git clone https://github.com/ROCmSoftwarePlatform/triton.git
+    && git clone https://github.com/ROCm/triton.git \
     && cd triton/python \
     && pip3 install -e . \
-    && cd ../..; \
+    && cd ../.. \
+    && rm -r triton; \
     fi
 
 COPY ./ /app/vllm
 
 RUN python3 -m pip install --upgrade pip
 RUN python3 -m pip install xformers==0.0.23 --no-deps
 
-RUN cd vllm \
-    && pip install -r requirements-rocm.txt \
-    && pip install typing-extensions==4.8.0 \
-    && bash patch_xformers.rocm.sh \
-    && cd gradlib && python setup.py develop && cd ../ \
-    && python setup.py build && python setup.py develop; exit 0
-
-RUN pip install pyarrow Ray pandas==2.0 numpy==1.20.3
+RUN cd /app \
+    && cd vllm \
+    && pip install -U -r requirements-rocm.txt \
+    && if [ "$BUILD_FA" = "1" ]; then \
+    bash patch_xformers.rocm.sh; fi \
+    && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
+    patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch; fi \
+    && python3 setup.py install \
+    && cd ..
 
-RUN git clone https://github.com/ROCmSoftwarePlatform/rocmProfileData.git \
-    && cd rocmProfileData && make; make install
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install --no-cache-dir ray[all]
 
-WORKDIR /workspace/vllm
+CMD ["/bin/bash"]
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -3,23 +3,17 @@
 import time
 from pathlib import Path
 from typing import Optional
-import pandas as pd
+
 import numpy as np
 import torch
 from tqdm import tqdm
 
 from vllm import LLM, SamplingParams
-from torch.profiler import profile, record_function, ProfilerActivity
 
-def list_of_ints(arg):
-    return list(map(int, arg.split(',')))
 
 def main(args: argparse.Namespace):
     print(args)
 
-    print(f'>>>Loading LLM')
-    if args.report:
-        results_df = pd.DataFrame(columns=['model', 'batch', 'tp', 'input', 'output', 'latency'])
     # NOTE(woosuk): If the request cannot be processed in a single batch,
     # the engine will automatically process the request in multiple batches.
     llm = LLM(
@@ -36,101 +30,60 @@ def main(args: argparse.Namespace):
         ray_workers_use_nsight=args.ray_workers_use_nsight,
     )
 
-    for batch_size in args.batch_size:
-        for output_len in args.output_len:
-            for input_len in args.input_len:
-                print(f'>>>RUNNING {args.model} Batch_size:{batch_size} Input_len:{input_len} Output_len:{output_len}') 
-                sampling_params = SamplingParams(
-                    n=args.n,
-                    temperature=0.0 if args.use_beam_search else 1.0,
-                    top_p=1.0,
-                    use_beam_search=args.use_beam_search,
-                    ignore_eos=True,
-                    max_tokens=output_len,
-                )
-                print(sampling_params)
-                dummy_prompt_token_ids = [[0] * input_len] * batch_size
-                dummy_prompts = []
-                dummy_prompts.append('DeepSpeed is a machine learning library that deep learning practitioners should use for what purpose')
-
-                def run_to_completion(profile_dir: Optional[str] = None):
-                    if profile_dir:
-                        with torch.profiler.profile(
-                                activities=[
-                                    torch.profiler.ProfilerActivity.CPU,
-                                    torch.profiler.ProfilerActivity.CUDA,
-                                ],
-                                on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                                    str(profile_dir))) as p:
-                            llm.generate(prompt_token_ids=dummy_prompt_token_ids,
-                                        sampling_params=sampling_params,
-                                        use_tqdm=False)
-                        print(p.key_averages())
-                    elif args.accuracy:
-                        start_time = time.perf_counter()
-                        rsp = llm.generate(
-                                    #prompt_token_ids=dummy_prompt_token_ids,
-                                    prompts=dummy_prompts,
-                                    sampling_params=sampling_params,
-                                    use_tqdm=False)
-                        end_time = time.perf_counter()
-                        latency = end_time - start_time
-                        print('>>Rsp', rsp[0].outputs)
-                        return latency
-                    else:
-                        start_time = time.perf_counter()
-                        rsp = llm.generate(prompt_token_ids=dummy_prompt_token_ids,
-                                    sampling_params=sampling_params,
-                                    use_tqdm=False)
-                        end_time = time.perf_counter()
-                        latency = end_time - start_time
-                        print('>>Rsp', rsp[0].outputs)
-                        return latency
-
-                print("Warming up...")
-                run_to_completion(profile_dir=None)
-
-                if (args.warmup_only):
-
-                    print(">>> Warmup only specified, exiting")
-                    continue
+    sampling_params = SamplingParams(
+        n=args.n,
+        temperature=0.0 if args.use_beam_search else 1.0,
+        top_p=1.0,
+        use_beam_search=args.use_beam_search,
+        ignore_eos=True,
+        max_tokens=args.output_len,
+    )
+    print(sampling_params)
+    dummy_prompt_token_ids = np.random.randint(10000,
+                                               size=(args.batch_size,
+                                                     args.input_len))
+    dummy_prompt_token_ids = dummy_prompt_token_ids.tolist()
 
-                if args.profile:
-                    profile_dir = args.profile_result_dir
-                    if not profile_dir:
-                        profile_dir = Path(
-                            "."
-                        ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
-                    print(f"Profiling (results will be saved to '{profile_dir}')...")
-                    run_to_completion(profile_dir=args.profile_result_dir)
-                    return
-                if args.rpd:
-                    from rpdTracerControl import rpdTracerControl
-                    rpdTracerControl.setFilename(name = "/workspace/trace.rpd", append=True)
-                    profile_rpd = rpdTracerControl()
-                    profile_rpd.start()
-                    print(f"RPD Profiling'...")
-                    with torch.autograd.profiler.emit_nvtx():
-                        run_to_completion(profile_dir=None)
-                    profile_rpd.stop()
-                    return
+    def run_to_completion(profile_dir: Optional[str] = None):
+        if profile_dir:
+            with torch.profiler.profile(
+                    activities=[
+                        torch.profiler.ProfilerActivity.CPU,
+                        torch.profiler.ProfilerActivity.CUDA,
+                    ],
+                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                        str(profile_dir))) as p:
+                llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+                             sampling_params=sampling_params,
+                             use_tqdm=False)
+            print(p.key_averages())
+        else:
+            start_time = time.perf_counter()
+            llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+                         sampling_params=sampling_params,
+                         use_tqdm=False)
+            end_time = time.perf_counter()
+            latency = end_time - start_time
+            return latency
 
-                # Benchmark.
-                latencies = []
-                for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
-                    latencies.append(run_to_completion(profile_dir=None))
+    print("Warming up...")
+    run_to_completion(profile_dir=None)
 
-                if torch.distributed.get_rank() == 0:
-                #results_df = pd.DataFrame(columns=['model', 'batch', 'tp', 'input', 'output', 'latency'])
-                    latency=np.mean(latencies)
-                    print(f'Avg latency: {latency} seconds') 
-                    if args.report:
-                        entry = {'model':[args.model], 'tp':[args.tensor_parallel_size],'batch':[batch_size], 'input':[input_len], 'output':[output_len], 'latency':[latency]}
-                        results_df = pd.concat([results_df, pd.DataFrame(entry)], ignore_index=True)
-    if torch.distributed.get_rank() == 0 and args.report:
-        print(results_df)
-        results_df.to_csv(args.report_file, index=False)
+    if args.profile:
+        profile_dir = args.profile_result_dir
+        if not profile_dir:
+            profile_dir = Path(
+                "."
+            ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
+        print(f"Profiling (results will be saved to '{profile_dir}')...")
+        run_to_completion(profile_dir=profile_dir)
+        return
 
+    # Benchmark.
+    latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
+        latencies.append(run_to_completion(profile_dir=None))
+    print(f'Avg latency: {np.mean(latencies)} seconds')
 
 
 if __name__ == '__main__':
@@ -144,9 +97,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
                         choices=['awq', 'gptq', 'squeezellm', None],
                         default=None)
     parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
-    parser.add_argument('--input-len', type=list_of_ints, default=32)
-    parser.add_argument('--output-len', type=list_of_ints, default=128)
-    parser.add_argument('--batch-size', type=list_of_ints, default=8)
+    parser.add_argument('--input-len', type=int, default=32)
+    parser.add_argument('--output-len', type=int, default=128)
+    parser.add_argument('--batch-size', type=int, default=8)
     parser.add_argument('--n',
                         type=int,
                         default=1,
@@ -159,7 +112,6 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument('--trust-remote-code',
                         action='store_true',
                         help='trust remote code from huggingface')
-
     parser.add_argument(
         '--dtype',
         type=str,
@@ -172,9 +124,6 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument('--enforce-eager',
                         action='store_true',
                         help='enforce eager mode and disable CUDA graph')
-    parser.add_argument('--accuracy',
-                        action='store_true',
-                        help='Run an Actual query through vllm')
     parser.add_argument(
         "--kv-cache-dtype",
         type=str,
@@ -216,14 +165,5 @@ def run_to_completion(profile_dir: Optional[str] = None):
         action='store_true',
         help="If specified, use nsight to profile ray workers",
     )
-    parser.add_argument(
-        '--rpd',
-        action='store_true',
-        help='profile the generation process of a single batch using the rpd tracer')
-    parser.add_argument('--warmup-only', action='store_true',
-                        help='only run warmup, useful for tuning')
-    parser.add_argument('--report', action='store_true',
-                        help='turn on dataframe reporting')
-    parser.add_argument('--report-file', type=str, default=None)
     args = parser.parse_args()
     main(args)
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
@@ -629,7 +629,11 @@ template<
   typename CACHE_T,
   int BLOCK_SIZE,
   bool IS_FP8_KV_CACHE,
+#ifdef USE_ROCM
   int NUM_THREADS = 1024>
+#else
+  int NUM_THREADS = 128>
+#endif
 void paged_attention_v1_launcher(
   torch::Tensor& out,
   torch::Tensor& query,
@@ -810,8 +814,13 @@ template<
   typename CACHE_T,
   int BLOCK_SIZE,
   bool IS_FP8_KV_CACHE,
+#ifdef USE_ROCM
+  int NUM_THREADS = 128,
+  int PARTITION_SIZE = 512>
+#else
   int NUM_THREADS = 1024,
   int PARTITION_SIZE = 1024>
+#endif
 void paged_attention_v2_launcher(
   torch::Tensor& out,
   torch::Tensor& exp_sums,

diff --git a/csrc/cache.h b/csrc/cache.h
@@ -24,13 +24,6 @@ void reshape_and_cache(
   const std::string& kv_cache_dtype,
   const float kv_scale);
 
-void gather_cached_kv(
-  torch::Tensor& key,
-  torch::Tensor& value,
-  torch::Tensor& key_cache,
-  torch::Tensor& value_cache,
-  torch::Tensor& slot_mapping);
-
 // Just for unittest
 void convert_fp8(
   torch::Tensor& src_cache,