Improve profiling setup and documentation, sync benchmarks with main (#…

…218) * Automatically set rpd env var with profile flag * Add readme * Fix lint errors --------- Co-authored-by: AdrianAbeyta <[email protected]>
ROCm · Oct 3, 2024 · 4cb422f · 4cb422f
1 parent 47d6392
commit 4cb422f
Show file tree

Hide file tree

Showing 4 changed files with 97 additions and 25 deletions.
diff --git a/benchmarks/profiling/README.md b/benchmarks/profiling/README.md
@@ -0,0 +1,59 @@
+# VLLM Benchmark Profiling
+
+This profiling directory provides a method to profile VLLM throughput and latency benchmarks using ROCm profiling utilities.
+
+## 1. Dependencies
+
+Before using the profiling feature, you need to install the required dependencies:
+
+### Install ROCm Profile Data
+
+```bash
+git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git
+cd rocmProfileData && make && sudo make install
+```
+
+### Install hipMarker
+
+```bash
+cd rocmProfileData/hipMarker && python3 setup.py install
+```
+
+## 2. Profiling Benchmarks
+
+Profiling can be used to monitor the performance of the VLLM benchmarks with ROCm. The key flags used for profiling are:
+
+- `--profile-rpd`: Profiles the generation process of a single batch.
+- `--profile-dir PROFILE_DIR`: Specifies the path to save the profiler output, which can later be visualized using tools like [ui.perfetto.dev](https://ui.perfetto.dev/) or [chrome.tracing](chrome://tracing/).
+
+### Profiling Using Default Directory
+
+By default, profiling results are saved in either `vllm_benchmark_latency_result` or `vllm_benchmark_throughput_result`. To run a benchmark and profile it using the default directory, execute:
+
+```bash
+python3 benchmark_throughput.py --input-len {len} --output-len {len} --model {model} --profile-rpd
+```
+
+### Profiling With a Custom Directory
+
+You can specify a custom directory for saving profiler outputs by using the `--profile-dir` flag:
+
+```bash
+python3 benchmark_throughput.py --input-len {len} --output-len {len} --model {model} --profile-rpd --profile-dir {/path/to/custom/dir}
+```
+
+After profiling is complete, an `.rpd` file containing the trace data will be saved to the specified directory.
+
+## 3. Convert Trace Data to JSON Format
+
+To view the trace data, it needs to be converted into a format that is compatible with tools like Chrome tracing or Perfetto.
+
+You can use the `rpd2tracing.py` script in rocmProfileData to convert the `.rpd` file into a JSON file:
+
+```bash
+python3 rocmProfileData/tools/rpd2tracing.py trace.rpd trace.json
+```
+
+Once the trace is converted, open the `.json` file in [Chrome](chrome://tracing/) or [Perfetto](https://ui.perfetto.dev/) for visualization.
+
+
diff --git a/benchmarks/profiling/benchmark_latency.py b/benchmarks/profiling/benchmark_latency.py
@@ -13,7 +13,7 @@
 
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
-from vllm.inputs import PromptType
+from vllm.inputs import PromptInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.utils import FlexibleArgumentParser
 
@@ -30,8 +30,7 @@ def rpd_profiler_context():
         rpd.top_totals()
 
     @contextmanager
-    def torch_profiler_context(profile_dir: Optional[str] = None,
-                               trace_file_name=None):
+    def torch_profiler_context(profile_dir: Optional[str] = None):
         p = torch.profiler.profile(
             activities=[
                 torch.profiler.ProfilerActivity.CPU,
@@ -48,15 +47,27 @@ def torch_profiler_context(profile_dir: Optional[str] = None,
             print(p.key_averages().table(sort_by="self_cuda_time_total",
                                          row_limit=-1))
 
-    def get_profiling_context(profile_dir: Optional[str] = None,
-                              trace_file_name=None):
+    def get_profiling_context(profile_dir: Optional[str] = None):
         if args.profile_torch:
-            return torch_profiler_context(profile_dir, trace_file_name)
+            return torch_profiler_context(profile_dir)
         elif args.profile_rpd:
             return rpd_profiler_context()
         else:
             return nullcontext()
 
+    if args.profile_torch or args.profile_rpd:
+        profile_dir = Path(args.profile_dir
+                           or "./vllm_benchmark_latency_result")
+        profile_dir.mkdir(parents=True, exist_ok=True)
+        name = os.path.basename(os.path.normpath(args.model))
+        model_trace_name = (
+            f"{name}_in_{args.input_len}_out_{args.output_len}_"
+            f"batch_{args.batch_size}_tp_{args.tensor_parallel_size}")
+        print(f"Profiling (results will be saved to '{profile_dir}')...")
+        if args.profile_rpd:
+            profile_dir /= f"{model_trace_name}.rpd"
+            os.environ["VLLM_RPD_PROFILER_DIR"] = str(profile_dir)
+
     # NOTE(woosuk): If the request cannot be processed in a single batch,
     # the engine will automatically process the request in multiple batches.
     llm = LLM(
@@ -100,19 +111,19 @@ def get_profiling_context(profile_dir: Optional[str] = None,
     dummy_prompt_token_ids = np.random.randint(10000,
                                                size=(args.batch_size,
                                                      args.input_len))
-    dummy_prompts: List[PromptType] = [{
+    dummy_inputs: List[PromptInputs] = [{
         "prompt_token_ids": batch
     } for batch in dummy_prompt_token_ids.tolist()]
 
     def run_to_completion(profile_dir: Optional[str] = None):
         if profile_dir:
-            with get_profiling_context():
-                llm.generate(dummy_prompts,
+            with get_profiling_context(profile_dir):
+                llm.generate(dummy_inputs,
                              sampling_params=sampling_params,
                              use_tqdm=False)
         else:
             start_time = time.perf_counter()
-            llm.generate(dummy_prompts,
+            llm.generate(dummy_inputs,
                          sampling_params=sampling_params,
                          use_tqdm=False)
             end_time = time.perf_counter()
@@ -124,11 +135,6 @@ def run_to_completion(profile_dir: Optional[str] = None):
         run_to_completion(profile_dir=None)
 
     if args.profile_torch or args.profile_rpd:
-        profile_dir = args.profile_dir
-        if not profile_dir:
-            profile_dir = Path(".") / "vllm_benchmark_latency_result"
-            os.makedirs(profile_dir, exist_ok=True)
-        print(f"Profiling (results will be saved to '{profile_dir}')...")
         run_to_completion(profile_dir=profile_dir)
         return
 

diff --git a/benchmarks/profiling/benchmark_throughput.py b/benchmarks/profiling/benchmark_throughput.py
@@ -5,6 +5,7 @@
 import random
 import time
 from contextlib import contextmanager, nullcontext
+from pathlib import Path
 from typing import List, Optional, Tuple
 
 import torch
@@ -121,15 +122,27 @@ def torch_profiler_context(profile_dir: Optional[str] = None):
             print(p.key_averages().table(sort_by="self_cuda_time_total",
                                          row_limit=-1))
 
-    def get_profiling_context(profile_dir: Optional[str] = None,
-                              trace_file_name=None):
+    def get_profiling_context(profile_dir: Optional[str] = None):
         if args.profile_torch:
-            return torch_profiler_context(profile_dir, trace_file_name)
+            return torch_profiler_context(profile_dir)
         elif args.profile_rpd:
             return rpd_profiler_context()
         else:
             return nullcontext()
 
+    if args.profile_torch or args.profile_rpd:
+        profile_dir = Path(args.profile_dir
+                           or "./vllm_benchmark_throughput_result")
+        profile_dir.mkdir(parents=True, exist_ok=True)
+        name = os.path.basename(os.path.normpath(args.model))
+        model_trace_name = (
+            f"{name}_in_{args.input_len}_out_{args.output_len}_"
+            f"tp_{args.tensor_parallel_size}")
+        print(f"Profiling (results will be saved to '{profile_dir}')...")
+        if args.profile_rpd:
+            profile_dir /= f"{model_trace_name}.rpd"
+            os.environ["VLLM_RPD_PROFILER_DIR"] = str(profile_dir)
+
     llm = LLM(
         model=model,
         tokenizer=tokenizer,
@@ -171,10 +184,7 @@ def get_profiling_context(profile_dir: Optional[str] = None,
             ))
 
     if args.profile_torch or args.profile_rpd:
-        profile_dir = args.profile_dir
-        name = os.path.basename(os.path.normpath(args.model))
-        model_trace_name = f"{name}_in_{args.input_len}_out_{args.output_len}"
-        with get_profiling_context(profile_dir, model_trace_name):
+        with get_profiling_context(profile_dir):
             llm.generate(prompts, sampling_params, use_tqdm=True)
         return
     else:

diff --git a/vllm/utils.py b/vllm/utils.py
@@ -163,11 +163,8 @@ def __init__(self,
                  skip=False):
         self.skip = skip
         if not self.skip:
-            if 'RANK' in os.environ or int(os.getenv('WORLD_SIZE', 1)) > 1:
-                filename = f"{filename}_pid{os.getpid()}"
             self.name = name
             self.args = args if args else ""
-            print(f"filename type {type(filename)}")
             self.rpd = self.initialize_rpd_tracer(filename, nvtx)
 
     def _recreate_cm(self):