From 379db5f99f62c5a7227bfed72aaf8a966220e84d Mon Sep 17 00:00:00 2001
From: aakbarza <amir.zadeh89@gmail.com>
Date: Fri, 17 Jan 2025 10:41:20 -0800
Subject: [PATCH] Profile with kineto and warmup for more accurate benchmarking
 (#3585)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/3585

X-link: https://github.com/facebookresearch/FBGEMM/pull/667

**Summary**
This PR introduces:
A new warm-up method to ensure sufficient GPU preparation before benchmarking.
Benchmark time calculation using the Kineto profiler for measuring the time and bandwidth of inference forward kernels.

**Motivation**
In small benchmark cases, kernel launch and synchronization overheads can be significant compared to the actual kernel runtime. By leveraging the Kineto profiler:
These overheads are eliminated.
Users get a more accurate estimation of kernel execution time and bandwidth of the forward kernel.

For small kernels the iteration based warm-up might not be sufficient.
By leveraging the time based warmup:
Users will be confident the GPU has done enough warm-up.

**Test instruction**
The below script shows how to use this features:
python bench/split_table_batched_embeddings_benchmark.py nbit-device-with-spec --export-trace --warmup_ms 50

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/3580

Reviewed By: leitian

Differential Revision: D68292871

Pulled By: q10

fbshipit-source-id: 0a90cddcf07780164e38ac1b945717d8456947c0
---
 fbgemm_gpu/bench/bench_utils.py               |  53 +++++-
 ...plit_table_batched_embeddings_benchmark.py | 177 ++++++++++++++++--
 2 files changed, 203 insertions(+), 27 deletions(-)

diff --git a/fbgemm_gpu/bench/bench_utils.py b/fbgemm_gpu/bench/bench_utils.py
index 830415945c..2f9dc8702c 100644
--- a/fbgemm_gpu/bench/bench_utils.py
+++ b/fbgemm_gpu/bench/bench_utils.py
@@ -30,6 +30,28 @@
 logging.basicConfig(level=logging.DEBUG)
 
 
+def warmup(
+    request: TBERequest,
+    warmup_ms: int,
+    warmup_runs: int,
+    func: Callable[[torch.Tensor, torch.Tensor, Optional[torch.Tensor]], torch.Tensor],
+    bwd_only: bool = False,
+    grad: Optional[torch.Tensor] = None,
+) -> None:
+    indices, offsets, weights = request.unpack_3()
+    if warmup_ms:
+        start_time_ms = time.time() * 1000
+        while time.time() * 1000 - start_time_ms < warmup_ms:
+            out = func(indices, offsets, weights)
+            if bwd_only:
+                out.backward(grad)
+    else:
+        for _ in range(warmup_runs):
+            out = func(indices, offsets, weights)
+            if bwd_only:
+                out.backward(grad)
+
+
 def benchmark_torch_function(  # noqa: C901
     # pyre-fixme[2]: Parameter must be annotated.
     f,
@@ -159,19 +181,30 @@ def benchmark_requests(
     # Can be used to clear model's stats after warmup for example.
     callback_after_warmup: Optional[Callable[[], None]] = None,
     periodic_logs: bool = False,
+    warmup_ms: Optional[int] = None,
 ) -> float:
     times = []
-
     # Run at least one warmup iteration to avoid the long cudaLaunchKernel time
-    # for the first kernel
-    num_warmups = num_warmups + 1 if num_warmups >= 0 else 1
-
-    if num_warmups > 0:
-        indices, offsets, weights = requests[0].unpack_3()
-        for _ in range(num_warmups):
-            out = func(indices, offsets, weights)
-            if bwd_only:
-                out.backward(grad)
+    # for the first kernel if warmup_ms > 0
+    # warmup_ms is prioritized over num_warmups
+
+    if warmup_ms is None:
+        num_warmups = num_warmups + 1 if num_warmups >= 0 else 1
+
+    # warm-up the GPU before profiling
+    warmup(
+        requests[0],
+        # pyre-ignore[6]
+        warmup_ms,
+        num_warmups,
+        lambda indices, offsets, per_sample_weights: func(
+            indices,
+            offsets,
+            per_sample_weights,
+        ),
+        bwd_only=bwd_only,
+        grad=grad,
+    )
 
     if callback_after_warmup is not None:
         callback_after_warmup()
diff --git a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
index f439ed6780..9d0a5c8be4 100644
--- a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
+++ b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
@@ -77,6 +77,7 @@
         benchmark_torch_function,
         benchmark_vbe,
         fill_random_scale_bias,
+        warmup,
     )
 else:
     from fbgemm_gpu.bench.bench_utils import (
@@ -87,12 +88,28 @@
         benchmark_torch_function,
         benchmark_vbe,
         fill_random_scale_bias,
+        warmup,
     )
 
 
 logging.basicConfig(level=logging.DEBUG)
 
 
+def kineto_trace_profiler(p: profile, trace_info: tuple[str, str, str, str]) -> float:
+    phase, trace_url, tbe_type, kern_name = trace_info
+    p.export_chrome_trace(
+        trace_url.format(tbe_type=tbe_type, phase=phase, ospid=os.getpid())
+    )
+    kernel_time = 0
+    for event in p.key_averages():
+        # Sum the total time of forward kernel runs
+        if kern_name in event.key:
+            kernel_time += event.device_time
+    assert kernel_time > 0
+    print(f"Total CUDA time: {kernel_time:.2f} ")
+    return kernel_time
+
+
 @click.group()
 def cli() -> None:
     pass
@@ -323,7 +340,6 @@ def device(  # noqa C901
     logging.info(
         f"Accessed weights per batch: {B * sum(Ds) * L * param_size_multiplier / 1.0e9: .2f} GB"
     )
-
     requests = generate_requests(
         iters,
         B,
@@ -1135,6 +1151,7 @@ def nbit_cpu(  # noqa C901
 @click.option("--iters", default=100)
 @click.option("--runs-of-iters", default=5)
 @click.option("--warmup-runs", default=2)
+@click.option("--warmup-ms", type=int, default=None)
 @click.option("--output-dtype", type=SparseType, default=SparseType.FP16)
 @click.option("--report-aibench", is_flag=True)
 @click.option("--run-reference", is_flag=True, default=False)
@@ -1148,6 +1165,17 @@ def nbit_cpu(  # noqa C901
     type=str,
     default="{tbe_type}_tbe_{phase}_trace_{ospid}.json",
 )
+@click.option(
+    "--warmup-runs",
+    default=2,
+    help="Number of warmup runs. Ignored if --warmup-ms is set.",
+)
+@click.option(
+    "--warmup-ms",
+    type=int,
+    default=None,
+    help="Warmup duration in milliseconds. Disables the --run-nums option.",
+)
 def nbit_device(  # noqa C901
     alpha: float,
     bag_size: int,
@@ -1168,7 +1196,6 @@ def nbit_device(  # noqa C901
     check_median: bool,
     iters: int,
     runs_of_iters: int,
-    warmup_runs: int,
     output_dtype: SparseType,
     report_aibench: bool,
     run_reference: bool,
@@ -1178,6 +1205,8 @@ def nbit_device(  # noqa C901
     fp8_exponent_bias: Optional[int],
     export_trace: bool,
     trace_url: str,
+    warmup_runs: int,
+    warmup_ms: Optional[int],
 ) -> None:
     np.random.seed(42)
     torch.manual_seed(42)
@@ -1295,6 +1324,7 @@ def nbit_device(  # noqa C901
                 per_sample_weights,
             ),
             check_median=check_median,
+            warmup_ms=warmup_ms,
         )
 
         # free up GPU memory
@@ -1324,18 +1354,6 @@ def nbit_device(  # noqa C901
         f"Memory Usage For Pruning: {mem_for_pruning / 1.0e9:.0f} GB"
     )
 
-    # Get trace for one run of iter
-    tbe_type: str = "split"
-
-    def _kineto_trace_handler(p: profile, phase: str) -> None:
-        p.export_chrome_trace(
-            trace_url.format(tbe_type=tbe_type, phase=phase, ospid=os.getpid())
-        )
-
-    # pyre-ignore[3]
-    def context_factory(on_trace_ready: Callable[[profile], None]):
-        return profile(on_trace_ready=on_trace_ready) if export_trace else nullcontext()
-
     requests = generate_requests(
         iters,
         B,
@@ -1353,7 +1371,35 @@ def context_factory(on_trace_ready: Callable[[profile], None]):
         for req in requests
     ]
 
-    with context_factory(lambda p: _kineto_trace_handler(p, "fwd")):
+    # pyre-ignore[3]
+    def context_factory(on_trace_ready: Callable[[profile], None]):
+        return profile(on_trace_ready=on_trace_ready) if export_trace else nullcontext()
+
+    # Get trace for one run of iter
+    tbe_type: str = "split"
+    # input of the kineto_trace_profiler
+    trace_info = ("fwd", trace_url, tbe_type, "embedding_codegen_forward")
+    time_dict = {"kernel_time": None}  # dict to hold the kernel time
+
+    # warm-up right before profiling
+    # warmup_ms prioritized over warmup_runs
+    if warmup_ms or warmup_runs:
+        warmup(
+            requests[0],
+            # pyre-ignore[6]
+            warmup_ms,
+            warmup_runs,
+            lambda indices, offsets, per_sample_weights: emb.forward(
+                indices.int(),
+                offsets.int(),
+                per_sample_weights,
+            ),
+        )
+
+    with context_factory(
+        # pyre-ignore[6]
+        lambda p: time_dict.update(kernel_time=kineto_trace_profiler(p, trace_info))
+    ):
         # forward
         time_per_iter = benchmark_requests(
             requests,
@@ -1364,6 +1410,21 @@ def context_factory(on_trace_ready: Callable[[profile], None]):
             ),
             check_median=check_median,
         )
+
+    if export_trace:
+        kernel_time = time_dict["kernel_time"]
+        # pyre-ignore[58]
+        bandwidth = read_write_bytes / kernel_time / 1.0e3
+
+        logging.info(
+            f"kineto profiled stats: "
+            f"{weights_precision} Forward, B: {B}, "
+            f"E: {E}, T: {T}, D: {D}, L: {L}, W: {weighted}, "
+            f"BW: {bandwidth: .2f} GB/s, "  # noqa: B950
+            f"Time: {kernel_time:.0f}us, "
+            f"Memory Usage For Pruning: {mem_for_pruning / 1.0e9:.0f} GB"
+        )
+
     # free up GPU memory
     del requests
 
@@ -1465,12 +1526,28 @@ def context_factory(on_trace_ready: Callable[[profile], None]):
 @click.option("--check-median", is_flag=True, default=True)
 @click.option("--iters", default=100)
 @click.option("--runs-of-iters", default=5)
-@click.option("--warmup-runs", default=2)
 @click.option("--output-dtype", type=SparseType, default=SparseType.FP16)
 @click.option("--report-aibench", is_flag=True)
 @click.option("--fp8-exponent-bits", type=int, default=None)
 @click.option("--fp8-exponent-bias", type=int, default=None)
 @click.option("--use-cpu", is_flag=True, default=False)
+@click.option("--export-trace", is_flag=True, default=False)
+@click.option(
+    "--trace-url",
+    type=str,
+    default="{tbe_type}_tbe_spec_{phase}_trace_{ospid}.json",
+)
+@click.option(
+    "--warmup-runs",
+    default=2,
+    help="Number of warmup runs. Ignored if --warmup-ms is set.",
+)
+@click.option(
+    "--warmup-ms",
+    type=int,
+    default=None,
+    help="Warmup duration in milliseconds. Disables the --run-nums option.",
+)
 def nbit_device_with_spec(  # noqa C901
     alpha: float,
     bag_size_list: str,
@@ -1490,12 +1567,15 @@ def nbit_device_with_spec(  # noqa C901
     check_median: bool,
     iters: int,
     runs_of_iters: int,
-    warmup_runs: int,
     output_dtype: SparseType,
     report_aibench: bool,
     fp8_exponent_bits: Optional[int],
     fp8_exponent_bias: Optional[int],
     use_cpu: bool,
+    export_trace: bool,
+    trace_url: str,
+    warmup_runs: int,
+    warmup_ms: Optional[int],
 ) -> None:
     np.random.seed(42)
     torch.manual_seed(42)
@@ -1607,6 +1687,7 @@ def nbit_device_with_spec(  # noqa C901
     )
 
     times = []
+    kineto_request = []
     for i in range(runs_of_iters):
         # Generate a request for each table then combine
         all_requests = {
@@ -1683,8 +1764,13 @@ def nbit_device_with_spec(  # noqa C901
                     per_sample_weights,
                 ),
                 check_median=check_median,
+                warmup_ms=warmup_ms,
             )
 
+        # copy the request of last iteration for kineto profile benchmark
+        if i == runs_of_iters - 1:
+            kineto_request = requests
+
         # free up memory
         del requests
 
@@ -1712,6 +1798,63 @@ def nbit_device_with_spec(  # noqa C901
         f"Memory Usage For Pruning: {mem_for_pruning / 1.0e9:.0f} GB"
     )
 
+    # pyre-ignore[3]
+    def context_factory(on_trace_ready: Callable[[profile], None]):
+        return profile(on_trace_ready=on_trace_ready) if export_trace else nullcontext()
+
+    if not use_cpu:
+        # profile with kineto
+        tbe_type: str = "split"
+        time_dict = {"kernel_time": None}  # Shared variable to hold the kernel time
+        trace_info = ("fwd", trace_url, tbe_type, "embedding_codegen_forward")
+
+        # warm-up right before profiling
+        # warmup_ms prioritized over warmup_runs
+        if warmup_ms or warmup_runs:
+            warmup(
+                kineto_request[0],
+                # pyre-ignore[6]
+                warmup_ms,
+                warmup_runs,
+                lambda indices, offsets, per_sample_weights: emb.forward(
+                    indices.int(),
+                    offsets.int(),
+                    per_sample_weights,
+                ),
+            )
+
+        with context_factory(
+            # pyre-ignore[6]
+            lambda p: time_dict.update(kernel_time=kineto_trace_profiler(p, trace_info))
+        ):
+            # forward
+            time_per_iter = benchmark_requests(
+                kineto_request,
+                lambda indices, offsets, per_sample_weights: emb.forward(
+                    indices.int(),
+                    offsets.int(),
+                    per_sample_weights,
+                ),
+                check_median=check_median,
+            )
+
+        if export_trace:
+            kernel_time = time_dict["kernel_time"]
+            # pyre-ignore[6]
+            bandwidth = read_write_bytes / kernel_time / 1.0e3
+
+            logging.info(
+                f"kineto profiled stats: "
+                f"{weights_precision} Forward, B: {B}, "
+                f"E: {E}, T: {T}, D: {D}, L: {L}, W: {weighted}, "
+                f"BW: {bandwidth: .2f} GB/s, "  # noqa: B950
+                f"Time: {kernel_time:.0f}us, "
+                f"Memory Usage For Pruning: {mem_for_pruning / 1.0e9:.0f} GB"
+            )
+
+    # free up memory
+    del kineto_request
+
     if report_aibench and haveAIBench:
         print(
             emitMetric(