From 66d713c86b1d8790008233fe26674d0972c4ae62 Mon Sep 17 00:00:00 2001 From: Huanyu He Date: Sat, 22 Jun 2024 09:29:30 -0700 Subject: [PATCH] Add GPU trace for KT.regroup benchmark (#2157) Summary: # context * we are adding fbgemm operators for the KT.regroup function. * we wanted a good way to measure the performance beside the runtime * **trace is very important to evaluate the actual performance impact** * for example, just from the GPU runtime readings, it seems like the native-pytorch implementation (`_regroup_keyed_tenors`) has better performance over the fbgemm_gpu implementation (`KeyedTensor.regroup`) * but if we look at the CPU/GPU traces, we'll find that the native-pytorch implementation is actually CPU-bounded, and has very bad impact on the overall performance. # usage * to generate trace file in the given path (.) ``` buck2 run fbcode//mode/opt fbcode//torchrec/sparse/tests:jagged_tensor_benchmark -- --profile=. ``` ``` $ ll *.json -rw-rw-r-- 1 hhy hhy 8062963 Jun 21 22:21 trace-KeyedTensor.regroup_dup.json -rw-rw-r-- 1 hhy hhy 943675 Jun 21 22:21 trace-KeyedTensor.regroup.json -rw-rw-r-- 1 hhy hhy 5140105 Jun 21 22:21 trace-KTRegroupAsDict_dup.json -rw-rw-r-- 1 hhy hhy 350349 Jun 21 22:21 trace-KTRegroupAsDict.json -rw-rw-r-- 1 hhy hhy 8025287 Jun 21 22:21 trace-_regroup_keyed_tenors_dup.json -rw-rw-r-- 1 hhy hhy 8041473 Jun 21 22:21 trace-_regroup_keyed_tenors.json ``` # performance ``` INFO:2024-06-21 22:22:51 1102779:1102779 CuptiCallbackApi.cpp:78] Callback: domain = 3, cbid = 1 INFO:2024-06-21 22:22:51 1102779:1102779 CuptiActivityProfiler.cpp:241] CUDA versions. CUPTI: 18; Runtime: 12000; Driver: 12000 INFO:2024-06-21 22:22:51 1102779:1102779 NcclProfiler.cpp:150] NCCL Profiler Instantiated _regroup_keyed_tenors | B: 1024 | F: 1020 | device: cuda | Runtime (P90): 2.8 ms | Memory (P90): 1011.0 KeyedTensor.regroup | B: 1024 | F: 1020 | device: cuda | Runtime (P90): 5.0 ms | Memory (P90): 1517.0 KTRegroupAsDict | B: 1024 | F: 1020 | device: cuda | Runtime (P90): 4.9 ms | Memory (P90): 1517.0 _regroup_keyed_tenors_dup | B: 1024 | F: 1020 | device: cuda | Runtime (P90): 2.5 ms | Memory (P90): 1011.0 KeyedTensor.regroup_dup | B: 1024 | F: 1020 | device: cuda | Runtime (P90): 2.5 ms | Memory (P90): 1011.0 KTRegroupAsDict_dup | B: 1024 | F: 1020 | device: cuda | Runtime (P90): 2.5 ms | Memory (P90): 1011.0 ``` # traces * _regroup_keyed_tenors {F1712147044} * KeyedTensor.regroup {F1712148863} * KTRegroupAsDict {F1712150411} Differential Revision: D58906521 --- .../distributed/benchmark/benchmark_utils.py | 6 +- .../sparse/tests/jagged_tensor_benchmark.py | 123 ++++++++++-------- 2 files changed, 71 insertions(+), 58 deletions(-) diff --git a/torchrec/distributed/benchmark/benchmark_utils.py b/torchrec/distributed/benchmark/benchmark_utils.py index 1236a8a13..cb2f99828 100644 --- a/torchrec/distributed/benchmark/benchmark_utils.py +++ b/torchrec/distributed/benchmark/benchmark_utils.py @@ -492,9 +492,9 @@ def fx_script_module(eager_module: torch.nn.Module) -> torch.nn.Module: def benchmark( name: str, model: torch.nn.Module, - warmup_inputs: List[KeyedJaggedTensor], - bench_inputs: List[KeyedJaggedTensor], - prof_inputs: List[KeyedJaggedTensor], + warmup_inputs: Union[List[KeyedJaggedTensor], List[Dict[str, Any]]], + bench_inputs: Union[List[KeyedJaggedTensor], List[Dict[str, Any]]], + prof_inputs: Union[List[KeyedJaggedTensor], List[Dict[str, Any]]], world_size: int, output_dir: str, num_benchmarks: int, diff --git a/torchrec/sparse/tests/jagged_tensor_benchmark.py b/torchrec/sparse/tests/jagged_tensor_benchmark.py index 1745910ea..5b02bb618 100644 --- a/torchrec/sparse/tests/jagged_tensor_benchmark.py +++ b/torchrec/sparse/tests/jagged_tensor_benchmark.py @@ -40,6 +40,7 @@ def bench( run_backward: bool, fn: Callable[..., List[torch.Tensor]], fn_kwargs: Dict[str, Any], + output_dir: str = "", ) -> None: # initial call @@ -49,8 +50,8 @@ def wrapped_func( model: torch.nn.Module, # not used bench_inputs: List[KeyedJaggedTensor], # not used fn: Callable[..., List[torch.Tensor]], - fn_kwargs: Dict[str, Any], run_backward: bool, + **kwargs: Dict[str, Any], ) -> None: result = fn(**fn_kwargs) if run_backward: @@ -64,26 +65,27 @@ def wrapped_func( loss = torch.nn.functional.l1_loss(pred, labels) loss.sum().backward() + model = DummyModel() + setattr(model, "forward", lambda kwargs: fn(**kwargs)) if device_type == "cuda": result = benchmark( name=name, - model=DummyModel(), - warmup_inputs=[], + model=model, + warmup_inputs=[fn_kwargs] * 10, bench_inputs=[], - prof_inputs=[], + prof_inputs=[fn_kwargs] * 10, world_size=1, - output_dir="", + output_dir=output_dir, num_benchmarks=20, func_to_benchmark=functools.partial( wrapped_func, fn=fn, run_backward=run_backward, fn_kwargs=fn_kwargs ), benchmark_func_kwargs={}, rank=0, - enable_logging=False, + enable_logging=True, ) else: # cpu - model = DummyModel() times = timeit.repeat( lambda: wrapped_func( model=model, @@ -160,6 +162,12 @@ def wrapped_func( default=2, help="Total num of regrouping", ) +@click.option( + "--profile", + type=str, + default="", + help="profile output directory", +) def main( cuda_matrix: bool, run_backward: bool, @@ -170,6 +178,7 @@ def main( dim_sparse: int, batch_size: int, n_groups: int, + profile: str, ) -> None: if cuda_matrix: n_denses = [64, 128, 256, 512, 1024] @@ -184,54 +193,58 @@ def main( for device_type in device_types: for batch_size in batch_sizes: - for n_dense, n_sparse in zip(n_denses, n_sparses): - - device = torch.device(device_type) - kts = build_kts( - n_dense, - n_sparse, - dim_dense, - dim_sparse, - batch_size, - device, - run_backward, - ) - labels = torch.randint( - 0, 1, (batch_size,), device=torch.device(device_type) - ).float() - groups = build_groups(kts, n_groups) - bench( - "[fallback] _regroup_keyed_tenors", - labels, - batch_size, - n_dense + n_sparse, - device_type, - run_backward, - _regroup_keyed_tensors, - {"keyed_tensors": kts, "groups": groups}, - ) - bench( - "[prod] KeyedTensor.regroup", - labels, - batch_size, - n_dense + n_sparse, - device_type, - run_backward, - KeyedTensor.regroup, - {"keyed_tensors": kts, "groups": groups}, - ) - bench( - "[prod] KTRegroupAsDict", - labels, - batch_size, - n_dense + n_sparse, - device_type, - run_backward, - KTRegroupAsDict( - groups=groups, keys=[str(i) for i in range(n_groups)] - ), - {"keyed_tensors": kts}, - ) + for duplicates in [False, True]: + for n_dense, n_sparse in zip(n_denses, n_sparses): + dup = "_dup" if duplicates else "" + device = torch.device(device_type) + kts = build_kts( + n_dense, + n_sparse, + dim_dense, + dim_sparse, + batch_size, + device, + run_backward, + ) + labels = torch.randint( + 0, 1, (batch_size,), device=torch.device(device_type) + ).float() + groups = build_groups(kts, n_groups, duplicates=duplicates) + bench( + "_regroup_keyed_tenors" + dup, + labels, + batch_size, + n_dense + n_sparse, + device_type, + run_backward, + _regroup_keyed_tensors, + {"keyed_tensors": kts, "groups": groups}, + profile, + ) + bench( + "KeyedTensor.regroup" + dup, + labels, + batch_size, + n_dense + n_sparse, + device_type, + run_backward, + KeyedTensor.regroup, + {"keyed_tensors": kts, "groups": groups}, + profile, + ) + bench( + "KTRegroupAsDict" + dup, + labels, + batch_size, + n_dense + n_sparse, + device_type, + run_backward, + KTRegroupAsDict( + groups=groups, keys=[str(i) for i in range(n_groups)] + ), + {"keyed_tensors": kts}, + profile, + ) if __name__ == "__main__":