From 8fe8afc3093fd78c4d4b6096318b424e847990e4 Mon Sep 17 00:00:00 2001 From: Huanyu He Date: Wed, 28 Aug 2024 15:30:43 -0700 Subject: [PATCH] add trace and single process runner for pipeline benchmark (#2347) Summary: Pull Request resolved: https://github.com/pytorch/torchrec/pull/2347 # context * please refer to this [plan doc](https://docs.google.com/document/d/1E45sbCPVA7JzG18BFS0tTOQHMLETGuZxkoupRwTwqkM/edit#heading=h.o7xaxy435ue4) * add trace for the pipeline benchmark * add single process runner for the pipeline benchmark {F1832319035} Differential Revision: D61637749 --- .../distributed/benchmark/benchmark_utils.py | 121 ++++++++++++ .../tests/pipeline_benchmarks.py | 186 +++++++++++++++--- 2 files changed, 279 insertions(+), 28 deletions(-) diff --git a/torchrec/distributed/benchmark/benchmark_utils.py b/torchrec/distributed/benchmark/benchmark_utils.py index bc499f335..511ee00be 100644 --- a/torchrec/distributed/benchmark/benchmark_utils.py +++ b/torchrec/distributed/benchmark/benchmark_utils.py @@ -643,6 +643,127 @@ def trace_handler(prof) -> None: ) +def benchmark_func( + name: str, + bench_inputs: List[Dict[str, Any]], + prof_inputs: List[Dict[str, Any]], + world_size: int, + profile_dir: str, + num_benchmarks: int, + num_profiles: int, + # pyre-ignore[2] + func_to_benchmark: Any, + benchmark_func_kwargs: Optional[Dict[str, Any]], + rank: int, + device_type: str = "cuda", +) -> BenchmarkResult: + max_mem_allocated: List[int] = [] + if device_type == "cuda": + if rank == -1: + # Reset memory for measurement, no process per rank so do all + for di in range(world_size): + torch.cuda.reset_peak_memory_stats(di) + else: + torch.cuda.reset_peak_memory_stats(rank) + + start = [] + end = [] + if device_type == "cuda": + # Measure time taken for batches in bench_inputs + start = [torch.cuda.Event(enable_timing=True) for _ in range(num_benchmarks)] + end = [torch.cuda.Event(enable_timing=True) for _ in range(num_benchmarks)] + + if benchmark_func_kwargs is None: + # Need this to unwrap + benchmark_func_kwargs = {} + + times = [] + if device_type == "cuda": + for i in range(num_benchmarks): + start[i].record() + func_to_benchmark(bench_inputs, **benchmark_func_kwargs) + end[i].record() + elif device_type == "cpu": + times = timeit.repeat( + lambda: func_to_benchmark(bench_inputs, **benchmark_func_kwargs), + number=1, + repeat=num_benchmarks, + ) + + if device_type == "cuda": + if rank == -1: + for di in range(world_size): + torch.cuda.synchronize(di) + else: + torch.cuda.synchronize(rank) + + # TODO: First Benchmark Run for Eager Mode produces outlier + # Start counting after first as workaround for standard deviation + if device_type == "cuda": + elapsed_time = torch.tensor( + [si.elapsed_time(ei) for si, ei in zip(start[1:], end[1:])] + ) + else: + elapsed_time = torch.tensor(times) * 1e3 + + if device_type == "cuda": + if rank == -1: + # Add up all memory allocated in inference mode + for di in range(world_size): + b = torch.cuda.max_memory_allocated(di) + max_mem_allocated.append(b // 1024 // 1024) + else: + # Only add up memory allocated for current rank in training mode + b = torch.cuda.max_memory_allocated(rank) + max_mem_allocated.append(b // 1024 // 1024) + + if profile_dir != "": + # Only do profiling if output_dir is set + + # pyre-ignore[2] + def trace_handler(prof) -> None: + total_average = prof.profiler.total_average() + logger.info(f" TOTAL_AVERAGE:\n{name}\n{total_average}") + dir_path: str = profile_dir + if rank == 0: + trace_file: str = f"{dir_path}/trace-{name}.json" + else: + trace_file: str = f"{dir_path}/trace-{name}-{rank}.json" + return # only 1 rank should output in pg case, rank = 0 + logger.info(f" PROFILE[{name}].chrome_trace:{trace_file}") + prof.export_chrome_trace(trace_file) + + if device_type == "cuda": + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + record_shapes=True, + profile_memory=True, + with_flops=True, + with_modules=True, + on_trace_ready=trace_handler, + ) as p: + for i in range(num_profiles): + with record_function(f"## profile {i} ##"): + func_to_benchmark(prof_inputs, **benchmark_func_kwargs) + p.step() + + if rank == -1: + for di in range(torch.cuda.device_count()): + torch.cuda.synchronize(torch.device(f"cuda:{di}")) + else: + torch.cuda.synchronize() + + return BenchmarkResult( + short_name=name, + elapsed_time=elapsed_time, + max_mem_allocated=max_mem_allocated, + rank=rank, + ) + + def benchmark_type_name(compile_mode: CompileMode, sharding_type: ShardingType) -> str: if sharding_type == ShardingType.TABLE_WISE: name = "tw-sharded" diff --git a/torchrec/distributed/train_pipeline/tests/pipeline_benchmarks.py b/torchrec/distributed/train_pipeline/tests/pipeline_benchmarks.py index f92cf82c9..35d7c774a 100644 --- a/torchrec/distributed/train_pipeline/tests/pipeline_benchmarks.py +++ b/torchrec/distributed/train_pipeline/tests/pipeline_benchmarks.py @@ -12,7 +12,7 @@ import copy import multiprocessing import os -from typing import Any, Callable, cast, Dict, List, Optional, Tuple +from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Type, Union import click @@ -22,7 +22,7 @@ from torch import nn, optim from torch.optim import Optimizer from torchrec.distributed import DistributedModelParallel -from torchrec.distributed.benchmark.benchmark_utils import benchmark +from torchrec.distributed.benchmark.benchmark_utils import benchmark_func from torchrec.distributed.embedding_types import EmbeddingComputeKernel from torchrec.distributed.test_utils.multi_process import MultiProcessContext from torchrec.distributed.test_utils.test_model import ( @@ -46,6 +46,23 @@ from torchrec.test_utils import get_free_port +_pipeline_cls: Dict[str, Type[Union[TrainPipelineBase, TrainPipelineSparseDist]]] = { + "base": TrainPipelineBase, + "sparse": TrainPipelineSparseDist, + "semi": TrainPipelineSemiSync, + "prefetch": PrefetchTrainPipelineSparseDist, +} + + +def _gen_pipelines( + pipelines: str, +) -> List[Type[Union[TrainPipelineBase, TrainPipelineSparseDist]]]: + if pipelines == "all": + return list(_pipeline_cls.values()) + else: + return [_pipeline_cls[pipelines]] + + @click.command() @click.option( "--world_size", @@ -82,6 +99,30 @@ default=100, help="Pooling Factor.", ) +@click.option( + "--input_type", + type=str, + default="kjt", + help="Input type: kjt, td", +) +@click.option( + "--pipeline", + type=str, + default="all", + help="Pipeline to run: all, base, sparse, semi, prefetch", +) +@click.option( + "--multi_process", + type=bool, + default=True, + help="Run in multi process mode.", +) +@click.option( + "--profile", + type=str, + default="", + help="profile output directory", +) def main( world_size: int, n_features: int, @@ -89,6 +130,10 @@ def main( n_batches: int, batch_size: int, pooling_factor: int, + input_type: str, + pipeline: str, + multi_process: bool, + profile: str, ) -> None: """ Checks that pipelined training is equivalent to non-pipelined training. @@ -125,18 +170,34 @@ def main( batch_size=batch_size, world_size=world_size, pooling_factor=pooling_factor, + input_type=input_type, ) - _run_multi_process_test( - callable=runner, - tables=tables, - weighted_tables=weighted_tables, - sharding_type=ShardingType.TABLE_WISE.value, - kernel_type=EmbeddingComputeKernel.FUSED.value, - batches=batches, - fused_params={}, - world_size=world_size, - ) + if multi_process: + _run_multi_process_test( + callable=runner, + tables=tables, + weighted_tables=weighted_tables, + sharding_type=ShardingType.TABLE_WISE.value, + kernel_type=EmbeddingComputeKernel.FUSED.value, + batches=batches, + fused_params={}, + world_size=world_size, + pipelines=pipeline, + profile=profile, + ) + else: + single_runner( + tables=tables, + weighted_tables=weighted_tables, + sharding_type=ShardingType.TABLE_WISE.value, + kernel_type=EmbeddingComputeKernel.FUSED.value, + batches=batches, + fused_params={}, + world_size=1, + pipelines=pipeline, + profile=profile, + ) def _run_multi_process_test( @@ -179,6 +240,7 @@ def _generate_data( batch_size: int = 4096, world_size: int = 1, pooling_factor: int = 10, + input_type: str = "kjt", ) -> List[List[ModelInput]]: return [ ModelInput.generate( @@ -238,6 +300,8 @@ def runner( fused_params: Dict[str, Any], world_size: int, batches: List[List[ModelInput]], + pipelines: str, + profile: str, ) -> None: torch.autograd.set_detect_anomaly(True) @@ -269,12 +333,7 @@ def runner( }, ) bench_inputs = [batch[rank] for batch in batches] - for pipeline_clazz in [ - TrainPipelineBase, - TrainPipelineSparseDist, - TrainPipelineSemiSync, - PrefetchTrainPipelineSparseDist, - ]: + for pipeline_clazz in _gen_pipelines(pipelines=pipelines): if pipeline_clazz == TrainPipelineSemiSync: # pyre-ignore [28] pipeline = pipeline_clazz( @@ -292,8 +351,8 @@ def runner( pipeline.progress(iter(bench_inputs)) def _func_to_benchmark( - model: nn.Module, bench_inputs: List[ModelInput], + model: nn.Module, pipeline: TrainPipeline, ) -> None: dataloader = iter(bench_inputs) @@ -303,20 +362,17 @@ def _func_to_benchmark( except StopIteration: break - result = benchmark( + result = benchmark_func( name=pipeline_clazz.__name__, - model=sharded_model, + bench_inputs=bench_inputs, # pyre-ignore + prof_inputs=bench_inputs, # pyre-ignore num_benchmarks=5, - output_dir="", - warmup_inputs=[], - # pyre-ignore - bench_inputs=bench_inputs, - prof_inputs=[], + num_profiles=2, + profile_dir=profile, world_size=world_size, func_to_benchmark=_func_to_benchmark, - benchmark_func_kwargs={"pipeline": pipeline}, + benchmark_func_kwargs={"model": sharded_model, "pipeline": pipeline}, rank=rank, - enable_logging=False, ) if rank == 0: print( @@ -324,5 +380,79 @@ def _func_to_benchmark( ) +def single_runner( + tables: List[EmbeddingBagConfig], + weighted_tables: List[EmbeddingBagConfig], + sharding_type: str, + kernel_type: str, + fused_params: Dict[str, Any], + world_size: int, + batches: List[List[ModelInput]], + pipelines: str, + profile: str, +) -> None: + device = torch.device("cuda") + torch.autograd.set_detect_anomaly(True) + model = TestSparseNN( + tables=tables, + weighted_tables=weighted_tables, + dense_device=device, + sparse_device=device, + over_arch_clazz=TestOverArchLarge, + ).to(device) + + optimizer = optim.SGD( + [param for name, param in model.named_parameters() if "sparse" not in name], + lr=0.1, + ) + + bench_inputs = [batch[0] for batch in batches] + for pipeline_clazz in _gen_pipelines(pipelines=pipelines): + if pipeline_clazz == TrainPipelineSemiSync: + # pyre-ignore [28] + pipeline = pipeline_clazz( + model=model, + optimizer=optimizer, + device=device, + start_batch=0, + ) + else: + pipeline = pipeline_clazz( + model=model, + optimizer=optimizer, + device=device, + ) + pipeline.progress(iter(bench_inputs)) + + def _func_to_benchmark( + bench_inputs: List[ModelInput], + model: nn.Module, + pipeline: TrainPipeline, + ) -> None: + dataloader = iter(bench_inputs) + while True: + try: + pipeline.progress(dataloader) + except StopIteration: + break + + result = benchmark_func( + name=pipeline_clazz.__name__, + bench_inputs=bench_inputs, # pyre-ignore + prof_inputs=bench_inputs, # pyre-ignore + num_benchmarks=5, + num_profiles=2, + profile_dir=profile, + world_size=world_size, + func_to_benchmark=_func_to_benchmark, + benchmark_func_kwargs={"model": model, "pipeline": pipeline}, + rank=0, + ) + + print( + f" {pipeline_clazz.__name__: <{35}} | Runtime (P90): {result.runtime_percentile(90)/1000:5.3f} s | Memory (P90): {result.max_mem_percentile(90)/1000:5.3f} GB" + ) + + if __name__ == "__main__": main()