From 554ecc7567a758685a7604594006d4a71ae5337c Mon Sep 17 00:00:00 2001 From: Vishal Agarwal Date: Thu, 19 Sep 2024 09:38:43 +0530 Subject: [PATCH 1/3] fix memory usage print --- benchmark/python/benchmark_e2e.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/benchmark/python/benchmark_e2e.py b/benchmark/python/benchmark_e2e.py index f5c3ace8c..e0513411a 100644 --- a/benchmark/python/benchmark_e2e.py +++ b/benchmark/python/benchmark_e2e.py @@ -44,7 +44,7 @@ def monitor_gpu_memory(): memory_usage = result.stdout.splitlines() - if len(memory_usage) > 1: + if len(memory_usage) >= 1: gpu_memory = [float(line) for line in memory_usage] current_peak = round(max(gpu_memory) / 1024, 2) with peak_memory_lock: @@ -137,7 +137,7 @@ def save_results(args, results, filename, print_memory_usage=False): if IS_NVIDIA_SYSTEM: columns.append("peak_gpu_memory (GiB)") else: - columns.append("peak_cpu_memory(GiB)") + columns.append("peak_cpu_memory (GiB)") df = pd.DataFrame( results, @@ -165,6 +165,12 @@ def save_results(args, results, filename, print_memory_usage=False): record.metrics.customized["sampling_latency_ms"] = row["Sampling Latency (ms)"] record.metrics.customized["wall_clock_throughput_tps"] = row["Wall Clock Throughput (tps)"] record.metrics.customized["wall_clock_time_s"] = row["Wall Clock Time (s)"] + + if print_memory_usage: + if IS_NVIDIA_SYSTEM: + record.metrics.customized["peak_gpu_memory_gb"] = row["peak_gpu_memory (GiB)"] + else: + record.metrics.customized["peak_cpu_memory_gb"] = row["peak_cpu_memory (GiB)"] records.append(record) @@ -178,6 +184,13 @@ def run_benchmark_memory(args, batch_size, prompt_length, generation_length, max This function is to run benchmark and print the momory usage """ global stop_monitoring + global peak_gpu_memory + global peak_cpu_memory + + # Reset the peak memory variables and the monitoring flag + stop_monitoring = False + peak_gpu_memory = 0.0 + peak_cpu_memory = 0.0 if IS_NVIDIA_SYSTEM: monitor_thread = threading.Thread(target=monitor_gpu_memory) @@ -240,7 +253,7 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length if args.verbose: print(f"Running benchmark for batch size = {batch_size}, prompt length = {prompt_length}") for _ in tqdm(range(num_repetitions)): wall_clock_start_time = time.time() - + # Prepare run generator = og.Generator(model, params) @@ -329,6 +342,12 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length print(f"Average Wall Clock Time: {avg_wall_clock_time} s") print(f"Average Wall Clock Throughput: {avg_wall_clock_thrpt} tps") + if args.print_memory_usage: + if IS_NVIDIA_SYSTEM: + print(f"Peak GPU Memory Usage: {peak_gpu_memory} GiB ") + else: + print(f"Peak CPU Memory Usage: {peak_cpu_memory} GiB ") + metrics = [ batch_size, prompt_length, @@ -359,7 +378,7 @@ def main(args): max_length = args.max_lengths[0] if len(args.max_lengths) == 1 else args.max_lengths[m] else: max_length = prompt_length + gen_length - print(f"Args: batch_size = {batch_size}, prompt_length = {prompt_length}, tokens = {gen_length}, max_length = {max_length}") + print(f"\nArgs: batch_size = {batch_size}, prompt_length = {prompt_length}, tokens = {gen_length}, max_length = {max_length}") if args.print_memory_usage: metrics = run_benchmark_memory(args, batch_size, prompt_length, gen_length, max_length) else: @@ -370,10 +389,6 @@ def main(args): filename = args.output if args.print_memory_usage: - if IS_NVIDIA_SYSTEM: - print(f"-------------------* Peak GPU Memory Usage: {peak_gpu_memory} GiB *-------------------") - else: - print(f"-------------------* Peak CPU Memory Usage: {peak_cpu_memory} GiB *-------------------") save_results(args, all_csv_metrics, filename, print_memory_usage=True) else: save_results(args, all_csv_metrics, filename) From c9c9373ab5df36f50d3d49e7e65b19c49979b552 Mon Sep 17 00:00:00 2001 From: Vishal Agarwal Date: Thu, 19 Sep 2024 09:40:41 +0530 Subject: [PATCH 2/3] remove redundant generator object init --- benchmark/python/benchmark_e2e.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/benchmark/python/benchmark_e2e.py b/benchmark/python/benchmark_e2e.py index e0513411a..8e84c02ba 100644 --- a/benchmark/python/benchmark_e2e.py +++ b/benchmark/python/benchmark_e2e.py @@ -253,9 +253,6 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length if args.verbose: print(f"Running benchmark for batch size = {batch_size}, prompt length = {prompt_length}") for _ in tqdm(range(num_repetitions)): wall_clock_start_time = time.time() - - # Prepare run - generator = og.Generator(model, params) # Measure tokenization tokenize_start_time = time.perf_counter() From 0e2d42ddbee0fd4acdd19327f569c5953f282c1f Mon Sep 17 00:00:00 2001 From: Vishal Agarwal Date: Thu, 19 Sep 2024 09:41:51 +0530 Subject: [PATCH 3/3] fix warmup generation loop --- benchmark/python/benchmark_e2e.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmark/python/benchmark_e2e.py b/benchmark/python/benchmark_e2e.py index 8e84c02ba..8cd99f20d 100644 --- a/benchmark/python/benchmark_e2e.py +++ b/benchmark/python/benchmark_e2e.py @@ -239,8 +239,9 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length if args.verbose: print("Running warmup runs...") for _ in tqdm(range(args.warmup)): generator = og.Generator(model, params) - generator.compute_logits() - generator.generate_next_token() + while not generator.is_done(): + generator.compute_logits() + generator.generate_next_token() if args.print_model_output: print(tokenizer.decode(generator.get_sequence(0))) # Delete the generator to free the captured graph for the next generator, if graph capture is enabled del generator