From 49e72b4a378bae925f179c923d763352e0821cbf Mon Sep 17 00:00:00 2001 From: Yan Wang Date: Tue, 21 Jan 2025 22:12:28 +0100 Subject: [PATCH] Add command line options; add fwd/bwd --- thunder/dynamo/compiler.py | 2 +- thunder/dynamo/report.py | 98 ++++++++++++++++++++++++------------ thunder/dynamo/utils.py | 97 ++++++++++++++++++++++++----------- thunder/tests/test_dynamo.py | 6 ++- 4 files changed, 140 insertions(+), 63 deletions(-) diff --git a/thunder/dynamo/compiler.py b/thunder/dynamo/compiler.py index c01a5d1f02..abe98982fa 100644 --- a/thunder/dynamo/compiler.py +++ b/thunder/dynamo/compiler.py @@ -121,7 +121,7 @@ def save_reproducer_to_folder( reproducer_folder, f"graph{graph_idx}_{cur_name}", use_pytest_benchmark, - check_consistency=check_consistency, + # check_consistency=check_consistency, save_input_tensor=save_input_tensor, ) diff --git a/thunder/dynamo/report.py b/thunder/dynamo/report.py index 2d20f152c2..a5194ab95e 100644 --- a/thunder/dynamo/report.py +++ b/thunder/dynamo/report.py @@ -4,8 +4,12 @@ import sys from pathlib import Path import json +import argparse +import torch from thunder.dynamo.compiler import thunderfx +from thunder.benchmarks.targets import ComputeType, backward_only +from thunder.dynamo.utils import run_backward if TYPE_CHECKING: from thunder.dynamo.utils import SubgraphInfo @@ -13,6 +17,32 @@ from collections.abc import Callable +def run_repro(ex_dict, ex_name, model, compute_type, *inputs): # CLI options + if ex_name == "eager": + compiled_fn = model + elif ex_name == "torch_inductor": + compiled_fn = ex_dict[ex_name](model, inputs) + else: + compiled_fn = ex_dict[ex_name](model) + + results = {} + match compute_type: + case "forward": + try: + result = compiled_fn(*inputs) + except Exception as e: + raise e + results["forward"] = result + case "forward+backward": + try: + forward_result, grads = run_backward(compiled_fn, *inputs) + except Exception as e: + raise e + results["forward"] = forward_result + results["backward"] = grads + return results + + def get_thunder_graph_names(subgraph_infos: list[SubgraphInfo]): thunder_graph_names = [] for graph_idx, subgraph_info in enumerate(subgraph_infos): @@ -30,6 +60,7 @@ def thunderfx_save_report( folder_path: str | PathLike = "/tmp/thunderfx_report", check_consistency: bool = True, check_benchmark: bool = True, + save_benchmark_inputs: bool = True, **kwargs, ): try: @@ -44,7 +75,7 @@ def thunderfx_save_report( return print(f"The reproducer file is saved in {folder_path}") return - + print("The input callable can be ran by thunderfx successfully.") if not check_benchmark and not check_consistency: return @@ -62,21 +93,23 @@ def thunderfx_save_report( folder.mkdir(exist_ok=True) # Checks consistency with Torch eager if check_consistency: + print("Verifying consistency between Thunder and Torch eager ...") consistency_folder = folder / "consistency" consistency_folder.mkdir(exist_ok=True) - compiled._backend.save_reproducer_to_folder(consistency_folder, check_consistency=True) + compiled._backend.save_reproducer_to_folder(consistency_folder) for file in consistency_folder.glob("*.py"): - # The consistency results generated by the script are passed here via stdout - consistency_result = eval( - subprocess.run([sys.executable, folder / file], capture_output=True, text=True).stdout - ) - for g_name, consistency in consistency_result.items(): - g_ex_name = f"{file.name.rstrip('.py')}[{g_name}]" - assert g_ex_name in report_result - report_result[g_ex_name] = ["yes" if consistency is None else str(consistency)] + g_name = file.name.rstrip(".py") + cmd = [sys.executable, folder / file, "--check_consistency=True", "--compute_type=forward+backward"] + consistency_result = subprocess.run(cmd, capture_output=True, text=True) + if consistency_result.returncode: + error = consistency_result.stderr + print(f"[{g_name}] Consistency check failed: {error}") + else: + print(f"[{g_name}] Consistency check succeeded") # Benchmark if check_benchmark: + print("Analyzing performance through benchmarking, this might take a moment...") benchmark_folder = folder / "benchmark" benchmark_folder.mkdir(exist_ok=True) compiled._backend.save_reproducer_to_folder(benchmark_folder, save_input_tensor=True, use_pytest_benchmark=True) @@ -84,7 +117,7 @@ def thunderfx_save_report( benchmark_json_files = [] for file in benchmark_folder.glob("*.py"): benchmark_json_files.append(str(benchmark_folder / f"{file.name.replace('.py', '.json')}")) - subprocess.run( + benchmark_result = subprocess.run( [ sys.executable, "-m", @@ -92,32 +125,33 @@ def thunderfx_save_report( benchmark_folder / file, "--benchmark-timer=torch.utils.benchmark.utils.timer.timer", "--benchmark-warmup=on", + "--benchmark-group-by=param:compute_type", f"--benchmark-json={benchmark_json_files[-1]}", + "--disable-warnings", + "-q", ], capture_output=True, text=True, ) - + print(benchmark_result.stdout) + print("Max allocated memory usage:") for tmp_json in benchmark_json_files: with open(tmp_json) as file: data = json.load(file) - for bk in data["benchmarks"]: - cur_name = bk["name"].lstrip("test_") - if cur_name in report_result: - report_result[cur_name].append(bk["stats"]["mean"]) - report_result[cur_name].append(bk["extra_info"]["max_allocated_memory_MB"]) - - list_data: list[dict] = [] - for g_name, values in report_result.items(): - list_data.append({}) - list_data[-1]["name"] = g_name - if check_consistency: - list_data[-1]["consistency"] = values[0] - if check_benchmark: - base = check_benchmark + check_consistency - 1 - list_data[-1]["performance_mean"] = values[base] - list_data[-1]["max_allocated_memory_MB"] = values[base + 1] - json_data = {"report": list_data} - - with open(folder / "report.json", "w") as f: - json.dump(json_data, f, indent=4) + benchs = data["benchmarks"] + forward_benchs = [bench for bench in benchs if "forward" in bench["param"]] + backward_benchs = [bench for bench in benchs if "backward" in bench["param"]] + + forward_benchs_sorted = sorted( + forward_benchs, key=lambda x: x["extra_info"]["max_allocated_memory_MB"], reverse=True + ) + backward_benchs_sorted = sorted( + backward_benchs, key=lambda x: x["extra_info"]["max_allocated_memory_MB"], reverse=True + ) + + for bk in forward_benchs_sorted: + print(f"{bk['name'].lstrip('test_')}: {bk['extra_info']['max_allocated_memory_MB']/1000} GB") + print("\n") + for bk in backward_benchs_sorted: + print(f"{bk['name'].lstrip('test_')}: {bk['extra_info']['max_allocated_memory_MB']/1000} GB") + print("\n") diff --git a/thunder/dynamo/utils.py b/thunder/dynamo/utils.py index 475f3c4f1e..b9486e7bff 100644 --- a/thunder/dynamo/utils.py +++ b/thunder/dynamo/utils.py @@ -15,7 +15,8 @@ from thunder.torch.default_torch_ops import torch_auto_registered_ops from thunder.torch import _torch_to_thunder_function_map from thunder.torch.langctx import torchctx -from thunder.core.utils import check +from thunder.core.utils import check, sequencify +from thunder.core.pytree import tree_flatten if TYPE_CHECKING: from thunder.core.symbol import Symbol @@ -724,7 +725,6 @@ def reproducer( folder: str | os.PathLike, graph_name: str, use_pytest_benchmark: bool = False, - check_consistency: bool = False, save_input_tensor: bool = False, ): folder = Path(folder) @@ -753,6 +753,38 @@ def torch_inductor(fn, inputs): bench_executors_dict["thunder"]=partial(thunder.jit, {thunder_options_str}) bench_executors_dict["torch_inductor"]=torch_inductor bench_executors_dict["eager"]=None +""" + + COMMAND_LINE_ARGS = f""" +import argparse + +parser = argparse.ArgumentParser(description="Process some inputs for myscript.py.") + +parser.add_argument( + "--check_consistency", + type=bool, + default=False, + help="Whether to check consistency (default: False)" +) +parser.add_argument( + "--compute_type", + type=str, + choices=["forward", "forward+backward"], + default="forward", + help="Type of computation to perform (forward, forward+backward)" +) +parser.add_argument( + "--executor", + type=str, + choices=["torch_inductor", "thunder", "eager"], + default="thunder", + help="Type of computation to perform (thunder, torch_inductor, or eager)" +) + +args = parser.parse_args() +ex_name = args.executor +compute_type = args.compute_type +check_acc = args.check_consistency """ # split reason @@ -791,6 +823,7 @@ def torch_inductor(fn, inputs): code_str += "from thunder.dev_utils.nvtx_profile_transform import NvtxProfileTransform\n" if use_pytest_benchmark: code_str += f"""import pytest +from thunder.benchmarks.targets import parametrize_compute_type_only_training, benchmark_for_compute_type {EXECUTOR_DICT_CODE_STR} """ if has_cuda_args: @@ -803,9 +836,11 @@ def torch_inductor(fn, inputs): "executor,", executors, ids=executor_ids, -)""" - func_str = f"def test_{graph_name}(benchmark, executor):\n{readable}\n" +) +@parametrize_compute_type_only_training""" + func_str = f"def test_{graph_name}(benchmark, executor, compute_type):\n{readable}\n" else: + code_str += f"\n{EXECUTOR_DICT_CODE_STR}{COMMAND_LINE_ARGS}" func_str = f"def test_{graph_name}():\n{readable}\n" if any(arg is None for arg in args): @@ -820,24 +855,15 @@ def torch_inductor(fn, inputs): func_str += f"{_addindent(input_str, 4)}\n]\n" if not use_pytest_benchmark: - func_str += f"compiled = thunder.jit(DynamoModule(), {thunder_options_str})\n" - func_str += "thunder_result = compiled(*inputs)" - if check_consistency: - func_str += f""" -{TORCH_INDUCTOR_FUNCTION_STR} -eager_result = DynamoModule()(*inputs)\n -inductor_result = torch_inductor(DynamoModule(), inputs)(*inputs) -def check_assertion(expected, actual): - try: - torch.testing.assert_close(expected, actual) - return None # No exception, return None - except AssertionError as e: - return e # Return the caught exception -result = {{}} -result["thunder"] = check_assertion(eager_result, thunder_result) -result["torch_inductor"] = check_assertion(eager_result, inductor_result) - -print(result) + func_str += f""" +mod = DynamoModule() +from thunder.dynamo.report import run_repro + +result = run_repro(bench_executors_dict, ex_name, mod, compute_type, *inputs) +if check_acc: + eager_result = run_repro(bench_executors_dict, "eager", mod, compute_type, *inputs) + for (compute_t, eager_v), (_, cur_v) in zip(eager_result.items(), result.items()): + torch.testing.assert_close(eager_v, cur_v, msg=lambda e : f'{{compute_t}}: {{e}}') """ else: func_str = f"""{func_str} @@ -848,14 +874,8 @@ def check_assertion(expected, actual): compiled = executor(mod, inputs) else: compiled = executor(mod) -""" - if not has_cuda_args: - func_str += f"""benchmark(compiled, *inputs)""" - else: - func_str += f"""from thunder.benchmarks import record_peak_allocated_memory -with record_peak_allocated_memory(benchmark): - benchmark(compiled, *inputs) +benchmark_for_compute_type(compute_type, benchmark, compiled, inputs, {{}}) """ print(comment_str, file=f) print(code_str, file=f) @@ -863,3 +883,22 @@ def check_assertion(expected, actual): if not use_pytest_benchmark: print(f"\ntest_{graph_name}()", file=f) + + +def run_backward(fn, *args, **kwargs): + result = fn(*args, **kwargs) + result = sequencify(result) + + forward_inputs = tree_flatten((args, kwargs))[0] + forward_inputs = list(filter(lambda x: isinstance(x, torch.Tensor) and x.requires_grad, forward_inputs)) + differentiable_tensor_result = list(filter(lambda x: isinstance(x, torch.Tensor) and x.requires_grad, result)) + + output_grads = [] + for diff_result in differentiable_tensor_result: + output_grads.append(torch.ones_like(diff_result)) + + for i in forward_inputs: + i.grad = None + + torch.autograd.backward(result, output_grads, inputs=forward_inputs) + return result, [t.grad for t in forward_inputs] diff --git a/thunder/tests/test_dynamo.py b/thunder/tests/test_dynamo.py index 8a23b04368..9659934bf3 100644 --- a/thunder/tests/test_dynamo.py +++ b/thunder/tests/test_dynamo.py @@ -1022,7 +1022,7 @@ def foo(x): out = thfoo(t0) assert out.device.type == "meta" - + @requiresCUDA def test_report(tmp_path): def foo(x): @@ -1031,6 +1031,10 @@ def foo(x): return y + x.cos() x = torch.randn(4, 4, device="cuda", requires_grad=True) + # cf = thunderfx(foo) + # cf(x) + # cf._backend.save_reproducer_to_folder(tmp_path) + thunderfx_save_report(foo, x, folder_path=tmp_path) from unittest.mock import patch