From 49e72b4a378bae925f179c923d763352e0821cbf Mon Sep 17 00:00:00 2001
From: Yan Wang <wayan@nvidia.com>
Date: Tue, 21 Jan 2025 22:12:28 +0100
Subject: [PATCH] Add command line options; add fwd/bwd

---
 thunder/dynamo/compiler.py   |  2 +-
 thunder/dynamo/report.py     | 98 ++++++++++++++++++++++++------------
 thunder/dynamo/utils.py      | 97 ++++++++++++++++++++++++-----------
 thunder/tests/test_dynamo.py |  6 ++-
 4 files changed, 140 insertions(+), 63 deletions(-)

diff --git a/thunder/dynamo/compiler.py b/thunder/dynamo/compiler.py
index c01a5d1f02..abe98982fa 100644
--- a/thunder/dynamo/compiler.py
+++ b/thunder/dynamo/compiler.py
@@ -121,7 +121,7 @@ def save_reproducer_to_folder(
                     reproducer_folder,
                     f"graph{graph_idx}_{cur_name}",
                     use_pytest_benchmark,
-                    check_consistency=check_consistency,
+                    # check_consistency=check_consistency,
                     save_input_tensor=save_input_tensor,
                 )
 
diff --git a/thunder/dynamo/report.py b/thunder/dynamo/report.py
index 2d20f152c2..a5194ab95e 100644
--- a/thunder/dynamo/report.py
+++ b/thunder/dynamo/report.py
@@ -4,8 +4,12 @@
 import sys
 from pathlib import Path
 import json
+import argparse
 
+import torch
 from thunder.dynamo.compiler import thunderfx
+from thunder.benchmarks.targets import ComputeType, backward_only
+from thunder.dynamo.utils import run_backward
 
 if TYPE_CHECKING:
     from thunder.dynamo.utils import SubgraphInfo
@@ -13,6 +17,32 @@
     from collections.abc import Callable
 
 
+def run_repro(ex_dict, ex_name, model, compute_type, *inputs):  # CLI options
+    if ex_name == "eager":
+        compiled_fn = model
+    elif ex_name == "torch_inductor":
+        compiled_fn = ex_dict[ex_name](model, inputs)
+    else:
+        compiled_fn = ex_dict[ex_name](model)
+
+    results = {}
+    match compute_type:
+        case "forward":
+            try:
+                result = compiled_fn(*inputs)
+            except Exception as e:
+                raise e
+            results["forward"] = result
+        case "forward+backward":
+            try:
+                forward_result, grads = run_backward(compiled_fn, *inputs)
+            except Exception as e:
+                raise e
+            results["forward"] = forward_result
+            results["backward"] = grads
+    return results
+
+
 def get_thunder_graph_names(subgraph_infos: list[SubgraphInfo]):
     thunder_graph_names = []
     for graph_idx, subgraph_info in enumerate(subgraph_infos):
@@ -30,6 +60,7 @@ def thunderfx_save_report(
     folder_path: str | PathLike = "/tmp/thunderfx_report",
     check_consistency: bool = True,
     check_benchmark: bool = True,
+    save_benchmark_inputs: bool = True,
     **kwargs,
 ):
     try:
@@ -44,7 +75,7 @@ def thunderfx_save_report(
             return
         print(f"The reproducer file is saved in {folder_path}")
         return
-
+    print("The input callable can be ran by thunderfx successfully.")
     if not check_benchmark and not check_consistency:
         return
 
@@ -62,21 +93,23 @@ def thunderfx_save_report(
     folder.mkdir(exist_ok=True)
     # Checks consistency with Torch eager
     if check_consistency:
+        print("Verifying consistency between Thunder and Torch eager ...")
         consistency_folder = folder / "consistency"
         consistency_folder.mkdir(exist_ok=True)
-        compiled._backend.save_reproducer_to_folder(consistency_folder, check_consistency=True)
+        compiled._backend.save_reproducer_to_folder(consistency_folder)
         for file in consistency_folder.glob("*.py"):
-            # The consistency results generated by the script are passed here via stdout
-            consistency_result = eval(
-                subprocess.run([sys.executable, folder / file], capture_output=True, text=True).stdout
-            )
-            for g_name, consistency in consistency_result.items():
-                g_ex_name = f"{file.name.rstrip('.py')}[{g_name}]"
-                assert g_ex_name in report_result
-                report_result[g_ex_name] = ["yes" if consistency is None else str(consistency)]
+            g_name = file.name.rstrip(".py")
+            cmd = [sys.executable, folder / file, "--check_consistency=True", "--compute_type=forward+backward"]
+            consistency_result = subprocess.run(cmd, capture_output=True, text=True)
+            if consistency_result.returncode:
+                error = consistency_result.stderr
+                print(f"[{g_name}] Consistency check failed: {error}")
+            else:
+                print(f"[{g_name}] Consistency check succeeded")
 
     # Benchmark
     if check_benchmark:
+        print("Analyzing performance through benchmarking, this might take a moment...")
         benchmark_folder = folder / "benchmark"
         benchmark_folder.mkdir(exist_ok=True)
         compiled._backend.save_reproducer_to_folder(benchmark_folder, save_input_tensor=True, use_pytest_benchmark=True)
@@ -84,7 +117,7 @@ def thunderfx_save_report(
         benchmark_json_files = []
         for file in benchmark_folder.glob("*.py"):
             benchmark_json_files.append(str(benchmark_folder / f"{file.name.replace('.py', '.json')}"))
-            subprocess.run(
+            benchmark_result = subprocess.run(
                 [
                     sys.executable,
                     "-m",
@@ -92,32 +125,33 @@ def thunderfx_save_report(
                     benchmark_folder / file,
                     "--benchmark-timer=torch.utils.benchmark.utils.timer.timer",
                     "--benchmark-warmup=on",
+                    "--benchmark-group-by=param:compute_type",
                     f"--benchmark-json={benchmark_json_files[-1]}",
+                    "--disable-warnings",
+                    "-q",
                 ],
                 capture_output=True,
                 text=True,
             )
-
+        print(benchmark_result.stdout)
+        print("Max allocated memory usage:")
         for tmp_json in benchmark_json_files:
             with open(tmp_json) as file:
                 data = json.load(file)
-                for bk in data["benchmarks"]:
-                    cur_name = bk["name"].lstrip("test_")
-                    if cur_name in report_result:
-                        report_result[cur_name].append(bk["stats"]["mean"])
-                        report_result[cur_name].append(bk["extra_info"]["max_allocated_memory_MB"])
-
-    list_data: list[dict] = []
-    for g_name, values in report_result.items():
-        list_data.append({})
-        list_data[-1]["name"] = g_name
-        if check_consistency:
-            list_data[-1]["consistency"] = values[0]
-        if check_benchmark:
-            base = check_benchmark + check_consistency - 1
-            list_data[-1]["performance_mean"] = values[base]
-            list_data[-1]["max_allocated_memory_MB"] = values[base + 1]
-    json_data = {"report": list_data}
-
-    with open(folder / "report.json", "w") as f:
-        json.dump(json_data, f, indent=4)
+                benchs = data["benchmarks"]
+                forward_benchs = [bench for bench in benchs if "forward" in bench["param"]]
+                backward_benchs = [bench for bench in benchs if "backward" in bench["param"]]
+
+                forward_benchs_sorted = sorted(
+                    forward_benchs, key=lambda x: x["extra_info"]["max_allocated_memory_MB"], reverse=True
+                )
+                backward_benchs_sorted = sorted(
+                    backward_benchs, key=lambda x: x["extra_info"]["max_allocated_memory_MB"], reverse=True
+                )
+
+                for bk in forward_benchs_sorted:
+                    print(f"{bk['name'].lstrip('test_')}: {bk['extra_info']['max_allocated_memory_MB']/1000} GB")
+                print("\n")
+                for bk in backward_benchs_sorted:
+                    print(f"{bk['name'].lstrip('test_')}: {bk['extra_info']['max_allocated_memory_MB']/1000} GB")
+                print("\n")
diff --git a/thunder/dynamo/utils.py b/thunder/dynamo/utils.py
index 475f3c4f1e..b9486e7bff 100644
--- a/thunder/dynamo/utils.py
+++ b/thunder/dynamo/utils.py
@@ -15,7 +15,8 @@
 from thunder.torch.default_torch_ops import torch_auto_registered_ops
 from thunder.torch import _torch_to_thunder_function_map
 from thunder.torch.langctx import torchctx
-from thunder.core.utils import check
+from thunder.core.utils import check, sequencify
+from thunder.core.pytree import tree_flatten
 
 if TYPE_CHECKING:
     from thunder.core.symbol import Symbol
@@ -724,7 +725,6 @@ def reproducer(
     folder: str | os.PathLike,
     graph_name: str,
     use_pytest_benchmark: bool = False,
-    check_consistency: bool = False,
     save_input_tensor: bool = False,
 ):
     folder = Path(folder)
@@ -753,6 +753,38 @@ def torch_inductor(fn, inputs):
 bench_executors_dict["thunder"]=partial(thunder.jit, {thunder_options_str})
 bench_executors_dict["torch_inductor"]=torch_inductor
 bench_executors_dict["eager"]=None
+"""
+
+    COMMAND_LINE_ARGS = f"""
+import argparse
+
+parser = argparse.ArgumentParser(description="Process some inputs for myscript.py.")
+
+parser.add_argument(
+    "--check_consistency",
+    type=bool,
+    default=False,
+    help="Whether to check consistency (default: False)"
+)
+parser.add_argument(
+    "--compute_type",
+    type=str,
+    choices=["forward", "forward+backward"],
+    default="forward",
+    help="Type of computation to perform (forward, forward+backward)"
+)
+parser.add_argument(
+    "--executor",
+    type=str,
+    choices=["torch_inductor", "thunder", "eager"],
+    default="thunder",
+    help="Type of computation to perform (thunder, torch_inductor, or eager)"
+)
+
+args = parser.parse_args()
+ex_name = args.executor
+compute_type = args.compute_type
+check_acc = args.check_consistency
 """
 
     # split reason
@@ -791,6 +823,7 @@ def torch_inductor(fn, inputs):
             code_str += "from thunder.dev_utils.nvtx_profile_transform import NvtxProfileTransform\n"
         if use_pytest_benchmark:
             code_str += f"""import pytest
+from thunder.benchmarks.targets import parametrize_compute_type_only_training, benchmark_for_compute_type
 {EXECUTOR_DICT_CODE_STR}
 """
             if has_cuda_args:
@@ -803,9 +836,11 @@ def torch_inductor(fn, inputs):
     "executor,",
     executors,
     ids=executor_ids,
-)"""
-            func_str = f"def test_{graph_name}(benchmark, executor):\n{readable}\n"
+)
+@parametrize_compute_type_only_training"""
+            func_str = f"def test_{graph_name}(benchmark, executor, compute_type):\n{readable}\n"
         else:
+            code_str += f"\n{EXECUTOR_DICT_CODE_STR}{COMMAND_LINE_ARGS}"
             func_str = f"def test_{graph_name}():\n{readable}\n"
 
         if any(arg is None for arg in args):
@@ -820,24 +855,15 @@ def torch_inductor(fn, inputs):
             func_str += f"{_addindent(input_str, 4)}\n]\n"
 
         if not use_pytest_benchmark:
-            func_str += f"compiled = thunder.jit(DynamoModule(), {thunder_options_str})\n"
-            func_str += "thunder_result = compiled(*inputs)"
-            if check_consistency:
-                func_str += f"""
-{TORCH_INDUCTOR_FUNCTION_STR}
-eager_result = DynamoModule()(*inputs)\n
-inductor_result = torch_inductor(DynamoModule(), inputs)(*inputs)
-def check_assertion(expected, actual):
-    try:
-        torch.testing.assert_close(expected, actual)
-        return None  # No exception, return None
-    except AssertionError as e:
-        return e  # Return the caught exception
-result = {{}}
-result["thunder"] = check_assertion(eager_result, thunder_result)
-result["torch_inductor"] = check_assertion(eager_result, inductor_result)
-
-print(result)
+            func_str += f"""
+mod = DynamoModule()
+from thunder.dynamo.report import run_repro
+
+result = run_repro(bench_executors_dict, ex_name, mod, compute_type, *inputs)
+if check_acc:
+    eager_result = run_repro(bench_executors_dict, "eager", mod, compute_type, *inputs)
+    for (compute_t, eager_v), (_, cur_v) in zip(eager_result.items(), result.items()):
+        torch.testing.assert_close(eager_v, cur_v, msg=lambda e : f'{{compute_t}}: {{e}}')
 """
         else:
             func_str = f"""{func_str}
@@ -848,14 +874,8 @@ def check_assertion(expected, actual):
     compiled = executor(mod, inputs)
 else:
     compiled = executor(mod)
-"""
-            if not has_cuda_args:
-                func_str += f"""benchmark(compiled, *inputs)"""
-            else:
-                func_str += f"""from thunder.benchmarks import record_peak_allocated_memory
 
-with record_peak_allocated_memory(benchmark):
-    benchmark(compiled, *inputs)
+benchmark_for_compute_type(compute_type, benchmark, compiled, inputs, {{}})
 """
         print(comment_str, file=f)
         print(code_str, file=f)
@@ -863,3 +883,22 @@ def check_assertion(expected, actual):
 
         if not use_pytest_benchmark:
             print(f"\ntest_{graph_name}()", file=f)
+
+
+def run_backward(fn, *args, **kwargs):
+    result = fn(*args, **kwargs)
+    result = sequencify(result)
+
+    forward_inputs = tree_flatten((args, kwargs))[0]
+    forward_inputs = list(filter(lambda x: isinstance(x, torch.Tensor) and x.requires_grad, forward_inputs))
+    differentiable_tensor_result = list(filter(lambda x: isinstance(x, torch.Tensor) and x.requires_grad, result))
+
+    output_grads = []
+    for diff_result in differentiable_tensor_result:
+        output_grads.append(torch.ones_like(diff_result))
+
+    for i in forward_inputs:
+        i.grad = None
+
+    torch.autograd.backward(result, output_grads, inputs=forward_inputs)
+    return result, [t.grad for t in forward_inputs]
diff --git a/thunder/tests/test_dynamo.py b/thunder/tests/test_dynamo.py
index 8a23b04368..9659934bf3 100644
--- a/thunder/tests/test_dynamo.py
+++ b/thunder/tests/test_dynamo.py
@@ -1022,7 +1022,7 @@ def foo(x):
     out = thfoo(t0)
     assert out.device.type == "meta"
 
-    
+
 @requiresCUDA
 def test_report(tmp_path):
     def foo(x):
@@ -1031,6 +1031,10 @@ def foo(x):
         return y + x.cos()
 
     x = torch.randn(4, 4, device="cuda", requires_grad=True)
+    # cf = thunderfx(foo)
+    # cf(x)
+    # cf._backend.save_reproducer_to_folder(tmp_path)
+
     thunderfx_save_report(foo, x, folder_path=tmp_path)
 
     from unittest.mock import patch