From 866ab425aaa2d431e3177674002384a974cf514f Mon Sep 17 00:00:00 2001
From: Simon Fan <xmfan@meta.com>
Date: Tue, 3 Sep 2024 09:28:08 -0700
Subject: [PATCH 01/10] Add compiled autograd tutorial

---
 .../compiled_autograd_tutorial.py             | 275 ++++++++++++++++++
 1 file changed, 275 insertions(+)
 create mode 100644 intermediate_source/compiled_autograd_tutorial.py

diff --git a/intermediate_source/compiled_autograd_tutorial.py b/intermediate_source/compiled_autograd_tutorial.py
new file mode 100644
index 0000000000..3b8bdd68c6
--- /dev/null
+++ b/intermediate_source/compiled_autograd_tutorial.py
@@ -0,0 +1,275 @@
+# -*- coding: utf-8 -*-
+
+"""
+Compiled Autograd: Capturing a larger backward graph for ``torch.compile``
+==========================================================================
+
+"""
+
+######################################################################
+# Compiled Autograd is a torch.compile extension introduced in PyTorch 2.4
+# that allows the capture of a larger backward graph. It is highly recommended
+# to familiarize yourself with `torch.compile <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`_.
+# 
+
+######################################################################
+# Doesn't torch.compile already capture the backward graph?
+# ------------
+# Partially. AOTAutograd captures the backward graph ahead-of-time, but with certain limitations:
+# - Graph breaks in the forward lead to graph breaks in the backward
+# - `Backward hooks <https://pytorch.org/docs/stable/notes/autograd.html#backward-hooks-execution>`_ are not captured
+# 
+# Compiled Autograd addresses these limitations by directly integrating with the autograd engine, allowing
+# it to capture the full backward graph at runtime. Models with these two characteristics should try
+# Compiled Autograd, and potentially observe better performance.
+#
+# However, Compiled Autograd has its own limitations:
+# - Dynamic autograd structure leads to recompiles
+# 
+
+######################################################################
+# Basic Usage
+# ------------
+#
+
+# NOTE: Must be enabled before using the decorator
+torch._dynamo.config.compiled_autograd = True
+
+class Model(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(10, 10)
+
+    def forward(self, x):
+        return self.linear(x)
+
+@torch.compile
+def train(model, x):
+    loss = model(x).sum()
+    loss.backward()
+
+model = Model()
+x = torch.randn(10)
+train(model, x) 
+
+######################################################################
+# Inspecting the compiled autograd logs
+# ------------
+# Run the script with either TORCH_LOGS environment variables
+# 
+"""
+Prints graph:
+TORCH_LOGS="compiled_autograd" python example.py
+
+Performance degrading, prints verbose graph and recompile reasons:
+TORCH_LOGS="compiled_autograd_verbose" python example.py
+"""
+
+######################################################################
+# Or with the set_logs private API:
+#
+
+# flag must be enabled before wrapping using torch.compile
+torch._logging._internal.set_logs(compiled_autograd=True)
+
+@torch.compile
+def train(model, x):
+    loss = model(x).sum()
+    loss.backward()
+
+train(model, x) 
+
+######################################################################
+# The compiled autograd graph should now be logged to stdout. Certain graph nodes will have names that are prefixed by "aot0_",
+# these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0.
+# 
+# NOTE: This is the graph that we will call torch.compile on, NOT the optimized graph. Compiled Autograd basically
+# generated some python code to represent the entire C++ autograd execution.
+# 
+"""
+INFO:torch._dynamo.compiled_autograd.__compiled_autograd:TRACED GRAPH
+ ===== Compiled autograd graph =====
+ <eval_with_key>.4 class CompiledAutograd(torch.nn.Module):
+    def forward(self, inputs, sizes, scalars, hooks):
+        # No stacktrace found for following nodes
+        aot0_tangents_1: "f32[][]cpu" = inputs[0]
+        aot0_primals_3: "f32[10][1]cpu" = inputs[1]
+        getitem_2: "f32[10][1]cpu" = inputs[2]
+        getitem_3: "f32[10, 10][10, 1]cpu" = inputs[3];  inputs = None
+        
+         # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: CompiledFunctionBackward0 (NodeCall 1)
+        aot0_expand: "f32[10][0]cpu" = torch.ops.aten.expand.default(aot0_tangents_1, [10]);  aot0_tangents_1 = None
+        aot0_view_2: "f32[1, 10][0, 0]cpu" = torch.ops.aten.view.default(aot0_expand, [1, 10]);  aot0_expand = None
+        aot0_permute_2: "f32[10, 1][0, 0]cpu" = torch.ops.aten.permute.default(aot0_view_2, [1, 0])
+        aot0_select: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 0)
+        aot0_view: "f32[1, 10][10, 1]cpu" = torch.ops.aten.view.default(aot0_primals_3, [1, 10]);  aot0_primals_3 = None
+        aot0_mul_3: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select, aot0_view);  aot0_select = None
+        aot0_select_1: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 1)
+        aot0_mul_4: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_1, aot0_view);  aot0_select_1 = None
+        aot0_select_2: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 2)
+        aot0_mul_5: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_2, aot0_view);  aot0_select_2 = None
+        aot0_select_3: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 3)
+        aot0_mul_6: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_3, aot0_view);  aot0_select_3 = None
+        aot0_select_4: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 4)
+        aot0_mul_7: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_4, aot0_view);  aot0_select_4 = None
+        aot0_select_5: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 5)
+        aot0_mul_8: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_5, aot0_view);  aot0_select_5 = None
+        aot0_select_6: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 6)
+        aot0_mul_9: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_6, aot0_view);  aot0_select_6 = None
+        aot0_select_7: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 7)
+        aot0_mul_10: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_7, aot0_view);  aot0_select_7 = None
+        aot0_select_8: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 8)
+        aot0_mul_11: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_8, aot0_view);  aot0_select_8 = None
+        aot0_select_9: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 9);  aot0_permute_2 = None
+        aot0_mul_12: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_9, aot0_view);  aot0_select_9 = aot0_view = None
+        aot0_cat: "f32[10, 10][10, 1]cpu" = torch.ops.aten.cat.default([aot0_mul_3, aot0_mul_4, aot0_mul_5, aot0_mul_6, aot0_mul_7, aot0_mul_8, aot0_mul_9, aot0_mul_10, aot0_mul_11, aot0_mul_12]);  aot0_mul_3 = aot0_mul_4 = aot0_mul_5 = aot0_mul_6 = aot0_mul_7 = aot0_mul_8 = aot0_mul_9 = aot0_mul_10 = aot0_mul_11 = aot0_mul_12 = None
+        aot0_permute_3: "f32[10, 10][1, 10]cpu" = torch.ops.aten.permute.default(aot0_cat, [1, 0]);  aot0_cat = None
+        aot0_sum_3: "f32[1, 10][10, 1]cpu" = torch.ops.aten.sum.dim_IntList(aot0_view_2, [0], True);  aot0_view_2 = None
+        aot0_view_3: "f32[10][1]cpu" = torch.ops.aten.view.default(aot0_sum_3, [10]);  aot0_sum_3 = None
+        
+         # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: torch::autograd::AccumulateGrad (NodeCall 2)
+        accumulate_grad_ = torch.ops.inductor.accumulate_grad_.default(getitem_2, aot0_view_3);  getitem_2 = aot0_view_3 = accumulate_grad_ = None
+        
+         # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: CompiledFunctionBackward0 (NodeCall 1)
+        aot0_permute_4: "f32[10, 10][10, 1]cpu" = torch.ops.aten.permute.default(aot0_permute_3, [1, 0]);  aot0_permute_3 = None
+        
+         # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: torch::autograd::AccumulateGrad (NodeCall 3)
+        accumulate_grad__1 = torch.ops.inductor.accumulate_grad_.default(getitem_3, aot0_permute_4);  getitem_3 = aot0_permute_4 = accumulate_grad__1 = None
+        _exec_final_callbacks_stub = torch__dynamo_external_utils__exec_final_callbacks_stub();  _exec_final_callbacks_stub = None
+        return []
+"""
+
+######################################################################
+# Compiling the forward and backward pass using different flags
+# ------------
+# 
+
+def train(model, x):
+    model = torch.compile(model)
+    loss = model(x).sum()
+    torch.compile(lambda: loss.backward(), fullgraph=True)()
+
+######################################################################
+# Or you can use the context manager, which will apply to all autograd calls within it
+# 
+
+def train(model, x):
+    model = torch.compile(model)
+    loss = model(x).sum()
+    with torch._dynamo.compiled_autograd.enable(torch.compile(fullgraph=True)):
+        loss.backward()
+
+
+######################################################################
+# Demonstrating the limitations of AOTAutograd addressed by Compiled Autograd
+# ------------
+# 1. Graph breaks in the forward lead to graph breaks in the backward
+#
+
+@torch.compile(backend="aot_eager")
+def fn(x):
+    # 1st graph
+    temp = x + 10
+    torch._dynamo.graph_break()
+    # 2nd graph
+    temp = temp + 10
+    torch._dynamo.graph_break()
+    # 3rd graph
+    return temp.sum()
+
+x = torch.randn(10, 10, requires_grad=True)
+loss = fn(x)
+
+# 1. base torch.compile 
+loss.backward(retain_graph=True)
+assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 3)
+torch._dynamo.utils.counters.clear()
+
+# 2. torch.compile with compiled autograd
+with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")):
+    loss.backward()
+
+# single graph for the backward
+assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 1)
+
+
+######################################################################
+# 2. `Backward hooks are not captured
+# 
+
+@torch.compile(backend="aot_eager")
+def fn(x):
+    return x.sum()
+
+x = torch.randn(10, 10, requires_grad=True)
+x.register_hook(lambda grad: grad+10)
+loss = fn(x)
+
+torch._logging._internal.set_logs(compiled_autograd=True)
+with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")):
+    loss.backward()
+
+######################################################################
+# There is a `call_hook` node in the graph, which dynamo will inline
+# 
+
+"""
+INFO:torch._dynamo.compiled_autograd.__compiled_autograd:TRACED GRAPH
+ ===== Compiled autograd graph =====
+ <eval_with_key>.2 class CompiledAutograd(torch.nn.Module):
+    def forward(self, inputs, sizes, scalars, hooks):
+    ...
+    getitem_2 = hooks[0];  hooks = None
+    call_hook: "f32[10, 10][0, 0]cpu" = torch__dynamo_external_utils_call_hook(getitem_2, aot0_expand, hook_type = 'tensor_pre_hook');  getitem_2 = aot0_expand = None
+    ...
+"""
+
+######################################################################
+# Understanding recompilation reasons for Compiled Autograd
+# ------------
+# 1. Due to change in autograd structure 
+
+torch._logging._internal.set_logs(compiled_autograd_verbose=True)
+torch._dynamo.config.compiled_autograd = True
+x = torch.randn(10, requires_grad=True)
+for op in [torch.add, torch.sub, torch.mul, torch.div]:
+    loss = op(x, x).sum()
+    torch.compile(lambda: loss.backward(), backend="eager")()
+
+######################################################################
+# You should see some cache miss logs (recompiles):
+# Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] 
+# ...
+# Cache miss due to new autograd node: SubBackward0 (NodeCall 2) with key size 56, previous key sizes=[]
+# ...
+# Cache miss due to new autograd node: MulBackward0 (NodeCall 2) with key size 71, previous key sizes=[]
+# ...
+# Cache miss due to new autograd node: DivBackward0 (NodeCall 2) with key size 70, previous key sizes=[]
+# ...
+
+######################################################################
+# 2. Due to dynamic shapes
+# 
+
+torch._logging._internal.set_logs(compiled_autograd_verbose=True)
+torch._dynamo.config.compiled_autograd = True
+for i in [10, 100, 10]:
+    x = torch.randn(i, i, requires_grad=True)
+    loss = x.sum()
+    torch.compile(lambda: loss.backward(), backend="eager")()
+
+######################################################################
+# You should see some cache miss logs (recompiles):
+# ...
+# Cache miss due to changed shapes: marking size idx 0 of torch::autograd::GraphRoot (NodeCall 0) as dynamic
+# Cache miss due to changed shapes: marking size idx 1 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic
+# Cache miss due to changed shapes: marking size idx 2 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic
+# Cache miss due to changed shapes: marking size idx 3 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic
+# ...
+
+######################################################################
+# Compatibility and rough edges
+# ------------
+#
+# Compiled Autograd is under active development and is not yet compatible with all existing PyTorch features.
+# For the latest status on a particular feature, refer to: https://docs.google.com/document/d/11VucFBEewzqgkABIjebZIzMvrXr3BtcY1aGKpX61pJY.

From 7483147343e327bc851db771822ed45c9acf4a79 Mon Sep 17 00:00:00 2001
From: Simon Fan <xmfan@fb.com>
Date: Tue, 3 Sep 2024 19:20:32 -0700
Subject: [PATCH 02/10] update

---
 .../compiled_autograd_tutorial.py             | 101 +++++++++++-------
 1 file changed, 61 insertions(+), 40 deletions(-)

diff --git a/intermediate_source/compiled_autograd_tutorial.py b/intermediate_source/compiled_autograd_tutorial.py
index 3b8bdd68c6..4b5e2bbebf 100644
--- a/intermediate_source/compiled_autograd_tutorial.py
+++ b/intermediate_source/compiled_autograd_tutorial.py
@@ -16,22 +16,43 @@
 # Doesn't torch.compile already capture the backward graph?
 # ------------
 # Partially. AOTAutograd captures the backward graph ahead-of-time, but with certain limitations:
-# - Graph breaks in the forward lead to graph breaks in the backward
-# - `Backward hooks <https://pytorch.org/docs/stable/notes/autograd.html#backward-hooks-execution>`_ are not captured
+#   - Graph breaks in the forward lead to graph breaks in the backward
+#   - `Backward hooks <https://pytorch.org/docs/stable/notes/autograd.html#backward-hooks-execution>`_ are not captured
 # 
 # Compiled Autograd addresses these limitations by directly integrating with the autograd engine, allowing
 # it to capture the full backward graph at runtime. Models with these two characteristics should try
 # Compiled Autograd, and potentially observe better performance.
 #
 # However, Compiled Autograd has its own limitations:
-# - Dynamic autograd structure leads to recompiles
+#   - Dynamic autograd structure leads to recompiles
 # 
 
+######################################################################
+# Tutorial output cells setup
+# ------------
+#
+
+import os
+
+class ScopedLogging:
+    def __init__(self):
+        assert "TORCH_LOGS" not in os.environ
+        assert "TORCH_LOGS_FORMAT" not in os.environ
+        os.environ["TORCH_LOGS"] = "compiled_autograd_verbose"
+        os.environ["TORCH_LOGS_FORMAT"] = "short"
+    
+    def __del__(self):
+        del os.environ["TORCH_LOGS"]
+        del os.environ["TORCH_LOGS_FORMAT"]
+    
+
 ######################################################################
 # Basic Usage
 # ------------
 #
 
+import torch
+
 # NOTE: Must be enabled before using the decorator
 torch._dynamo.config.compiled_autograd = True
 
@@ -57,21 +78,12 @@ def train(model, x):
 # ------------
 # Run the script with either TORCH_LOGS environment variables
 # 
-"""
-Prints graph:
-TORCH_LOGS="compiled_autograd" python example.py
-
-Performance degrading, prints verbose graph and recompile reasons:
-TORCH_LOGS="compiled_autograd_verbose" python example.py
-"""
-
-######################################################################
-# Or with the set_logs private API:
+# - To only print the compiled autograd graph, use `TORCH_LOGS="compiled_autograd" python example.py`
+# - To sacrifice some performance, in order to print the graph with more tensor medata and recompile reasons, use `TORCH_LOGS="compiled_autograd_verbose" python example.py`
+# 
+# Logs can also be enabled through the private API torch._logging._internal.set_logs.
 #
 
-# flag must be enabled before wrapping using torch.compile
-torch._logging._internal.set_logs(compiled_autograd=True)
-
 @torch.compile
 def train(model, x):
     loss = model(x).sum()
@@ -80,14 +92,15 @@ def train(model, x):
 train(model, x) 
 
 ######################################################################
-# The compiled autograd graph should now be logged to stdout. Certain graph nodes will have names that are prefixed by "aot0_",
+# The compiled autograd graph should now be logged to stdout. Certain graph nodes will have names that are prefixed by aot0_,
 # these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0.
 # 
 # NOTE: This is the graph that we will call torch.compile on, NOT the optimized graph. Compiled Autograd basically
 # generated some python code to represent the entire C++ autograd execution.
 # 
 """
-INFO:torch._dynamo.compiled_autograd.__compiled_autograd:TRACED GRAPH
+DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[]
+DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:TRACED GRAPH
  ===== Compiled autograd graph =====
  <eval_with_key>.4 class CompiledAutograd(torch.nn.Module):
     def forward(self, inputs, sizes, scalars, hooks):
@@ -178,6 +191,7 @@ def fn(x):
     return temp.sum()
 
 x = torch.randn(10, 10, requires_grad=True)
+torch._dynamo.utils.counters.clear()
 loss = fn(x)
 
 # 1. base torch.compile 
@@ -205,7 +219,6 @@ def fn(x):
 x.register_hook(lambda grad: grad+10)
 loss = fn(x)
 
-torch._logging._internal.set_logs(compiled_autograd=True)
 with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")):
     loss.backward()
 
@@ -214,22 +227,22 @@ def fn(x):
 # 
 
 """
-INFO:torch._dynamo.compiled_autograd.__compiled_autograd:TRACED GRAPH
+DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[]
+DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:TRACED GRAPH
  ===== Compiled autograd graph =====
  <eval_with_key>.2 class CompiledAutograd(torch.nn.Module):
     def forward(self, inputs, sizes, scalars, hooks):
-    ...
-    getitem_2 = hooks[0];  hooks = None
-    call_hook: "f32[10, 10][0, 0]cpu" = torch__dynamo_external_utils_call_hook(getitem_2, aot0_expand, hook_type = 'tensor_pre_hook');  getitem_2 = aot0_expand = None
-    ...
+        ...
+        getitem_2 = hooks[0];  hooks = None
+        call_hook: "f32[10, 10][0, 0]cpu" = torch__dynamo_external_utils_call_hook(getitem_2, aot0_expand, hook_type = 'tensor_pre_hook');  getitem_2 = aot0_expand = None
+        ...
 """
 
 ######################################################################
-# Understanding recompilation reasons for Compiled Autograd
+# Common recompilation reasons for Compiled Autograd
 # ------------
 # 1. Due to change in autograd structure 
 
-torch._logging._internal.set_logs(compiled_autograd_verbose=True)
 torch._dynamo.config.compiled_autograd = True
 x = torch.randn(10, requires_grad=True)
 for op in [torch.add, torch.sub, torch.mul, torch.div]:
@@ -238,14 +251,18 @@ def forward(self, inputs, sizes, scalars, hooks):
 
 ######################################################################
 # You should see some cache miss logs (recompiles):
-# Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] 
-# ...
-# Cache miss due to new autograd node: SubBackward0 (NodeCall 2) with key size 56, previous key sizes=[]
-# ...
-# Cache miss due to new autograd node: MulBackward0 (NodeCall 2) with key size 71, previous key sizes=[]
-# ...
-# Cache miss due to new autograd node: DivBackward0 (NodeCall 2) with key size 70, previous key sizes=[]
-# ...
+#
+
+"""
+Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] 
+...
+Cache miss due to new autograd node: SubBackward0 (NodeCall 2) with key size 56, previous key sizes=[]
+...
+Cache miss due to new autograd node: MulBackward0 (NodeCall 2) with key size 71, previous key sizes=[]
+...
+Cache miss due to new autograd node: DivBackward0 (NodeCall 2) with key size 70, previous key sizes=[]
+...
+"""
 
 ######################################################################
 # 2. Due to dynamic shapes
@@ -260,12 +277,16 @@ def forward(self, inputs, sizes, scalars, hooks):
 
 ######################################################################
 # You should see some cache miss logs (recompiles):
-# ...
-# Cache miss due to changed shapes: marking size idx 0 of torch::autograd::GraphRoot (NodeCall 0) as dynamic
-# Cache miss due to changed shapes: marking size idx 1 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic
-# Cache miss due to changed shapes: marking size idx 2 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic
-# Cache miss due to changed shapes: marking size idx 3 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic
-# ...
+#
+
+"""
+...
+Cache miss due to changed shapes: marking size idx 0 of torch::autograd::GraphRoot (NodeCall 0) as dynamic
+Cache miss due to changed shapes: marking size idx 1 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic
+Cache miss due to changed shapes: marking size idx 2 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic
+Cache miss due to changed shapes: marking size idx 3 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic
+...
+"""
 
 ######################################################################
 # Compatibility and rough edges

From 6cccee05681bf34ed358cbdf83cfbfbe9066e94b Mon Sep 17 00:00:00 2001
From: Simon Fan <xmfan@fb.com>
Date: Wed, 4 Sep 2024 11:46:22 -0700
Subject: [PATCH 03/10] update

---
 .../compiled_autograd_tutorial.py             | 146 ++++++++++--------
 1 file changed, 79 insertions(+), 67 deletions(-)

diff --git a/intermediate_source/compiled_autograd_tutorial.py b/intermediate_source/compiled_autograd_tutorial.py
index 4b5e2bbebf..4fd58e9743 100644
--- a/intermediate_source/compiled_autograd_tutorial.py
+++ b/intermediate_source/compiled_autograd_tutorial.py
@@ -4,58 +4,57 @@
 Compiled Autograd: Capturing a larger backward graph for ``torch.compile``
 ==========================================================================
 
+**Author:** `Simon Fan <https://github.com/xmfan>`_
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * How compiled autograd interacts with torch.compile
+       * How to use the compiled autograd API
+       * How to inspect logs using TORCH_LOGS
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * PyTorch 2.4
+       * `torch.compile <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`_ familiarity
+
 """
 
 ######################################################################
+# Overview
+# ------------
 # Compiled Autograd is a torch.compile extension introduced in PyTorch 2.4
-# that allows the capture of a larger backward graph. It is highly recommended
-# to familiarize yourself with `torch.compile <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`_.
+# that allows the capture of a larger backward graph.
 # 
-
-######################################################################
 # Doesn't torch.compile already capture the backward graph?
 # ------------
-# Partially. AOTAutograd captures the backward graph ahead-of-time, but with certain limitations:
-#   - Graph breaks in the forward lead to graph breaks in the backward
-#   - `Backward hooks <https://pytorch.org/docs/stable/notes/autograd.html#backward-hooks-execution>`_ are not captured
+# And it does, **partially**. AOTAutograd captures the backward graph ahead-of-time, but with certain limitations:
+# 1. Graph breaks in the forward lead to graph breaks in the backward
+# 2. `Backward hooks <https://pytorch.org/docs/stable/notes/autograd.html#backward-hooks-execution>`_ are not captured
 # 
 # Compiled Autograd addresses these limitations by directly integrating with the autograd engine, allowing
 # it to capture the full backward graph at runtime. Models with these two characteristics should try
 # Compiled Autograd, and potentially observe better performance.
 #
 # However, Compiled Autograd has its own limitations:
-#   - Dynamic autograd structure leads to recompiles
+# 1. Additional runtime overhead at the start of the backward
+# 2. Dynamic autograd structure leads to recompiles
+# 
+# .. note:: Compiled Autograd is under active development and is not yet compatible with all existing PyTorch features. For the latest status on a particular feature, refer to `Compiled Autograd Landing Page <https://docs.google.com/document/d/11VucFBEewzqgkABIjebZIzMvrXr3BtcY1aGKpX61pJY>`_.
 # 
 
-######################################################################
-# Tutorial output cells setup
-# ------------
-#
-
-import os
-
-class ScopedLogging:
-    def __init__(self):
-        assert "TORCH_LOGS" not in os.environ
-        assert "TORCH_LOGS_FORMAT" not in os.environ
-        os.environ["TORCH_LOGS"] = "compiled_autograd_verbose"
-        os.environ["TORCH_LOGS_FORMAT"] = "short"
-    
-    def __del__(self):
-        del os.environ["TORCH_LOGS"]
-        del os.environ["TORCH_LOGS_FORMAT"]
-    
 
 ######################################################################
-# Basic Usage
+# Setup
 # ------------
-#
+# In this tutorial, we'll base our examples on this toy model.
+# 
 
 import torch
 
-# NOTE: Must be enabled before using the decorator
-torch._dynamo.config.compiled_autograd = True
-
 class Model(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -64,24 +63,30 @@ def __init__(self):
     def forward(self, x):
         return self.linear(x)
 
+
+######################################################################
+# Basic usage
+# ------------
+# .. note:: The ``torch._dynamo.config.compiled_autograd = True`` config must be enabled before calling the torch.compile API.
+#
+
+model = Model()
+x = torch.randn(10)
+
+torch._dynamo.config.compiled_autograd = True
 @torch.compile
 def train(model, x):
     loss = model(x).sum()
     loss.backward()
 
-model = Model()
-x = torch.randn(10)
 train(model, x) 
 
 ######################################################################
 # Inspecting the compiled autograd logs
 # ------------
-# Run the script with either TORCH_LOGS environment variables
-# 
-# - To only print the compiled autograd graph, use `TORCH_LOGS="compiled_autograd" python example.py`
-# - To sacrifice some performance, in order to print the graph with more tensor medata and recompile reasons, use `TORCH_LOGS="compiled_autograd_verbose" python example.py`
-# 
-# Logs can also be enabled through the private API torch._logging._internal.set_logs.
+# Run the script with the TORCH_LOGS environment variables:
+#   - To only print the compiled autograd graph, use ``TORCH_LOGS="compiled_autograd" python example.py``
+#   - To print the graph with more tensor medata and recompile reasons, at the cost of performance, use ``TORCH_LOGS="compiled_autograd_verbose" python example.py``
 #
 
 @torch.compile
@@ -92,13 +97,11 @@ def train(model, x):
 train(model, x) 
 
 ######################################################################
-# The compiled autograd graph should now be logged to stdout. Certain graph nodes will have names that are prefixed by aot0_,
-# these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0.
-# 
-# NOTE: This is the graph that we will call torch.compile on, NOT the optimized graph. Compiled Autograd basically
-# generated some python code to represent the entire C++ autograd execution.
+# The compiled autograd graph should now be logged to stderr. Certain graph nodes will have names that are prefixed by ``aot0_``,
+# these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0 e.g. ``aot0_view_2`` corresponds to ``view_2`` of the AOT backward graph with id=0.
 # 
-"""
+
+stderr_output = """
 DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[]
 DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:TRACED GRAPH
  ===== Compiled autograd graph =====
@@ -152,6 +155,10 @@ def forward(self, inputs, sizes, scalars, hooks):
         return []
 """
 
+######################################################################
+# .. note:: This is the graph that we will call torch.compile on, NOT the optimized graph. Compiled Autograd generates some python code to represent the entire C++ autograd execution.
+# 
+
 ######################################################################
 # Compiling the forward and backward pass using different flags
 # ------------
@@ -163,7 +170,7 @@ def train(model, x):
     torch.compile(lambda: loss.backward(), fullgraph=True)()
 
 ######################################################################
-# Or you can use the context manager, which will apply to all autograd calls within it
+# Or you can use the context manager, which will apply to all autograd calls within its scope.
 # 
 
 def train(model, x):
@@ -174,7 +181,7 @@ def train(model, x):
 
 
 ######################################################################
-# Demonstrating the limitations of AOTAutograd addressed by Compiled Autograd
+# Compiled Autograd addresses certain limitations of AOTAutograd
 # ------------
 # 1. Graph breaks in the forward lead to graph breaks in the backward
 #
@@ -208,7 +215,12 @@ def fn(x):
 
 
 ######################################################################
-# 2. `Backward hooks are not captured
+# In the ``1. base torch.compile`` case, we see that 3 backward graphs were produced due to the 2 graph breaks in the compiled function ``fn``. 
+# Whereas in ``2. torch.compile with compiled autograd``, we see that a full backward graph was traced despite the graph breaks.
+# 
+
+######################################################################
+# 2. Backward hooks are not captured
 # 
 
 @torch.compile(backend="aot_eager")
@@ -223,19 +235,19 @@ def fn(x):
     loss.backward()
 
 ######################################################################
-# There is a `call_hook` node in the graph, which dynamo will inline
+# There should be a ``call_hook`` node in the graph, which dynamo will later inline into
 # 
 
-"""
+stderr_output = """
 DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[]
 DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:TRACED GRAPH
- ===== Compiled autograd graph =====
- <eval_with_key>.2 class CompiledAutograd(torch.nn.Module):
-    def forward(self, inputs, sizes, scalars, hooks):
-        ...
-        getitem_2 = hooks[0];  hooks = None
-        call_hook: "f32[10, 10][0, 0]cpu" = torch__dynamo_external_utils_call_hook(getitem_2, aot0_expand, hook_type = 'tensor_pre_hook');  getitem_2 = aot0_expand = None
-        ...
+===== Compiled autograd graph =====
+<eval_with_key>.2 class CompiledAutograd(torch.nn.Module):
+   def forward(self, inputs, sizes, scalars, hooks):
+       ...
+       getitem_2 = hooks[0];  hooks = None
+       call_hook: "f32[10, 10][0, 0]cpu" = torch__dynamo_external_utils_call_hook(getitem_2, aot0_expand, hook_type = 'tensor_pre_hook');  getitem_2 = aot0_expand = None
+       ...
 """
 
 ######################################################################
@@ -250,10 +262,10 @@ def forward(self, inputs, sizes, scalars, hooks):
     torch.compile(lambda: loss.backward(), backend="eager")()
 
 ######################################################################
-# You should see some cache miss logs (recompiles):
+# You should see some recompile messages: **Cache miss due to new autograd node**.
 #
 
-"""
+stderr_output = """
 Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] 
 ...
 Cache miss due to new autograd node: SubBackward0 (NodeCall 2) with key size 56, previous key sizes=[]
@@ -268,7 +280,6 @@ def forward(self, inputs, sizes, scalars, hooks):
 # 2. Due to dynamic shapes
 # 
 
-torch._logging._internal.set_logs(compiled_autograd_verbose=True)
 torch._dynamo.config.compiled_autograd = True
 for i in [10, 100, 10]:
     x = torch.randn(i, i, requires_grad=True)
@@ -276,10 +287,10 @@ def forward(self, inputs, sizes, scalars, hooks):
     torch.compile(lambda: loss.backward(), backend="eager")()
 
 ######################################################################
-# You should see some cache miss logs (recompiles):
+# You should see some recompiles messages: **Cache miss due to changed shapes**.
 #
 
-"""
+stderr_output = """
 ...
 Cache miss due to changed shapes: marking size idx 0 of torch::autograd::GraphRoot (NodeCall 0) as dynamic
 Cache miss due to changed shapes: marking size idx 1 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic
@@ -289,8 +300,9 @@ def forward(self, inputs, sizes, scalars, hooks):
 """
 
 ######################################################################
-# Compatibility and rough edges
-# ------------
-#
-# Compiled Autograd is under active development and is not yet compatible with all existing PyTorch features.
-# For the latest status on a particular feature, refer to: https://docs.google.com/document/d/11VucFBEewzqgkABIjebZIzMvrXr3BtcY1aGKpX61pJY.
+# Conclusion
+# ----------
+# In this tutorial, we went over the high-level ecosystem of torch.compile with compiled autograd, the basics of compiled autograd and a few common recompilation reasons.
+# 
+# For feedback on this tutorial, please file an issue on https://github.com/pytorch/tutorials.
+# 
\ No newline at end of file

From 50a69783a0283747af5154c4b65b4d636d72c891 Mon Sep 17 00:00:00 2001
From: Simon Fan <xmfan@fb.com>
Date: Wed, 4 Sep 2024 12:54:43 -0700
Subject: [PATCH 04/10] update

---
 intermediate_source/compiled_autograd_tutorial.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/intermediate_source/compiled_autograd_tutorial.py b/intermediate_source/compiled_autograd_tutorial.py
index 4fd58e9743..932e930102 100644
--- a/intermediate_source/compiled_autograd_tutorial.py
+++ b/intermediate_source/compiled_autograd_tutorial.py
@@ -305,4 +305,4 @@ def forward(self, inputs, sizes, scalars, hooks):
 # In this tutorial, we went over the high-level ecosystem of torch.compile with compiled autograd, the basics of compiled autograd and a few common recompilation reasons.
 # 
 # For feedback on this tutorial, please file an issue on https://github.com/pytorch/tutorials.
-# 
\ No newline at end of file
+# 

From 271b8f2cbd61b98d32e1ab5be6d7da06b8f86c36 Mon Sep 17 00:00:00 2001
From: Simon Fan <xmfan@fb.com>
Date: Fri, 6 Sep 2024 11:20:37 -0700
Subject: [PATCH 05/10] address comments

---
 .../compiled_autograd_tutorial.py             | 80 ++++++++++---------
 1 file changed, 43 insertions(+), 37 deletions(-)

diff --git a/intermediate_source/compiled_autograd_tutorial.py b/intermediate_source/compiled_autograd_tutorial.py
index 932e930102..ff66a0a0a0 100644
--- a/intermediate_source/compiled_autograd_tutorial.py
+++ b/intermediate_source/compiled_autograd_tutorial.py
@@ -11,37 +11,35 @@
     .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
        :class-card: card-prerequisites
 
-       * How compiled autograd interacts with torch.compile
+       * How compiled autograd interacts with ``torch.compile``
        * How to use the compiled autograd API
-       * How to inspect logs using TORCH_LOGS
+       * How to inspect logs using ``TORCH_LOGS``
 
     .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
        :class-card: card-prerequisites
 
        * PyTorch 2.4
-       * `torch.compile <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`_ familiarity
+       * Complete the `Introduction to torch.compile <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`_
 
 """
 
 ######################################################################
 # Overview
 # ------------
-# Compiled Autograd is a torch.compile extension introduced in PyTorch 2.4
+# Compiled Autograd is a ``torch.compile`` extension introduced in PyTorch 2.4
 # that allows the capture of a larger backward graph.
 # 
-# Doesn't torch.compile already capture the backward graph?
-# ------------
-# And it does, **partially**. AOTAutograd captures the backward graph ahead-of-time, but with certain limitations:
-# 1. Graph breaks in the forward lead to graph breaks in the backward
-# 2. `Backward hooks <https://pytorch.org/docs/stable/notes/autograd.html#backward-hooks-execution>`_ are not captured
+# While ``torch.compile`` does capture the backward graph, it does so **partially**. The AOTAutograd component captures the backward graph ahead-of-time, with certain limitations:
+# * Graph breaks in the forward lead to graph breaks in the backward
+# * `Backward hooks <https://pytorch.org/docs/stable/notes/autograd.html#backward-hooks-execution>`_ are not captured
 # 
 # Compiled Autograd addresses these limitations by directly integrating with the autograd engine, allowing
 # it to capture the full backward graph at runtime. Models with these two characteristics should try
 # Compiled Autograd, and potentially observe better performance.
 #
-# However, Compiled Autograd has its own limitations:
-# 1. Additional runtime overhead at the start of the backward
-# 2. Dynamic autograd structure leads to recompiles
+# However, Compiled Autograd introduces its own limitations:
+# * Added runtime overhead at the start of the backward for cache lookup
+# * More prone to recompiles and graph breaks in dynamo due to the larger capture
 # 
 # .. note:: Compiled Autograd is under active development and is not yet compatible with all existing PyTorch features. For the latest status on a particular feature, refer to `Compiled Autograd Landing Page <https://docs.google.com/document/d/11VucFBEewzqgkABIjebZIzMvrXr3BtcY1aGKpX61pJY>`_.
 # 
@@ -50,8 +48,9 @@
 ######################################################################
 # Setup
 # ------------
-# In this tutorial, we'll base our examples on this toy model.
-# 
+# In this tutorial, we will base our examples on this simple neural network model.
+# It takes a a 10-dimensional input vector, processes it through a single linear layer, and outputs another 10-dimensional vector.
+#
 
 import torch
 
@@ -67,7 +66,7 @@ def forward(self, x):
 ######################################################################
 # Basic usage
 # ------------
-# .. note:: The ``torch._dynamo.config.compiled_autograd = True`` config must be enabled before calling the torch.compile API.
+# Before calling the torch.compile API, make sure to set ``torch._dynamo.config.compiled_autograd`` to ``True``:
 #
 
 model = Model()
@@ -82,23 +81,30 @@ def train(model, x):
 train(model, x) 
 
 ######################################################################
-# Inspecting the compiled autograd logs
-# ------------
-# Run the script with the TORCH_LOGS environment variables:
-#   - To only print the compiled autograd graph, use ``TORCH_LOGS="compiled_autograd" python example.py``
-#   - To print the graph with more tensor medata and recompile reasons, at the cost of performance, use ``TORCH_LOGS="compiled_autograd_verbose" python example.py``
+# In the code above, we create an instance of the ``Model`` class and generate a random 10-dimensional tensor ``x`` by using torch.randn(10).
+# We define the training loop function ``train`` and decorate it with @torch.compile to optimize its execution.
+#
+# When ``train(model, x)`` is called:
+# * Python Interpreter calls Dynamo, since this call was decorated with ``@torch.compile``
+# * Dynamo intercepts the python bytecode, simulates their execution and records the operations into a graph
+# * AOTDispatcher disables hooks and calls the autograd engine to compute gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph. Using ``torch.autograd.Function``, AOTDispatcher rewrites the forward and backward implementation of ``train``.
+# * Inductor generates a function corresponding to an optimized implementation of the AOTDispatcher forward and backward
+# * Dynamo sets the optimized function to be evaluated next by Python Interpreter
+# * Python Interpreter executes the optimized function, which basically executes ``loss = model(x).sum()``
+# * Python Interpreter executes ``loss.backward()``, calling into the autograd engine, which routes to the Compiled Autograd engine since we enabled the config: ``torch._dynamo.config.compiled_autograd = True``
+# * Compiled Autograd computes the gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph, including any hooks it encounters. During this, it will record the backward previously rewritten by AOTDispatcher. Compiled Autograd then generates a new function which corresponds to a fully traced implementation of ``loss.backward()``, and executes it with ``torch.compile`` in inference mode
+# * The same steps recursively apply to the Compiled Autograd graph, but this time AOTDispatcher does not need to partition this graph into a forward and backward
 #
-
-@torch.compile
-def train(model, x):
-    loss = model(x).sum()
-    loss.backward()
-
-train(model, x) 
 
 ######################################################################
-# The compiled autograd graph should now be logged to stderr. Certain graph nodes will have names that are prefixed by ``aot0_``,
-# these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0 e.g. ``aot0_view_2`` corresponds to ``view_2`` of the AOT backward graph with id=0.
+# Inspecting the compiled autograd logs
+# -------------------------------------
+# Run the script with the ``TORCH_LOGS`` environment variables:
+#   - To only print the compiled autograd graph, use ``TORCH_LOGS="compiled_autograd" python example.py``
+#   - To print the graph with more tensor metadata and recompile reasons, at the cost of performance, use ``TORCH_LOGS="compiled_autograd_verbose" python example.py``
+# 
+# Rerun the snippet above, the compiled autograd graph should now be logged to ``stderr``. Certain graph nodes will have names that are prefixed by ``aot0_``,
+# these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0, for example, ``aot0_view_2`` corresponds to ``view_2`` of the AOT backward graph with id=0.
 # 
 
 stderr_output = """
@@ -156,17 +162,19 @@ def forward(self, inputs, sizes, scalars, hooks):
 """
 
 ######################################################################
-# .. note:: This is the graph that we will call torch.compile on, NOT the optimized graph. Compiled Autograd generates some python code to represent the entire C++ autograd execution.
+# .. note:: This is the graph on which we will call ``torch.compile``, **NOT** the optimized graph. Compiled Autograd essentially generates some unoptimized Python code to represent the entire C++ autograd execution.
 # 
 
 ######################################################################
 # Compiling the forward and backward pass using different flags
-# ------------
-# 
+# -------------------------------------------------------------
+# You can use different compiler configs for the two compilations, for example, the backward may be a fullgraph even if there are graph breaks in the forward.
+#
 
 def train(model, x):
     model = torch.compile(model)
     loss = model(x).sum()
+    torch._dynamo.config.compiled_autograd = True
     torch.compile(lambda: loss.backward(), fullgraph=True)()
 
 ######################################################################
@@ -182,7 +190,7 @@ def train(model, x):
 
 ######################################################################
 # Compiled Autograd addresses certain limitations of AOTAutograd
-# ------------
+# --------------------------------------------------------------
 # 1. Graph breaks in the forward lead to graph breaks in the backward
 #
 
@@ -252,7 +260,7 @@ def forward(self, inputs, sizes, scalars, hooks):
 
 ######################################################################
 # Common recompilation reasons for Compiled Autograd
-# ------------
+# --------------------------------------------------
 # 1. Due to change in autograd structure 
 
 torch._dynamo.config.compiled_autograd = True
@@ -302,7 +310,5 @@ def forward(self, inputs, sizes, scalars, hooks):
 ######################################################################
 # Conclusion
 # ----------
-# In this tutorial, we went over the high-level ecosystem of torch.compile with compiled autograd, the basics of compiled autograd and a few common recompilation reasons.
-# 
-# For feedback on this tutorial, please file an issue on https://github.com/pytorch/tutorials.
+# In this tutorial, we went over the high-level ecosystem of ``torch.compile`` with compiled autograd, the basics of compiled autograd and a few common recompilation reasons.
 # 

From 64c923e63968eeb0a299b0aae79c41581213d37b Mon Sep 17 00:00:00 2001
From: Simon Fan <xmfan@fb.com>
Date: Fri, 6 Sep 2024 16:15:43 -0700
Subject: [PATCH 06/10] try to fix build

---
 intermediate_source/compiled_autograd_tutorial.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/intermediate_source/compiled_autograd_tutorial.py b/intermediate_source/compiled_autograd_tutorial.py
index ff66a0a0a0..c1e4f2a538 100644
--- a/intermediate_source/compiled_autograd_tutorial.py
+++ b/intermediate_source/compiled_autograd_tutorial.py
@@ -50,7 +50,6 @@
 # ------------
 # In this tutorial, we will base our examples on this simple neural network model.
 # It takes a a 10-dimensional input vector, processes it through a single linear layer, and outputs another 10-dimensional vector.
-#
 
 import torch
 

From a4326ebac8c8d1c1f232dad3bf272cd06587cb24 Mon Sep 17 00:00:00 2001
From: Simon Fan <xmfan@fb.com>
Date: Mon, 9 Sep 2024 09:55:59 -0700
Subject: [PATCH 07/10] convert to .rst

---
 .../compiled_autograd_tutorial.py             | 313 ------------------
 .../compiled_autograd_tutorial.rst            | 301 +++++++++++++++++
 2 files changed, 301 insertions(+), 313 deletions(-)
 delete mode 100644 intermediate_source/compiled_autograd_tutorial.py
 create mode 100644 intermediate_source/compiled_autograd_tutorial.rst

diff --git a/intermediate_source/compiled_autograd_tutorial.py b/intermediate_source/compiled_autograd_tutorial.py
deleted file mode 100644
index c1e4f2a538..0000000000
--- a/intermediate_source/compiled_autograd_tutorial.py
+++ /dev/null
@@ -1,313 +0,0 @@
-# -*- coding: utf-8 -*-
-
-"""
-Compiled Autograd: Capturing a larger backward graph for ``torch.compile``
-==========================================================================
-
-**Author:** `Simon Fan <https://github.com/xmfan>`_
-
-.. grid:: 2
-
-    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
-       :class-card: card-prerequisites
-
-       * How compiled autograd interacts with ``torch.compile``
-       * How to use the compiled autograd API
-       * How to inspect logs using ``TORCH_LOGS``
-
-    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
-       :class-card: card-prerequisites
-
-       * PyTorch 2.4
-       * Complete the `Introduction to torch.compile <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`_
-
-"""
-
-######################################################################
-# Overview
-# ------------
-# Compiled Autograd is a ``torch.compile`` extension introduced in PyTorch 2.4
-# that allows the capture of a larger backward graph.
-# 
-# While ``torch.compile`` does capture the backward graph, it does so **partially**. The AOTAutograd component captures the backward graph ahead-of-time, with certain limitations:
-# * Graph breaks in the forward lead to graph breaks in the backward
-# * `Backward hooks <https://pytorch.org/docs/stable/notes/autograd.html#backward-hooks-execution>`_ are not captured
-# 
-# Compiled Autograd addresses these limitations by directly integrating with the autograd engine, allowing
-# it to capture the full backward graph at runtime. Models with these two characteristics should try
-# Compiled Autograd, and potentially observe better performance.
-#
-# However, Compiled Autograd introduces its own limitations:
-# * Added runtime overhead at the start of the backward for cache lookup
-# * More prone to recompiles and graph breaks in dynamo due to the larger capture
-# 
-# .. note:: Compiled Autograd is under active development and is not yet compatible with all existing PyTorch features. For the latest status on a particular feature, refer to `Compiled Autograd Landing Page <https://docs.google.com/document/d/11VucFBEewzqgkABIjebZIzMvrXr3BtcY1aGKpX61pJY>`_.
-# 
-
-
-######################################################################
-# Setup
-# ------------
-# In this tutorial, we will base our examples on this simple neural network model.
-# It takes a a 10-dimensional input vector, processes it through a single linear layer, and outputs another 10-dimensional vector.
-
-import torch
-
-class Model(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.linear = torch.nn.Linear(10, 10)
-
-    def forward(self, x):
-        return self.linear(x)
-
-
-######################################################################
-# Basic usage
-# ------------
-# Before calling the torch.compile API, make sure to set ``torch._dynamo.config.compiled_autograd`` to ``True``:
-#
-
-model = Model()
-x = torch.randn(10)
-
-torch._dynamo.config.compiled_autograd = True
-@torch.compile
-def train(model, x):
-    loss = model(x).sum()
-    loss.backward()
-
-train(model, x) 
-
-######################################################################
-# In the code above, we create an instance of the ``Model`` class and generate a random 10-dimensional tensor ``x`` by using torch.randn(10).
-# We define the training loop function ``train`` and decorate it with @torch.compile to optimize its execution.
-#
-# When ``train(model, x)`` is called:
-# * Python Interpreter calls Dynamo, since this call was decorated with ``@torch.compile``
-# * Dynamo intercepts the python bytecode, simulates their execution and records the operations into a graph
-# * AOTDispatcher disables hooks and calls the autograd engine to compute gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph. Using ``torch.autograd.Function``, AOTDispatcher rewrites the forward and backward implementation of ``train``.
-# * Inductor generates a function corresponding to an optimized implementation of the AOTDispatcher forward and backward
-# * Dynamo sets the optimized function to be evaluated next by Python Interpreter
-# * Python Interpreter executes the optimized function, which basically executes ``loss = model(x).sum()``
-# * Python Interpreter executes ``loss.backward()``, calling into the autograd engine, which routes to the Compiled Autograd engine since we enabled the config: ``torch._dynamo.config.compiled_autograd = True``
-# * Compiled Autograd computes the gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph, including any hooks it encounters. During this, it will record the backward previously rewritten by AOTDispatcher. Compiled Autograd then generates a new function which corresponds to a fully traced implementation of ``loss.backward()``, and executes it with ``torch.compile`` in inference mode
-# * The same steps recursively apply to the Compiled Autograd graph, but this time AOTDispatcher does not need to partition this graph into a forward and backward
-#
-
-######################################################################
-# Inspecting the compiled autograd logs
-# -------------------------------------
-# Run the script with the ``TORCH_LOGS`` environment variables:
-#   - To only print the compiled autograd graph, use ``TORCH_LOGS="compiled_autograd" python example.py``
-#   - To print the graph with more tensor metadata and recompile reasons, at the cost of performance, use ``TORCH_LOGS="compiled_autograd_verbose" python example.py``
-# 
-# Rerun the snippet above, the compiled autograd graph should now be logged to ``stderr``. Certain graph nodes will have names that are prefixed by ``aot0_``,
-# these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0, for example, ``aot0_view_2`` corresponds to ``view_2`` of the AOT backward graph with id=0.
-# 
-
-stderr_output = """
-DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[]
-DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:TRACED GRAPH
- ===== Compiled autograd graph =====
- <eval_with_key>.4 class CompiledAutograd(torch.nn.Module):
-    def forward(self, inputs, sizes, scalars, hooks):
-        # No stacktrace found for following nodes
-        aot0_tangents_1: "f32[][]cpu" = inputs[0]
-        aot0_primals_3: "f32[10][1]cpu" = inputs[1]
-        getitem_2: "f32[10][1]cpu" = inputs[2]
-        getitem_3: "f32[10, 10][10, 1]cpu" = inputs[3];  inputs = None
-        
-         # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: CompiledFunctionBackward0 (NodeCall 1)
-        aot0_expand: "f32[10][0]cpu" = torch.ops.aten.expand.default(aot0_tangents_1, [10]);  aot0_tangents_1 = None
-        aot0_view_2: "f32[1, 10][0, 0]cpu" = torch.ops.aten.view.default(aot0_expand, [1, 10]);  aot0_expand = None
-        aot0_permute_2: "f32[10, 1][0, 0]cpu" = torch.ops.aten.permute.default(aot0_view_2, [1, 0])
-        aot0_select: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 0)
-        aot0_view: "f32[1, 10][10, 1]cpu" = torch.ops.aten.view.default(aot0_primals_3, [1, 10]);  aot0_primals_3 = None
-        aot0_mul_3: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select, aot0_view);  aot0_select = None
-        aot0_select_1: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 1)
-        aot0_mul_4: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_1, aot0_view);  aot0_select_1 = None
-        aot0_select_2: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 2)
-        aot0_mul_5: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_2, aot0_view);  aot0_select_2 = None
-        aot0_select_3: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 3)
-        aot0_mul_6: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_3, aot0_view);  aot0_select_3 = None
-        aot0_select_4: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 4)
-        aot0_mul_7: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_4, aot0_view);  aot0_select_4 = None
-        aot0_select_5: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 5)
-        aot0_mul_8: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_5, aot0_view);  aot0_select_5 = None
-        aot0_select_6: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 6)
-        aot0_mul_9: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_6, aot0_view);  aot0_select_6 = None
-        aot0_select_7: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 7)
-        aot0_mul_10: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_7, aot0_view);  aot0_select_7 = None
-        aot0_select_8: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 8)
-        aot0_mul_11: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_8, aot0_view);  aot0_select_8 = None
-        aot0_select_9: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 9);  aot0_permute_2 = None
-        aot0_mul_12: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_9, aot0_view);  aot0_select_9 = aot0_view = None
-        aot0_cat: "f32[10, 10][10, 1]cpu" = torch.ops.aten.cat.default([aot0_mul_3, aot0_mul_4, aot0_mul_5, aot0_mul_6, aot0_mul_7, aot0_mul_8, aot0_mul_9, aot0_mul_10, aot0_mul_11, aot0_mul_12]);  aot0_mul_3 = aot0_mul_4 = aot0_mul_5 = aot0_mul_6 = aot0_mul_7 = aot0_mul_8 = aot0_mul_9 = aot0_mul_10 = aot0_mul_11 = aot0_mul_12 = None
-        aot0_permute_3: "f32[10, 10][1, 10]cpu" = torch.ops.aten.permute.default(aot0_cat, [1, 0]);  aot0_cat = None
-        aot0_sum_3: "f32[1, 10][10, 1]cpu" = torch.ops.aten.sum.dim_IntList(aot0_view_2, [0], True);  aot0_view_2 = None
-        aot0_view_3: "f32[10][1]cpu" = torch.ops.aten.view.default(aot0_sum_3, [10]);  aot0_sum_3 = None
-        
-         # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: torch::autograd::AccumulateGrad (NodeCall 2)
-        accumulate_grad_ = torch.ops.inductor.accumulate_grad_.default(getitem_2, aot0_view_3);  getitem_2 = aot0_view_3 = accumulate_grad_ = None
-        
-         # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: CompiledFunctionBackward0 (NodeCall 1)
-        aot0_permute_4: "f32[10, 10][10, 1]cpu" = torch.ops.aten.permute.default(aot0_permute_3, [1, 0]);  aot0_permute_3 = None
-        
-         # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: torch::autograd::AccumulateGrad (NodeCall 3)
-        accumulate_grad__1 = torch.ops.inductor.accumulate_grad_.default(getitem_3, aot0_permute_4);  getitem_3 = aot0_permute_4 = accumulate_grad__1 = None
-        _exec_final_callbacks_stub = torch__dynamo_external_utils__exec_final_callbacks_stub();  _exec_final_callbacks_stub = None
-        return []
-"""
-
-######################################################################
-# .. note:: This is the graph on which we will call ``torch.compile``, **NOT** the optimized graph. Compiled Autograd essentially generates some unoptimized Python code to represent the entire C++ autograd execution.
-# 
-
-######################################################################
-# Compiling the forward and backward pass using different flags
-# -------------------------------------------------------------
-# You can use different compiler configs for the two compilations, for example, the backward may be a fullgraph even if there are graph breaks in the forward.
-#
-
-def train(model, x):
-    model = torch.compile(model)
-    loss = model(x).sum()
-    torch._dynamo.config.compiled_autograd = True
-    torch.compile(lambda: loss.backward(), fullgraph=True)()
-
-######################################################################
-# Or you can use the context manager, which will apply to all autograd calls within its scope.
-# 
-
-def train(model, x):
-    model = torch.compile(model)
-    loss = model(x).sum()
-    with torch._dynamo.compiled_autograd.enable(torch.compile(fullgraph=True)):
-        loss.backward()
-
-
-######################################################################
-# Compiled Autograd addresses certain limitations of AOTAutograd
-# --------------------------------------------------------------
-# 1. Graph breaks in the forward lead to graph breaks in the backward
-#
-
-@torch.compile(backend="aot_eager")
-def fn(x):
-    # 1st graph
-    temp = x + 10
-    torch._dynamo.graph_break()
-    # 2nd graph
-    temp = temp + 10
-    torch._dynamo.graph_break()
-    # 3rd graph
-    return temp.sum()
-
-x = torch.randn(10, 10, requires_grad=True)
-torch._dynamo.utils.counters.clear()
-loss = fn(x)
-
-# 1. base torch.compile 
-loss.backward(retain_graph=True)
-assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 3)
-torch._dynamo.utils.counters.clear()
-
-# 2. torch.compile with compiled autograd
-with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")):
-    loss.backward()
-
-# single graph for the backward
-assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 1)
-
-
-######################################################################
-# In the ``1. base torch.compile`` case, we see that 3 backward graphs were produced due to the 2 graph breaks in the compiled function ``fn``. 
-# Whereas in ``2. torch.compile with compiled autograd``, we see that a full backward graph was traced despite the graph breaks.
-# 
-
-######################################################################
-# 2. Backward hooks are not captured
-# 
-
-@torch.compile(backend="aot_eager")
-def fn(x):
-    return x.sum()
-
-x = torch.randn(10, 10, requires_grad=True)
-x.register_hook(lambda grad: grad+10)
-loss = fn(x)
-
-with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")):
-    loss.backward()
-
-######################################################################
-# There should be a ``call_hook`` node in the graph, which dynamo will later inline into
-# 
-
-stderr_output = """
-DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[]
-DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:TRACED GRAPH
-===== Compiled autograd graph =====
-<eval_with_key>.2 class CompiledAutograd(torch.nn.Module):
-   def forward(self, inputs, sizes, scalars, hooks):
-       ...
-       getitem_2 = hooks[0];  hooks = None
-       call_hook: "f32[10, 10][0, 0]cpu" = torch__dynamo_external_utils_call_hook(getitem_2, aot0_expand, hook_type = 'tensor_pre_hook');  getitem_2 = aot0_expand = None
-       ...
-"""
-
-######################################################################
-# Common recompilation reasons for Compiled Autograd
-# --------------------------------------------------
-# 1. Due to change in autograd structure 
-
-torch._dynamo.config.compiled_autograd = True
-x = torch.randn(10, requires_grad=True)
-for op in [torch.add, torch.sub, torch.mul, torch.div]:
-    loss = op(x, x).sum()
-    torch.compile(lambda: loss.backward(), backend="eager")()
-
-######################################################################
-# You should see some recompile messages: **Cache miss due to new autograd node**.
-#
-
-stderr_output = """
-Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] 
-...
-Cache miss due to new autograd node: SubBackward0 (NodeCall 2) with key size 56, previous key sizes=[]
-...
-Cache miss due to new autograd node: MulBackward0 (NodeCall 2) with key size 71, previous key sizes=[]
-...
-Cache miss due to new autograd node: DivBackward0 (NodeCall 2) with key size 70, previous key sizes=[]
-...
-"""
-
-######################################################################
-# 2. Due to dynamic shapes
-# 
-
-torch._dynamo.config.compiled_autograd = True
-for i in [10, 100, 10]:
-    x = torch.randn(i, i, requires_grad=True)
-    loss = x.sum()
-    torch.compile(lambda: loss.backward(), backend="eager")()
-
-######################################################################
-# You should see some recompiles messages: **Cache miss due to changed shapes**.
-#
-
-stderr_output = """
-...
-Cache miss due to changed shapes: marking size idx 0 of torch::autograd::GraphRoot (NodeCall 0) as dynamic
-Cache miss due to changed shapes: marking size idx 1 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic
-Cache miss due to changed shapes: marking size idx 2 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic
-Cache miss due to changed shapes: marking size idx 3 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic
-...
-"""
-
-######################################################################
-# Conclusion
-# ----------
-# In this tutorial, we went over the high-level ecosystem of ``torch.compile`` with compiled autograd, the basics of compiled autograd and a few common recompilation reasons.
-# 
diff --git a/intermediate_source/compiled_autograd_tutorial.rst b/intermediate_source/compiled_autograd_tutorial.rst
new file mode 100644
index 0000000000..22bc9904cb
--- /dev/null
+++ b/intermediate_source/compiled_autograd_tutorial.rst
@@ -0,0 +1,301 @@
+Compiled Autograd: Capturing a larger backward graph for ``torch.compile``
+==========================================================================
+**Author:** `Simon Fan <https://github.com/xmfan>`_
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * How compiled autograd interacts with ``torch.compile``
+       * How to use the compiled autograd API
+       * How to inspect logs using ``TORCH_LOGS``
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * PyTorch 2.4
+       * Complete the `Introduction to torch.compile <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`_
+
+Overview
+--------
+Compiled Autograd is a ``torch.compile`` extension introduced in PyTorch 2.4
+that allows the capture of a larger backward graph.
+
+While ``torch.compile`` does capture the backward graph, it does so **partially**. The AOTAutograd component captures the backward graph ahead-of-time, with certain limitations:
+
+* Graph breaks in the forward lead to graph breaks in the backward
+* `Backward hooks <https://pytorch.org/docs/stable/notes/autograd.html#backward-hooks-execution>`_ are not captured
+
+Compiled Autograd addresses these limitations by directly integrating with the autograd engine, allowing
+it to capture the full backward graph at runtime. Models with these two characteristics should try
+Compiled Autograd, and potentially observe better performance.
+
+However, Compiled Autograd introduces its own limitations:
+
+* Added runtime overhead at the start of the backward for cache lookup
+* More prone to recompiles and graph breaks in dynamo due to the larger capture
+
+.. note:: Compiled Autograd is under active development and is not yet compatible with all existing PyTorch features. For the latest status on a particular feature, refer to `Compiled Autograd Landing Page <https://docs.google.com/document/d/11VucFBEewzqgkABIjebZIzMvrXr3BtcY1aGKpX61pJY>`_.
+
+Setup
+-----
+In this tutorial, we will base our examples on this simple neural network model.
+It takes a a 10-dimensional input vector, processes it through a single linear layer, and outputs another 10-dimensional vector.
+
+.. code:: python
+
+   import torch
+
+   class Model(torch.nn.Module):
+      def __init__(self):
+         super().__init__()
+         self.linear = torch.nn.Linear(10, 10)
+
+      def forward(self, x):
+         return self.linear(x)
+
+Basic usage
+------------
+Before calling the torch.compile API, make sure to set ``torch._dynamo.config.compiled_autograd`` to ``True``:
+
+.. code:: python
+
+   model = Model()
+   x = torch.randn(10)
+
+   torch._dynamo.config.compiled_autograd = True
+   @torch.compile
+   def train(model, x):
+      loss = model(x).sum()
+      loss.backward()
+
+   train(model, x) 
+
+In the code above, we create an instance of the ``Model`` class and generate a random 10-dimensional tensor ``x`` by using torch.randn(10).
+We define the training loop function ``train`` and decorate it with @torch.compile to optimize its execution.
+When ``train(model, x)`` is called:
+
+* Python Interpreter calls Dynamo, since this call was decorated with ``@torch.compile``
+* Dynamo intercepts the python bytecode, simulates their execution and records the operations into a graph
+* AOTDispatcher disables hooks and calls the autograd engine to compute gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph. Using ``torch.autograd.Function``, AOTDispatcher rewrites the forward and backward implementation of ``train``.
+* Inductor generates a function corresponding to an optimized implementation of the AOTDispatcher forward and backward
+* Dynamo sets the optimized function to be evaluated next by Python Interpreter
+* Python Interpreter executes the optimized function, which basically executes ``loss = model(x).sum()``
+* Python Interpreter executes ``loss.backward()``, calling into the autograd engine, which routes to the Compiled Autograd engine since we enabled the config: ``torch._dynamo.config.compiled_autograd = True``
+* Compiled Autograd computes the gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph, including any hooks it encounters. During this, it will record the backward previously rewritten by AOTDispatcher. Compiled Autograd then generates a new function which corresponds to a fully traced implementation of ``loss.backward()``, and executes it with ``torch.compile`` in inference mode
+* The same steps recursively apply to the Compiled Autograd graph, but this time AOTDispatcher does not need to partition this graph into a forward and backward
+
+Inspecting the compiled autograd logs
+-------------------------------------
+Run the script with the ``TORCH_LOGS`` environment variables:
+
+* To only print the compiled autograd graph, use ``TORCH_LOGS="compiled_autograd" python example.py``
+* To print the graph with more tensor metadata and recompile reasons, at the cost of performance, use ``TORCH_LOGS="compiled_autograd_verbose" python example.py``
+
+Rerun the snippet above, the compiled autograd graph should now be logged to ``stderr``. Certain graph nodes will have names that are prefixed by ``aot0_``,
+these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0, for example, ``aot0_view_2`` corresponds to ``view_2`` of the AOT backward graph with id=0.
+
+
+.. code:: python
+
+   stderr_output = """
+   DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[]
+   DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:TRACED GRAPH
+   ===== Compiled autograd graph =====
+   <eval_with_key>.4 class CompiledAutograd(torch.nn.Module):
+      def forward(self, inputs, sizes, scalars, hooks):
+         # No stacktrace found for following nodes
+         aot0_tangents_1: "f32[][]cpu" = inputs[0]
+         aot0_primals_3: "f32[10][1]cpu" = inputs[1]
+         getitem_2: "f32[10][1]cpu" = inputs[2]
+         getitem_3: "f32[10, 10][10, 1]cpu" = inputs[3];  inputs = None
+         
+            # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: CompiledFunctionBackward0 (NodeCall 1)
+         aot0_expand: "f32[10][0]cpu" = torch.ops.aten.expand.default(aot0_tangents_1, [10]);  aot0_tangents_1 = None
+         aot0_view_2: "f32[1, 10][0, 0]cpu" = torch.ops.aten.view.default(aot0_expand, [1, 10]);  aot0_expand = None
+         aot0_permute_2: "f32[10, 1][0, 0]cpu" = torch.ops.aten.permute.default(aot0_view_2, [1, 0])
+         aot0_select: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 0)
+         aot0_view: "f32[1, 10][10, 1]cpu" = torch.ops.aten.view.default(aot0_primals_3, [1, 10]);  aot0_primals_3 = None
+         aot0_mul_3: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select, aot0_view);  aot0_select = None
+         aot0_select_1: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 1)
+         aot0_mul_4: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_1, aot0_view);  aot0_select_1 = None
+         aot0_select_2: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 2)
+         aot0_mul_5: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_2, aot0_view);  aot0_select_2 = None
+         aot0_select_3: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 3)
+         aot0_mul_6: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_3, aot0_view);  aot0_select_3 = None
+         aot0_select_4: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 4)
+         aot0_mul_7: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_4, aot0_view);  aot0_select_4 = None
+         aot0_select_5: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 5)
+         aot0_mul_8: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_5, aot0_view);  aot0_select_5 = None
+         aot0_select_6: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 6)
+         aot0_mul_9: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_6, aot0_view);  aot0_select_6 = None
+         aot0_select_7: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 7)
+         aot0_mul_10: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_7, aot0_view);  aot0_select_7 = None
+         aot0_select_8: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 8)
+         aot0_mul_11: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_8, aot0_view);  aot0_select_8 = None
+         aot0_select_9: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 9);  aot0_permute_2 = None
+         aot0_mul_12: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_9, aot0_view);  aot0_select_9 = aot0_view = None
+         aot0_cat: "f32[10, 10][10, 1]cpu" = torch.ops.aten.cat.default([aot0_mul_3, aot0_mul_4, aot0_mul_5, aot0_mul_6, aot0_mul_7, aot0_mul_8, aot0_mul_9, aot0_mul_10, aot0_mul_11, aot0_mul_12]);  aot0_mul_3 = aot0_mul_4 = aot0_mul_5 = aot0_mul_6 = aot0_mul_7 = aot0_mul_8 = aot0_mul_9 = aot0_mul_10 = aot0_mul_11 = aot0_mul_12 = None
+         aot0_permute_3: "f32[10, 10][1, 10]cpu" = torch.ops.aten.permute.default(aot0_cat, [1, 0]);  aot0_cat = None
+         aot0_sum_3: "f32[1, 10][10, 1]cpu" = torch.ops.aten.sum.dim_IntList(aot0_view_2, [0], True);  aot0_view_2 = None
+         aot0_view_3: "f32[10][1]cpu" = torch.ops.aten.view.default(aot0_sum_3, [10]);  aot0_sum_3 = None
+         
+            # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: torch::autograd::AccumulateGrad (NodeCall 2)
+         accumulate_grad_ = torch.ops.inductor.accumulate_grad_.default(getitem_2, aot0_view_3);  getitem_2 = aot0_view_3 = accumulate_grad_ = None
+         
+            # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: CompiledFunctionBackward0 (NodeCall 1)
+         aot0_permute_4: "f32[10, 10][10, 1]cpu" = torch.ops.aten.permute.default(aot0_permute_3, [1, 0]);  aot0_permute_3 = None
+         
+            # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: torch::autograd::AccumulateGrad (NodeCall 3)
+         accumulate_grad__1 = torch.ops.inductor.accumulate_grad_.default(getitem_3, aot0_permute_4);  getitem_3 = aot0_permute_4 = accumulate_grad__1 = None
+         _exec_final_callbacks_stub = torch__dynamo_external_utils__exec_final_callbacks_stub();  _exec_final_callbacks_stub = None
+         return []
+   """
+
+.. note:: This is the graph on which we will call ``torch.compile``, **NOT** the optimized graph. Compiled Autograd essentially generates some unoptimized Python code to represent the entire C++ autograd execution.
+
+Compiling the forward and backward pass using different flags
+-------------------------------------------------------------
+You can use different compiler configs for the two compilations, for example, the backward may be a fullgraph even if there are graph breaks in the forward.
+
+.. code:: python
+
+def train(model, x):
+    model = torch.compile(model)
+    loss = model(x).sum()
+    torch._dynamo.config.compiled_autograd = True
+    torch.compile(lambda: loss.backward(), fullgraph=True)()
+
+Or you can use the context manager, which will apply to all autograd calls within its scope.
+
+.. code:: python
+
+   def train(model, x):
+      model = torch.compile(model)
+      loss = model(x).sum()
+      with torch._dynamo.compiled_autograd.enable(torch.compile(fullgraph=True)):
+         loss.backward()
+
+
+Compiled Autograd addresses certain limitations of AOTAutograd
+--------------------------------------------------------------
+1. Graph breaks in the forward lead to graph breaks in the backward
+
+.. code:: python
+
+   @torch.compile(backend="aot_eager")
+   def fn(x):
+      # 1st graph
+      temp = x + 10
+      torch._dynamo.graph_break()
+      # 2nd graph
+      temp = temp + 10
+      torch._dynamo.graph_break()
+      # 3rd graph
+      return temp.sum()
+
+   x = torch.randn(10, 10, requires_grad=True)
+   torch._dynamo.utils.counters.clear()
+   loss = fn(x)
+
+   # 1. base torch.compile 
+   loss.backward(retain_graph=True)
+   assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 3)
+   torch._dynamo.utils.counters.clear()
+
+   # 2. torch.compile with compiled autograd
+   with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")):
+      loss.backward()
+
+   # single graph for the backward
+   assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 1)
+
+
+In the ``1. base torch.compile`` case, we see that 3 backward graphs were produced due to the 2 graph breaks in the compiled function ``fn``. 
+Whereas in ``2. torch.compile with compiled autograd``, we see that a full backward graph was traced despite the graph breaks.
+
+2. Backward hooks are not captured
+
+.. code:: python
+
+   @torch.compile(backend="aot_eager")
+   def fn(x):
+      return x.sum()
+
+   x = torch.randn(10, 10, requires_grad=True)
+   x.register_hook(lambda grad: grad+10)
+   loss = fn(x)
+
+   with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")):
+      loss.backward()
+
+There should be a ``call_hook`` node in the graph, which dynamo will later inline into
+
+.. code:: python
+
+   stderr_output = """
+   DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[]
+   DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:TRACED GRAPH
+   ===== Compiled autograd graph =====
+   <eval_with_key>.2 class CompiledAutograd(torch.nn.Module):
+      def forward(self, inputs, sizes, scalars, hooks):
+         ...
+         getitem_2 = hooks[0];  hooks = None
+         call_hook: "f32[10, 10][0, 0]cpu" = torch__dynamo_external_utils_call_hook(getitem_2, aot0_expand, hook_type = 'tensor_pre_hook');  getitem_2 = aot0_expand = None
+         ...
+   """
+
+Common recompilation reasons for Compiled Autograd
+--------------------------------------------------
+1. Due to change in autograd structure 
+
+.. code:: python
+
+   torch._dynamo.config.compiled_autograd = True
+   x = torch.randn(10, requires_grad=True)
+   for op in [torch.add, torch.sub, torch.mul, torch.div]:
+      loss = op(x, x).sum()
+      torch.compile(lambda: loss.backward(), backend="eager")()
+
+You should see some recompile messages: **Cache miss due to new autograd node**.
+
+.. code:: python
+
+   stderr_output = """
+   Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] 
+   ...
+   Cache miss due to new autograd node: SubBackward0 (NodeCall 2) with key size 56, previous key sizes=[]
+   ...
+   Cache miss due to new autograd node: MulBackward0 (NodeCall 2) with key size 71, previous key sizes=[]
+   ...
+   Cache miss due to new autograd node: DivBackward0 (NodeCall 2) with key size 70, previous key sizes=[]
+   ...
+   """
+
+2. Due to dynamic shapes
+
+.. code:: python
+
+   torch._dynamo.config.compiled_autograd = True
+   for i in [10, 100, 10]:
+      x = torch.randn(i, i, requires_grad=True)
+      loss = x.sum()
+      torch.compile(lambda: loss.backward(), backend="eager")()
+
+You should see some recompiles messages: **Cache miss due to changed shapes**.
+
+.. code:: python
+
+   stderr_output = """
+   ...
+   Cache miss due to changed shapes: marking size idx 0 of torch::autograd::GraphRoot (NodeCall 0) as dynamic
+   Cache miss due to changed shapes: marking size idx 1 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic
+   Cache miss due to changed shapes: marking size idx 2 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic
+   Cache miss due to changed shapes: marking size idx 3 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic
+   ...
+   """
+
+Conclusion
+----------
+In this tutorial, we went over the high-level ecosystem of ``torch.compile`` with compiled autograd, the basics of compiled autograd and a few common recompilation reasons.

From 94d061203e50bf447a6c1564528436fb061840da Mon Sep 17 00:00:00 2001
From: Simon Fan <xmfan@fb.com>
Date: Fri, 13 Sep 2024 14:48:58 -0700
Subject: [PATCH 08/10] address comments

---
 .../compiled_autograd_tutorial.rst            | 37 ++++++++++---------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/intermediate_source/compiled_autograd_tutorial.rst b/intermediate_source/compiled_autograd_tutorial.rst
index 22bc9904cb..bf2de77f1b 100644
--- a/intermediate_source/compiled_autograd_tutorial.rst
+++ b/intermediate_source/compiled_autograd_tutorial.rst
@@ -16,6 +16,7 @@ Compiled Autograd: Capturing a larger backward graph for ``torch.compile``
 
        * PyTorch 2.4
        * Complete the `Introduction to torch.compile <https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html>`_
+       * Read through the TorchDynamo and AOTAutograd sections of `Get Started with PyTorch 2.x <https://pytorch.org/get-started/pytorch-2.0/>`_
 
 Overview
 --------
@@ -41,7 +42,7 @@ However, Compiled Autograd introduces its own limitations:
 Setup
 -----
 In this tutorial, we will base our examples on this simple neural network model.
-It takes a a 10-dimensional input vector, processes it through a single linear layer, and outputs another 10-dimensional vector.
+It takes a 10-dimensional input vector, processes it through a single linear layer, and outputs another 10-dimensional vector.
 
 .. code:: python
 
@@ -57,7 +58,7 @@ It takes a a 10-dimensional input vector, processes it through a single linear l
 
 Basic usage
 ------------
-Before calling the torch.compile API, make sure to set ``torch._dynamo.config.compiled_autograd`` to ``True``:
+Before calling the ``torch.compile`` API, make sure to set ``torch._dynamo.config.compiled_autograd`` to ``True``:
 
 .. code:: python
 
@@ -72,19 +73,19 @@ Before calling the torch.compile API, make sure to set ``torch._dynamo.config.co
 
    train(model, x) 
 
-In the code above, we create an instance of the ``Model`` class and generate a random 10-dimensional tensor ``x`` by using torch.randn(10).
+In the code above, we create an instance of the ``Model`` class and generate a random 10-dimensional tensor ``x`` by using ``torch.randn(10)``.
 We define the training loop function ``train`` and decorate it with @torch.compile to optimize its execution.
 When ``train(model, x)`` is called:
 
-* Python Interpreter calls Dynamo, since this call was decorated with ``@torch.compile``
-* Dynamo intercepts the python bytecode, simulates their execution and records the operations into a graph
-* AOTDispatcher disables hooks and calls the autograd engine to compute gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph. Using ``torch.autograd.Function``, AOTDispatcher rewrites the forward and backward implementation of ``train``.
-* Inductor generates a function corresponding to an optimized implementation of the AOTDispatcher forward and backward
-* Dynamo sets the optimized function to be evaluated next by Python Interpreter
-* Python Interpreter executes the optimized function, which basically executes ``loss = model(x).sum()``
-* Python Interpreter executes ``loss.backward()``, calling into the autograd engine, which routes to the Compiled Autograd engine since we enabled the config: ``torch._dynamo.config.compiled_autograd = True``
-* Compiled Autograd computes the gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph, including any hooks it encounters. During this, it will record the backward previously rewritten by AOTDispatcher. Compiled Autograd then generates a new function which corresponds to a fully traced implementation of ``loss.backward()``, and executes it with ``torch.compile`` in inference mode
-* The same steps recursively apply to the Compiled Autograd graph, but this time AOTDispatcher does not need to partition this graph into a forward and backward
+* Python Interpreter calls Dynamo, since this call was decorated with ``@torch.compile``.
+* Dynamo intercepts the Python bytecode, simulates their execution and records the operations into a graph.
+* ``AOTDispatcher`` disables hooks and calls the autograd engine to compute gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph. Using ``torch.autograd.Function``, AOTDispatcher rewrites the forward and backward implementation of ``train``.
+* Inductor generates a function corresponding to an optimized implementation of the AOTDispatcher forward and backward.
+* Dynamo sets the optimized function to be evaluated next by Python Interpreter.
+* Python Interpreter executes the optimized function, which executes ``loss = model(x).sum()``.
+* Python Interpreter executes ``loss.backward()``, calling into the autograd engine, which routes to the Compiled Autograd engine since we set ``torch._dynamo.config.compiled_autograd = True``.
+* Compiled Autograd computes the gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph, including any hooks it encounters. During this process, it will record the backward previously rewritten by AOTDispatcher. Compiled Autograd then generates a new function which corresponds to a fully-traced implementation of ``loss.backward()``, and executes it with ``torch.compile`` in inference mode.
+* The same steps recursively apply to the Compiled Autograd graph, but this time AOTDispatcher will not need to partition the graph.
 
 Inspecting the compiled autograd logs
 -------------------------------------
@@ -180,7 +181,7 @@ Or you can use the context manager, which will apply to all autograd calls withi
 
 Compiled Autograd addresses certain limitations of AOTAutograd
 --------------------------------------------------------------
-1. Graph breaks in the forward lead to graph breaks in the backward
+1. Graph breaks in the forward pass lead to graph breaks in the backward pass:
 
 .. code:: python
 
@@ -248,7 +249,7 @@ There should be a ``call_hook`` node in the graph, which dynamo will later inlin
 
 Common recompilation reasons for Compiled Autograd
 --------------------------------------------------
-1. Due to change in autograd structure 
+1. Due to changes in the autograd structure of the loss value
 
 .. code:: python
 
@@ -258,7 +259,7 @@ Common recompilation reasons for Compiled Autograd
       loss = op(x, x).sum()
       torch.compile(lambda: loss.backward(), backend="eager")()
 
-You should see some recompile messages: **Cache miss due to new autograd node**.
+In the example above, we call a different operator on each iteration, leading to ``loss`` tracking a different autograd history each time. You should see some recompile messages: **Cache miss due to new autograd node**.
 
 .. code:: python
 
@@ -273,7 +274,7 @@ You should see some recompile messages: **Cache miss due to new autograd node**.
    ...
    """
 
-2. Due to dynamic shapes
+2. Due to tensors changing shapes
 
 .. code:: python
 
@@ -283,7 +284,7 @@ You should see some recompile messages: **Cache miss due to new autograd node**.
       loss = x.sum()
       torch.compile(lambda: loss.backward(), backend="eager")()
 
-You should see some recompiles messages: **Cache miss due to changed shapes**.
+In the example above, ``x`` changes shapes, and compiled autograd will mark ``x`` as a dynamic shape tensor after the first change. You should see recompiles messages: **Cache miss due to changed shapes**.
 
 .. code:: python
 
@@ -298,4 +299,4 @@ You should see some recompiles messages: **Cache miss due to changed shapes**.
 
 Conclusion
 ----------
-In this tutorial, we went over the high-level ecosystem of ``torch.compile`` with compiled autograd, the basics of compiled autograd and a few common recompilation reasons.
+In this tutorial, we went over the high-level ecosystem of ``torch.compile`` with compiled autograd, the basics of compiled autograd and a few common recompilation reasons. Stay tuned for deep dives on `dev-discuss <https://dev-discuss.pytorch.org/>`_.

From 70c7434066d49c7a07004794588e8a96367b964e Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Wed, 9 Oct 2024 10:36:54 -0700
Subject: [PATCH 09/10] Add a card and toctree

---
 index.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/index.rst b/index.rst
index 95c4a8f3ef..76a4d2e09d 100644
--- a/index.rst
+++ b/index.rst
@@ -439,6 +439,13 @@ Welcome to PyTorch Tutorials
    :link: advanced/python_custom_ops.html
    :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA
 
+.. customcarditem::
+   :header: Compiled Autograd: Capturing a larger backward graph for ``torch.compile``
+   :card_description: Learn how to use compiled autograd to capture a larger backward graph.
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: intermediate/compiled_autograd_tutorial
+   :tags: Model-Optimization,CUDA
+
 .. customcarditem::
    :header: Custom C++ and CUDA Operators
    :card_description: How to extend PyTorch with custom C++ and CUDA operators.
@@ -1132,6 +1139,7 @@ Additional Resources
    intermediate/nvfuser_intro_tutorial
    intermediate/ax_multiobjective_nas_tutorial
    intermediate/torch_compile_tutorial
+   intermediate/compiled_autograd_tutorial
    intermediate/inductor_debug_cpu
    intermediate/scaled_dot_product_attention_tutorial
    beginner/knowledge_distillation_tutorial

From 83d4665d6731e77c4ad0eed8e3b6468ce1155ef6 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Wed, 9 Oct 2024 15:01:11 -0700
Subject: [PATCH 10/10] Minor editorial and formatting fixes

---
 .../compiled_autograd_tutorial.rst            | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/intermediate_source/compiled_autograd_tutorial.rst b/intermediate_source/compiled_autograd_tutorial.rst
index bf2de77f1b..bcae7e63da 100644
--- a/intermediate_source/compiled_autograd_tutorial.rst
+++ b/intermediate_source/compiled_autograd_tutorial.rst
@@ -162,11 +162,11 @@ You can use different compiler configs for the two compilations, for example, th
 
 .. code:: python
 
-def train(model, x):
-    model = torch.compile(model)
-    loss = model(x).sum()
-    torch._dynamo.config.compiled_autograd = True
-    torch.compile(lambda: loss.backward(), fullgraph=True)()
+   def train(model, x):
+       model = torch.compile(model)
+       loss = model(x).sum()
+       torch._dynamo.config.compiled_autograd = True
+       torch.compile(lambda: loss.backward(), fullgraph=True)()
 
 Or you can use the context manager, which will apply to all autograd calls within its scope.
 
@@ -213,8 +213,8 @@ Compiled Autograd addresses certain limitations of AOTAutograd
    assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 1)
 
 
-In the ``1. base torch.compile`` case, we see that 3 backward graphs were produced due to the 2 graph breaks in the compiled function ``fn``. 
-Whereas in ``2. torch.compile with compiled autograd``, we see that a full backward graph was traced despite the graph breaks.
+In the first ``torch.compile`` case, we see that 3 backward graphs were produced due to the 2 graph breaks in the compiled function ``fn``. 
+Whereas in the second ``torch.compile`` with compiled autograd case, we see that a full backward graph was traced despite the graph breaks.
 
 2. Backward hooks are not captured
 
@@ -231,7 +231,7 @@ Whereas in ``2. torch.compile with compiled autograd``, we see that a full backw
    with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")):
       loss.backward()
 
-There should be a ``call_hook`` node in the graph, which dynamo will later inline into
+There should be a ``call_hook`` node in the graph, which dynamo will later inline into the following:
 
 .. code:: python
 
@@ -249,7 +249,7 @@ There should be a ``call_hook`` node in the graph, which dynamo will later inlin
 
 Common recompilation reasons for Compiled Autograd
 --------------------------------------------------
-1. Due to changes in the autograd structure of the loss value
+1. Due to changes in the autograd structure of the loss value:
 
 .. code:: python
 
@@ -274,7 +274,7 @@ In the example above, we call a different operator on each iteration, leading to
    ...
    """
 
-2. Due to tensors changing shapes
+2. Due to tensors changing shapes:
 
 .. code:: python