From 866ab425aaa2d431e3177674002384a974cf514f Mon Sep 17 00:00:00 2001 From: Simon Fan Date: Tue, 3 Sep 2024 09:28:08 -0700 Subject: [PATCH 01/10] Add compiled autograd tutorial --- .../compiled_autograd_tutorial.py | 275 ++++++++++++++++++ 1 file changed, 275 insertions(+) create mode 100644 intermediate_source/compiled_autograd_tutorial.py diff --git a/intermediate_source/compiled_autograd_tutorial.py b/intermediate_source/compiled_autograd_tutorial.py new file mode 100644 index 0000000000..3b8bdd68c6 --- /dev/null +++ b/intermediate_source/compiled_autograd_tutorial.py @@ -0,0 +1,275 @@ +# -*- coding: utf-8 -*- + +""" +Compiled Autograd: Capturing a larger backward graph for ``torch.compile`` +========================================================================== + +""" + +###################################################################### +# Compiled Autograd is a torch.compile extension introduced in PyTorch 2.4 +# that allows the capture of a larger backward graph. It is highly recommended +# to familiarize yourself with `torch.compile `_. +# + +###################################################################### +# Doesn't torch.compile already capture the backward graph? +# ------------ +# Partially. AOTAutograd captures the backward graph ahead-of-time, but with certain limitations: +# - Graph breaks in the forward lead to graph breaks in the backward +# - `Backward hooks `_ are not captured +# +# Compiled Autograd addresses these limitations by directly integrating with the autograd engine, allowing +# it to capture the full backward graph at runtime. Models with these two characteristics should try +# Compiled Autograd, and potentially observe better performance. +# +# However, Compiled Autograd has its own limitations: +# - Dynamic autograd structure leads to recompiles +# + +###################################################################### +# Basic Usage +# ------------ +# + +# NOTE: Must be enabled before using the decorator +torch._dynamo.config.compiled_autograd = True + +class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(10, 10) + + def forward(self, x): + return self.linear(x) + +@torch.compile +def train(model, x): + loss = model(x).sum() + loss.backward() + +model = Model() +x = torch.randn(10) +train(model, x) + +###################################################################### +# Inspecting the compiled autograd logs +# ------------ +# Run the script with either TORCH_LOGS environment variables +# +""" +Prints graph: +TORCH_LOGS="compiled_autograd" python example.py + +Performance degrading, prints verbose graph and recompile reasons: +TORCH_LOGS="compiled_autograd_verbose" python example.py +""" + +###################################################################### +# Or with the set_logs private API: +# + +# flag must be enabled before wrapping using torch.compile +torch._logging._internal.set_logs(compiled_autograd=True) + +@torch.compile +def train(model, x): + loss = model(x).sum() + loss.backward() + +train(model, x) + +###################################################################### +# The compiled autograd graph should now be logged to stdout. Certain graph nodes will have names that are prefixed by "aot0_", +# these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0. +# +# NOTE: This is the graph that we will call torch.compile on, NOT the optimized graph. Compiled Autograd basically +# generated some python code to represent the entire C++ autograd execution. +# +""" +INFO:torch._dynamo.compiled_autograd.__compiled_autograd:TRACED GRAPH + ===== Compiled autograd graph ===== + .4 class CompiledAutograd(torch.nn.Module): + def forward(self, inputs, sizes, scalars, hooks): + # No stacktrace found for following nodes + aot0_tangents_1: "f32[][]cpu" = inputs[0] + aot0_primals_3: "f32[10][1]cpu" = inputs[1] + getitem_2: "f32[10][1]cpu" = inputs[2] + getitem_3: "f32[10, 10][10, 1]cpu" = inputs[3]; inputs = None + + # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: CompiledFunctionBackward0 (NodeCall 1) + aot0_expand: "f32[10][0]cpu" = torch.ops.aten.expand.default(aot0_tangents_1, [10]); aot0_tangents_1 = None + aot0_view_2: "f32[1, 10][0, 0]cpu" = torch.ops.aten.view.default(aot0_expand, [1, 10]); aot0_expand = None + aot0_permute_2: "f32[10, 1][0, 0]cpu" = torch.ops.aten.permute.default(aot0_view_2, [1, 0]) + aot0_select: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 0) + aot0_view: "f32[1, 10][10, 1]cpu" = torch.ops.aten.view.default(aot0_primals_3, [1, 10]); aot0_primals_3 = None + aot0_mul_3: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select, aot0_view); aot0_select = None + aot0_select_1: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 1) + aot0_mul_4: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_1, aot0_view); aot0_select_1 = None + aot0_select_2: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 2) + aot0_mul_5: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_2, aot0_view); aot0_select_2 = None + aot0_select_3: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 3) + aot0_mul_6: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_3, aot0_view); aot0_select_3 = None + aot0_select_4: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 4) + aot0_mul_7: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_4, aot0_view); aot0_select_4 = None + aot0_select_5: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 5) + aot0_mul_8: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_5, aot0_view); aot0_select_5 = None + aot0_select_6: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 6) + aot0_mul_9: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_6, aot0_view); aot0_select_6 = None + aot0_select_7: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 7) + aot0_mul_10: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_7, aot0_view); aot0_select_7 = None + aot0_select_8: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 8) + aot0_mul_11: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_8, aot0_view); aot0_select_8 = None + aot0_select_9: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 9); aot0_permute_2 = None + aot0_mul_12: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_9, aot0_view); aot0_select_9 = aot0_view = None + aot0_cat: "f32[10, 10][10, 1]cpu" = torch.ops.aten.cat.default([aot0_mul_3, aot0_mul_4, aot0_mul_5, aot0_mul_6, aot0_mul_7, aot0_mul_8, aot0_mul_9, aot0_mul_10, aot0_mul_11, aot0_mul_12]); aot0_mul_3 = aot0_mul_4 = aot0_mul_5 = aot0_mul_6 = aot0_mul_7 = aot0_mul_8 = aot0_mul_9 = aot0_mul_10 = aot0_mul_11 = aot0_mul_12 = None + aot0_permute_3: "f32[10, 10][1, 10]cpu" = torch.ops.aten.permute.default(aot0_cat, [1, 0]); aot0_cat = None + aot0_sum_3: "f32[1, 10][10, 1]cpu" = torch.ops.aten.sum.dim_IntList(aot0_view_2, [0], True); aot0_view_2 = None + aot0_view_3: "f32[10][1]cpu" = torch.ops.aten.view.default(aot0_sum_3, [10]); aot0_sum_3 = None + + # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: torch::autograd::AccumulateGrad (NodeCall 2) + accumulate_grad_ = torch.ops.inductor.accumulate_grad_.default(getitem_2, aot0_view_3); getitem_2 = aot0_view_3 = accumulate_grad_ = None + + # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: CompiledFunctionBackward0 (NodeCall 1) + aot0_permute_4: "f32[10, 10][10, 1]cpu" = torch.ops.aten.permute.default(aot0_permute_3, [1, 0]); aot0_permute_3 = None + + # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: torch::autograd::AccumulateGrad (NodeCall 3) + accumulate_grad__1 = torch.ops.inductor.accumulate_grad_.default(getitem_3, aot0_permute_4); getitem_3 = aot0_permute_4 = accumulate_grad__1 = None + _exec_final_callbacks_stub = torch__dynamo_external_utils__exec_final_callbacks_stub(); _exec_final_callbacks_stub = None + return [] +""" + +###################################################################### +# Compiling the forward and backward pass using different flags +# ------------ +# + +def train(model, x): + model = torch.compile(model) + loss = model(x).sum() + torch.compile(lambda: loss.backward(), fullgraph=True)() + +###################################################################### +# Or you can use the context manager, which will apply to all autograd calls within it +# + +def train(model, x): + model = torch.compile(model) + loss = model(x).sum() + with torch._dynamo.compiled_autograd.enable(torch.compile(fullgraph=True)): + loss.backward() + + +###################################################################### +# Demonstrating the limitations of AOTAutograd addressed by Compiled Autograd +# ------------ +# 1. Graph breaks in the forward lead to graph breaks in the backward +# + +@torch.compile(backend="aot_eager") +def fn(x): + # 1st graph + temp = x + 10 + torch._dynamo.graph_break() + # 2nd graph + temp = temp + 10 + torch._dynamo.graph_break() + # 3rd graph + return temp.sum() + +x = torch.randn(10, 10, requires_grad=True) +loss = fn(x) + +# 1. base torch.compile +loss.backward(retain_graph=True) +assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 3) +torch._dynamo.utils.counters.clear() + +# 2. torch.compile with compiled autograd +with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")): + loss.backward() + +# single graph for the backward +assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 1) + + +###################################################################### +# 2. `Backward hooks are not captured +# + +@torch.compile(backend="aot_eager") +def fn(x): + return x.sum() + +x = torch.randn(10, 10, requires_grad=True) +x.register_hook(lambda grad: grad+10) +loss = fn(x) + +torch._logging._internal.set_logs(compiled_autograd=True) +with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")): + loss.backward() + +###################################################################### +# There is a `call_hook` node in the graph, which dynamo will inline +# + +""" +INFO:torch._dynamo.compiled_autograd.__compiled_autograd:TRACED GRAPH + ===== Compiled autograd graph ===== + .2 class CompiledAutograd(torch.nn.Module): + def forward(self, inputs, sizes, scalars, hooks): + ... + getitem_2 = hooks[0]; hooks = None + call_hook: "f32[10, 10][0, 0]cpu" = torch__dynamo_external_utils_call_hook(getitem_2, aot0_expand, hook_type = 'tensor_pre_hook'); getitem_2 = aot0_expand = None + ... +""" + +###################################################################### +# Understanding recompilation reasons for Compiled Autograd +# ------------ +# 1. Due to change in autograd structure + +torch._logging._internal.set_logs(compiled_autograd_verbose=True) +torch._dynamo.config.compiled_autograd = True +x = torch.randn(10, requires_grad=True) +for op in [torch.add, torch.sub, torch.mul, torch.div]: + loss = op(x, x).sum() + torch.compile(lambda: loss.backward(), backend="eager")() + +###################################################################### +# You should see some cache miss logs (recompiles): +# Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] +# ... +# Cache miss due to new autograd node: SubBackward0 (NodeCall 2) with key size 56, previous key sizes=[] +# ... +# Cache miss due to new autograd node: MulBackward0 (NodeCall 2) with key size 71, previous key sizes=[] +# ... +# Cache miss due to new autograd node: DivBackward0 (NodeCall 2) with key size 70, previous key sizes=[] +# ... + +###################################################################### +# 2. Due to dynamic shapes +# + +torch._logging._internal.set_logs(compiled_autograd_verbose=True) +torch._dynamo.config.compiled_autograd = True +for i in [10, 100, 10]: + x = torch.randn(i, i, requires_grad=True) + loss = x.sum() + torch.compile(lambda: loss.backward(), backend="eager")() + +###################################################################### +# You should see some cache miss logs (recompiles): +# ... +# Cache miss due to changed shapes: marking size idx 0 of torch::autograd::GraphRoot (NodeCall 0) as dynamic +# Cache miss due to changed shapes: marking size idx 1 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic +# Cache miss due to changed shapes: marking size idx 2 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic +# Cache miss due to changed shapes: marking size idx 3 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic +# ... + +###################################################################### +# Compatibility and rough edges +# ------------ +# +# Compiled Autograd is under active development and is not yet compatible with all existing PyTorch features. +# For the latest status on a particular feature, refer to: https://docs.google.com/document/d/11VucFBEewzqgkABIjebZIzMvrXr3BtcY1aGKpX61pJY. From 7483147343e327bc851db771822ed45c9acf4a79 Mon Sep 17 00:00:00 2001 From: Simon Fan Date: Tue, 3 Sep 2024 19:20:32 -0700 Subject: [PATCH 02/10] update --- .../compiled_autograd_tutorial.py | 101 +++++++++++------- 1 file changed, 61 insertions(+), 40 deletions(-) diff --git a/intermediate_source/compiled_autograd_tutorial.py b/intermediate_source/compiled_autograd_tutorial.py index 3b8bdd68c6..4b5e2bbebf 100644 --- a/intermediate_source/compiled_autograd_tutorial.py +++ b/intermediate_source/compiled_autograd_tutorial.py @@ -16,22 +16,43 @@ # Doesn't torch.compile already capture the backward graph? # ------------ # Partially. AOTAutograd captures the backward graph ahead-of-time, but with certain limitations: -# - Graph breaks in the forward lead to graph breaks in the backward -# - `Backward hooks `_ are not captured +# - Graph breaks in the forward lead to graph breaks in the backward +# - `Backward hooks `_ are not captured # # Compiled Autograd addresses these limitations by directly integrating with the autograd engine, allowing # it to capture the full backward graph at runtime. Models with these two characteristics should try # Compiled Autograd, and potentially observe better performance. # # However, Compiled Autograd has its own limitations: -# - Dynamic autograd structure leads to recompiles +# - Dynamic autograd structure leads to recompiles # +###################################################################### +# Tutorial output cells setup +# ------------ +# + +import os + +class ScopedLogging: + def __init__(self): + assert "TORCH_LOGS" not in os.environ + assert "TORCH_LOGS_FORMAT" not in os.environ + os.environ["TORCH_LOGS"] = "compiled_autograd_verbose" + os.environ["TORCH_LOGS_FORMAT"] = "short" + + def __del__(self): + del os.environ["TORCH_LOGS"] + del os.environ["TORCH_LOGS_FORMAT"] + + ###################################################################### # Basic Usage # ------------ # +import torch + # NOTE: Must be enabled before using the decorator torch._dynamo.config.compiled_autograd = True @@ -57,21 +78,12 @@ def train(model, x): # ------------ # Run the script with either TORCH_LOGS environment variables # -""" -Prints graph: -TORCH_LOGS="compiled_autograd" python example.py - -Performance degrading, prints verbose graph and recompile reasons: -TORCH_LOGS="compiled_autograd_verbose" python example.py -""" - -###################################################################### -# Or with the set_logs private API: +# - To only print the compiled autograd graph, use `TORCH_LOGS="compiled_autograd" python example.py` +# - To sacrifice some performance, in order to print the graph with more tensor medata and recompile reasons, use `TORCH_LOGS="compiled_autograd_verbose" python example.py` +# +# Logs can also be enabled through the private API torch._logging._internal.set_logs. # -# flag must be enabled before wrapping using torch.compile -torch._logging._internal.set_logs(compiled_autograd=True) - @torch.compile def train(model, x): loss = model(x).sum() @@ -80,14 +92,15 @@ def train(model, x): train(model, x) ###################################################################### -# The compiled autograd graph should now be logged to stdout. Certain graph nodes will have names that are prefixed by "aot0_", +# The compiled autograd graph should now be logged to stdout. Certain graph nodes will have names that are prefixed by aot0_, # these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0. # # NOTE: This is the graph that we will call torch.compile on, NOT the optimized graph. Compiled Autograd basically # generated some python code to represent the entire C++ autograd execution. # """ -INFO:torch._dynamo.compiled_autograd.__compiled_autograd:TRACED GRAPH +DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] +DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:TRACED GRAPH ===== Compiled autograd graph ===== .4 class CompiledAutograd(torch.nn.Module): def forward(self, inputs, sizes, scalars, hooks): @@ -178,6 +191,7 @@ def fn(x): return temp.sum() x = torch.randn(10, 10, requires_grad=True) +torch._dynamo.utils.counters.clear() loss = fn(x) # 1. base torch.compile @@ -205,7 +219,6 @@ def fn(x): x.register_hook(lambda grad: grad+10) loss = fn(x) -torch._logging._internal.set_logs(compiled_autograd=True) with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")): loss.backward() @@ -214,22 +227,22 @@ def fn(x): # """ -INFO:torch._dynamo.compiled_autograd.__compiled_autograd:TRACED GRAPH +DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] +DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:TRACED GRAPH ===== Compiled autograd graph ===== .2 class CompiledAutograd(torch.nn.Module): def forward(self, inputs, sizes, scalars, hooks): - ... - getitem_2 = hooks[0]; hooks = None - call_hook: "f32[10, 10][0, 0]cpu" = torch__dynamo_external_utils_call_hook(getitem_2, aot0_expand, hook_type = 'tensor_pre_hook'); getitem_2 = aot0_expand = None - ... + ... + getitem_2 = hooks[0]; hooks = None + call_hook: "f32[10, 10][0, 0]cpu" = torch__dynamo_external_utils_call_hook(getitem_2, aot0_expand, hook_type = 'tensor_pre_hook'); getitem_2 = aot0_expand = None + ... """ ###################################################################### -# Understanding recompilation reasons for Compiled Autograd +# Common recompilation reasons for Compiled Autograd # ------------ # 1. Due to change in autograd structure -torch._logging._internal.set_logs(compiled_autograd_verbose=True) torch._dynamo.config.compiled_autograd = True x = torch.randn(10, requires_grad=True) for op in [torch.add, torch.sub, torch.mul, torch.div]: @@ -238,14 +251,18 @@ def forward(self, inputs, sizes, scalars, hooks): ###################################################################### # You should see some cache miss logs (recompiles): -# Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] -# ... -# Cache miss due to new autograd node: SubBackward0 (NodeCall 2) with key size 56, previous key sizes=[] -# ... -# Cache miss due to new autograd node: MulBackward0 (NodeCall 2) with key size 71, previous key sizes=[] -# ... -# Cache miss due to new autograd node: DivBackward0 (NodeCall 2) with key size 70, previous key sizes=[] -# ... +# + +""" +Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] +... +Cache miss due to new autograd node: SubBackward0 (NodeCall 2) with key size 56, previous key sizes=[] +... +Cache miss due to new autograd node: MulBackward0 (NodeCall 2) with key size 71, previous key sizes=[] +... +Cache miss due to new autograd node: DivBackward0 (NodeCall 2) with key size 70, previous key sizes=[] +... +""" ###################################################################### # 2. Due to dynamic shapes @@ -260,12 +277,16 @@ def forward(self, inputs, sizes, scalars, hooks): ###################################################################### # You should see some cache miss logs (recompiles): -# ... -# Cache miss due to changed shapes: marking size idx 0 of torch::autograd::GraphRoot (NodeCall 0) as dynamic -# Cache miss due to changed shapes: marking size idx 1 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic -# Cache miss due to changed shapes: marking size idx 2 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic -# Cache miss due to changed shapes: marking size idx 3 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic -# ... +# + +""" +... +Cache miss due to changed shapes: marking size idx 0 of torch::autograd::GraphRoot (NodeCall 0) as dynamic +Cache miss due to changed shapes: marking size idx 1 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic +Cache miss due to changed shapes: marking size idx 2 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic +Cache miss due to changed shapes: marking size idx 3 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic +... +""" ###################################################################### # Compatibility and rough edges From 6cccee05681bf34ed358cbdf83cfbfbe9066e94b Mon Sep 17 00:00:00 2001 From: Simon Fan Date: Wed, 4 Sep 2024 11:46:22 -0700 Subject: [PATCH 03/10] update --- .../compiled_autograd_tutorial.py | 146 ++++++++++-------- 1 file changed, 79 insertions(+), 67 deletions(-) diff --git a/intermediate_source/compiled_autograd_tutorial.py b/intermediate_source/compiled_autograd_tutorial.py index 4b5e2bbebf..4fd58e9743 100644 --- a/intermediate_source/compiled_autograd_tutorial.py +++ b/intermediate_source/compiled_autograd_tutorial.py @@ -4,58 +4,57 @@ Compiled Autograd: Capturing a larger backward graph for ``torch.compile`` ========================================================================== +**Author:** `Simon Fan `_ + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * How compiled autograd interacts with torch.compile + * How to use the compiled autograd API + * How to inspect logs using TORCH_LOGS + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch 2.4 + * `torch.compile `_ familiarity + """ ###################################################################### +# Overview +# ------------ # Compiled Autograd is a torch.compile extension introduced in PyTorch 2.4 -# that allows the capture of a larger backward graph. It is highly recommended -# to familiarize yourself with `torch.compile `_. +# that allows the capture of a larger backward graph. # - -###################################################################### # Doesn't torch.compile already capture the backward graph? # ------------ -# Partially. AOTAutograd captures the backward graph ahead-of-time, but with certain limitations: -# - Graph breaks in the forward lead to graph breaks in the backward -# - `Backward hooks `_ are not captured +# And it does, **partially**. AOTAutograd captures the backward graph ahead-of-time, but with certain limitations: +# 1. Graph breaks in the forward lead to graph breaks in the backward +# 2. `Backward hooks `_ are not captured # # Compiled Autograd addresses these limitations by directly integrating with the autograd engine, allowing # it to capture the full backward graph at runtime. Models with these two characteristics should try # Compiled Autograd, and potentially observe better performance. # # However, Compiled Autograd has its own limitations: -# - Dynamic autograd structure leads to recompiles +# 1. Additional runtime overhead at the start of the backward +# 2. Dynamic autograd structure leads to recompiles +# +# .. note:: Compiled Autograd is under active development and is not yet compatible with all existing PyTorch features. For the latest status on a particular feature, refer to `Compiled Autograd Landing Page `_. # -###################################################################### -# Tutorial output cells setup -# ------------ -# - -import os - -class ScopedLogging: - def __init__(self): - assert "TORCH_LOGS" not in os.environ - assert "TORCH_LOGS_FORMAT" not in os.environ - os.environ["TORCH_LOGS"] = "compiled_autograd_verbose" - os.environ["TORCH_LOGS_FORMAT"] = "short" - - def __del__(self): - del os.environ["TORCH_LOGS"] - del os.environ["TORCH_LOGS_FORMAT"] - ###################################################################### -# Basic Usage +# Setup # ------------ -# +# In this tutorial, we'll base our examples on this toy model. +# import torch -# NOTE: Must be enabled before using the decorator -torch._dynamo.config.compiled_autograd = True - class Model(torch.nn.Module): def __init__(self): super().__init__() @@ -64,24 +63,30 @@ def __init__(self): def forward(self, x): return self.linear(x) + +###################################################################### +# Basic usage +# ------------ +# .. note:: The ``torch._dynamo.config.compiled_autograd = True`` config must be enabled before calling the torch.compile API. +# + +model = Model() +x = torch.randn(10) + +torch._dynamo.config.compiled_autograd = True @torch.compile def train(model, x): loss = model(x).sum() loss.backward() -model = Model() -x = torch.randn(10) train(model, x) ###################################################################### # Inspecting the compiled autograd logs # ------------ -# Run the script with either TORCH_LOGS environment variables -# -# - To only print the compiled autograd graph, use `TORCH_LOGS="compiled_autograd" python example.py` -# - To sacrifice some performance, in order to print the graph with more tensor medata and recompile reasons, use `TORCH_LOGS="compiled_autograd_verbose" python example.py` -# -# Logs can also be enabled through the private API torch._logging._internal.set_logs. +# Run the script with the TORCH_LOGS environment variables: +# - To only print the compiled autograd graph, use ``TORCH_LOGS="compiled_autograd" python example.py`` +# - To print the graph with more tensor medata and recompile reasons, at the cost of performance, use ``TORCH_LOGS="compiled_autograd_verbose" python example.py`` # @torch.compile @@ -92,13 +97,11 @@ def train(model, x): train(model, x) ###################################################################### -# The compiled autograd graph should now be logged to stdout. Certain graph nodes will have names that are prefixed by aot0_, -# these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0. -# -# NOTE: This is the graph that we will call torch.compile on, NOT the optimized graph. Compiled Autograd basically -# generated some python code to represent the entire C++ autograd execution. +# The compiled autograd graph should now be logged to stderr. Certain graph nodes will have names that are prefixed by ``aot0_``, +# these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0 e.g. ``aot0_view_2`` corresponds to ``view_2`` of the AOT backward graph with id=0. # -""" + +stderr_output = """ DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:TRACED GRAPH ===== Compiled autograd graph ===== @@ -152,6 +155,10 @@ def forward(self, inputs, sizes, scalars, hooks): return [] """ +###################################################################### +# .. note:: This is the graph that we will call torch.compile on, NOT the optimized graph. Compiled Autograd generates some python code to represent the entire C++ autograd execution. +# + ###################################################################### # Compiling the forward and backward pass using different flags # ------------ @@ -163,7 +170,7 @@ def train(model, x): torch.compile(lambda: loss.backward(), fullgraph=True)() ###################################################################### -# Or you can use the context manager, which will apply to all autograd calls within it +# Or you can use the context manager, which will apply to all autograd calls within its scope. # def train(model, x): @@ -174,7 +181,7 @@ def train(model, x): ###################################################################### -# Demonstrating the limitations of AOTAutograd addressed by Compiled Autograd +# Compiled Autograd addresses certain limitations of AOTAutograd # ------------ # 1. Graph breaks in the forward lead to graph breaks in the backward # @@ -208,7 +215,12 @@ def fn(x): ###################################################################### -# 2. `Backward hooks are not captured +# In the ``1. base torch.compile`` case, we see that 3 backward graphs were produced due to the 2 graph breaks in the compiled function ``fn``. +# Whereas in ``2. torch.compile with compiled autograd``, we see that a full backward graph was traced despite the graph breaks. +# + +###################################################################### +# 2. Backward hooks are not captured # @torch.compile(backend="aot_eager") @@ -223,19 +235,19 @@ def fn(x): loss.backward() ###################################################################### -# There is a `call_hook` node in the graph, which dynamo will inline +# There should be a ``call_hook`` node in the graph, which dynamo will later inline into # -""" +stderr_output = """ DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:TRACED GRAPH - ===== Compiled autograd graph ===== - .2 class CompiledAutograd(torch.nn.Module): - def forward(self, inputs, sizes, scalars, hooks): - ... - getitem_2 = hooks[0]; hooks = None - call_hook: "f32[10, 10][0, 0]cpu" = torch__dynamo_external_utils_call_hook(getitem_2, aot0_expand, hook_type = 'tensor_pre_hook'); getitem_2 = aot0_expand = None - ... +===== Compiled autograd graph ===== +.2 class CompiledAutograd(torch.nn.Module): + def forward(self, inputs, sizes, scalars, hooks): + ... + getitem_2 = hooks[0]; hooks = None + call_hook: "f32[10, 10][0, 0]cpu" = torch__dynamo_external_utils_call_hook(getitem_2, aot0_expand, hook_type = 'tensor_pre_hook'); getitem_2 = aot0_expand = None + ... """ ###################################################################### @@ -250,10 +262,10 @@ def forward(self, inputs, sizes, scalars, hooks): torch.compile(lambda: loss.backward(), backend="eager")() ###################################################################### -# You should see some cache miss logs (recompiles): +# You should see some recompile messages: **Cache miss due to new autograd node**. # -""" +stderr_output = """ Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] ... Cache miss due to new autograd node: SubBackward0 (NodeCall 2) with key size 56, previous key sizes=[] @@ -268,7 +280,6 @@ def forward(self, inputs, sizes, scalars, hooks): # 2. Due to dynamic shapes # -torch._logging._internal.set_logs(compiled_autograd_verbose=True) torch._dynamo.config.compiled_autograd = True for i in [10, 100, 10]: x = torch.randn(i, i, requires_grad=True) @@ -276,10 +287,10 @@ def forward(self, inputs, sizes, scalars, hooks): torch.compile(lambda: loss.backward(), backend="eager")() ###################################################################### -# You should see some cache miss logs (recompiles): +# You should see some recompiles messages: **Cache miss due to changed shapes**. # -""" +stderr_output = """ ... Cache miss due to changed shapes: marking size idx 0 of torch::autograd::GraphRoot (NodeCall 0) as dynamic Cache miss due to changed shapes: marking size idx 1 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic @@ -289,8 +300,9 @@ def forward(self, inputs, sizes, scalars, hooks): """ ###################################################################### -# Compatibility and rough edges -# ------------ -# -# Compiled Autograd is under active development and is not yet compatible with all existing PyTorch features. -# For the latest status on a particular feature, refer to: https://docs.google.com/document/d/11VucFBEewzqgkABIjebZIzMvrXr3BtcY1aGKpX61pJY. +# Conclusion +# ---------- +# In this tutorial, we went over the high-level ecosystem of torch.compile with compiled autograd, the basics of compiled autograd and a few common recompilation reasons. +# +# For feedback on this tutorial, please file an issue on https://github.com/pytorch/tutorials. +# \ No newline at end of file From 50a69783a0283747af5154c4b65b4d636d72c891 Mon Sep 17 00:00:00 2001 From: Simon Fan Date: Wed, 4 Sep 2024 12:54:43 -0700 Subject: [PATCH 04/10] update --- intermediate_source/compiled_autograd_tutorial.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/intermediate_source/compiled_autograd_tutorial.py b/intermediate_source/compiled_autograd_tutorial.py index 4fd58e9743..932e930102 100644 --- a/intermediate_source/compiled_autograd_tutorial.py +++ b/intermediate_source/compiled_autograd_tutorial.py @@ -305,4 +305,4 @@ def forward(self, inputs, sizes, scalars, hooks): # In this tutorial, we went over the high-level ecosystem of torch.compile with compiled autograd, the basics of compiled autograd and a few common recompilation reasons. # # For feedback on this tutorial, please file an issue on https://github.com/pytorch/tutorials. -# \ No newline at end of file +# From 271b8f2cbd61b98d32e1ab5be6d7da06b8f86c36 Mon Sep 17 00:00:00 2001 From: Simon Fan Date: Fri, 6 Sep 2024 11:20:37 -0700 Subject: [PATCH 05/10] address comments --- .../compiled_autograd_tutorial.py | 80 ++++++++++--------- 1 file changed, 43 insertions(+), 37 deletions(-) diff --git a/intermediate_source/compiled_autograd_tutorial.py b/intermediate_source/compiled_autograd_tutorial.py index 932e930102..ff66a0a0a0 100644 --- a/intermediate_source/compiled_autograd_tutorial.py +++ b/intermediate_source/compiled_autograd_tutorial.py @@ -11,37 +11,35 @@ .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn :class-card: card-prerequisites - * How compiled autograd interacts with torch.compile + * How compiled autograd interacts with ``torch.compile`` * How to use the compiled autograd API - * How to inspect logs using TORCH_LOGS + * How to inspect logs using ``TORCH_LOGS`` .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites :class-card: card-prerequisites * PyTorch 2.4 - * `torch.compile `_ familiarity + * Complete the `Introduction to torch.compile `_ """ ###################################################################### # Overview # ------------ -# Compiled Autograd is a torch.compile extension introduced in PyTorch 2.4 +# Compiled Autograd is a ``torch.compile`` extension introduced in PyTorch 2.4 # that allows the capture of a larger backward graph. # -# Doesn't torch.compile already capture the backward graph? -# ------------ -# And it does, **partially**. AOTAutograd captures the backward graph ahead-of-time, but with certain limitations: -# 1. Graph breaks in the forward lead to graph breaks in the backward -# 2. `Backward hooks `_ are not captured +# While ``torch.compile`` does capture the backward graph, it does so **partially**. The AOTAutograd component captures the backward graph ahead-of-time, with certain limitations: +# * Graph breaks in the forward lead to graph breaks in the backward +# * `Backward hooks `_ are not captured # # Compiled Autograd addresses these limitations by directly integrating with the autograd engine, allowing # it to capture the full backward graph at runtime. Models with these two characteristics should try # Compiled Autograd, and potentially observe better performance. # -# However, Compiled Autograd has its own limitations: -# 1. Additional runtime overhead at the start of the backward -# 2. Dynamic autograd structure leads to recompiles +# However, Compiled Autograd introduces its own limitations: +# * Added runtime overhead at the start of the backward for cache lookup +# * More prone to recompiles and graph breaks in dynamo due to the larger capture # # .. note:: Compiled Autograd is under active development and is not yet compatible with all existing PyTorch features. For the latest status on a particular feature, refer to `Compiled Autograd Landing Page `_. # @@ -50,8 +48,9 @@ ###################################################################### # Setup # ------------ -# In this tutorial, we'll base our examples on this toy model. -# +# In this tutorial, we will base our examples on this simple neural network model. +# It takes a a 10-dimensional input vector, processes it through a single linear layer, and outputs another 10-dimensional vector. +# import torch @@ -67,7 +66,7 @@ def forward(self, x): ###################################################################### # Basic usage # ------------ -# .. note:: The ``torch._dynamo.config.compiled_autograd = True`` config must be enabled before calling the torch.compile API. +# Before calling the torch.compile API, make sure to set ``torch._dynamo.config.compiled_autograd`` to ``True``: # model = Model() @@ -82,23 +81,30 @@ def train(model, x): train(model, x) ###################################################################### -# Inspecting the compiled autograd logs -# ------------ -# Run the script with the TORCH_LOGS environment variables: -# - To only print the compiled autograd graph, use ``TORCH_LOGS="compiled_autograd" python example.py`` -# - To print the graph with more tensor medata and recompile reasons, at the cost of performance, use ``TORCH_LOGS="compiled_autograd_verbose" python example.py`` +# In the code above, we create an instance of the ``Model`` class and generate a random 10-dimensional tensor ``x`` by using torch.randn(10). +# We define the training loop function ``train`` and decorate it with @torch.compile to optimize its execution. +# +# When ``train(model, x)`` is called: +# * Python Interpreter calls Dynamo, since this call was decorated with ``@torch.compile`` +# * Dynamo intercepts the python bytecode, simulates their execution and records the operations into a graph +# * AOTDispatcher disables hooks and calls the autograd engine to compute gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph. Using ``torch.autograd.Function``, AOTDispatcher rewrites the forward and backward implementation of ``train``. +# * Inductor generates a function corresponding to an optimized implementation of the AOTDispatcher forward and backward +# * Dynamo sets the optimized function to be evaluated next by Python Interpreter +# * Python Interpreter executes the optimized function, which basically executes ``loss = model(x).sum()`` +# * Python Interpreter executes ``loss.backward()``, calling into the autograd engine, which routes to the Compiled Autograd engine since we enabled the config: ``torch._dynamo.config.compiled_autograd = True`` +# * Compiled Autograd computes the gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph, including any hooks it encounters. During this, it will record the backward previously rewritten by AOTDispatcher. Compiled Autograd then generates a new function which corresponds to a fully traced implementation of ``loss.backward()``, and executes it with ``torch.compile`` in inference mode +# * The same steps recursively apply to the Compiled Autograd graph, but this time AOTDispatcher does not need to partition this graph into a forward and backward # - -@torch.compile -def train(model, x): - loss = model(x).sum() - loss.backward() - -train(model, x) ###################################################################### -# The compiled autograd graph should now be logged to stderr. Certain graph nodes will have names that are prefixed by ``aot0_``, -# these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0 e.g. ``aot0_view_2`` corresponds to ``view_2`` of the AOT backward graph with id=0. +# Inspecting the compiled autograd logs +# ------------------------------------- +# Run the script with the ``TORCH_LOGS`` environment variables: +# - To only print the compiled autograd graph, use ``TORCH_LOGS="compiled_autograd" python example.py`` +# - To print the graph with more tensor metadata and recompile reasons, at the cost of performance, use ``TORCH_LOGS="compiled_autograd_verbose" python example.py`` +# +# Rerun the snippet above, the compiled autograd graph should now be logged to ``stderr``. Certain graph nodes will have names that are prefixed by ``aot0_``, +# these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0, for example, ``aot0_view_2`` corresponds to ``view_2`` of the AOT backward graph with id=0. # stderr_output = """ @@ -156,17 +162,19 @@ def forward(self, inputs, sizes, scalars, hooks): """ ###################################################################### -# .. note:: This is the graph that we will call torch.compile on, NOT the optimized graph. Compiled Autograd generates some python code to represent the entire C++ autograd execution. +# .. note:: This is the graph on which we will call ``torch.compile``, **NOT** the optimized graph. Compiled Autograd essentially generates some unoptimized Python code to represent the entire C++ autograd execution. # ###################################################################### # Compiling the forward and backward pass using different flags -# ------------ -# +# ------------------------------------------------------------- +# You can use different compiler configs for the two compilations, for example, the backward may be a fullgraph even if there are graph breaks in the forward. +# def train(model, x): model = torch.compile(model) loss = model(x).sum() + torch._dynamo.config.compiled_autograd = True torch.compile(lambda: loss.backward(), fullgraph=True)() ###################################################################### @@ -182,7 +190,7 @@ def train(model, x): ###################################################################### # Compiled Autograd addresses certain limitations of AOTAutograd -# ------------ +# -------------------------------------------------------------- # 1. Graph breaks in the forward lead to graph breaks in the backward # @@ -252,7 +260,7 @@ def forward(self, inputs, sizes, scalars, hooks): ###################################################################### # Common recompilation reasons for Compiled Autograd -# ------------ +# -------------------------------------------------- # 1. Due to change in autograd structure torch._dynamo.config.compiled_autograd = True @@ -302,7 +310,5 @@ def forward(self, inputs, sizes, scalars, hooks): ###################################################################### # Conclusion # ---------- -# In this tutorial, we went over the high-level ecosystem of torch.compile with compiled autograd, the basics of compiled autograd and a few common recompilation reasons. -# -# For feedback on this tutorial, please file an issue on https://github.com/pytorch/tutorials. +# In this tutorial, we went over the high-level ecosystem of ``torch.compile`` with compiled autograd, the basics of compiled autograd and a few common recompilation reasons. # From 64c923e63968eeb0a299b0aae79c41581213d37b Mon Sep 17 00:00:00 2001 From: Simon Fan Date: Fri, 6 Sep 2024 16:15:43 -0700 Subject: [PATCH 06/10] try to fix build --- intermediate_source/compiled_autograd_tutorial.py | 1 - 1 file changed, 1 deletion(-) diff --git a/intermediate_source/compiled_autograd_tutorial.py b/intermediate_source/compiled_autograd_tutorial.py index ff66a0a0a0..c1e4f2a538 100644 --- a/intermediate_source/compiled_autograd_tutorial.py +++ b/intermediate_source/compiled_autograd_tutorial.py @@ -50,7 +50,6 @@ # ------------ # In this tutorial, we will base our examples on this simple neural network model. # It takes a a 10-dimensional input vector, processes it through a single linear layer, and outputs another 10-dimensional vector. -# import torch From a4326ebac8c8d1c1f232dad3bf272cd06587cb24 Mon Sep 17 00:00:00 2001 From: Simon Fan Date: Mon, 9 Sep 2024 09:55:59 -0700 Subject: [PATCH 07/10] convert to .rst --- .../compiled_autograd_tutorial.py | 313 ------------------ .../compiled_autograd_tutorial.rst | 301 +++++++++++++++++ 2 files changed, 301 insertions(+), 313 deletions(-) delete mode 100644 intermediate_source/compiled_autograd_tutorial.py create mode 100644 intermediate_source/compiled_autograd_tutorial.rst diff --git a/intermediate_source/compiled_autograd_tutorial.py b/intermediate_source/compiled_autograd_tutorial.py deleted file mode 100644 index c1e4f2a538..0000000000 --- a/intermediate_source/compiled_autograd_tutorial.py +++ /dev/null @@ -1,313 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Compiled Autograd: Capturing a larger backward graph for ``torch.compile`` -========================================================================== - -**Author:** `Simon Fan `_ - -.. grid:: 2 - - .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn - :class-card: card-prerequisites - - * How compiled autograd interacts with ``torch.compile`` - * How to use the compiled autograd API - * How to inspect logs using ``TORCH_LOGS`` - - .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites - :class-card: card-prerequisites - - * PyTorch 2.4 - * Complete the `Introduction to torch.compile `_ - -""" - -###################################################################### -# Overview -# ------------ -# Compiled Autograd is a ``torch.compile`` extension introduced in PyTorch 2.4 -# that allows the capture of a larger backward graph. -# -# While ``torch.compile`` does capture the backward graph, it does so **partially**. The AOTAutograd component captures the backward graph ahead-of-time, with certain limitations: -# * Graph breaks in the forward lead to graph breaks in the backward -# * `Backward hooks `_ are not captured -# -# Compiled Autograd addresses these limitations by directly integrating with the autograd engine, allowing -# it to capture the full backward graph at runtime. Models with these two characteristics should try -# Compiled Autograd, and potentially observe better performance. -# -# However, Compiled Autograd introduces its own limitations: -# * Added runtime overhead at the start of the backward for cache lookup -# * More prone to recompiles and graph breaks in dynamo due to the larger capture -# -# .. note:: Compiled Autograd is under active development and is not yet compatible with all existing PyTorch features. For the latest status on a particular feature, refer to `Compiled Autograd Landing Page `_. -# - - -###################################################################### -# Setup -# ------------ -# In this tutorial, we will base our examples on this simple neural network model. -# It takes a a 10-dimensional input vector, processes it through a single linear layer, and outputs another 10-dimensional vector. - -import torch - -class Model(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(10, 10) - - def forward(self, x): - return self.linear(x) - - -###################################################################### -# Basic usage -# ------------ -# Before calling the torch.compile API, make sure to set ``torch._dynamo.config.compiled_autograd`` to ``True``: -# - -model = Model() -x = torch.randn(10) - -torch._dynamo.config.compiled_autograd = True -@torch.compile -def train(model, x): - loss = model(x).sum() - loss.backward() - -train(model, x) - -###################################################################### -# In the code above, we create an instance of the ``Model`` class and generate a random 10-dimensional tensor ``x`` by using torch.randn(10). -# We define the training loop function ``train`` and decorate it with @torch.compile to optimize its execution. -# -# When ``train(model, x)`` is called: -# * Python Interpreter calls Dynamo, since this call was decorated with ``@torch.compile`` -# * Dynamo intercepts the python bytecode, simulates their execution and records the operations into a graph -# * AOTDispatcher disables hooks and calls the autograd engine to compute gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph. Using ``torch.autograd.Function``, AOTDispatcher rewrites the forward and backward implementation of ``train``. -# * Inductor generates a function corresponding to an optimized implementation of the AOTDispatcher forward and backward -# * Dynamo sets the optimized function to be evaluated next by Python Interpreter -# * Python Interpreter executes the optimized function, which basically executes ``loss = model(x).sum()`` -# * Python Interpreter executes ``loss.backward()``, calling into the autograd engine, which routes to the Compiled Autograd engine since we enabled the config: ``torch._dynamo.config.compiled_autograd = True`` -# * Compiled Autograd computes the gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph, including any hooks it encounters. During this, it will record the backward previously rewritten by AOTDispatcher. Compiled Autograd then generates a new function which corresponds to a fully traced implementation of ``loss.backward()``, and executes it with ``torch.compile`` in inference mode -# * The same steps recursively apply to the Compiled Autograd graph, but this time AOTDispatcher does not need to partition this graph into a forward and backward -# - -###################################################################### -# Inspecting the compiled autograd logs -# ------------------------------------- -# Run the script with the ``TORCH_LOGS`` environment variables: -# - To only print the compiled autograd graph, use ``TORCH_LOGS="compiled_autograd" python example.py`` -# - To print the graph with more tensor metadata and recompile reasons, at the cost of performance, use ``TORCH_LOGS="compiled_autograd_verbose" python example.py`` -# -# Rerun the snippet above, the compiled autograd graph should now be logged to ``stderr``. Certain graph nodes will have names that are prefixed by ``aot0_``, -# these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0, for example, ``aot0_view_2`` corresponds to ``view_2`` of the AOT backward graph with id=0. -# - -stderr_output = """ -DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] -DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:TRACED GRAPH - ===== Compiled autograd graph ===== - .4 class CompiledAutograd(torch.nn.Module): - def forward(self, inputs, sizes, scalars, hooks): - # No stacktrace found for following nodes - aot0_tangents_1: "f32[][]cpu" = inputs[0] - aot0_primals_3: "f32[10][1]cpu" = inputs[1] - getitem_2: "f32[10][1]cpu" = inputs[2] - getitem_3: "f32[10, 10][10, 1]cpu" = inputs[3]; inputs = None - - # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: CompiledFunctionBackward0 (NodeCall 1) - aot0_expand: "f32[10][0]cpu" = torch.ops.aten.expand.default(aot0_tangents_1, [10]); aot0_tangents_1 = None - aot0_view_2: "f32[1, 10][0, 0]cpu" = torch.ops.aten.view.default(aot0_expand, [1, 10]); aot0_expand = None - aot0_permute_2: "f32[10, 1][0, 0]cpu" = torch.ops.aten.permute.default(aot0_view_2, [1, 0]) - aot0_select: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 0) - aot0_view: "f32[1, 10][10, 1]cpu" = torch.ops.aten.view.default(aot0_primals_3, [1, 10]); aot0_primals_3 = None - aot0_mul_3: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select, aot0_view); aot0_select = None - aot0_select_1: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 1) - aot0_mul_4: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_1, aot0_view); aot0_select_1 = None - aot0_select_2: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 2) - aot0_mul_5: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_2, aot0_view); aot0_select_2 = None - aot0_select_3: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 3) - aot0_mul_6: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_3, aot0_view); aot0_select_3 = None - aot0_select_4: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 4) - aot0_mul_7: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_4, aot0_view); aot0_select_4 = None - aot0_select_5: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 5) - aot0_mul_8: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_5, aot0_view); aot0_select_5 = None - aot0_select_6: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 6) - aot0_mul_9: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_6, aot0_view); aot0_select_6 = None - aot0_select_7: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 7) - aot0_mul_10: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_7, aot0_view); aot0_select_7 = None - aot0_select_8: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 8) - aot0_mul_11: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_8, aot0_view); aot0_select_8 = None - aot0_select_9: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 9); aot0_permute_2 = None - aot0_mul_12: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_9, aot0_view); aot0_select_9 = aot0_view = None - aot0_cat: "f32[10, 10][10, 1]cpu" = torch.ops.aten.cat.default([aot0_mul_3, aot0_mul_4, aot0_mul_5, aot0_mul_6, aot0_mul_7, aot0_mul_8, aot0_mul_9, aot0_mul_10, aot0_mul_11, aot0_mul_12]); aot0_mul_3 = aot0_mul_4 = aot0_mul_5 = aot0_mul_6 = aot0_mul_7 = aot0_mul_8 = aot0_mul_9 = aot0_mul_10 = aot0_mul_11 = aot0_mul_12 = None - aot0_permute_3: "f32[10, 10][1, 10]cpu" = torch.ops.aten.permute.default(aot0_cat, [1, 0]); aot0_cat = None - aot0_sum_3: "f32[1, 10][10, 1]cpu" = torch.ops.aten.sum.dim_IntList(aot0_view_2, [0], True); aot0_view_2 = None - aot0_view_3: "f32[10][1]cpu" = torch.ops.aten.view.default(aot0_sum_3, [10]); aot0_sum_3 = None - - # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: torch::autograd::AccumulateGrad (NodeCall 2) - accumulate_grad_ = torch.ops.inductor.accumulate_grad_.default(getitem_2, aot0_view_3); getitem_2 = aot0_view_3 = accumulate_grad_ = None - - # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: CompiledFunctionBackward0 (NodeCall 1) - aot0_permute_4: "f32[10, 10][10, 1]cpu" = torch.ops.aten.permute.default(aot0_permute_3, [1, 0]); aot0_permute_3 = None - - # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: torch::autograd::AccumulateGrad (NodeCall 3) - accumulate_grad__1 = torch.ops.inductor.accumulate_grad_.default(getitem_3, aot0_permute_4); getitem_3 = aot0_permute_4 = accumulate_grad__1 = None - _exec_final_callbacks_stub = torch__dynamo_external_utils__exec_final_callbacks_stub(); _exec_final_callbacks_stub = None - return [] -""" - -###################################################################### -# .. note:: This is the graph on which we will call ``torch.compile``, **NOT** the optimized graph. Compiled Autograd essentially generates some unoptimized Python code to represent the entire C++ autograd execution. -# - -###################################################################### -# Compiling the forward and backward pass using different flags -# ------------------------------------------------------------- -# You can use different compiler configs for the two compilations, for example, the backward may be a fullgraph even if there are graph breaks in the forward. -# - -def train(model, x): - model = torch.compile(model) - loss = model(x).sum() - torch._dynamo.config.compiled_autograd = True - torch.compile(lambda: loss.backward(), fullgraph=True)() - -###################################################################### -# Or you can use the context manager, which will apply to all autograd calls within its scope. -# - -def train(model, x): - model = torch.compile(model) - loss = model(x).sum() - with torch._dynamo.compiled_autograd.enable(torch.compile(fullgraph=True)): - loss.backward() - - -###################################################################### -# Compiled Autograd addresses certain limitations of AOTAutograd -# -------------------------------------------------------------- -# 1. Graph breaks in the forward lead to graph breaks in the backward -# - -@torch.compile(backend="aot_eager") -def fn(x): - # 1st graph - temp = x + 10 - torch._dynamo.graph_break() - # 2nd graph - temp = temp + 10 - torch._dynamo.graph_break() - # 3rd graph - return temp.sum() - -x = torch.randn(10, 10, requires_grad=True) -torch._dynamo.utils.counters.clear() -loss = fn(x) - -# 1. base torch.compile -loss.backward(retain_graph=True) -assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 3) -torch._dynamo.utils.counters.clear() - -# 2. torch.compile with compiled autograd -with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")): - loss.backward() - -# single graph for the backward -assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 1) - - -###################################################################### -# In the ``1. base torch.compile`` case, we see that 3 backward graphs were produced due to the 2 graph breaks in the compiled function ``fn``. -# Whereas in ``2. torch.compile with compiled autograd``, we see that a full backward graph was traced despite the graph breaks. -# - -###################################################################### -# 2. Backward hooks are not captured -# - -@torch.compile(backend="aot_eager") -def fn(x): - return x.sum() - -x = torch.randn(10, 10, requires_grad=True) -x.register_hook(lambda grad: grad+10) -loss = fn(x) - -with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")): - loss.backward() - -###################################################################### -# There should be a ``call_hook`` node in the graph, which dynamo will later inline into -# - -stderr_output = """ -DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] -DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:TRACED GRAPH -===== Compiled autograd graph ===== -.2 class CompiledAutograd(torch.nn.Module): - def forward(self, inputs, sizes, scalars, hooks): - ... - getitem_2 = hooks[0]; hooks = None - call_hook: "f32[10, 10][0, 0]cpu" = torch__dynamo_external_utils_call_hook(getitem_2, aot0_expand, hook_type = 'tensor_pre_hook'); getitem_2 = aot0_expand = None - ... -""" - -###################################################################### -# Common recompilation reasons for Compiled Autograd -# -------------------------------------------------- -# 1. Due to change in autograd structure - -torch._dynamo.config.compiled_autograd = True -x = torch.randn(10, requires_grad=True) -for op in [torch.add, torch.sub, torch.mul, torch.div]: - loss = op(x, x).sum() - torch.compile(lambda: loss.backward(), backend="eager")() - -###################################################################### -# You should see some recompile messages: **Cache miss due to new autograd node**. -# - -stderr_output = """ -Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] -... -Cache miss due to new autograd node: SubBackward0 (NodeCall 2) with key size 56, previous key sizes=[] -... -Cache miss due to new autograd node: MulBackward0 (NodeCall 2) with key size 71, previous key sizes=[] -... -Cache miss due to new autograd node: DivBackward0 (NodeCall 2) with key size 70, previous key sizes=[] -... -""" - -###################################################################### -# 2. Due to dynamic shapes -# - -torch._dynamo.config.compiled_autograd = True -for i in [10, 100, 10]: - x = torch.randn(i, i, requires_grad=True) - loss = x.sum() - torch.compile(lambda: loss.backward(), backend="eager")() - -###################################################################### -# You should see some recompiles messages: **Cache miss due to changed shapes**. -# - -stderr_output = """ -... -Cache miss due to changed shapes: marking size idx 0 of torch::autograd::GraphRoot (NodeCall 0) as dynamic -Cache miss due to changed shapes: marking size idx 1 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic -Cache miss due to changed shapes: marking size idx 2 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic -Cache miss due to changed shapes: marking size idx 3 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic -... -""" - -###################################################################### -# Conclusion -# ---------- -# In this tutorial, we went over the high-level ecosystem of ``torch.compile`` with compiled autograd, the basics of compiled autograd and a few common recompilation reasons. -# diff --git a/intermediate_source/compiled_autograd_tutorial.rst b/intermediate_source/compiled_autograd_tutorial.rst new file mode 100644 index 0000000000..22bc9904cb --- /dev/null +++ b/intermediate_source/compiled_autograd_tutorial.rst @@ -0,0 +1,301 @@ +Compiled Autograd: Capturing a larger backward graph for ``torch.compile`` +========================================================================== +**Author:** `Simon Fan `_ + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn + :class-card: card-prerequisites + + * How compiled autograd interacts with ``torch.compile`` + * How to use the compiled autograd API + * How to inspect logs using ``TORCH_LOGS`` + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + :class-card: card-prerequisites + + * PyTorch 2.4 + * Complete the `Introduction to torch.compile `_ + +Overview +-------- +Compiled Autograd is a ``torch.compile`` extension introduced in PyTorch 2.4 +that allows the capture of a larger backward graph. + +While ``torch.compile`` does capture the backward graph, it does so **partially**. The AOTAutograd component captures the backward graph ahead-of-time, with certain limitations: + +* Graph breaks in the forward lead to graph breaks in the backward +* `Backward hooks `_ are not captured + +Compiled Autograd addresses these limitations by directly integrating with the autograd engine, allowing +it to capture the full backward graph at runtime. Models with these two characteristics should try +Compiled Autograd, and potentially observe better performance. + +However, Compiled Autograd introduces its own limitations: + +* Added runtime overhead at the start of the backward for cache lookup +* More prone to recompiles and graph breaks in dynamo due to the larger capture + +.. note:: Compiled Autograd is under active development and is not yet compatible with all existing PyTorch features. For the latest status on a particular feature, refer to `Compiled Autograd Landing Page `_. + +Setup +----- +In this tutorial, we will base our examples on this simple neural network model. +It takes a a 10-dimensional input vector, processes it through a single linear layer, and outputs another 10-dimensional vector. + +.. code:: python + + import torch + + class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(10, 10) + + def forward(self, x): + return self.linear(x) + +Basic usage +------------ +Before calling the torch.compile API, make sure to set ``torch._dynamo.config.compiled_autograd`` to ``True``: + +.. code:: python + + model = Model() + x = torch.randn(10) + + torch._dynamo.config.compiled_autograd = True + @torch.compile + def train(model, x): + loss = model(x).sum() + loss.backward() + + train(model, x) + +In the code above, we create an instance of the ``Model`` class and generate a random 10-dimensional tensor ``x`` by using torch.randn(10). +We define the training loop function ``train`` and decorate it with @torch.compile to optimize its execution. +When ``train(model, x)`` is called: + +* Python Interpreter calls Dynamo, since this call was decorated with ``@torch.compile`` +* Dynamo intercepts the python bytecode, simulates their execution and records the operations into a graph +* AOTDispatcher disables hooks and calls the autograd engine to compute gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph. Using ``torch.autograd.Function``, AOTDispatcher rewrites the forward and backward implementation of ``train``. +* Inductor generates a function corresponding to an optimized implementation of the AOTDispatcher forward and backward +* Dynamo sets the optimized function to be evaluated next by Python Interpreter +* Python Interpreter executes the optimized function, which basically executes ``loss = model(x).sum()`` +* Python Interpreter executes ``loss.backward()``, calling into the autograd engine, which routes to the Compiled Autograd engine since we enabled the config: ``torch._dynamo.config.compiled_autograd = True`` +* Compiled Autograd computes the gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph, including any hooks it encounters. During this, it will record the backward previously rewritten by AOTDispatcher. Compiled Autograd then generates a new function which corresponds to a fully traced implementation of ``loss.backward()``, and executes it with ``torch.compile`` in inference mode +* The same steps recursively apply to the Compiled Autograd graph, but this time AOTDispatcher does not need to partition this graph into a forward and backward + +Inspecting the compiled autograd logs +------------------------------------- +Run the script with the ``TORCH_LOGS`` environment variables: + +* To only print the compiled autograd graph, use ``TORCH_LOGS="compiled_autograd" python example.py`` +* To print the graph with more tensor metadata and recompile reasons, at the cost of performance, use ``TORCH_LOGS="compiled_autograd_verbose" python example.py`` + +Rerun the snippet above, the compiled autograd graph should now be logged to ``stderr``. Certain graph nodes will have names that are prefixed by ``aot0_``, +these correspond to the nodes previously compiled ahead of time in AOTAutograd backward graph 0, for example, ``aot0_view_2`` corresponds to ``view_2`` of the AOT backward graph with id=0. + + +.. code:: python + + stderr_output = """ + DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] + DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:TRACED GRAPH + ===== Compiled autograd graph ===== + .4 class CompiledAutograd(torch.nn.Module): + def forward(self, inputs, sizes, scalars, hooks): + # No stacktrace found for following nodes + aot0_tangents_1: "f32[][]cpu" = inputs[0] + aot0_primals_3: "f32[10][1]cpu" = inputs[1] + getitem_2: "f32[10][1]cpu" = inputs[2] + getitem_3: "f32[10, 10][10, 1]cpu" = inputs[3]; inputs = None + + # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: CompiledFunctionBackward0 (NodeCall 1) + aot0_expand: "f32[10][0]cpu" = torch.ops.aten.expand.default(aot0_tangents_1, [10]); aot0_tangents_1 = None + aot0_view_2: "f32[1, 10][0, 0]cpu" = torch.ops.aten.view.default(aot0_expand, [1, 10]); aot0_expand = None + aot0_permute_2: "f32[10, 1][0, 0]cpu" = torch.ops.aten.permute.default(aot0_view_2, [1, 0]) + aot0_select: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 0) + aot0_view: "f32[1, 10][10, 1]cpu" = torch.ops.aten.view.default(aot0_primals_3, [1, 10]); aot0_primals_3 = None + aot0_mul_3: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select, aot0_view); aot0_select = None + aot0_select_1: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 1) + aot0_mul_4: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_1, aot0_view); aot0_select_1 = None + aot0_select_2: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 2) + aot0_mul_5: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_2, aot0_view); aot0_select_2 = None + aot0_select_3: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 3) + aot0_mul_6: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_3, aot0_view); aot0_select_3 = None + aot0_select_4: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 4) + aot0_mul_7: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_4, aot0_view); aot0_select_4 = None + aot0_select_5: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 5) + aot0_mul_8: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_5, aot0_view); aot0_select_5 = None + aot0_select_6: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 6) + aot0_mul_9: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_6, aot0_view); aot0_select_6 = None + aot0_select_7: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 7) + aot0_mul_10: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_7, aot0_view); aot0_select_7 = None + aot0_select_8: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 8) + aot0_mul_11: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_8, aot0_view); aot0_select_8 = None + aot0_select_9: "f32[1][0]cpu" = torch.ops.aten.select.int(aot0_permute_2, 0, 9); aot0_permute_2 = None + aot0_mul_12: "f32[1, 10][10, 1]cpu" = torch.ops.aten.mul.Tensor(aot0_select_9, aot0_view); aot0_select_9 = aot0_view = None + aot0_cat: "f32[10, 10][10, 1]cpu" = torch.ops.aten.cat.default([aot0_mul_3, aot0_mul_4, aot0_mul_5, aot0_mul_6, aot0_mul_7, aot0_mul_8, aot0_mul_9, aot0_mul_10, aot0_mul_11, aot0_mul_12]); aot0_mul_3 = aot0_mul_4 = aot0_mul_5 = aot0_mul_6 = aot0_mul_7 = aot0_mul_8 = aot0_mul_9 = aot0_mul_10 = aot0_mul_11 = aot0_mul_12 = None + aot0_permute_3: "f32[10, 10][1, 10]cpu" = torch.ops.aten.permute.default(aot0_cat, [1, 0]); aot0_cat = None + aot0_sum_3: "f32[1, 10][10, 1]cpu" = torch.ops.aten.sum.dim_IntList(aot0_view_2, [0], True); aot0_view_2 = None + aot0_view_3: "f32[10][1]cpu" = torch.ops.aten.view.default(aot0_sum_3, [10]); aot0_sum_3 = None + + # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: torch::autograd::AccumulateGrad (NodeCall 2) + accumulate_grad_ = torch.ops.inductor.accumulate_grad_.default(getitem_2, aot0_view_3); getitem_2 = aot0_view_3 = accumulate_grad_ = None + + # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: CompiledFunctionBackward0 (NodeCall 1) + aot0_permute_4: "f32[10, 10][10, 1]cpu" = torch.ops.aten.permute.default(aot0_permute_3, [1, 0]); aot0_permute_3 = None + + # File: /data/users/xmfan/a/pytorch/torch/_dynamo/compiled_autograd.py:483 in set_node_origin, code: torch::autograd::AccumulateGrad (NodeCall 3) + accumulate_grad__1 = torch.ops.inductor.accumulate_grad_.default(getitem_3, aot0_permute_4); getitem_3 = aot0_permute_4 = accumulate_grad__1 = None + _exec_final_callbacks_stub = torch__dynamo_external_utils__exec_final_callbacks_stub(); _exec_final_callbacks_stub = None + return [] + """ + +.. note:: This is the graph on which we will call ``torch.compile``, **NOT** the optimized graph. Compiled Autograd essentially generates some unoptimized Python code to represent the entire C++ autograd execution. + +Compiling the forward and backward pass using different flags +------------------------------------------------------------- +You can use different compiler configs for the two compilations, for example, the backward may be a fullgraph even if there are graph breaks in the forward. + +.. code:: python + +def train(model, x): + model = torch.compile(model) + loss = model(x).sum() + torch._dynamo.config.compiled_autograd = True + torch.compile(lambda: loss.backward(), fullgraph=True)() + +Or you can use the context manager, which will apply to all autograd calls within its scope. + +.. code:: python + + def train(model, x): + model = torch.compile(model) + loss = model(x).sum() + with torch._dynamo.compiled_autograd.enable(torch.compile(fullgraph=True)): + loss.backward() + + +Compiled Autograd addresses certain limitations of AOTAutograd +-------------------------------------------------------------- +1. Graph breaks in the forward lead to graph breaks in the backward + +.. code:: python + + @torch.compile(backend="aot_eager") + def fn(x): + # 1st graph + temp = x + 10 + torch._dynamo.graph_break() + # 2nd graph + temp = temp + 10 + torch._dynamo.graph_break() + # 3rd graph + return temp.sum() + + x = torch.randn(10, 10, requires_grad=True) + torch._dynamo.utils.counters.clear() + loss = fn(x) + + # 1. base torch.compile + loss.backward(retain_graph=True) + assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 3) + torch._dynamo.utils.counters.clear() + + # 2. torch.compile with compiled autograd + with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")): + loss.backward() + + # single graph for the backward + assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 1) + + +In the ``1. base torch.compile`` case, we see that 3 backward graphs were produced due to the 2 graph breaks in the compiled function ``fn``. +Whereas in ``2. torch.compile with compiled autograd``, we see that a full backward graph was traced despite the graph breaks. + +2. Backward hooks are not captured + +.. code:: python + + @torch.compile(backend="aot_eager") + def fn(x): + return x.sum() + + x = torch.randn(10, 10, requires_grad=True) + x.register_hook(lambda grad: grad+10) + loss = fn(x) + + with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")): + loss.backward() + +There should be a ``call_hook`` node in the graph, which dynamo will later inline into + +.. code:: python + + stderr_output = """ + DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] + DEBUG:torch._dynamo.compiled_autograd.__compiled_autograd_verbose:TRACED GRAPH + ===== Compiled autograd graph ===== + .2 class CompiledAutograd(torch.nn.Module): + def forward(self, inputs, sizes, scalars, hooks): + ... + getitem_2 = hooks[0]; hooks = None + call_hook: "f32[10, 10][0, 0]cpu" = torch__dynamo_external_utils_call_hook(getitem_2, aot0_expand, hook_type = 'tensor_pre_hook'); getitem_2 = aot0_expand = None + ... + """ + +Common recompilation reasons for Compiled Autograd +-------------------------------------------------- +1. Due to change in autograd structure + +.. code:: python + + torch._dynamo.config.compiled_autograd = True + x = torch.randn(10, requires_grad=True) + for op in [torch.add, torch.sub, torch.mul, torch.div]: + loss = op(x, x).sum() + torch.compile(lambda: loss.backward(), backend="eager")() + +You should see some recompile messages: **Cache miss due to new autograd node**. + +.. code:: python + + stderr_output = """ + Cache miss due to new autograd node: torch::autograd::GraphRoot (NodeCall 0) with key size 39, previous key sizes=[] + ... + Cache miss due to new autograd node: SubBackward0 (NodeCall 2) with key size 56, previous key sizes=[] + ... + Cache miss due to new autograd node: MulBackward0 (NodeCall 2) with key size 71, previous key sizes=[] + ... + Cache miss due to new autograd node: DivBackward0 (NodeCall 2) with key size 70, previous key sizes=[] + ... + """ + +2. Due to dynamic shapes + +.. code:: python + + torch._dynamo.config.compiled_autograd = True + for i in [10, 100, 10]: + x = torch.randn(i, i, requires_grad=True) + loss = x.sum() + torch.compile(lambda: loss.backward(), backend="eager")() + +You should see some recompiles messages: **Cache miss due to changed shapes**. + +.. code:: python + + stderr_output = """ + ... + Cache miss due to changed shapes: marking size idx 0 of torch::autograd::GraphRoot (NodeCall 0) as dynamic + Cache miss due to changed shapes: marking size idx 1 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic + Cache miss due to changed shapes: marking size idx 2 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic + Cache miss due to changed shapes: marking size idx 3 of torch::autograd::AccumulateGrad (NodeCall 2) as dynamic + ... + """ + +Conclusion +---------- +In this tutorial, we went over the high-level ecosystem of ``torch.compile`` with compiled autograd, the basics of compiled autograd and a few common recompilation reasons. From 94d061203e50bf447a6c1564528436fb061840da Mon Sep 17 00:00:00 2001 From: Simon Fan Date: Fri, 13 Sep 2024 14:48:58 -0700 Subject: [PATCH 08/10] address comments --- .../compiled_autograd_tutorial.rst | 37 ++++++++++--------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/intermediate_source/compiled_autograd_tutorial.rst b/intermediate_source/compiled_autograd_tutorial.rst index 22bc9904cb..bf2de77f1b 100644 --- a/intermediate_source/compiled_autograd_tutorial.rst +++ b/intermediate_source/compiled_autograd_tutorial.rst @@ -16,6 +16,7 @@ Compiled Autograd: Capturing a larger backward graph for ``torch.compile`` * PyTorch 2.4 * Complete the `Introduction to torch.compile `_ + * Read through the TorchDynamo and AOTAutograd sections of `Get Started with PyTorch 2.x `_ Overview -------- @@ -41,7 +42,7 @@ However, Compiled Autograd introduces its own limitations: Setup ----- In this tutorial, we will base our examples on this simple neural network model. -It takes a a 10-dimensional input vector, processes it through a single linear layer, and outputs another 10-dimensional vector. +It takes a 10-dimensional input vector, processes it through a single linear layer, and outputs another 10-dimensional vector. .. code:: python @@ -57,7 +58,7 @@ It takes a a 10-dimensional input vector, processes it through a single linear l Basic usage ------------ -Before calling the torch.compile API, make sure to set ``torch._dynamo.config.compiled_autograd`` to ``True``: +Before calling the ``torch.compile`` API, make sure to set ``torch._dynamo.config.compiled_autograd`` to ``True``: .. code:: python @@ -72,19 +73,19 @@ Before calling the torch.compile API, make sure to set ``torch._dynamo.config.co train(model, x) -In the code above, we create an instance of the ``Model`` class and generate a random 10-dimensional tensor ``x`` by using torch.randn(10). +In the code above, we create an instance of the ``Model`` class and generate a random 10-dimensional tensor ``x`` by using ``torch.randn(10)``. We define the training loop function ``train`` and decorate it with @torch.compile to optimize its execution. When ``train(model, x)`` is called: -* Python Interpreter calls Dynamo, since this call was decorated with ``@torch.compile`` -* Dynamo intercepts the python bytecode, simulates their execution and records the operations into a graph -* AOTDispatcher disables hooks and calls the autograd engine to compute gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph. Using ``torch.autograd.Function``, AOTDispatcher rewrites the forward and backward implementation of ``train``. -* Inductor generates a function corresponding to an optimized implementation of the AOTDispatcher forward and backward -* Dynamo sets the optimized function to be evaluated next by Python Interpreter -* Python Interpreter executes the optimized function, which basically executes ``loss = model(x).sum()`` -* Python Interpreter executes ``loss.backward()``, calling into the autograd engine, which routes to the Compiled Autograd engine since we enabled the config: ``torch._dynamo.config.compiled_autograd = True`` -* Compiled Autograd computes the gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph, including any hooks it encounters. During this, it will record the backward previously rewritten by AOTDispatcher. Compiled Autograd then generates a new function which corresponds to a fully traced implementation of ``loss.backward()``, and executes it with ``torch.compile`` in inference mode -* The same steps recursively apply to the Compiled Autograd graph, but this time AOTDispatcher does not need to partition this graph into a forward and backward +* Python Interpreter calls Dynamo, since this call was decorated with ``@torch.compile``. +* Dynamo intercepts the Python bytecode, simulates their execution and records the operations into a graph. +* ``AOTDispatcher`` disables hooks and calls the autograd engine to compute gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph. Using ``torch.autograd.Function``, AOTDispatcher rewrites the forward and backward implementation of ``train``. +* Inductor generates a function corresponding to an optimized implementation of the AOTDispatcher forward and backward. +* Dynamo sets the optimized function to be evaluated next by Python Interpreter. +* Python Interpreter executes the optimized function, which executes ``loss = model(x).sum()``. +* Python Interpreter executes ``loss.backward()``, calling into the autograd engine, which routes to the Compiled Autograd engine since we set ``torch._dynamo.config.compiled_autograd = True``. +* Compiled Autograd computes the gradients for ``model.linear.weight`` and ``model.linear.bias``, and records the operations into a graph, including any hooks it encounters. During this process, it will record the backward previously rewritten by AOTDispatcher. Compiled Autograd then generates a new function which corresponds to a fully-traced implementation of ``loss.backward()``, and executes it with ``torch.compile`` in inference mode. +* The same steps recursively apply to the Compiled Autograd graph, but this time AOTDispatcher will not need to partition the graph. Inspecting the compiled autograd logs ------------------------------------- @@ -180,7 +181,7 @@ Or you can use the context manager, which will apply to all autograd calls withi Compiled Autograd addresses certain limitations of AOTAutograd -------------------------------------------------------------- -1. Graph breaks in the forward lead to graph breaks in the backward +1. Graph breaks in the forward pass lead to graph breaks in the backward pass: .. code:: python @@ -248,7 +249,7 @@ There should be a ``call_hook`` node in the graph, which dynamo will later inlin Common recompilation reasons for Compiled Autograd -------------------------------------------------- -1. Due to change in autograd structure +1. Due to changes in the autograd structure of the loss value .. code:: python @@ -258,7 +259,7 @@ Common recompilation reasons for Compiled Autograd loss = op(x, x).sum() torch.compile(lambda: loss.backward(), backend="eager")() -You should see some recompile messages: **Cache miss due to new autograd node**. +In the example above, we call a different operator on each iteration, leading to ``loss`` tracking a different autograd history each time. You should see some recompile messages: **Cache miss due to new autograd node**. .. code:: python @@ -273,7 +274,7 @@ You should see some recompile messages: **Cache miss due to new autograd node**. ... """ -2. Due to dynamic shapes +2. Due to tensors changing shapes .. code:: python @@ -283,7 +284,7 @@ You should see some recompile messages: **Cache miss due to new autograd node**. loss = x.sum() torch.compile(lambda: loss.backward(), backend="eager")() -You should see some recompiles messages: **Cache miss due to changed shapes**. +In the example above, ``x`` changes shapes, and compiled autograd will mark ``x`` as a dynamic shape tensor after the first change. You should see recompiles messages: **Cache miss due to changed shapes**. .. code:: python @@ -298,4 +299,4 @@ You should see some recompiles messages: **Cache miss due to changed shapes**. Conclusion ---------- -In this tutorial, we went over the high-level ecosystem of ``torch.compile`` with compiled autograd, the basics of compiled autograd and a few common recompilation reasons. +In this tutorial, we went over the high-level ecosystem of ``torch.compile`` with compiled autograd, the basics of compiled autograd and a few common recompilation reasons. Stay tuned for deep dives on `dev-discuss `_. From 70c7434066d49c7a07004794588e8a96367b964e Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Wed, 9 Oct 2024 10:36:54 -0700 Subject: [PATCH 09/10] Add a card and toctree --- index.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/index.rst b/index.rst index 95c4a8f3ef..76a4d2e09d 100644 --- a/index.rst +++ b/index.rst @@ -439,6 +439,13 @@ Welcome to PyTorch Tutorials :link: advanced/python_custom_ops.html :tags: Extending-PyTorch,Frontend-APIs,C++,CUDA +.. customcarditem:: + :header: Compiled Autograd: Capturing a larger backward graph for ``torch.compile`` + :card_description: Learn how to use compiled autograd to capture a larger backward graph. + :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: intermediate/compiled_autograd_tutorial + :tags: Model-Optimization,CUDA + .. customcarditem:: :header: Custom C++ and CUDA Operators :card_description: How to extend PyTorch with custom C++ and CUDA operators. @@ -1132,6 +1139,7 @@ Additional Resources intermediate/nvfuser_intro_tutorial intermediate/ax_multiobjective_nas_tutorial intermediate/torch_compile_tutorial + intermediate/compiled_autograd_tutorial intermediate/inductor_debug_cpu intermediate/scaled_dot_product_attention_tutorial beginner/knowledge_distillation_tutorial From 83d4665d6731e77c4ad0eed8e3b6468ce1155ef6 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Wed, 9 Oct 2024 15:01:11 -0700 Subject: [PATCH 10/10] Minor editorial and formatting fixes --- .../compiled_autograd_tutorial.rst | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/intermediate_source/compiled_autograd_tutorial.rst b/intermediate_source/compiled_autograd_tutorial.rst index bf2de77f1b..bcae7e63da 100644 --- a/intermediate_source/compiled_autograd_tutorial.rst +++ b/intermediate_source/compiled_autograd_tutorial.rst @@ -162,11 +162,11 @@ You can use different compiler configs for the two compilations, for example, th .. code:: python -def train(model, x): - model = torch.compile(model) - loss = model(x).sum() - torch._dynamo.config.compiled_autograd = True - torch.compile(lambda: loss.backward(), fullgraph=True)() + def train(model, x): + model = torch.compile(model) + loss = model(x).sum() + torch._dynamo.config.compiled_autograd = True + torch.compile(lambda: loss.backward(), fullgraph=True)() Or you can use the context manager, which will apply to all autograd calls within its scope. @@ -213,8 +213,8 @@ Compiled Autograd addresses certain limitations of AOTAutograd assert(torch._dynamo.utils.counters["stats"]["unique_graphs"] == 1) -In the ``1. base torch.compile`` case, we see that 3 backward graphs were produced due to the 2 graph breaks in the compiled function ``fn``. -Whereas in ``2. torch.compile with compiled autograd``, we see that a full backward graph was traced despite the graph breaks. +In the first ``torch.compile`` case, we see that 3 backward graphs were produced due to the 2 graph breaks in the compiled function ``fn``. +Whereas in the second ``torch.compile`` with compiled autograd case, we see that a full backward graph was traced despite the graph breaks. 2. Backward hooks are not captured @@ -231,7 +231,7 @@ Whereas in ``2. torch.compile with compiled autograd``, we see that a full backw with torch._dynamo.compiled_autograd.enable(torch.compile(backend="aot_eager")): loss.backward() -There should be a ``call_hook`` node in the graph, which dynamo will later inline into +There should be a ``call_hook`` node in the graph, which dynamo will later inline into the following: .. code:: python @@ -249,7 +249,7 @@ There should be a ``call_hook`` node in the graph, which dynamo will later inlin Common recompilation reasons for Compiled Autograd -------------------------------------------------- -1. Due to changes in the autograd structure of the loss value +1. Due to changes in the autograd structure of the loss value: .. code:: python @@ -274,7 +274,7 @@ In the example above, we call a different operator on each iteration, leading to ... """ -2. Due to tensors changing shapes +2. Due to tensors changing shapes: .. code:: python