Reland: [aotinductor] Replicate split_cat from torch IR to predispatch IR" (pytorch#118590)

frank-wei · pytorchmergebot · commit 6fa162e68148 · 2024-01-31T00:09:46.000Z
Summary: This is part the pass migration efforts. The final target is removing the acc tracer in AOTI. In this diff, I did a few things: 1. copy and modify the `fx_passes/split_cat.py` passes based on predispatch IR. 2. verify the correctness by copying the `test_split_cat_fx_passes.py` and create a new file `test_split_cat_fx_passes_aten_fb.py` which is executed in AOTI and checked the counters 3. create a util function to execute the pass and compare the before/after graph to give user more information like pass effect and time spent. It will create logs like ``` [2024-01-25 20:26:48,997] torch._inductor.utils: [INFO] [Pre grad(predispatch IR)]Apply split_cat, index: 0, save before/after graph to /tmp/tmpvlpwrklp, graph before/after are the same = False, time elapsed = 0:00:00.001585 [2024-01-25 20:26:49,000] torch._inductor.utils: [INFO] [Pre grad(predispatch IR)]Apply split_cat, index: 1, save before/after graph to /tmp/tmpz_onjfeu, graph before/after are the same = False, time elapsed = 0:00:00.001873 [2024-01-25 20:26:49,002] torch._inductor.utils: [INFO] [Pre grad(predispatch IR)]Apply split_cat, index: 2, save before/after graph to /tmp/tmpgkck8yko, graph before/after are the same = True, time elapsed = 0:00:00.000269 [2024-01-25 20:26:49,007] torch._inductor.utils: [INFO] [Pre grad(predispatch IR)]Apply split_cat, index: 3, save before/after graph to /tmp/tmpquenq06y, graph before/after are the same = False, time elapsed = 0:00:00.003621 [2024-01-25 20:26:49,009] torch._inductor.utils: [INFO] [Pre grad(predispatch IR)]Apply split_cat, index: 4, save before/after graph to /tmp/tmpi8fia0dv, graph before/after are the same = True, time elapsed = 0:00:00.000190 ``` Differential Revision: D53171027 Pull Request resolved: pytorch#118590 Approved by: https://github.com/kflu, https://github.com/khabinov, https://github.com/chenyang78
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
@@ -3,6 +3,7 @@
 import os
 import sys
 import tempfile
+import types
 import unittest
 from typing import Dict, Tuple
 
@@ -78,7 +79,8 @@ def check_model(
         }
     ):
         torch.manual_seed(0)
-        model = model.to(self.device)
+        if not isinstance(model, types.FunctionType):
+            model = model.to(self.device)
         ref_model = copy.deepcopy(model)
         ref_inputs = copy.deepcopy(example_inputs)
         expected = ref_model(*ref_inputs)
diff --git a/torch/_inductor/fx_passes/pre_grad.py b/torch/_inductor/fx_passes/pre_grad.py
@@ -13,7 +13,6 @@
 from torch.fx.passes.shape_prop import ShapeProp
 from torch.nn import functional as F
 from torch.nn.utils.fusion import fuse_conv_bn_eval, fuse_conv_bn_weights
-
 from .. import config
 
 from ..fx_utils import matches_module_function_pattern
@@ -22,7 +21,7 @@
     PatternMatcherPass,
     stable_topological_sort,
 )
-from ..utils import is_cpu_device
+from ..utils import is_cpu_device, pass_execution_and_save
 from .group_batch_fusion import group_batch_fusion_passes
 from .misc_patterns import numpy_compat_normalization
 
@@ -35,6 +34,12 @@
 efficient_conv_bn_eval_pass = PatternMatcherPass(prevent_match_across_mutations=True)
 merge_getitem_cat_pass = PatternMatcherPass(prevent_match_across_mutations=True)
 predispatch_pass = PatternMatcherPass(prevent_match_across_mutations=True)
+# based on predispatch aten IR
+normalization_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
+merge_splits_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
+split_cat_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
+unbind_stack_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
+merge_getitem_cat_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
 
 pattern_matcher_passes: List[PatternMatcherPass] = [
     normalization_pass,
@@ -44,6 +49,13 @@
     unbind_stack_pass,
     efficient_conv_bn_eval_pass,
 ]
+pattern_matcher_passes_aten: List[PatternMatcherPass] = [
+    normalization_pass_aten,
+    merge_getitem_cat_pass_aten,
+    merge_splits_pass_aten,
+    split_cat_pass_aten,
+    unbind_stack_pass_aten,
+]
 
 
 @init_once_fakemode
@@ -66,7 +78,6 @@ def pre_grad_passes(gm: torch.fx.GraphModule, example_inputs):
     Consider adding a new pass to post_grad.py or joint_graph.py which
     are after functionalization and normalization.
     """
-
     if config.pattern_matcher:
         lazy_init()
         if hasattr(
@@ -75,8 +86,28 @@ def pre_grad_passes(gm: torch.fx.GraphModule, example_inputs):
             gm_before_fx_passes = gm.__copy__()
         # explicitly run with predispatch atenIR based passes
         if config.is_predispatch:
-            group_batch_fusion_passes(gm.graph, pre_grad=True)
-            predispatch_pass.apply(gm.graph)  # type: ignore[arg-type]
+            pass_execution_and_save(
+                group_batch_fusion_passes,
+                gm,
+                "[Pre grad(predispatch IR)] Apply group_batch_fusion",
+            )
+            pass_execution_and_save(
+                predispatch_pass.apply,
+                gm,
+                "[Pre grad(predispatch IR)] Apply predispatch_pass",
+            )
+            log.debug(
+                "[Pre grad(predispatch IR)]Before split cat in pre grad pass. graph: %s",
+                gm.graph,
+            )
+            for ind, pattern_matcher_pass_aten in enumerate(
+                pattern_matcher_passes_aten
+            ):
+                pass_execution_and_save(
+                    pattern_matcher_pass_aten.apply,
+                    gm,
+                    f"[Pre grad(predispatch IR)]Apply split_cat, index: {ind}",
+                )
         else:
             gm = fuse_fx(gm, example_inputs)
             numpy_compat_normalization(gm.graph)
diff --git a/torch/_inductor/fx_passes/split_cat.py b/torch/_inductor/fx_passes/split_cat.py
@@ -299,11 +299,9 @@ class TorchSplit(CallFunction):
     splits are unique getitems.
     """
 
-    def __init__(self, arg, sizes):
+    def __init__(self, arg, sizes, func=torch.split):
         # using KeywordArg("dim") for `dim` checks they all match
-        super().__init__(
-            torch.split, arg, sizes, _users=MULTIPLE, dim=KeywordArg("dim")
-        )
+        super().__init__(func, arg, sizes, _users=MULTIPLE, dim=KeywordArg("dim"))
 
     def _match(self, node: torch.fx.Node, ctx: MatchContext):
         m = super()._match(node, ctx)
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
@@ -6,6 +6,7 @@
 import functools
 import getpass
 import inspect
+import io
 import itertools
 import logging
 import math
@@ -19,6 +20,7 @@
 import textwrap
 import time
 import unittest
+from datetime import datetime
 from io import StringIO
 from typing import (
     Any,
@@ -45,7 +47,6 @@
 from torch.autograd import DeviceType
 from torch.autograd.profiler_util import EventList
 from torch.utils._sympy.functions import CeilDiv, CleanDiv, FloorDiv, ModularIndexing
-
 from . import config
 
 log = logging.getLogger(__name__)
@@ -1225,3 +1226,35 @@ class Placeholder(enum.Enum):
     # The descriptive name of the triton kernel; when unique_kernel_names = False, this
     # placeholder will be replaced with a string with more information.
     DESCRIPTIVE_NAME = "DESCRIPTIVE_NAME"
+
+
+def pass_execution_and_save(func, gm, msg):
+    from .pattern_matcher import stable_topological_sort
+
+    with tempfile.NamedTemporaryFile(
+        mode="w",
+        encoding="utf-8",
+        delete=False,
+    ) as f:
+        before_io = io.StringIO()
+        after_io = io.StringIO()
+        print(f"Before:\n{gm.graph}", file=f)
+        print(gm.graph, file=before_io)
+        start_time = datetime.now()
+        func(gm.graph)
+        time_elapsed = datetime.now() - start_time
+        # recompile graph
+        stable_topological_sort(gm.graph)
+        gm.graph.lint()
+        gm.recompile()
+
+        print(f"After:\n{gm.graph}", file=f)
+        print(gm.graph, file=after_io)
+        t = before_io.getvalue() == after_io.getvalue()
+        log.info(
+            "%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s",
+            msg,
+            f.name,
+            t,
+            time_elapsed,
+        )