pyg-team · akihironitta · Jul 11, 2024 · Jul 20, 2024 · Jul 20, 2024 · Jul 21, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ## [0.5.0] - 2023-MM-DD
 ### Added
+- Added `torch.compile` support for `segment_matmul` ([#333](https://github.com/pyg-team/pyg-lib/pull/333))
 - Added PyTorch 2.4 support ([#338](https://github.com/pyg-team/pyg-lib/pull/338))
 - Added PyTorch 2.3 support ([#322](https://github.com/pyg-team/pyg-lib/pull/322))
 - Added Windows support ([#315](https://github.com/pyg-team/pyg-lib/pull/315))

diff --git a/pyg_lib/_compile.py b/pyg_lib/_compile.py
@@ -0,0 +1,12 @@
+from collections.abc import Callable
+
+import torch
+
+_WITH_PT24 = tuple(map(int, torch.__version__.split('.')[:2])) >= (2, 4)
+
+if _WITH_PT24:
+    register_fake = torch.library.register_fake
+else:
+
+    def register_fake(*args, **kwargs) -> Callable:
+        return lambda x: x
diff --git a/pyg_lib/csrc/ops/matmul.cpp b/pyg_lib/csrc/ops/matmul.cpp
@@ -51,8 +51,7 @@ at::Tensor segment_matmul(const at::Tensor& input,
   at::checkDim(c, input_arg, 2);
   at::checkDim(c, ptr_arg, 1);
   at::checkDim(c, other_arg, 3);
-  at::checkSize(c, other_arg, 1, input_arg->size(-1));
-  at::checkNumel(c, ptr_arg, other_arg->size(0) + 1);
+  at::checkSize_symint(c, other_arg, 1, input_arg->sym_size(-1));
 
   static auto op = c10::Dispatcher::singleton()
                        .findSchemaOrThrow("pyg::segment_matmul", "")
@@ -63,8 +62,9 @@ at::Tensor segment_matmul(const at::Tensor& input,
 TORCH_LIBRARY_FRAGMENT(pyg, m) {
   m.def(TORCH_SELECTIVE_SCHEMA(
       "pyg::grouped_matmul(Tensor[] input, Tensor[] other) -> Tensor[]"));
-  m.def(TORCH_SELECTIVE_SCHEMA(
-      "pyg::segment_matmul(Tensor input, Tensor ptr, Tensor other) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("pyg::segment_matmul(Tensor input, Tensor ptr, "
+                               "Tensor other) -> Tensor"),
+        {at::Tag::pt2_compliant_tag});
 }
 
 }  // namespace ops

diff --git a/pyg_lib/ops/__init__.py b/pyg_lib/ops/__init__.py
@@ -4,6 +4,8 @@
 import torch.utils._pytree as pytree
 from torch import Tensor
 
+from pyg_lib._compile import register_fake
+
 
 def _pytreeify(cls):
     r"""A pytree is Python nested data structure. It is a tree in the sense
@@ -152,8 +154,8 @@ def segment_matmul(
 
         out = pyg_lib.ops.segment_matmul(inputs, ptr, other)
         assert out.size() == (8, 32)
-        assert out[0:5] == inputs[0:5] @ other[0]
-        assert out[5:8] == inputs[5:8] @ other[1]
+        assert torch.allclose(out[0:5], inputs[0:5] @ other[0])
+        assert torch.allclose(out[5:8], inputs[5:8] @ other[1])
 
     Args:
         inputs: The left operand 2D matrix of shape :obj:`[N, K]`.
@@ -172,6 +174,21 @@ def segment_matmul(
     return out
 
 
+@register_fake("pyg::segment_matmul")
+def _(inputs, ptr, other):
+    assert inputs.dtype == other.dtype
+    assert inputs.dim() == 2
+    assert ptr.dim() == 1
+    assert other.dim() == 3
+    assert ptr.size() == (other.size(0) + 1, )
+    return torch.empty(
+        inputs.size(0),
+        other.size(2),
+        device=inputs.device,
+        dtype=inputs.dtype,
+    )
+
+
 def sampled_add(
     left: Tensor,
     right: Tensor,

diff --git a/test/ops/test_matmul.py b/test/ops/test_matmul.py
@@ -4,6 +4,7 @@
 import torch
 
 import pyg_lib
+from pyg_lib._compile import _WITH_PT24
 from pyg_lib.testing import withCUDA
 
 os.environ['NVIDIA_TF32_OVERRIDE'] = '0'
@@ -37,6 +38,50 @@ def test_segment_matmul_autograd(dtype, device):
     assert inputs.grad.size() == inputs.size()
 
 
+@withCUDA
+@pytest.mark.parametrize('dtype', [
+    pytest.param(torch.float32, id='float32'),
+    pytest.param(torch.bfloat16, id='bfloat16'),
+])
+@pytest.mark.parametrize('requires_grad', [
+    pytest.param(False, id='requires_grad_False'),
+    pytest.param(True, id='requires_grad_True'),
+])
+@pytest.mark.skipif(not _WITH_PT24, reason='PyTorch 2.4.0 is required')
+def test_segment_matmul_opcheck(device, dtype, requires_grad):
+    if requires_grad:
+        pytest.skip('TODO: Support requires_grad=True')
+    if device.type == 'cuda' and dtype == torch.bfloat16:
+        pytest.skip('CUDA does not support bfloat16')
+
+    from torch.library import opcheck
+
+    inputs = torch.randn(
+        (8, 16),
+        requires_grad=requires_grad,
+        device=device,
+        dtype=dtype,
+    )
+    ptr = torch.tensor([0, 5, 8], device=device)
+    other = torch.randn(
+        (2, 16, 32),
+        requires_grad=requires_grad,
+        device=device,
+        dtype=dtype,
+    )
+    opcheck(
+        torch.ops.pyg.segment_matmul,
+        (inputs, ptr, other),
+        test_utils=[
+            "test_schema",
+            "test_autograd_registration",
+            "test_faketensor",
+            "test_aot_dispatch_static",
+            "test_aot_dispatch_dynamic",
+        ],
+    )
+
+
 @withCUDA
 @pytest.mark.parametrize('dtype', [torch.float, torch.bfloat16])
 @pytest.mark.parametrize('transposed', [True, False])