[release/2.6][SWDEV-523736] Fix some unittests for Navi4x

k-artem · k-artem · commit 03c4453a242a · 2025-06-13T16:51:02.000+02:00
* Most part of testcases work properly on Navi48(gfx1201) with TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1, in this commit enable it for this arch. No support of AOTriton currently for Navci44(gfx1200), so these testcases just skipped. * test_qconv2d_int8_mixed_bf16 skipped because it was originally skipped in pytorch#112550 but later lost. * test_sac_ilp_case1 skipped as per SWDEV-509011 * test_distributed_checkpoint_state_dict_type[0-1]_cuda fixed bug with arguments.
diff --git a/test/distributed/_tools/test_sac_ilp.py b/test/distributed/_tools/test_sac_ilp.py
@@ -19,7 +19,14 @@
     sac_milp,
 )
 from torch.testing._internal.common_cuda import TEST_CUDA
-from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo, TestCase
+from torch.testing._internal.common_utils import (
+    run_tests,
+    skipIfTorchDynamo,
+    TestCase,
+    skipIfRocmArch,
+    NAVI4_ARCH,
+)
+
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
     Transformer,
@@ -131,6 +138,7 @@ def _collect_module_info_with_fake_tensor_mode(self) -> ModuleInfo:
 
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/115653")
     @unittest.skipIf(not TEST_CUDA, "CUDA not available")
+    @skipIfRocmArch(NAVI4_ARCH)
     def test_sac_ilp_case1(self):
         """
         This is a case where the memory budget is either binding or too tight,
diff --git a/test/distributed/fsdp/test_distributed_checkpoint.py b/test/distributed/fsdp/test_distributed_checkpoint.py
@@ -30,22 +30,22 @@
     )
     sys.exit(0)
 
-
-_DISTRIBUTED_STATE_DICT_IMPLS = {
-    StateDictType.LOCAL_STATE_DICT,
-    StateDictType.SHARDED_STATE_DICT,
-}
-
-
 class TestDistributedCheckpoint(FSDPTest):
     @property
     def world_size(self):
         return 2
 
     @skip_if_lt_x_gpu(2)
     @with_temp_dir
-    @parametrize("state_dict_type", _DISTRIBUTED_STATE_DICT_IMPLS)
-    def test_distributed_checkpoint(self, state_dict_type) -> None:
+    def test_distributed_checkpoint_state_dict_type0(self) -> None:
+        self._test_distributed_checkpoint(StateDictType.LOCAL_STATE_DICT)
+
+    @skip_if_lt_x_gpu(2)
+    @with_temp_dir
+    def test_distributed_checkpoint_state_dict_type1(self) -> None:
+        self._test_distributed_checkpoint(StateDictType.SHARDED_STATE_DICT)
+
+    def _test_distributed_checkpoint(self, state_dict_type) -> None:
         with enable_wrap(wrapper_cls=FSDP):
             torch.manual_seed(100)
             model = wrap(SkipModel(double_nest=True))
diff --git a/test/dynamo/test_activation_checkpointing.py b/test/dynamo/test_activation_checkpointing.py
@@ -20,6 +20,7 @@
 from torch._higher_order_ops.wrap import tag_activation_checkpoint
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_CUDNN_ATTENTION,
+    PLATFORM_SUPPORTS_FLASH_ATTENTION,
     SM90OrLater,
 )
 from torch.testing._internal.common_utils import IS_WINDOWS, skipIfRocm
@@ -1279,6 +1280,7 @@ def fn(x, ys):
         self.assertEqual(ref, res)
 
     @requires_cuda
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_pattern_matcher(self):
         # Check that the sdpa op is recomputed in the backward graph
         # tests percolate_tags
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
@@ -47,6 +47,8 @@
     parametrize,
     skipIfWindows,
     TEST_WITH_ROCM,
+    skipIfRocmArch,
+    NAVI44_ARCH,
 )
 from torch.testing._internal.two_tensor import TwoTensor
 
@@ -6408,6 +6410,7 @@ def fn(x):
         self.assertEqual(fn(inp), opt_fn(inp))
 
     @requires_cuda
+    @skipIfRocmArch(NAVI44_ARCH)
     def test_sdpa_dynamic_shapes(self):
         def f(x, s0, s1, s2):
             q = x.view(2, s0, s2, s0)
diff --git a/test/higher_order_ops/test_invoke_subgraph.py b/test/higher_order_ops/test_invoke_subgraph.py
@@ -17,6 +17,7 @@
     TEST_WITH_CROSSREF,
     TestCase,
 )
+from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
 from torch.testing._internal.inductor_utils import HAS_CUDA
 
 
@@ -167,6 +168,7 @@ def fn(x):
         self.assertEqual(x.grad, x_clone.grad)
 
     @requires_cuda
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_sdpa(self):
         @mark_compile_region
         def gn(q, k, v):
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
@@ -34,7 +34,11 @@
 )
 from torch.testing import FileCheck
 from torch.testing._internal import common_utils
-from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_BF16, TEST_MULTIGPU
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_BF16,
+    PLATFORM_SUPPORTS_FLASH_ATTENTION,
+    TEST_MULTIGPU,
+)
 from torch.testing._internal.common_device_type import (
     flex_attention_supported_platform as supported_platform,
 )
@@ -2610,6 +2614,7 @@ def test_kernel_options_argument_is_respected(self):
         FileCheck().check("BLOCK_M : tl.constexpr = 16").run(code[0])
 
     @supported_platform
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_comparison_vs_sdpa(self):
         def causal(score, b, h, q_idx, kv_idx):
             return torch.where(q_idx >= kv_idx, score, -float("inf"))
diff --git a/test/inductor/test_flex_decoding.py b/test/inductor/test_flex_decoding.py
@@ -4,7 +4,7 @@
 import functools
 from collections import namedtuple
 from typing import Callable, Optional, Tuple, Union
-from unittest import expectedFailure, skipUnless
+from unittest import expectedFailure, skipUnless, skipIf
 from unittest.mock import patch
 
 import torch
@@ -21,7 +21,10 @@
 )
 from torch.testing import FileCheck
 from torch.testing._internal import common_utils
-from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_BF16
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_BF16,
+    PLATFORM_SUPPORTS_FLASH_ATTENTION,
+)
 from torch.testing._internal.common_utils import skipIfRocm
 from torch.utils._triton import has_triton
 
@@ -1342,6 +1345,7 @@ def test_windowed_no_mask_vs_sdpa(self):
         self.run_test_with_call(attention, sdpa_attention, Q_H=16, KV_H=16, Q_S=8)
 
     @supported_platform
+    @skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_windowed_full_mask_vs_sdpa(self):
         def mask_mod(b, h, q, kv):
             return q + 1000 >= kv
@@ -1361,6 +1365,7 @@ def mask_mod(b, h, q, kv):
         self.run_test_with_call(attention, sdpa_attention, Q_H=16, KV_H=16, Q_S=8)
 
     @supported_platform
+    @skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_windowed_partial_block_vs_sdpa(self):
         def mask_mod(b, h, q, kv):
             return q + 1000 >= kv
@@ -1376,6 +1381,7 @@ def mask_mod(b, h, q, kv):
         self.run_test_with_call(attention, sdpa_attention, Q_H=16, KV_H=16, Q_S=8)
 
     @supported_platform
+    @skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_windowed_no_mask_vs_sdpa_paged_attention(self):
         score_mod = _generate_windowed(1000)
 
@@ -1386,6 +1392,7 @@ def test_windowed_no_mask_vs_sdpa_paged_attention(self):
         )
 
     @supported_platform
+    @skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_windowed_full_mask_vs_sdpa_paged_attention(self):
         def mask_mod(b, h, q, kv):
             return q + 1000 >= kv
@@ -1397,6 +1404,7 @@ def mask_mod(b, h, q, kv):
         )
 
     @supported_platform
+    @skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_windowed_partial_block_vs_sdpa_paged_attention(self):
         def mask_mod(b, h, q, kv):
             return q + 1000 >= kv
diff --git a/test/inductor/test_mkldnn_pattern_matcher.py b/test/inductor/test_mkldnn_pattern_matcher.py
@@ -1035,6 +1035,7 @@ def test_qconv2d_xpu(self):
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
+    @skipIfRocm
     def test_qconv2d_int8_mixed_bf16(self):
         r"""
         This testcase will quantize a single Conv2d module with int8_mixed_bf16 quantization.
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -10611,6 +10611,7 @@ def fn(q, k, v):
         )
 
     @expectedFailureXPU
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_scaled_dot_product_efficient_attention(self):
         if self.device == "cpu":
             raise unittest.SkipTest(f"requires {GPU_TYPE}")
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
@@ -43,14 +43,19 @@ def evaluate_gfx_arch_within(arch_list):
     effective_arch = os.environ.get('PYTORCH_DEBUG_FLASH_ATTENTION_GCN_ARCH_OVERRIDE', gcn_arch_name)
     # gcnArchName can be complicated strings like gfx90a:sramecc+:xnack-
     # Hence the matching should be done reversely
-    return any(arch in effective_arch for arch in arch_list)
+    result = any(arch in effective_arch for arch in arch_list)
+
+    if result and gcn_arch_name == "gfx1201":
+        os.environ['TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL'] = '1'
+
+    return result
 
 def CDNA2OrLater():
     return evaluate_gfx_arch_within(["gfx90a", "gfx942"])
 
 def evaluate_platform_supports_flash_attention():
     if TEST_WITH_ROCM:
-        arch_list = ["gfx90a", "gfx942", "gfx1100"]
+        arch_list = ["gfx90a", "gfx942", "gfx1100", "gfx1201"]
         return evaluate_gfx_arch_within(arch_list)
     if TEST_CUDA:
         return not IS_WINDOWS and SM80OrLater
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
@@ -111,6 +111,7 @@
 NAVI_ARCH = ("gfx1030", "gfx1100", "gfx1101", "gfx1200", "gfx1201")
 NAVI3_ARCH = ("gfx1100", "gfx1101")
 NAVI4_ARCH = ("gfx1200", "gfx1201")
+NAVI44_ARCH = "gfx1200"
 
 def is_navi3_arch():
     if torch.cuda.is_available():

Original file line number	Diff line number	Diff line change
`@@ -10611,6 +10611,7 @@ def fn(q, k, v):`
`10611`	`10611`	`)`
`10612`	`10612`
`10613`	`10613`	`@expectedFailureXPU`
	`10614`	`+ @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")`
`10614`	`10615`	`def test_scaled_dot_product_efficient_attention(self):`
`10615`	`10616`	`if self.device == "cpu":`
`10616`	`10617`	`raise unittest.SkipTest(f"requires {GPU_TYPE}")`