Thread safe parallel stencil tests

romanc · romanc · commit 69005752e02b · 2025-02-06T14:52:43.000+01:00
To avoid repeating boiler plate code in testing, `StencilTestSuite`
provides a convenient interace to test gtscript stencils.

Within that `StencilTestSuite` base class, generating the stencil is
separated from running &amp; validating the stencil code. Each deriving
test class will end up with two tests: one for stencil generation and a
second one to test the implementation by running the generated code with
defined inputs and expected outputs.

The base class was written such that the implementation test would
re-use the generated stencil code from the first test. This introduces
an implicit test order dependency. To save time and avoid unnecessary
test failure outputs, failing to generate the stencil code would
automatically skip the implementation/validation test.

Running tests in parallel (with `xdist`) breaks the expected test
execution order (in the default configuration). This leads to
automatically skiped validation tests in case the stencil code wasn't
generated yet. On the CI, we only run with 2 threads so only a couple
tests were skipped usually. Locally, I was running with 16 threads and
got ~30 skipped validation tests.

This PR proposes to address the issue by setting an `xdist_group` mark
on the generation/implementation tests that belong togehter. In
combination with `--dist loadgroup`, this will keep the expected order
where necessary. Only tests with `xdist_group` markers are affected by
`--dist loadgroup`. Tests without that marker will be distributed
normally as if in `--dist load` mode (the default so far). By grouping
with `cls_name` and backend, we keep maximal parallelization, grouping
only the two tests that are dependending on each other.
diff --git a/noxfile.py b/noxfile.py
@@ -91,14 +91,14 @@ def test_cartesian(
     markers = " and ".join(codegen_settings["markers"] + device_settings["markers"])
 
     session.run(
-        *f"pytest --cache-clear -sv -n {num_processes}".split(),
+        *f"pytest --cache-clear -sv -n {num_processes} --dist loadgroup".split(),
         *("-m", f"{markers}"),
-        str(pathlib.Path("tests") / "cartesian_tests"),
+        pathlib.Path("tests") / "cartesian_tests",
         *session.posargs,
     )
     session.run(
         *"pytest --doctest-modules --doctest-ignore-import-errors -sv".split(),
-        str(pathlib.Path("src") / "gt4py" / "cartesian"),
+        pathlib.Path("src") / "gt4py" / "cartesian",
     )
 
 
@@ -135,12 +135,12 @@ def test_eve(session: nox.Session) -> None:
 
     session.run(
         *f"pytest --cache-clear -sv -n {num_processes}".split(),
-        str(pathlib.Path("tests") / "eve_tests"),
+        pathlib.Path("tests") / "eve_tests",
         *session.posargs,
     )
     session.run(
         *"pytest --doctest-modules -sv".split(),
-        str(pathlib.Path("src") / "gt4py" / "eve"),
+        pathlib.Path("src") / "gt4py" / "eve",
     )
 
 
@@ -186,13 +186,13 @@ def test_next(
     session.run(
         *f"pytest --cache-clear -sv -n {num_processes}".split(),
         *("-m", f"{markers}"),
-        str(pathlib.Path("tests") / "next_tests"),
+        pathlib.Path("tests") / "next_tests",
         *session.posargs,
         success_codes=[0, NO_TESTS_COLLECTED_EXIT_CODE],
     )
     session.run(
         *"pytest --doctest-modules --doctest-ignore-import-errors -sv".split(),
-        str(pathlib.Path("src") / "gt4py" / "next"),
+        pathlib.Path("src") / "gt4py" / "next",
         success_codes=[0, NO_TESTS_COLLECTED_EXIT_CODE],
     )
 
@@ -217,12 +217,12 @@ def test_storage(
     session.run(
         *f"pytest --cache-clear -sv -n {num_processes}".split(),
         *("-m", f"{markers}"),
-        str(pathlib.Path("tests") / "storage_tests"),
+        pathlib.Path("tests") / "storage_tests",
         *session.posargs,
     )
     session.run(
         *"pytest --doctest-modules -sv".split(),
-        str(pathlib.Path("src") / "gt4py" / "storage"),
+        pathlib.Path("src") / "gt4py" / "storage",
         success_codes=[0, NO_TESTS_COLLECTED_EXIT_CODE],
     )
 
diff --git a/src/gt4py/cartesian/testing/suites.py b/src/gt4py/cartesian/testing/suites.py
@@ -167,7 +167,7 @@ def get_globals_combinations(dtypes):
                             generation_strategy=composite_strategy_factory(
                                 d, generation_strategy_factories
                             ),
-                            implementations=[],
+                            implementation=None,
                             test_id=len(cls_dict["tests"]),
                             definition=annotate_function(
                                 function=cls_dict["definition"],
@@ -199,14 +199,19 @@ def hyp_wrapper(test_hyp, hypothesis_data):
 
         for test in cls_dict["tests"]:
             if test["suite"] == cls_name:
-                marks = test["marks"]
-                if gt4pyc.backend.from_name(test["backend"]).storage_info["device"] == "gpu":
-                    marks.append(pytest.mark.requires_gpu)
                 name = test["backend"]
                 name += "".join(f"_{key}_{value}" for key, value in test["constants"].items())
                 name += "".join(
                     "_{}_{}".format(key, value.name) for key, value in test["dtypes"].items()
                 )
+
+                marks = test["marks"].copy()
+                if gt4pyc.backend.from_name(test["backend"]).storage_info["device"] == "gpu":
+                    marks.append(pytest.mark.requires_gpu)
+                # Run generation and implementation tests in the same group to ensure
+                # (thread-) safe parallelization of stencil tests.
+                marks.append(pytest.mark.xdist_group(name=f"{cls_name}_{name}"))
+
                 param = pytest.param(test, marks=marks, id=name)
                 pytest_params.append(param)
 
@@ -228,14 +233,19 @@ def hyp_wrapper(test_hyp, hypothesis_data):
         runtime_pytest_params = []
         for test in cls_dict["tests"]:
             if test["suite"] == cls_name:
-                marks = test["marks"]
-                if gt4pyc.backend.from_name(test["backend"]).storage_info["device"] == "gpu":
-                    marks.append(pytest.mark.requires_gpu)
                 name = test["backend"]
                 name += "".join(f"_{key}_{value}" for key, value in test["constants"].items())
                 name += "".join(
                     "_{}_{}".format(key, value.name) for key, value in test["dtypes"].items()
                 )
+
+                marks = test["marks"].copy()
+                if gt4pyc.backend.from_name(test["backend"]).storage_info["device"] == "gpu":
+                    marks.append(pytest.mark.requires_gpu)
+                # Run generation and implementation tests in the same group to ensure
+                # (thread-) safe parallelization of stencil tests.
+                marks.append(pytest.mark.xdist_group(name=f"{cls_name}_{name}"))
+
                 runtime_pytest_params.append(
                     pytest.param(
                         test,
@@ -434,8 +444,11 @@ class StencilTestSuite(metaclass=SuiteMeta):
     def _test_generation(cls, test, externals_dict):
         """Test source code generation for all *backends* and *stencil suites*.
 
-        The generated implementations are cached in a :class:`utils.ImplementationsDB`
-        instance, to avoid duplication of (potentially expensive) compilations.
+        The generated implementation is cached in the test context, to avoid duplication
+        of (potentially expensive) compilation.
+        Note: This caching introduces a dependency between tests, which is captured by an
+        `xdist_group` marker in combination with `--dist loadgroup` to ensure safe parallel
+        test execution.
         """
         backend_slug = gt_utils.slugify(test["backend"], valid_symbols="")
         implementation = gtscript.stencil(
@@ -461,7 +474,8 @@ def _test_generation(cls, test, externals_dict):
                     or ax == "K"
                     or field_info.boundary[i] >= cls.global_boundaries[name][i]
                 )
-        test["implementations"].append(implementation)
+        assert test["implementation"] is None
+        test["implementation"] = implementation
 
     @classmethod
     def _run_test_implementation(cls, parameters_dict, implementation):  # too complex
@@ -585,16 +599,16 @@ def _run_test_implementation(cls, parameters_dict, implementation):  # too compl
     def _test_implementation(cls, test, parameters_dict):
         """Test computed values for implementations generated for all *backends* and *stencil suites*.
 
-        The generated implementations are reused from previous tests by means of a
-        :class:`utils.ImplementationsDB` instance shared at module scope.
+        The generated implementation was cached in the test context, to avoid duplication
+        of (potentially expensive) compilation.
+        Note: This caching introduces a dependency between tests, which is captured by an
+        `xdist_group` marker in combination with `--dist loadgroup` to ensure safe parallel
+        test execution.
         """
-        implementation_list = test["implementations"]
-        if not implementation_list:
-            pytest.skip(
-                "Cannot perform validation tests, since there are no valid implementations."
-            )
-        for implementation in implementation_list:
-            if not isinstance(implementation, StencilObject):
-                raise RuntimeError("Wrong function got from implementations_db cache!")
+        implementation = test["implementation"]
+        assert (
+            implementation is not None
+        ), "Stencil not yet generated. Did you attempt to run stencil tests in parallel?"
+        assert isinstance(implementation, StencilObject)
 
-            cls._run_test_implementation(parameters_dict, implementation)
+        cls._run_test_implementation(parameters_dict, implementation)