Skip to content

Commit 6900575

Browse files
committedFeb 6, 2025
Thread safe parallel stencil tests
To avoid repeating boiler plate code in testing, `StencilTestSuite` provides a convenient interace to test gtscript stencils. Within that `StencilTestSuite` base class, generating the stencil is separated from running & validating the stencil code. Each deriving test class will end up with two tests: one for stencil generation and a second one to test the implementation by running the generated code with defined inputs and expected outputs. The base class was written such that the implementation test would re-use the generated stencil code from the first test. This introduces an implicit test order dependency. To save time and avoid unnecessary test failure outputs, failing to generate the stencil code would automatically skip the implementation/validation test. Running tests in parallel (with `xdist`) breaks the expected test execution order (in the default configuration). This leads to automatically skiped validation tests in case the stencil code wasn't generated yet. On the CI, we only run with 2 threads so only a couple tests were skipped usually. Locally, I was running with 16 threads and got ~30 skipped validation tests. This PR proposes to address the issue by setting an `xdist_group` mark on the generation/implementation tests that belong togehter. In combination with `--dist loadgroup`, this will keep the expected order where necessary. Only tests with `xdist_group` markers are affected by `--dist loadgroup`. Tests without that marker will be distributed normally as if in `--dist load` mode (the default so far). By grouping with `cls_name` and backend, we keep maximal parallelization, grouping only the two tests that are dependending on each other.
1 parent 14d18b3 commit 6900575

File tree

2 files changed

+44
-30
lines changed

2 files changed

+44
-30
lines changed
 

‎noxfile.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -91,14 +91,14 @@ def test_cartesian(
9191
markers = " and ".join(codegen_settings["markers"] + device_settings["markers"])
9292

9393
session.run(
94-
*f"pytest --cache-clear -sv -n {num_processes}".split(),
94+
*f"pytest --cache-clear -sv -n {num_processes} --dist loadgroup".split(),
9595
*("-m", f"{markers}"),
96-
str(pathlib.Path("tests") / "cartesian_tests"),
96+
pathlib.Path("tests") / "cartesian_tests",
9797
*session.posargs,
9898
)
9999
session.run(
100100
*"pytest --doctest-modules --doctest-ignore-import-errors -sv".split(),
101-
str(pathlib.Path("src") / "gt4py" / "cartesian"),
101+
pathlib.Path("src") / "gt4py" / "cartesian",
102102
)
103103

104104

@@ -135,12 +135,12 @@ def test_eve(session: nox.Session) -> None:
135135

136136
session.run(
137137
*f"pytest --cache-clear -sv -n {num_processes}".split(),
138-
str(pathlib.Path("tests") / "eve_tests"),
138+
pathlib.Path("tests") / "eve_tests",
139139
*session.posargs,
140140
)
141141
session.run(
142142
*"pytest --doctest-modules -sv".split(),
143-
str(pathlib.Path("src") / "gt4py" / "eve"),
143+
pathlib.Path("src") / "gt4py" / "eve",
144144
)
145145

146146

@@ -186,13 +186,13 @@ def test_next(
186186
session.run(
187187
*f"pytest --cache-clear -sv -n {num_processes}".split(),
188188
*("-m", f"{markers}"),
189-
str(pathlib.Path("tests") / "next_tests"),
189+
pathlib.Path("tests") / "next_tests",
190190
*session.posargs,
191191
success_codes=[0, NO_TESTS_COLLECTED_EXIT_CODE],
192192
)
193193
session.run(
194194
*"pytest --doctest-modules --doctest-ignore-import-errors -sv".split(),
195-
str(pathlib.Path("src") / "gt4py" / "next"),
195+
pathlib.Path("src") / "gt4py" / "next",
196196
success_codes=[0, NO_TESTS_COLLECTED_EXIT_CODE],
197197
)
198198

@@ -217,12 +217,12 @@ def test_storage(
217217
session.run(
218218
*f"pytest --cache-clear -sv -n {num_processes}".split(),
219219
*("-m", f"{markers}"),
220-
str(pathlib.Path("tests") / "storage_tests"),
220+
pathlib.Path("tests") / "storage_tests",
221221
*session.posargs,
222222
)
223223
session.run(
224224
*"pytest --doctest-modules -sv".split(),
225-
str(pathlib.Path("src") / "gt4py" / "storage"),
225+
pathlib.Path("src") / "gt4py" / "storage",
226226
success_codes=[0, NO_TESTS_COLLECTED_EXIT_CODE],
227227
)
228228

‎src/gt4py/cartesian/testing/suites.py

+35-21
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ def get_globals_combinations(dtypes):
167167
generation_strategy=composite_strategy_factory(
168168
d, generation_strategy_factories
169169
),
170-
implementations=[],
170+
implementation=None,
171171
test_id=len(cls_dict["tests"]),
172172
definition=annotate_function(
173173
function=cls_dict["definition"],
@@ -199,14 +199,19 @@ def hyp_wrapper(test_hyp, hypothesis_data):
199199

200200
for test in cls_dict["tests"]:
201201
if test["suite"] == cls_name:
202-
marks = test["marks"]
203-
if gt4pyc.backend.from_name(test["backend"]).storage_info["device"] == "gpu":
204-
marks.append(pytest.mark.requires_gpu)
205202
name = test["backend"]
206203
name += "".join(f"_{key}_{value}" for key, value in test["constants"].items())
207204
name += "".join(
208205
"_{}_{}".format(key, value.name) for key, value in test["dtypes"].items()
209206
)
207+
208+
marks = test["marks"].copy()
209+
if gt4pyc.backend.from_name(test["backend"]).storage_info["device"] == "gpu":
210+
marks.append(pytest.mark.requires_gpu)
211+
# Run generation and implementation tests in the same group to ensure
212+
# (thread-) safe parallelization of stencil tests.
213+
marks.append(pytest.mark.xdist_group(name=f"{cls_name}_{name}"))
214+
210215
param = pytest.param(test, marks=marks, id=name)
211216
pytest_params.append(param)
212217

@@ -228,14 +233,19 @@ def hyp_wrapper(test_hyp, hypothesis_data):
228233
runtime_pytest_params = []
229234
for test in cls_dict["tests"]:
230235
if test["suite"] == cls_name:
231-
marks = test["marks"]
232-
if gt4pyc.backend.from_name(test["backend"]).storage_info["device"] == "gpu":
233-
marks.append(pytest.mark.requires_gpu)
234236
name = test["backend"]
235237
name += "".join(f"_{key}_{value}" for key, value in test["constants"].items())
236238
name += "".join(
237239
"_{}_{}".format(key, value.name) for key, value in test["dtypes"].items()
238240
)
241+
242+
marks = test["marks"].copy()
243+
if gt4pyc.backend.from_name(test["backend"]).storage_info["device"] == "gpu":
244+
marks.append(pytest.mark.requires_gpu)
245+
# Run generation and implementation tests in the same group to ensure
246+
# (thread-) safe parallelization of stencil tests.
247+
marks.append(pytest.mark.xdist_group(name=f"{cls_name}_{name}"))
248+
239249
runtime_pytest_params.append(
240250
pytest.param(
241251
test,
@@ -434,8 +444,11 @@ class StencilTestSuite(metaclass=SuiteMeta):
434444
def _test_generation(cls, test, externals_dict):
435445
"""Test source code generation for all *backends* and *stencil suites*.
436446
437-
The generated implementations are cached in a :class:`utils.ImplementationsDB`
438-
instance, to avoid duplication of (potentially expensive) compilations.
447+
The generated implementation is cached in the test context, to avoid duplication
448+
of (potentially expensive) compilation.
449+
Note: This caching introduces a dependency between tests, which is captured by an
450+
`xdist_group` marker in combination with `--dist loadgroup` to ensure safe parallel
451+
test execution.
439452
"""
440453
backend_slug = gt_utils.slugify(test["backend"], valid_symbols="")
441454
implementation = gtscript.stencil(
@@ -461,7 +474,8 @@ def _test_generation(cls, test, externals_dict):
461474
or ax == "K"
462475
or field_info.boundary[i] >= cls.global_boundaries[name][i]
463476
)
464-
test["implementations"].append(implementation)
477+
assert test["implementation"] is None
478+
test["implementation"] = implementation
465479

466480
@classmethod
467481
def _run_test_implementation(cls, parameters_dict, implementation): # too complex
@@ -585,16 +599,16 @@ def _run_test_implementation(cls, parameters_dict, implementation): # too compl
585599
def _test_implementation(cls, test, parameters_dict):
586600
"""Test computed values for implementations generated for all *backends* and *stencil suites*.
587601
588-
The generated implementations are reused from previous tests by means of a
589-
:class:`utils.ImplementationsDB` instance shared at module scope.
602+
The generated implementation was cached in the test context, to avoid duplication
603+
of (potentially expensive) compilation.
604+
Note: This caching introduces a dependency between tests, which is captured by an
605+
`xdist_group` marker in combination with `--dist loadgroup` to ensure safe parallel
606+
test execution.
590607
"""
591-
implementation_list = test["implementations"]
592-
if not implementation_list:
593-
pytest.skip(
594-
"Cannot perform validation tests, since there are no valid implementations."
595-
)
596-
for implementation in implementation_list:
597-
if not isinstance(implementation, StencilObject):
598-
raise RuntimeError("Wrong function got from implementations_db cache!")
608+
implementation = test["implementation"]
609+
assert (
610+
implementation is not None
611+
), "Stencil not yet generated. Did you attempt to run stencil tests in parallel?"
612+
assert isinstance(implementation, StencilObject)
599613

600-
cls._run_test_implementation(parameters_dict, implementation)
614+
cls._run_test_implementation(parameters_dict, implementation)

0 commit comments

Comments
 (0)