From 40dae8786a0366ea4736ba83dd434d345cd43c4a Mon Sep 17 00:00:00 2001
From: Samuel Moors <samuel.moors@vub.be>
Date: Sat, 28 Dec 2024 19:17:13 +0100
Subject: [PATCH 1/7] use mixin class for osu

---
 eessi/testsuite/eessi_mixin.py    |   6 +-
 eessi/testsuite/tests/apps/osu.py | 235 +++++++++---------------------
 2 files changed, 69 insertions(+), 172 deletions(-)

diff --git a/eessi/testsuite/eessi_mixin.py b/eessi/testsuite/eessi_mixin.py
index d03d65e2..8753c8e1 100644
--- a/eessi/testsuite/eessi_mixin.py
+++ b/eessi/testsuite/eessi_mixin.py
@@ -43,6 +43,7 @@ class EESSI_Mixin(RegressionMixin):
     scale = parameter(SCALES.keys())
     bench_name = None
     bench_name_ci = None
+    num_tasks_per_compute_unit = 1
 
     # Create ReFrame variables for logging runtime environment information
     cvmfs_repo_name = variable(str, value='None')
@@ -118,7 +119,7 @@ def run_after_init(self):
         # Set scales as tags
         hooks.set_tag_scale(self)
 
-    @run_after('init')
+    @run_before('setup', always_last=True)
     def measure_mem_usage(self):
         if self.measure_memory_usage:
             hooks.measure_memory_usage(self)
@@ -163,7 +164,8 @@ def validate_setup(self):
     @run_after('setup')
     def assign_tasks_per_compute_unit(self):
         """Call hooks to assign tasks per compute unit, set OMP_NUM_THREADS, and set compact process binding"""
-        hooks.assign_tasks_per_compute_unit(test=self, compute_unit=self.compute_unit)
+        hooks.assign_tasks_per_compute_unit(test=self, compute_unit=self.compute_unit,
+                                            num_per=self.num_tasks_per_compute_unit)
 
         # Set OMP_NUM_THREADS environment variable
         hooks.set_omp_num_threads(self)
diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index 83bbd0f4..23ab1803 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -6,19 +6,20 @@
 non-GPU nodes. Otherwise those tests will FAIL.
 """
 import reframe as rfm
-from reframe.core.builtins import parameter, run_after  # added only to make the linter happy
+from reframe.core.builtins import parameter, run_after
 from reframe.utility import reframe
 
 from hpctestlib.microbenchmarks.mpi.osu import osu_benchmark
 
-from eessi.testsuite import hooks, utils
-from eessi.testsuite.constants import *
+from eessi.testsuite.constants import COMPUTE_UNIT, CPU, DEVICE_TYPES, INVALID_SYSTEM, GPU, NODE, SCALES
+from eessi.testsuite.eessi_mixin import EESSI_Mixin
 from eessi.testsuite.utils import find_modules, log
 
 
 def filter_scales_pt2pt():
     """
     Filtering function for filtering scales for the pt2pt OSU test
+    returns all scales with either 2 cores, 1 full node, or 2 full nodes
     """
     return [
         k for (k, v) in SCALES.items()
@@ -30,7 +31,8 @@ def filter_scales_pt2pt():
 
 def filter_scales_coll():
     """
-    Filtering function for filtering scales for collective the OSU test
+    Filtering function for filtering scales for the collective OSU test
+    returns all scales with at least 2 cores
     """
     return [
         k for (k, v) in SCALES.items()
@@ -40,17 +42,13 @@ def filter_scales_coll():
 
 
 @rfm.simple_test
-class EESSI_OSU_Micro_Benchmarks_pt2pt(osu_benchmark):
-    ''' Run-only OSU test '''
-    scale = parameter(filter_scales_pt2pt())
-    valid_prog_environs = ['default']
-    valid_systems = ['*']
+class EESSI_OSU_Base(osu_benchmark, EESSI_Mixin):
+    """ base class for OSU tests """
     time_limit = '30m'
     module_name = parameter(find_modules('OSU-Micro-Benchmarks'))
-    # Device type for non-cuda OSU-Micro-Benchmarks should run on hosts of both node types. To do this the default
-    # device type is set to GPU.
     device_type = parameter([DEVICE_TYPES[CPU], DEVICE_TYPES[GPU]])
-    # unset num_tasks_per_node from the hpctestlib.
+
+    # reset num_tasks_per_node from the hpctestlib: we handle it ourselves
     num_tasks_per_node = None
 
     # Set num_warmup_iters to 5 to reduce execution time, especially on slower interconnects
@@ -58,6 +56,9 @@ class EESSI_OSU_Micro_Benchmarks_pt2pt(osu_benchmark):
     # Set num_iters to 10 to reduce execution time, especially on slower interconnects
     num_iters = 10
 
+    def required_mem_per_node(self):
+        return 1024
+
     @run_after('init')
     def filter_scales_2gpus(self):
         """Filter out scales with < 2 GPUs if running on GPUs"""
@@ -69,26 +70,6 @@ def filter_scales_2gpus(self):
             self.valid_systems = [INVALID_SYSTEM]
             log(f'valid_systems set to {self.valid_systems} for scale {self.scale} and device_type {self.device_type}')
 
-    @run_after('init')
-    def filter_benchmark_pt2pt(self):
-        """ Filter out all non-mpi.pt2pt benchmarks """
-        if not self.benchmark_info[0].startswith('mpi.pt2pt'):
-            self.valid_systems = [INVALID_SYSTEM]
-
-    @run_after('init')
-    def run_after_init(self):
-        """hooks to run after init phase"""
-
-        # Filter on which scales are supported by the partitions defined in the ReFrame configuration
-        hooks.filter_supported_scales(self)
-
-        hooks.filter_valid_systems_by_device_type(self, required_device_type=self.device_type)
-
-        hooks.set_modules(self)
-
-        # Set scales as tags
-        hooks.set_tag_scale(self)
-
     @run_after('init')
     def set_device_buffers(self):
         """
@@ -98,32 +79,46 @@ def set_device_buffers(self):
         """
         if self.device_type == DEVICE_TYPES[GPU]:
             self.device_buffers = 'cuda'
-
         else:
-            # If the device_type is CPU then device_buffers should always be CPU.
             self.device_buffers = 'cpu'
 
     @run_after('init')
-    def set_tag_ci(self):
-        """ Setting tests under CI tag. """
-        if (self.benchmark_info[0] in ['mpi.pt2pt.osu_latency', 'mpi.pt2pt.osu_bw']):
-            self.tags.add('CI')
-            log(f'tags set to {self.tags}')
+    def set_tags(self):
+        """ Setting custom tags """
+        self.bench_name = self.benchmark_info[0]
+        self.tags.add(self.bench_name.split('.')[-1])
+
+    @run_after('setup', always_last=True)
+    def skip_test_1gpu(self):
+        if self.device_type == DEVICE_TYPES[GPU]:
+            num_gpus = self.num_gpus_per_node * self.num_nodes
+            self.skip_if(num_gpus < 2, "Skipping GPU test : only 1 GPU available for this test case")
 
-        if (self.benchmark_info[0] == 'mpi.pt2pt.osu_bw'):
-            self.tags.add('osu_bw')
 
-        if (self.benchmark_info[0] == 'mpi.pt2pt.osu_latency'):
-            self.tags.add('osu_latency')
+@rfm.simple_test
+class EESSI_OSU_Micro_Benchmarks_pt2pt(EESSI_OSU_Base):
+    ''' point-to-point OSU test '''
+    scale = parameter(filter_scales_pt2pt())
+    compute_unit = COMPUTE_UNIT[NODE]
 
     @run_after('init')
-    def set_mem(self):
-        """ Setting an extra job option of memory. This test has only 4 possibilities: 1_node, 2_nodes, 2_cores and
-        1cpn_2nodes. This is implemented for all cases including full node cases. The requested memory may seem large
-        and the test requires at least 4.5 GB per core at the minimum for the full test when run with validation (-c
-        option for osu_bw or osu_latency). We run till message size 8 (-m 8) which significantly reduces memory
-        requirement."""
-        self.extra_resources = {'memory': {'size': '12GB'}}
+    def filter_benchmark_pt2pt(self):
+        """ Filter out all non-mpi.pt2pt benchmarks """
+        if not self.benchmark_info[0].startswith('mpi.pt2pt'):
+            self.valid_systems = [INVALID_SYSTEM]
+
+    @run_after('init')
+    def select_ci(self):
+        " Select the CI variants "
+        if (self.bench_name in ['mpi.pt2pt.osu_latency', 'mpi.pt2pt.osu_bw']):
+            self.bench_name_ci = self.bench_name
+
+    @run_after('init')
+    def set_num_tasks_per_compute_unit(self):
+        """ Setting number of tasks per compute unit and cpus per task. This sets num_cpus_per_task
+        for 1 node and 2 node options where the request is for full nodes."""
+        if SCALES.get(self.scale).get('num_nodes') == 1:
+            self.num_tasks_per_compute_unit = 2
 
     @run_after('setup')
     def adjust_executable_opts(self):
@@ -132,134 +127,34 @@ def adjust_executable_opts(self):
         Therefore we must override it *after* the 'setup' phase
         """
         if self.device_type == DEVICE_TYPES[CPU]:
-            self.executable_opts = [ele for ele in self.executable_opts if ele != 'D']
-
-    @run_after('setup')
-    def set_num_tasks_per_node(self):
-        """ Setting number of tasks per node and cpus per task in this function. This function sets num_cpus_per_task
-        for 1 node and 2 node options where the request is for full nodes."""
-        if SCALES.get(self.scale).get('num_nodes') == 1:
-            hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[NODE], 2)
-        else:
-            hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[NODE])
-
-    @run_after('setup')
-    def set_num_gpus_per_node(self):
-        """
-        Set number of GPUs per node for GPU-to-GPU tests
-        """
-        if self.device_type == DEVICE_TYPES[GPU]:
-            # Skip single-node tests with less than 2 GPU devices in the node
-            self.skip_if(
-                SCALES[self.scale]['num_nodes'] == 1 and self.default_num_gpus_per_node < 2,
-                "There are < 2 GPU devices present in the node."
-                f" Skipping tests with device_type={DEVICE_TYPES[GPU]} involving < 2 GPUs and 1 node."
-            )
-            if not self.num_gpus_per_node:
-                self.num_gpus_per_node = self.default_num_gpus_per_node
-                log(f'num_gpus_per_node set to {self.num_gpus_per_node} for partition {self.current_partition.name}')
+            self.executable_opts = [x for x in self.executable_opts if x != 'D']
 
 
 @rfm.simple_test
-class EESSI_OSU_Micro_Benchmarks_coll(osu_benchmark):
-    ''' Run-only OSU test '''
+class EESSI_OSU_Micro_Benchmarks_coll(EESSI_OSU_Base):
+    ''' collective OSU test '''
     scale = parameter(filter_scales_coll())
-    valid_prog_environs = ['default']
-    valid_systems = ['*']
-    time_limit = '30m'
-    module_name = parameter(utils.find_modules('OSU-Micro-Benchmarks'))
-    # Device type for non-cuda OSU-Micro-Benchmarks should run on hosts of both node types. To do this the default
-    # device type is set to GPU.
-    device_type = parameter([DEVICE_TYPES[CPU], DEVICE_TYPES[GPU]])
-    # Unset num_tasks_per_node from hpctestlib
-    num_tasks_per_node = None
-
-    # Set num_warmup_iters to 5 to reduce execution time, especially on slower interconnects
-    num_warmup_iters = 5
-    # Set num_iters to 10 to reduce execution time, especially on slower interconnects
-    num_iters = 10
 
     @run_after('init')
-    def run_after_init(self):
-        """hooks to run after init phase"""
-        # Note: device_buffers variable is inherited from the hpctestlib class and adds options to the launcher
-        # commands based on what device is set.
-        self.device_buffers = 'cpu'
-        # Filter on which scales are supported by the partitions defined in the ReFrame configuration
-        hooks.filter_supported_scales(self)
-        hooks.filter_valid_systems_by_device_type(self, required_device_type=self.device_type)
-        is_cuda_module = utils.is_cuda_required_module(self.module_name)
-        if is_cuda_module and self.device_type == DEVICE_TYPES[GPU]:
-            self.device_buffers = 'cuda'
-
-        # If the device_type is CPU then device buffer should always be CPU.
-        if self.device_type == DEVICE_TYPES[CPU]:
-            self.device_buffers = 'cpu'
-        # This part of the code removes the collective communication calls out of the run list since this test is only
-        # meant for collective.
+    def filter_benchmark_coll(self):
+        """ Filter out all non-mpi.collective benchmarks """
         if not self.benchmark_info[0].startswith('mpi.collective'):
-            self.valid_systems = []
-        hooks.set_modules(self)
-
-    @run_after('init')
-    def set_tag_ci(self):
-        if (self.benchmark_info[0] == 'mpi.collective.osu_allreduce'
-           or self.benchmark_info[0] == 'mpi.collective.osu_alltoall'):
-            self.tags.add('CI')
-        if (self.benchmark_info[0] == 'mpi.collective.osu_allreduce'):
-            self.tags.add('osu_allreduce')
-        if (self.benchmark_info[0] == 'mpi.collective.osu_alltoall'):
-            self.tags.add('osu_alltoall')
+            self.valid_systems = [INVALID_SYSTEM]
 
     @run_after('init')
-    def set_mem(self):
-        """ Setting an extra job option of memory. The alltoall operation takes maximum memory of 0.1 GB per core for a
-        message size of 8 and almost 0.5 GB per core for the maximum message size the test allows. But we limit the
-        message sizes to 8 and for a safety net we take 64 GB assuming dense nodes works for all the tests and node
-        types."""
-        self.extra_resources = {'memory': {'size': '64GB'}}
+    def select_ci(self):
+        " Select the CI variants "
+        if (self.bench_name in ['mpi.collective.osu_allreduce', 'mpi.collective.osu_alltoall']):
+            self.bench_name_ci = self.bench_name
 
     @run_after('init')
-    def set_num_tasks(self):
-        hooks.set_tag_scale(self)
-
-    @run_after('setup')
-    def set_num_tasks_per_node(self):
-        """ Setting number of tasks per node, cpus per task and gpus per node in this function. This function sets
-        num_cpus_per_task for 1 node and 2 node options where the request is for full nodes."""
-        max_avail_cpus_per_node = self.current_partition.processor.num_cpus
-        if self.device_buffers == 'cpu':
-            # Setting num_tasks and num_tasks_per_node for the CPU tests
-            if SCALES.get(self.scale).get('num_cpus_per_node', 0):
-                hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[NODE],
-                                                    self.default_num_cpus_per_node)
-            elif SCALES.get(self.scale).get('node_part', 0):
-                pass_num_per = int(max_avail_cpus_per_node / SCALES.get(self.scale).get('node_part', 0))
-                if pass_num_per > 1:
-                    hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[NODE], pass_num_per)
-                else:
-                    self.skip(msg="Too few cores available for a collective operation.")
-
-            if FEATURES[GPU] in self.current_partition.features:
-                max_avail_gpus_per_node = utils.get_max_avail_gpus_per_node(self)
-                # Setting number of GPU for a cpu test on a GPU node.
-                if SCALES.get(self.scale).get('num_nodes') == 1:
-                    self.num_gpus_per_node = 1
-                else:
-                    self.num_gpus_per_node = max_avail_gpus_per_node
-        elif self.device_buffers == 'cuda':
-            max_avail_gpus_per_node = utils.get_max_avail_gpus_per_node(self)
-            # Setting num_tasks and num_tasks_per_node for the GPU tests
-            if max_avail_gpus_per_node == 1 and SCALES.get(self.scale).get('num_nodes') == 1:
-                self.skip(msg="There is only 1 device in the node. Skipping collective tests involving only 1 node.")
-            else:
-                if SCALES.get(self.scale).get('num_gpus_per_node', 0) * SCALES.get(self.scale).get('num_nodes', 0) > 1:
-                    hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT.get(GPU, FEATURES[GPU]))
-                elif SCALES.get(self.scale).get('node_part', 0):
-                    pass_num_per = int(max_avail_gpus_per_node / SCALES.get(self.scale).get('node_part', 0))
-                    if pass_num_per > 1:
-                        hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT.get(GPU, FEATURES[GPU]))
-                    else:
-                        self.skip(msg="Total GPUs (max_avail_gpus_per_node / node_part) is 1 less.")
-                else:
-                    self.skip(msg="Total GPUs (num_nodes * num_gpus_per_node) = 1")
+    def set_compute_unit(self):
+        """
+        Set the compute unit to which tasks will be assigned:
+        one task per core for CPU runs, and one task per GPU for GPU runs.
+        """
+        device_to_compute_unit = {
+            DEVICE_TYPES[CPU]: COMPUTE_UNIT[CPU],
+            DEVICE_TYPES[GPU]: COMPUTE_UNIT[GPU],
+        }
+        self.compute_unit = device_to_compute_unit.get(self.device_type)

From 29051545a95577855cb13adf5a5cfdad97fcca8f Mon Sep 17 00:00:00 2001
From: Samuel Moors <samuel.moors@vub.be>
Date: Mon, 30 Dec 2024 21:01:48 +0100
Subject: [PATCH 2/7] import run_before

---
 eessi/testsuite/eessi_mixin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eessi/testsuite/eessi_mixin.py b/eessi/testsuite/eessi_mixin.py
index 8753c8e1..cde5fbf0 100644
--- a/eessi/testsuite/eessi_mixin.py
+++ b/eessi/testsuite/eessi_mixin.py
@@ -1,4 +1,4 @@
-from reframe.core.builtins import parameter, run_after, variable
+from reframe.core.builtins import parameter, run_after, run_before, variable
 from reframe.core.exceptions import ReframeFatalError
 from reframe.core.pipeline import RegressionMixin
 from reframe.utility.sanity import make_performance_function

From 21de9d96279683cceefb6e7434b1992d0e3d5d6f Mon Sep 17 00:00:00 2001
From: Samuel Moors <samuel.moors@vub.be>
Date: Mon, 30 Dec 2024 22:03:26 +0100
Subject: [PATCH 3/7] initialize compute_unit in base class

---
 eessi/testsuite/tests/apps/osu.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index 23ab1803..5dbcaa9c 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -47,6 +47,7 @@ class EESSI_OSU_Base(osu_benchmark, EESSI_Mixin):
     time_limit = '30m'
     module_name = parameter(find_modules('OSU-Micro-Benchmarks'))
     device_type = parameter([DEVICE_TYPES[CPU], DEVICE_TYPES[GPU]])
+    compute_unit = None
 
     # reset num_tasks_per_node from the hpctestlib: we handle it ourselves
     num_tasks_per_node = None

From 06e808af5f9b20a68a50e41b84af5f3271c2c337 Mon Sep 17 00:00:00 2001
From: Samuel Moors <samuel.moors@vub.be>
Date: Mon, 30 Dec 2024 22:14:46 +0100
Subject: [PATCH 4/7] don't inherit from mixin in base class

---
 eessi/testsuite/tests/apps/osu.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index 5dbcaa9c..ef9fd121 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -41,13 +41,11 @@ def filter_scales_coll():
     ]
 
 
-@rfm.simple_test
-class EESSI_OSU_Base(osu_benchmark, EESSI_Mixin):
+class EESSI_OSU_Base(osu_benchmark):
     """ base class for OSU tests """
     time_limit = '30m'
     module_name = parameter(find_modules('OSU-Micro-Benchmarks'))
     device_type = parameter([DEVICE_TYPES[CPU], DEVICE_TYPES[GPU]])
-    compute_unit = None
 
     # reset num_tasks_per_node from the hpctestlib: we handle it ourselves
     num_tasks_per_node = None
@@ -97,7 +95,7 @@ def skip_test_1gpu(self):
 
 
 @rfm.simple_test
-class EESSI_OSU_Micro_Benchmarks_pt2pt(EESSI_OSU_Base):
+class EESSI_OSU_Micro_Benchmarks_pt2pt(EESSI_OSU_Base, EESSI_Mixin):
     ''' point-to-point OSU test '''
     scale = parameter(filter_scales_pt2pt())
     compute_unit = COMPUTE_UNIT[NODE]
@@ -132,7 +130,7 @@ def adjust_executable_opts(self):
 
 
 @rfm.simple_test
-class EESSI_OSU_Micro_Benchmarks_coll(EESSI_OSU_Base):
+class EESSI_OSU_Micro_Benchmarks_coll(EESSI_OSU_Base, EESSI_Mixin):
     ''' collective OSU test '''
     scale = parameter(filter_scales_coll())
 

From e905e28de412987736740db1c7d4824dc4eb2424 Mon Sep 17 00:00:00 2001
From: Samuel Moors <samuel.moors@vub.be>
Date: Thu, 9 Jan 2025 17:22:26 +0100
Subject: [PATCH 5/7] use 1GB memory per task

---
 eessi/testsuite/tests/apps/osu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index ef9fd121..8266fed2 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -56,7 +56,7 @@ class EESSI_OSU_Base(osu_benchmark):
     num_iters = 10
 
     def required_mem_per_node(self):
-        return 1024
+        return self.num_tasks_per_node * 1024
 
     @run_after('init')
     def filter_scales_2gpus(self):

From f27ba00b707d815dc33f714dff916abb9e202865 Mon Sep 17 00:00:00 2001
From: Samuel Moors <samuel.moors@vub.be>
Date: Sun, 12 Jan 2025 13:11:04 +0100
Subject: [PATCH 6/7] add partial node scales to the pt2pt gpu tests

---
 eessi/testsuite/tests/apps/osu.py | 61 ++++++++++++++++++++++++-------
 1 file changed, 47 insertions(+), 14 deletions(-)

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index 8266fed2..3ea452f1 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -16,9 +16,9 @@
 from eessi.testsuite.utils import find_modules, log
 
 
-def filter_scales_pt2pt():
+def filter_scales_pt2pt_cpu():
     """
-    Filtering function for filtering scales for the pt2pt OSU test
+    Filtering function for filtering scales for the pt2pt OSU test on CPUs
     returns all scales with either 2 cores, 1 full node, or 2 full nodes
     """
     return [
@@ -29,6 +29,19 @@ def filter_scales_pt2pt():
     ]
 
 
+def filter_scales_pt2pt_gpu():
+    """
+    Filtering function for filtering scales for the pt2pt OSU test on GPUs
+    returns all scales with either a partial node, 1 full node, or 2 full nodes
+    """
+    return [
+        k for (k, v) in SCALES.items()
+        if (v['num_nodes'] == 1 and v.get('node_part', 0) > 1)
+        or (v['num_nodes'] == 2 and v.get('node_part', 0) == 1)
+        or (v['num_nodes'] == 1 and v.get('node_part', 0) == 1)
+    ]
+
+
 def filter_scales_coll():
     """
     Filtering function for filtering scales for the collective OSU test
@@ -45,7 +58,6 @@ class EESSI_OSU_Base(osu_benchmark):
     """ base class for OSU tests """
     time_limit = '30m'
     module_name = parameter(find_modules('OSU-Micro-Benchmarks'))
-    device_type = parameter([DEVICE_TYPES[CPU], DEVICE_TYPES[GPU]])
 
     # reset num_tasks_per_node from the hpctestlib: we handle it ourselves
     num_tasks_per_node = None
@@ -87,17 +99,9 @@ def set_tags(self):
         self.bench_name = self.benchmark_info[0]
         self.tags.add(self.bench_name.split('.')[-1])
 
-    @run_after('setup', always_last=True)
-    def skip_test_1gpu(self):
-        if self.device_type == DEVICE_TYPES[GPU]:
-            num_gpus = self.num_gpus_per_node * self.num_nodes
-            self.skip_if(num_gpus < 2, "Skipping GPU test : only 1 GPU available for this test case")
-
 
-@rfm.simple_test
-class EESSI_OSU_Micro_Benchmarks_pt2pt(EESSI_OSU_Base, EESSI_Mixin):
-    ''' point-to-point OSU test '''
-    scale = parameter(filter_scales_pt2pt())
+class EESSI_OSU_pt2pt_Base(EESSI_OSU_Base):
+    ''' point-to-point OSU test base class '''
     compute_unit = COMPUTE_UNIT[NODE]
 
     @run_after('init')
@@ -130,9 +134,32 @@ def adjust_executable_opts(self):
 
 
 @rfm.simple_test
-class EESSI_OSU_Micro_Benchmarks_coll(EESSI_OSU_Base, EESSI_Mixin):
+class EESSI_OSU_pt2pt_CPU(EESSI_OSU_pt2pt_Base, EESSI_Mixin):
+    ''' point-to-point OSU test on CPUs'''
+    scale = parameter(filter_scales_pt2pt_cpu())
+    device_type = DEVICE_TYPES[CPU]
+
+
+@rfm.simple_test
+class EESSI_OSU_pt2pt_GPU(EESSI_OSU_pt2pt_Base, EESSI_Mixin):
+    ''' point-to-point OSU test on GPUs'''
+    scale = parameter(filter_scales_pt2pt_gpu())
+    device_type = DEVICE_TYPES[GPU]
+
+    @run_after('setup')
+    def skip_test_1gpu(self):
+        num_gpus = self.num_gpus_per_node * self.num_nodes
+        self.skip_if(
+            num_gpus != 2 and self.scale not in ['1_node', '2_nodes'],
+            f"Skipping test : {num_gpus} GPU(s) available for this test case, need exactly 2"
+        )
+
+
+@rfm.simple_test
+class EESSI_OSU_coll(EESSI_OSU_Base, EESSI_Mixin):
     ''' collective OSU test '''
     scale = parameter(filter_scales_coll())
+    device_type = parameter([DEVICE_TYPES[CPU], DEVICE_TYPES[GPU]])
 
     @run_after('init')
     def filter_benchmark_coll(self):
@@ -157,3 +184,9 @@ def set_compute_unit(self):
             DEVICE_TYPES[GPU]: COMPUTE_UNIT[GPU],
         }
         self.compute_unit = device_to_compute_unit.get(self.device_type)
+
+    @run_after('setup')
+    def skip_test_1gpu(self):
+        if self.device_type == DEVICE_TYPES[GPU]:
+            num_gpus = self.num_gpus_per_node * self.num_nodes
+            self.skip_if(num_gpus < 2, "Skipping GPU test : only 1 GPU available for this test case")

From c33114e88b962a602ef747cdc108c64de642fd5f Mon Sep 17 00:00:00 2001
From: Samuel Moors <samuel.moors@vub.be>
Date: Sun, 19 Jan 2025 15:57:53 +0100
Subject: [PATCH 7/7] always request gpus for the gpu-only pt2pt test

---
 eessi/testsuite/eessi_mixin.py    |  1 +
 eessi/testsuite/hooks.py          |  7 ++++++-
 eessi/testsuite/tests/apps/osu.py | 16 +++++++++++-----
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/eessi/testsuite/eessi_mixin.py b/eessi/testsuite/eessi_mixin.py
index 3dd5b224..feacfb96 100644
--- a/eessi/testsuite/eessi_mixin.py
+++ b/eessi/testsuite/eessi_mixin.py
@@ -45,6 +45,7 @@ class EESSI_Mixin(RegressionMixin):
     bench_name = None
     bench_name_ci = None
     num_tasks_per_compute_unit = 1
+    always_request_gpus = None
 
     # Create ReFrame variables for logging runtime environment information
     cvmfs_repo_name = variable(str, value='None')
diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py
index caa81fa7..8e4eb459 100644
--- a/eessi/testsuite/hooks.py
+++ b/eessi/testsuite/hooks.py
@@ -57,6 +57,8 @@ def _assign_default_num_gpus_per_node(test: rfm.RegressionTest):
         # no default set yet, so setting one
         test.default_num_gpus_per_node = math.ceil(test.max_avail_gpus_per_node / test.node_part)
 
+    log(f'default_num_gpus_per_node set to {test.default_num_gpus_per_node}')
+
 
 def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, num_per: int = 1):
     """
@@ -83,6 +85,8 @@ def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, n
     - assign_tasks_per_compute_unit(test, COMPUTE_UNIT[CPU_SOCKET]) will launch 2 tasks with 64 threads per task
 
     """
+    log(f'assign_tasks_per_compute_unit called with compute_unit: {compute_unit} and num_per: {num_per}')
+
     if num_per != 1 and compute_unit not in [COMPUTE_UNIT[NODE]]:
         raise NotImplementedError(
             f'Non-default num_per {num_per} is not implemented for compute_unit {compute_unit}.')
@@ -713,7 +717,8 @@ def _check_always_request_gpus(test: rfm.RegressionTest):
     """
     Make sure we always request enough GPUs if required for the current GPU partition (cluster-specific policy)
     """
-    if FEATURES[ALWAYS_REQUEST_GPUS] in test.current_partition.features and not test.num_gpus_per_node:
+    always_request_gpus = FEATURES[ALWAYS_REQUEST_GPUS] in test.current_partition.features or test.always_request_gpus
+    if always_request_gpus and not test.num_gpus_per_node:
         test.num_gpus_per_node = test.default_num_gpus_per_node
         log(f'num_gpus_per_node set to {test.num_gpus_per_node} for partition {test.current_partition.name}')
 
diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index 3ea452f1..8e3ba85e 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -145,14 +145,20 @@ class EESSI_OSU_pt2pt_GPU(EESSI_OSU_pt2pt_Base, EESSI_Mixin):
     ''' point-to-point OSU test on GPUs'''
     scale = parameter(filter_scales_pt2pt_gpu())
     device_type = DEVICE_TYPES[GPU]
+    always_request_gpus = True
 
     @run_after('setup')
-    def skip_test_1gpu(self):
+    def skip_test_gpus(self):
         num_gpus = self.num_gpus_per_node * self.num_nodes
-        self.skip_if(
-            num_gpus != 2 and self.scale not in ['1_node', '2_nodes'],
-            f"Skipping test : {num_gpus} GPU(s) available for this test case, need exactly 2"
-        )
+        if self.scale not in ['1_node', '2_nodes']:
+            # On a partial node allocation, run this test only if exactly 2 GPUs are allocated
+            self.skip_if(
+                num_gpus != 2,
+                f"Skipping test : {num_gpus} GPU(s) available for this test case, need exactly 2"
+            )
+        elif self.scale == '1_node':
+            # Make sure there are at least 2 GPUs
+            self.skip_if(num_gpus < 2, "Skipping GPU test : only 1 GPU available for this test case")
 
 
 @rfm.simple_test