Merge branch 'main' of github.com:EESSI/test-suite

EESSI · Sep 21, 2024 · 5020460 · 5020460
2 parents fbfe08d + a38b593
commit 5020460
Show file tree

Hide file tree

Showing 10 changed files with 86 additions and 23 deletions.
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
@@ -41,7 +41,7 @@ jobs:
           persist-credentials: false
 
       - name: "Run analysis"
-        uses: ossf/scorecard-action@99c53751e09b9529366343771cc321ec74e9bd3d # v2.0.6
+        uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1
         with:
           results_file: results.sarif
           results_format: sarif

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -36,7 +36,6 @@ jobs:
             # update $PYTHONPATH so 'import eessi.testsuite.utils' works
             export PYTHONPATH=$PWD:$PYTHONPATH
             echo $PYTHONPATH
-            python -c 'import eessi.testsuite.utils'
 
             # show active ReFrame configuration,
             # enable verbose output to help expose problems with configuration file (if any)

diff --git a/CI/README.md b/CI/README.md
@@ -36,6 +36,7 @@ It should define:
 - `RFM_CHECK_SEARCH_PATH` (optional): the search path where ReFrame should search for tests to run in this CI pipeline. Default: `${TEMPDIR}/test-suite/eessi/testsuite/tests/`.
 - `RFM_CHECK_SEARCH_RECURSIVE` (optional): whether ReFrame should search `RFM_CHECK_SEARCH_PATH` recursively. Default: `1`.
 - `RFM_PREFIX` (optional): the prefix in which ReFrame stores all the files. Default: `${HOME}/reframe_CI_runs`.
+- `REFRAME_TIMEOUT` (optional): DURATION as passed to the `timeout` command in Unix. If the `reframe` commands runs for longer than this, it will be killed by SIGTERM. The ReFrame runtime will then cancel all scheduled (and running) jobs. Can be used to make sure jobs don't pile up, e.g. if the test suite runs daily, but it takes longer than one day to process all jobs.
 
 ## Creating the `crontab` entry and specifying `EESSI_CI_SYSTEM_NAME`
 This line depends on how often you want to run the tests, and where the `run_reframe_wrapper.sh` is located exactly. We also define the EESSI_CI_SYSTEM_NAME in this entry, as cronjobs don't normally read your `.bashrc` (and thus we need a different way of specifying this environment variable).

diff --git a/CI/run_reframe.sh b/CI/run_reframe.sh
@@ -68,6 +68,11 @@ fi
 if [ -z "${RFM_PREFIX}" ]; then
     export RFM_PREFIX="${HOME}/reframe_CI_runs"
 fi
+if [ -z "${REFRAME_TIMEOUT}" ]; then
+    # 10 minutes short of 1 day, since typically the test suite will be run daily.
+    # This will prevent multiple ReFrame runs from piling up and exceeding the quota on our Magic Castle clusters
+    export REFRAME_TIMEOUT=1430m
+fi
 
 # Create virtualenv for ReFrame using system python
 python3 -m venv "${TEMPDIR}"/reframe_venv
@@ -76,11 +81,15 @@ python3 -m pip install --upgrade pip
 python3 -m pip install reframe-hpc=="${REFRAME_VERSION}"
 
 # Clone reframe repo to have the hpctestlib:
-git clone "${REFRAME_URL}" --branch "${REFRAME_BRANCH}" "${TEMPDIR}"/reframe
+REFRAME_CLONE_ARGS="${REFRAME_URL} --branch ${REFRAME_BRANCH} ${TEMPDIR}/reframe"
+echo "Cloning ReFrame repo: git clone ${REFRAME_CLONE_ARGS}"
+git clone ${REFRAME_CLONE_ARGS}
 export PYTHONPATH="${PYTHONPATH}":"${TEMPDIR}"/reframe
 
 # Clone test suite repo
-git clone "${EESSI_TESTSUITE_URL}" --branch "${EESSI_TESTSUITE_BRANCH}" "${TEMPDIR}"/test-suite
+EESSI_CLONE_ARGS="${EESSI_TESTSUITE_URL} --branch ${EESSI_TESTSUITE_BRANCH} ${TEMPDIR}/test-suite"
+echo "Cloning EESSI repo: git clone ${EESSI_CLONE_ARGS}"
+git clone ${EESSI_CLONE_ARGS}
 export PYTHONPATH="${PYTHONPATH}":"${TEMPDIR}"/test-suite/
 
 # Start the EESSI environment
@@ -100,7 +109,7 @@ echo ""
 echo "TEMPDIR: ${TEMPDIR}"
 echo "PYTHONPATH: ${PYTHONPATH}"
 echo "EESSI test suite URL: ${EESSI_TESTSUITE_URL}"
-echo "EESSI test suite version: ${EESSI_TESTSUITE_VERSION}"
+echo "EESSI test suite version: ${EESSI_TESTSUITE_BRANCH}"
 echo "HPCtestlib from ReFrame URL: ${REFRAME_URL}"
 echo "HPCtestlib from ReFrame branch: ${REFRAME_BRANCH}"
 echo "ReFrame executable: $(which reframe)"
@@ -118,7 +127,7 @@ reframe ${REFRAME_ARGS} --list
 
 # Run
 echo "Run tests:"
-reframe ${REFRAME_ARGS} --run
+timeout -v --preserve-status -s SIGTERM ${REFRAME_TIMEOUT} reframe ${REFRAME_ARGS} --run
 
 # Cleanup
 rm -rf "${TEMPDIR}"
diff --git a/config/aws_mc.py b/config/aws_mc.py
@@ -105,6 +105,12 @@
         # steps inherit environment. It doesn't hurt to define this even if srun is not used
         'export SLURM_EXPORT_ENV=ALL'
     ],
+    'resources': [
+        {
+            'name': 'memory',
+            'options': ['--mem={size}'],
+        }
+    ],
     'extras': {
         # Node types have somewhat varying amounts of memory, but we'll make it easy on ourselves
         # All should _at least_ have this amount (30GB * 1E9 / (1024*1024) = 28610 MiB)

diff --git a/config/azure_mc.py b/config/azure_mc.py
@@ -33,11 +33,32 @@
                     'name': 'x86_64-amd-zen4-node',
                     'access': ['--partition=x86-64-amd-zen4-node', '--export=NONE'],
                     'descr': 'Zen4, 16 cores, 30 GB',
+                    'prepare_cmds': [
+                        'export EESSI_SOFTWARE_SUBDIR_OVERRIDE=x86_64/amd/zen4',
+                        common_eessi_init(),
+                        # Required when using srun as launcher with --export=NONE in partition access,
+                        # in order to ensure job steps inherit environment. It doesn't hurt to define
+                        # this even if srun is not used
+                        'export SLURM_EXPORT_ENV=ALL'
+                    ],
+                    'extras': {
+                        'mem_per_node': 768000
+                    },
                 },
                 {
                     'name': 'aarch64-neoverse-N1-16c-62gb',
                     'access': ['--partition=aarch64-neoverse-n1-node', '--export=NONE'],
                     'descr': 'Neoverse N1, 16 cores, 62 GiB',
+                    'prepare_cmds': [
+                        common_eessi_init(),
+                        # Required when using srun as launcher with --export=NONE in partition access,
+                        # in order to ensure job steps inherit environment. It doesn't hurt to define
+                        # this even if srun is not used
+                        'export SLURM_EXPORT_ENV=ALL'
+                    ],
+                    'extras': {
+                        'mem_per_node': 64000
+                    },
                 },
             ]
         },
@@ -69,17 +90,12 @@
     'features': [
         FEATURES['CPU']
     ] + list(SCALES.keys()),
-    'prepare_cmds': [
-        common_eessi_init(),
-        # Required when using srun as launcher with --export=NONE in partition access, in order to ensure job
-        # steps inherit environment. It doesn't hurt to define this even if srun is not used
-        'export SLURM_EXPORT_ENV=ALL'
+    'resources': [
+        {
+            'name': 'memory',
+            'options': ['--mem={size}'],
+        }
     ],
-    'extras': {
-        # Node types have strongly varying amounts of memory, but we'll make it easy on ourselves
-        # All should _at least_ have this amount
-        'mem_per_node': 64000
-    },
 }
 for system in site_configuration['systems']:
     for partition in system['partitions']:

diff --git a/config/it4i_karolina.py b/config/it4i_karolina.py
@@ -60,6 +60,12 @@
                     'features': [
                         FEATURES[CPU],
                     ] + list(SCALES.keys()),
+                    'resources': [
+                        {
+                            'name': 'memory',
+                            'options': ['--mem={size}'],
+                        }
+                    ],
                     'extras': {
                         # Make sure to round down, otherwise a job might ask for more mem than is available
                         # per node

diff --git a/config/settings_example.py b/config/settings_example.py
@@ -56,6 +56,11 @@
                             'options': ['--mem={size}'],
                         }
                     ],
+                    'extras': {
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        'mem_per_node': 229376  # in MiB
+                    },
                     # list(SCALES.keys()) adds all the scales from eessi.testsuite.constants as valid for thi partition
                     # Can be modified if not all scales can run on this partition, see e.g. the surf_snellius.py config
                     'features': [FEATURES[CPU]] + list(SCALES.keys()),
@@ -98,6 +103,9 @@
                         FEATURES[GPU],
                     ] + list(SCALES.keys()),
                     'extras': {
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        'mem_per_node': 229376,  # in MiB
                         GPU_VENDOR: GPU_VENDORS[NVIDIA],
                     },
                 },

diff --git a/eessi/testsuite/common_config.py b/eessi/testsuite/common_config.py
@@ -1,5 +1,6 @@
 import os
-import warnings
+
+import reframe.core.logging as rlog
 
 perflog_format = '|'.join([
     '%(check_job_completion_time)s',
@@ -96,12 +97,11 @@ def common_eessi_init(eessi_version=None):
     eessi_cvmfs_repo = os.getenv('EESSI_CVMFS_REPO', None)
 
     if eessi_cvmfs_repo is None:
-        warn_msg = '\n' + '\n'.join([
-            "EESSI WARNING: Environment variable 'EESSI_CVMFS_REPO' was not found.",
-            "EESSI WARNING: If you do not intend to use the EESSI software stack, this is perfectly fine.",
-            "EESSI WARNING: To use EESSI, initialize the EESSI environment before running the test suite.",
-        ])
-        warnings.warn(warn_msg)
+        rlog.getlogger().warning('\n'.join([
+            "Environment variable 'EESSI_CVMFS_REPO' is not defined.",
+            "If you do not intend to use the EESSI software stack, this is perfectly fine.",
+            "To use EESSI, initialize the EESSI environment before running the test suite.",
+        ]))
         return ''
 
     eessi_init = []

diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py
@@ -525,6 +525,24 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req: float):
     msg += f" but {app_mem_req} MiB is needed"
     test.skip_if(test.current_partition.extras['mem_per_node'] < app_mem_req, msg)
 
+    # Check if a resource with the name 'memory' was set in the ReFrame config file. If not, warn the user
+    # and return from this hook (as setting test.extra_resources will be ignored in that case according to
+    # https://reframe-hpc.readthedocs.io/en/stable/regression_test_api.html#reframe.core.pipeline.RegressionTest.extra_resources
+    if 'memory' not in test.current_partition.resources:
+        logger = rflog.getlogger()
+        msg = "Your ReFrame configuration file does not specify any resource called 'memory' for this partition "
+        msg += f" ({test.current_partition.name})."
+        msg += " Without this, an explicit memory request cannot be made from the scheduler. This test will run,"
+        msg += " but it may result in an out of memory error."
+        msg += " Please specify how to requst memory (per node) from your resource scheduler by defining a resource"
+        msg += " 'memory' in your ReFrame configuration file for this partition."
+        msg += " For a SLURM system, one would e.g. define:"
+        msg += " 'resources': [{'name': 'memory', 'options': ['--mem={size}']}]"
+        logger.warning(msg)
+        # We return, as setting a test.extra_resources is pointless - it would be ignored anyway
+        # This way, we also don't add any lines to the log that a specific amount of memory was requested
+        return
+
     # Compute what is higher: the requested memory, or the memory available proportional to requested CPUs
     # Fraction of CPU cores requested
     check_proc_attribute_defined(test, 'num_cpus')