From 3b45e609c2aba4ca83c47b9f9ed4af5cba109970 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 4 Sep 2024 20:49:15 +0200 Subject: [PATCH 01/14] Make sure that zen4 is actually used in the test runs --- config/azure_mc.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/config/azure_mc.py b/config/azure_mc.py index 3afcbddb..471e4ea4 100644 --- a/config/azure_mc.py +++ b/config/azure_mc.py @@ -33,11 +33,30 @@ 'name': 'x86_64-amd-zen4-node', 'access': ['--partition=x86-64-amd-zen4-node', '--export=NONE'], 'descr': 'Zen4, 16 cores, 30 GB', - }, + 'prepare_cmds': [ + 'export EESSI_SOFTWARE_SUBDIR_OVERRIDE=x86_64/amd/zen4', + common_eessi_init(), + # Required when using srun as launcher with --export=NONE in partition access, in order to ensure job + # steps inherit environment. It doesn't hurt to define this even if srun is not used + 'export SLURM_EXPORT_ENV=ALL' + ] + 'extras': { + 'mem_per_node': 768000 + }, + }, { 'name': 'aarch64-neoverse-N1-16c-62gb', 'access': ['--partition=aarch64-neoverse-n1-node', '--export=NONE'], 'descr': 'Neoverse N1, 16 cores, 62 GiB', + 'prepare_cmds': [ + common_eessi_init(), + # Required when using srun as launcher with --export=NONE in partition access, in order to ensure job + # steps inherit environment. It doesn't hurt to define this even if srun is not used + 'export SLURM_EXPORT_ENV=ALL' + ], + 'extras': { + 'mem_per_node': 64000 + }, }, ] }, @@ -69,17 +88,6 @@ 'features': [ FEATURES['CPU'] ] + list(SCALES.keys()), - 'prepare_cmds': [ - common_eessi_init(), - # Required when using srun as launcher with --export=NONE in partition access, in order to ensure job - # steps inherit environment. It doesn't hurt to define this even if srun is not used - 'export SLURM_EXPORT_ENV=ALL' - ], - 'extras': { - # Node types have strongly varying amounts of memory, but we'll make it easy on ourselves - # All should _at least_ have this amount - 'mem_per_node': 64000 - }, } for system in site_configuration['systems']: for partition in system['partitions']: From 1974922ead486544e62fae239ff5977f74210dfb Mon Sep 17 00:00:00 2001 From: Sam Moors Date: Thu, 5 Sep 2024 08:35:20 +0200 Subject: [PATCH 02/14] add missing comma --- config/azure_mc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/azure_mc.py b/config/azure_mc.py index 471e4ea4..14d8a772 100644 --- a/config/azure_mc.py +++ b/config/azure_mc.py @@ -39,7 +39,7 @@ # Required when using srun as launcher with --export=NONE in partition access, in order to ensure job # steps inherit environment. It doesn't hurt to define this even if srun is not used 'export SLURM_EXPORT_ENV=ALL' - ] + ], 'extras': { 'mem_per_node': 768000 }, From f1955442db5770128405e26ec13cb942544f6bae Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 5 Sep 2024 11:41:12 +0200 Subject: [PATCH 03/14] Fix style issues --- config/azure_mc.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/config/azure_mc.py b/config/azure_mc.py index 471e4ea4..2058b87c 100644 --- a/config/azure_mc.py +++ b/config/azure_mc.py @@ -36,12 +36,13 @@ 'prepare_cmds': [ 'export EESSI_SOFTWARE_SUBDIR_OVERRIDE=x86_64/amd/zen4', common_eessi_init(), - # Required when using srun as launcher with --export=NONE in partition access, in order to ensure job - # steps inherit environment. It doesn't hurt to define this even if srun is not used + # Required when using srun as launcher with --export=NONE in partition access, + # in order to ensure job steps inherit environment. It doesn't hurt to define + # this even if srun is not used 'export SLURM_EXPORT_ENV=ALL' ] 'extras': { - 'mem_per_node': 768000 + 'mem_per_node': 768000 }, }, { @@ -50,12 +51,13 @@ 'descr': 'Neoverse N1, 16 cores, 62 GiB', 'prepare_cmds': [ common_eessi_init(), - # Required when using srun as launcher with --export=NONE in partition access, in order to ensure job - # steps inherit environment. It doesn't hurt to define this even if srun is not used + # Required when using srun as launcher with --export=NONE in partition access, + # in order to ensure job steps inherit environment. It doesn't hurt to define + # this even if srun is not used 'export SLURM_EXPORT_ENV=ALL' ], 'extras': { - 'mem_per_node': 64000 + 'mem_per_node': 64000 }, }, ] From 747fe67a76fb4508062cec1cb9c9bf6ac8dc23af Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 5 Sep 2024 11:46:25 +0200 Subject: [PATCH 04/14] Fix more style issues --- config/azure_mc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config/azure_mc.py b/config/azure_mc.py index 949c8077..f8b9ccd1 100644 --- a/config/azure_mc.py +++ b/config/azure_mc.py @@ -43,8 +43,8 @@ ], 'extras': { 'mem_per_node': 768000 - }, - }, + }, + }, { 'name': 'aarch64-neoverse-N1-16c-62gb', 'access': ['--partition=aarch64-neoverse-n1-node', '--export=NONE'], @@ -58,7 +58,7 @@ ], 'extras': { 'mem_per_node': 64000 - }, + }, }, ] }, From 180e1380648fcb3ea7b8b83e49f8b18598f15320 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Mon, 9 Sep 2024 13:50:13 +0200 Subject: [PATCH 05/14] use reframe warning --- eessi/testsuite/common_config.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/eessi/testsuite/common_config.py b/eessi/testsuite/common_config.py index 94a6849e..e3298506 100644 --- a/eessi/testsuite/common_config.py +++ b/eessi/testsuite/common_config.py @@ -1,5 +1,6 @@ import os -import warnings + +import reframe.core.logging as rlog perflog_format = '|'.join([ '%(check_job_completion_time)s', @@ -96,12 +97,11 @@ def common_eessi_init(eessi_version=None): eessi_cvmfs_repo = os.getenv('EESSI_CVMFS_REPO', None) if eessi_cvmfs_repo is None: - warn_msg = '\n' + '\n'.join([ - "EESSI WARNING: Environment variable 'EESSI_CVMFS_REPO' was not found.", - "EESSI WARNING: If you do not intend to use the EESSI software stack, this is perfectly fine.", - "EESSI WARNING: To use EESSI, initialize the EESSI environment before running the test suite.", - ]) - warnings.warn(warn_msg) + rlog.getlogger().warning('\n'.join([ + "Environment variable 'EESSI_CVMFS_REPO' is not defined.", + "If you do not intend to use the EESSI software stack, this is perfectly fine.", + "To use EESSI, initialize the EESSI environment before running the test suite.", + ])) return '' eessi_init = [] From 8dd0689d786897c9274361087dff9a9663fea86c Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 17 Sep 2024 13:26:26 +0200 Subject: [PATCH 06/14] Make sure ReFrame command only runs for 24h minus 10 minutes. That way, if it's on a daily schedule, jobs will never overlap, as this caused us to exceed vCPU limits on AWS --- CI/run_reframe.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/CI/run_reframe.sh b/CI/run_reframe.sh index 497c2a9b..3341146e 100755 --- a/CI/run_reframe.sh +++ b/CI/run_reframe.sh @@ -68,6 +68,11 @@ fi if [ -z "${RFM_PREFIX}" ]; then export RFM_PREFIX="${HOME}/reframe_CI_runs" fi +if [ -z "${REFRAME_TIMEOUT}" ]; then + # 10 minutes short of 1 day, since typically the test suite will be run daily. + # This will prevent multiple ReFrame runs from piling up and exceeding the quota on our Magic Castle clusters + export REFRAME_TIMEOUT=1430m +fi # Create virtualenv for ReFrame using system python python3 -m venv "${TEMPDIR}"/reframe_venv @@ -118,7 +123,7 @@ reframe ${REFRAME_ARGS} --list # Run echo "Run tests:" -reframe ${REFRAME_ARGS} --run +timeout -v --preserve-status -s SIGTERM ${REFRAME_TIMEOUT} reframe ${REFRAME_ARGS} --run # Cleanup rm -rf "${TEMPDIR}" From 57ed3de6aadad31dac66804c5a6c0ba922a878da Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 17 Sep 2024 13:30:34 +0200 Subject: [PATCH 07/14] Document new variable --- CI/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CI/README.md b/CI/README.md index 161c4a48..4ee16a50 100644 --- a/CI/README.md +++ b/CI/README.md @@ -36,6 +36,7 @@ It should define: - `RFM_CHECK_SEARCH_PATH` (optional): the search path where ReFrame should search for tests to run in this CI pipeline. Default: `${TEMPDIR}/test-suite/eessi/testsuite/tests/`. - `RFM_CHECK_SEARCH_RECURSIVE` (optional): whether ReFrame should search `RFM_CHECK_SEARCH_PATH` recursively. Default: `1`. - `RFM_PREFIX` (optional): the prefix in which ReFrame stores all the files. Default: `${HOME}/reframe_CI_runs`. +- `REFRAME_TIMEOUT` (optional): DURATION as passed to the `timeout` command in Unix. If the `reframe` commands runs for longer than this, it will be killed by SIGTERM. The ReFrame runtime will then cancel all scheduled (and running) jobs. Can be used to make sure jobs don't pile up, e.g. if the test suite runs daily, but it takes longer than one day to process all jobs. ## Creating the `crontab` entry and specifying `EESSI_CI_SYSTEM_NAME` This line depends on how often you want to run the tests, and where the `run_reframe_wrapper.sh` is located exactly. We also define the EESSI_CI_SYSTEM_NAME in this entry, as cronjobs don't normally read your `.bashrc` (and thus we need a different way of specifying this environment variable). From 9ef6a625419c991eaba3d8e115d0dafd3f372c10 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 17 Sep 2024 15:12:04 +0200 Subject: [PATCH 08/14] Make git clone commands more verbose, so that it is clearer what is being used in terms of version --- CI/run_reframe.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/CI/run_reframe.sh b/CI/run_reframe.sh index 3341146e..b50bd8c8 100755 --- a/CI/run_reframe.sh +++ b/CI/run_reframe.sh @@ -81,11 +81,15 @@ python3 -m pip install --upgrade pip python3 -m pip install reframe-hpc=="${REFRAME_VERSION}" # Clone reframe repo to have the hpctestlib: -git clone "${REFRAME_URL}" --branch "${REFRAME_BRANCH}" "${TEMPDIR}"/reframe +REFRAME_CLONE_ARGS="${REFRAME_URL} --branch ${REFRAME_BRANCH} ${TEMPDIR}/reframe" +echo "Cloning ReFrame repo: git clone ${REFRAME_CLONE_ARGS}" +git clone ${REFRAME_CLONE_ARGS} export PYTHONPATH="${PYTHONPATH}":"${TEMPDIR}"/reframe # Clone test suite repo -git clone "${EESSI_TESTSUITE_URL}" --branch "${EESSI_TESTSUITE_BRANCH}" "${TEMPDIR}"/test-suite +EESSI_CLONE_ARGS="${EESSI_TESTSUITE_URL} --branch ${EESSI_TESTSUITE_BRANCH} ${TEMPDIR}/test-suite" +echo "Cloning EESSI repo: git clone ${EESSI_CLONE_ARGS}" +git clone ${EESSI_CLONE_ARGS} export PYTHONPATH="${PYTHONPATH}":"${TEMPDIR}"/test-suite/ # Start the EESSI environment @@ -105,7 +109,7 @@ echo "" echo "TEMPDIR: ${TEMPDIR}" echo "PYTHONPATH: ${PYTHONPATH}" echo "EESSI test suite URL: ${EESSI_TESTSUITE_URL}" -echo "EESSI test suite version: ${EESSI_TESTSUITE_VERSION}" +echo "EESSI test suite version: ${EESSI_TESTSUITE_BRANCH}" echo "HPCtestlib from ReFrame URL: ${REFRAME_URL}" echo "HPCtestlib from ReFrame branch: ${REFRAME_BRANCH}" echo "ReFrame executable: $(which reframe)" From 3bc0b311a9e31f0c87928519588b64280925f3a3 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 17 Sep 2024 15:43:47 +0200 Subject: [PATCH 09/14] Make sure that we set 'memory' as resource in all configs. This needs to be specified in order for memory to be requested by the hook --- config/aws_mc.py | 6 ++++++ config/azure_mc.py | 6 ++++++ config/it4i_karolina.py | 6 ++++++ config/settings_example.py | 10 ++++++++++ 4 files changed, 28 insertions(+) diff --git a/config/aws_mc.py b/config/aws_mc.py index f90600d1..1ff95d94 100644 --- a/config/aws_mc.py +++ b/config/aws_mc.py @@ -105,6 +105,12 @@ # steps inherit environment. It doesn't hurt to define this even if srun is not used 'export SLURM_EXPORT_ENV=ALL' ], + 'resources': [ + { + 'name': 'memory', + 'options': ['--mem={size}'], + } + ], 'extras': { # Node types have somewhat varying amounts of memory, but we'll make it easy on ourselves # All should _at least_ have this amount (30GB * 1E9 / (1024*1024) = 28610 MiB) diff --git a/config/azure_mc.py b/config/azure_mc.py index f8b9ccd1..5300fac9 100644 --- a/config/azure_mc.py +++ b/config/azure_mc.py @@ -90,6 +90,12 @@ 'features': [ FEATURES['CPU'] ] + list(SCALES.keys()), + 'resources': [ + { + 'name': 'memory', + 'options': ['--mem={size}'], + } + ], } for system in site_configuration['systems']: for partition in system['partitions']: diff --git a/config/it4i_karolina.py b/config/it4i_karolina.py index c57bb889..f33e636c 100644 --- a/config/it4i_karolina.py +++ b/config/it4i_karolina.py @@ -60,6 +60,12 @@ 'features': [ FEATURES[CPU], ] + list(SCALES.keys()), + 'resources': [ + { + 'name': 'memory', + 'options': ['--mem={size}'], + } + ], 'extras': { # Make sure to round down, otherwise a job might ask for more mem than is available # per node diff --git a/config/settings_example.py b/config/settings_example.py index 83fedcce..26173d82 100644 --- a/config/settings_example.py +++ b/config/settings_example.py @@ -56,6 +56,11 @@ 'options': ['--mem={size}'], } ], + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 229376 # in MiB + }, # list(SCALES.keys()) adds all the scales from eessi.testsuite.constants as valid for thi partition # Can be modified if not all scales can run on this partition, see e.g. the surf_snellius.py config 'features': [FEATURES[CPU]] + list(SCALES.keys()), @@ -87,6 +92,11 @@ 'options': ['--mem={size}'], } ], + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 229376 # in MiB + }, 'devices': [ { 'type': DEVICE_TYPES[GPU], From 095cc2bb3375e216a863e1b90bd591c8011b5e08 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 17 Sep 2024 16:00:28 +0200 Subject: [PATCH 10/14] Fix duplicate extras entry --- config/settings_example.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/config/settings_example.py b/config/settings_example.py index 26173d82..969453ba 100644 --- a/config/settings_example.py +++ b/config/settings_example.py @@ -92,11 +92,6 @@ 'options': ['--mem={size}'], } ], - 'extras': { - # Make sure to round down, otherwise a job might ask for more mem than is available - # per node - 'mem_per_node': 229376 # in MiB - }, 'devices': [ { 'type': DEVICE_TYPES[GPU], @@ -108,6 +103,9 @@ FEATURES[GPU], ] + list(SCALES.keys()), 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 229376, # in MiB GPU_VENDOR: GPU_VENDORS[NVIDIA], }, }, From bb8a09381e13e362803ce0c3c52e6b983d0d37e7 Mon Sep 17 00:00:00 2001 From: casparvl casparvl Date: Tue, 17 Sep 2024 14:19:43 +0000 Subject: [PATCH 11/14] Make hook print a warning --- eessi/testsuite/hooks.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index 3211f273..2e9f247a 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -525,6 +525,25 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req: float): msg += f" but {app_mem_req} MiB is needed" test.skip_if(test.current_partition.extras['mem_per_node'] < app_mem_req, msg) + # Check if a resource with the name 'memory' was set in the ReFrame config file. If not, warn the user + # and return from this hook (as setting test.extra_resources will be ignored in that case according to + # https://reframe-hpc.readthedocs.io/en/stable/regression_test_api.html#reframe.core.pipeline.RegressionTest.extra_resources + if 'memory' not in test.current_partition.resources: + logger = rflog.getlogger() + msg = "Your ReFrame configuration file does not specify any resource called 'memory' for this partition " + msg += f" ({test.current_partition.name})." + msg += "Without this, an explicit memory request cannot be made from the scheduler. This test will run" + msg += " but it may result in an out of memory error." + msg += "Please specify how to requst memory (per node) from your resource scheduler by defining a resource" + msg += "'memory' in your ReFrame configuration file for this partition." + msg += "For a SLURM system, one would e.g. define:" + msg += "'resources': [{'name': 'memory', 'options': ['--mem={size}']}]" + logger.warning(msg) + # We return, as setting a test.extra_resources is pointless - it would be ignored anyway + # This way, we also don't add any lines to the log that a specific amount of memory was requested + return + + # Compute what is higher: the requested memory, or the memory available proportional to requested CPUs # Fraction of CPU cores requested check_proc_attribute_defined(test, 'num_cpus') From 031758a61bfea422371c7bff579c13c84dd8c69b Mon Sep 17 00:00:00 2001 From: casparvl casparvl Date: Tue, 17 Sep 2024 19:40:19 +0000 Subject: [PATCH 12/14] Remove extra blank line --- eessi/testsuite/hooks.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index 2e9f247a..ca09d999 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -532,18 +532,17 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req: float): logger = rflog.getlogger() msg = "Your ReFrame configuration file does not specify any resource called 'memory' for this partition " msg += f" ({test.current_partition.name})." - msg += "Without this, an explicit memory request cannot be made from the scheduler. This test will run" + msg += " Without this, an explicit memory request cannot be made from the scheduler. This test will run," msg += " but it may result in an out of memory error." - msg += "Please specify how to requst memory (per node) from your resource scheduler by defining a resource" - msg += "'memory' in your ReFrame configuration file for this partition." - msg += "For a SLURM system, one would e.g. define:" - msg += "'resources': [{'name': 'memory', 'options': ['--mem={size}']}]" + msg += " Please specify how to requst memory (per node) from your resource scheduler by defining a resource" + msg += " 'memory' in your ReFrame configuration file for this partition." + msg += " For a SLURM system, one would e.g. define:" + msg += " 'resources': [{'name': 'memory', 'options': ['--mem={size}']}]" logger.warning(msg) # We return, as setting a test.extra_resources is pointless - it would be ignored anyway # This way, we also don't add any lines to the log that a specific amount of memory was requested return - # Compute what is higher: the requested memory, or the memory available proportional to requested CPUs # Fraction of CPU cores requested check_proc_attribute_defined(test, 'num_cpus') From f1d9d315f50f83cc987341a574a26bd1cbb82030 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 17 Sep 2024 22:10:31 +0200 Subject: [PATCH 13/14] This import is not supposed to work. It indirectly imports reframe. However, this is only possible if running with the reframe command, since that command adds the 'external' directory in the reframe installation prefix to the sys.path. In other words: ReFrame is never supposed to be imported directly, that simply does not work (by design). --- .github/workflows/test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 73471776..0127f2de 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -36,7 +36,6 @@ jobs: # update $PYTHONPATH so 'import eessi.testsuite.utils' works export PYTHONPATH=$PWD:$PYTHONPATH echo $PYTHONPATH - python -c 'import eessi.testsuite.utils' # show active ReFrame configuration, # enable verbose output to help expose problems with configuration file (if any) From 0a49e974f4ca80dc7a3f1b1cf28a68233eab08f1 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 17 Sep 2024 22:37:12 +0200 Subject: [PATCH 14/14] Update scorecards action --- .github/workflows/scorecards.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 39cbade2..92770210 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -41,7 +41,7 @@ jobs: persist-credentials: false - name: "Run analysis" - uses: ossf/scorecard-action@99c53751e09b9529366343771cc321ec74e9bd3d # v2.0.6 + uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1 with: results_file: results.sarif results_format: sarif