diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 39cbade2..92770210 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -41,7 +41,7 @@ jobs: persist-credentials: false - name: "Run analysis" - uses: ossf/scorecard-action@99c53751e09b9529366343771cc321ec74e9bd3d # v2.0.6 + uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1 with: results_file: results.sarif results_format: sarif diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 73471776..0127f2de 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -36,7 +36,6 @@ jobs: # update $PYTHONPATH so 'import eessi.testsuite.utils' works export PYTHONPATH=$PWD:$PYTHONPATH echo $PYTHONPATH - python -c 'import eessi.testsuite.utils' # show active ReFrame configuration, # enable verbose output to help expose problems with configuration file (if any) diff --git a/CI/README.md b/CI/README.md index 161c4a48..4ee16a50 100644 --- a/CI/README.md +++ b/CI/README.md @@ -36,6 +36,7 @@ It should define: - `RFM_CHECK_SEARCH_PATH` (optional): the search path where ReFrame should search for tests to run in this CI pipeline. Default: `${TEMPDIR}/test-suite/eessi/testsuite/tests/`. - `RFM_CHECK_SEARCH_RECURSIVE` (optional): whether ReFrame should search `RFM_CHECK_SEARCH_PATH` recursively. Default: `1`. - `RFM_PREFIX` (optional): the prefix in which ReFrame stores all the files. Default: `${HOME}/reframe_CI_runs`. +- `REFRAME_TIMEOUT` (optional): DURATION as passed to the `timeout` command in Unix. If the `reframe` commands runs for longer than this, it will be killed by SIGTERM. The ReFrame runtime will then cancel all scheduled (and running) jobs. Can be used to make sure jobs don't pile up, e.g. if the test suite runs daily, but it takes longer than one day to process all jobs. ## Creating the `crontab` entry and specifying `EESSI_CI_SYSTEM_NAME` This line depends on how often you want to run the tests, and where the `run_reframe_wrapper.sh` is located exactly. We also define the EESSI_CI_SYSTEM_NAME in this entry, as cronjobs don't normally read your `.bashrc` (and thus we need a different way of specifying this environment variable). diff --git a/CI/run_reframe.sh b/CI/run_reframe.sh index 497c2a9b..b50bd8c8 100755 --- a/CI/run_reframe.sh +++ b/CI/run_reframe.sh @@ -68,6 +68,11 @@ fi if [ -z "${RFM_PREFIX}" ]; then export RFM_PREFIX="${HOME}/reframe_CI_runs" fi +if [ -z "${REFRAME_TIMEOUT}" ]; then + # 10 minutes short of 1 day, since typically the test suite will be run daily. + # This will prevent multiple ReFrame runs from piling up and exceeding the quota on our Magic Castle clusters + export REFRAME_TIMEOUT=1430m +fi # Create virtualenv for ReFrame using system python python3 -m venv "${TEMPDIR}"/reframe_venv @@ -76,11 +81,15 @@ python3 -m pip install --upgrade pip python3 -m pip install reframe-hpc=="${REFRAME_VERSION}" # Clone reframe repo to have the hpctestlib: -git clone "${REFRAME_URL}" --branch "${REFRAME_BRANCH}" "${TEMPDIR}"/reframe +REFRAME_CLONE_ARGS="${REFRAME_URL} --branch ${REFRAME_BRANCH} ${TEMPDIR}/reframe" +echo "Cloning ReFrame repo: git clone ${REFRAME_CLONE_ARGS}" +git clone ${REFRAME_CLONE_ARGS} export PYTHONPATH="${PYTHONPATH}":"${TEMPDIR}"/reframe # Clone test suite repo -git clone "${EESSI_TESTSUITE_URL}" --branch "${EESSI_TESTSUITE_BRANCH}" "${TEMPDIR}"/test-suite +EESSI_CLONE_ARGS="${EESSI_TESTSUITE_URL} --branch ${EESSI_TESTSUITE_BRANCH} ${TEMPDIR}/test-suite" +echo "Cloning EESSI repo: git clone ${EESSI_CLONE_ARGS}" +git clone ${EESSI_CLONE_ARGS} export PYTHONPATH="${PYTHONPATH}":"${TEMPDIR}"/test-suite/ # Start the EESSI environment @@ -100,7 +109,7 @@ echo "" echo "TEMPDIR: ${TEMPDIR}" echo "PYTHONPATH: ${PYTHONPATH}" echo "EESSI test suite URL: ${EESSI_TESTSUITE_URL}" -echo "EESSI test suite version: ${EESSI_TESTSUITE_VERSION}" +echo "EESSI test suite version: ${EESSI_TESTSUITE_BRANCH}" echo "HPCtestlib from ReFrame URL: ${REFRAME_URL}" echo "HPCtestlib from ReFrame branch: ${REFRAME_BRANCH}" echo "ReFrame executable: $(which reframe)" @@ -118,7 +127,7 @@ reframe ${REFRAME_ARGS} --list # Run echo "Run tests:" -reframe ${REFRAME_ARGS} --run +timeout -v --preserve-status -s SIGTERM ${REFRAME_TIMEOUT} reframe ${REFRAME_ARGS} --run # Cleanup rm -rf "${TEMPDIR}" diff --git a/config/aws_mc.py b/config/aws_mc.py index f90600d1..1ff95d94 100644 --- a/config/aws_mc.py +++ b/config/aws_mc.py @@ -105,6 +105,12 @@ # steps inherit environment. It doesn't hurt to define this even if srun is not used 'export SLURM_EXPORT_ENV=ALL' ], + 'resources': [ + { + 'name': 'memory', + 'options': ['--mem={size}'], + } + ], 'extras': { # Node types have somewhat varying amounts of memory, but we'll make it easy on ourselves # All should _at least_ have this amount (30GB * 1E9 / (1024*1024) = 28610 MiB) diff --git a/config/azure_mc.py b/config/azure_mc.py index 3afcbddb..5300fac9 100644 --- a/config/azure_mc.py +++ b/config/azure_mc.py @@ -33,11 +33,32 @@ 'name': 'x86_64-amd-zen4-node', 'access': ['--partition=x86-64-amd-zen4-node', '--export=NONE'], 'descr': 'Zen4, 16 cores, 30 GB', + 'prepare_cmds': [ + 'export EESSI_SOFTWARE_SUBDIR_OVERRIDE=x86_64/amd/zen4', + common_eessi_init(), + # Required when using srun as launcher with --export=NONE in partition access, + # in order to ensure job steps inherit environment. It doesn't hurt to define + # this even if srun is not used + 'export SLURM_EXPORT_ENV=ALL' + ], + 'extras': { + 'mem_per_node': 768000 + }, }, { 'name': 'aarch64-neoverse-N1-16c-62gb', 'access': ['--partition=aarch64-neoverse-n1-node', '--export=NONE'], 'descr': 'Neoverse N1, 16 cores, 62 GiB', + 'prepare_cmds': [ + common_eessi_init(), + # Required when using srun as launcher with --export=NONE in partition access, + # in order to ensure job steps inherit environment. It doesn't hurt to define + # this even if srun is not used + 'export SLURM_EXPORT_ENV=ALL' + ], + 'extras': { + 'mem_per_node': 64000 + }, }, ] }, @@ -69,17 +90,12 @@ 'features': [ FEATURES['CPU'] ] + list(SCALES.keys()), - 'prepare_cmds': [ - common_eessi_init(), - # Required when using srun as launcher with --export=NONE in partition access, in order to ensure job - # steps inherit environment. It doesn't hurt to define this even if srun is not used - 'export SLURM_EXPORT_ENV=ALL' + 'resources': [ + { + 'name': 'memory', + 'options': ['--mem={size}'], + } ], - 'extras': { - # Node types have strongly varying amounts of memory, but we'll make it easy on ourselves - # All should _at least_ have this amount - 'mem_per_node': 64000 - }, } for system in site_configuration['systems']: for partition in system['partitions']: diff --git a/config/it4i_karolina.py b/config/it4i_karolina.py index c57bb889..f33e636c 100644 --- a/config/it4i_karolina.py +++ b/config/it4i_karolina.py @@ -60,6 +60,12 @@ 'features': [ FEATURES[CPU], ] + list(SCALES.keys()), + 'resources': [ + { + 'name': 'memory', + 'options': ['--mem={size}'], + } + ], 'extras': { # Make sure to round down, otherwise a job might ask for more mem than is available # per node diff --git a/config/settings_example.py b/config/settings_example.py index 83fedcce..969453ba 100644 --- a/config/settings_example.py +++ b/config/settings_example.py @@ -56,6 +56,11 @@ 'options': ['--mem={size}'], } ], + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 229376 # in MiB + }, # list(SCALES.keys()) adds all the scales from eessi.testsuite.constants as valid for thi partition # Can be modified if not all scales can run on this partition, see e.g. the surf_snellius.py config 'features': [FEATURES[CPU]] + list(SCALES.keys()), @@ -98,6 +103,9 @@ FEATURES[GPU], ] + list(SCALES.keys()), 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 229376, # in MiB GPU_VENDOR: GPU_VENDORS[NVIDIA], }, }, diff --git a/eessi/testsuite/common_config.py b/eessi/testsuite/common_config.py index 94a6849e..e3298506 100644 --- a/eessi/testsuite/common_config.py +++ b/eessi/testsuite/common_config.py @@ -1,5 +1,6 @@ import os -import warnings + +import reframe.core.logging as rlog perflog_format = '|'.join([ '%(check_job_completion_time)s', @@ -96,12 +97,11 @@ def common_eessi_init(eessi_version=None): eessi_cvmfs_repo = os.getenv('EESSI_CVMFS_REPO', None) if eessi_cvmfs_repo is None: - warn_msg = '\n' + '\n'.join([ - "EESSI WARNING: Environment variable 'EESSI_CVMFS_REPO' was not found.", - "EESSI WARNING: If you do not intend to use the EESSI software stack, this is perfectly fine.", - "EESSI WARNING: To use EESSI, initialize the EESSI environment before running the test suite.", - ]) - warnings.warn(warn_msg) + rlog.getlogger().warning('\n'.join([ + "Environment variable 'EESSI_CVMFS_REPO' is not defined.", + "If you do not intend to use the EESSI software stack, this is perfectly fine.", + "To use EESSI, initialize the EESSI environment before running the test suite.", + ])) return '' eessi_init = [] diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index 3211f273..ca09d999 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -525,6 +525,24 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req: float): msg += f" but {app_mem_req} MiB is needed" test.skip_if(test.current_partition.extras['mem_per_node'] < app_mem_req, msg) + # Check if a resource with the name 'memory' was set in the ReFrame config file. If not, warn the user + # and return from this hook (as setting test.extra_resources will be ignored in that case according to + # https://reframe-hpc.readthedocs.io/en/stable/regression_test_api.html#reframe.core.pipeline.RegressionTest.extra_resources + if 'memory' not in test.current_partition.resources: + logger = rflog.getlogger() + msg = "Your ReFrame configuration file does not specify any resource called 'memory' for this partition " + msg += f" ({test.current_partition.name})." + msg += " Without this, an explicit memory request cannot be made from the scheduler. This test will run," + msg += " but it may result in an out of memory error." + msg += " Please specify how to requst memory (per node) from your resource scheduler by defining a resource" + msg += " 'memory' in your ReFrame configuration file for this partition." + msg += " For a SLURM system, one would e.g. define:" + msg += " 'resources': [{'name': 'memory', 'options': ['--mem={size}']}]" + logger.warning(msg) + # We return, as setting a test.extra_resources is pointless - it would be ignored anyway + # This way, we also don't add any lines to the log that a specific amount of memory was requested + return + # Compute what is higher: the requested memory, or the memory available proportional to requested CPUs # Fraction of CPU cores requested check_proc_attribute_defined(test, 'num_cpus')