Skip to content

Commit

Permalink
Merge branch 'main' of github.com:EESSI/test-suite
Browse files Browse the repository at this point in the history
  • Loading branch information
Caspar van Leeuwen committed Sep 21, 2024
2 parents fbfe08d + a38b593 commit 5020460
Show file tree
Hide file tree
Showing 10 changed files with 86 additions and 23 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/scorecards.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
persist-credentials: false

- name: "Run analysis"
uses: ossf/scorecard-action@99c53751e09b9529366343771cc321ec74e9bd3d # v2.0.6
uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1
with:
results_file: results.sarif
results_format: sarif
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ jobs:
# update $PYTHONPATH so 'import eessi.testsuite.utils' works
export PYTHONPATH=$PWD:$PYTHONPATH
echo $PYTHONPATH
python -c 'import eessi.testsuite.utils'
# show active ReFrame configuration,
# enable verbose output to help expose problems with configuration file (if any)
Expand Down
1 change: 1 addition & 0 deletions CI/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ It should define:
- `RFM_CHECK_SEARCH_PATH` (optional): the search path where ReFrame should search for tests to run in this CI pipeline. Default: `${TEMPDIR}/test-suite/eessi/testsuite/tests/`.
- `RFM_CHECK_SEARCH_RECURSIVE` (optional): whether ReFrame should search `RFM_CHECK_SEARCH_PATH` recursively. Default: `1`.
- `RFM_PREFIX` (optional): the prefix in which ReFrame stores all the files. Default: `${HOME}/reframe_CI_runs`.
- `REFRAME_TIMEOUT` (optional): DURATION as passed to the `timeout` command in Unix. If the `reframe` commands runs for longer than this, it will be killed by SIGTERM. The ReFrame runtime will then cancel all scheduled (and running) jobs. Can be used to make sure jobs don't pile up, e.g. if the test suite runs daily, but it takes longer than one day to process all jobs.

## Creating the `crontab` entry and specifying `EESSI_CI_SYSTEM_NAME`
This line depends on how often you want to run the tests, and where the `run_reframe_wrapper.sh` is located exactly. We also define the EESSI_CI_SYSTEM_NAME in this entry, as cronjobs don't normally read your `.bashrc` (and thus we need a different way of specifying this environment variable).
Expand Down
17 changes: 13 additions & 4 deletions CI/run_reframe.sh
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ fi
if [ -z "${RFM_PREFIX}" ]; then
export RFM_PREFIX="${HOME}/reframe_CI_runs"
fi
if [ -z "${REFRAME_TIMEOUT}" ]; then
# 10 minutes short of 1 day, since typically the test suite will be run daily.
# This will prevent multiple ReFrame runs from piling up and exceeding the quota on our Magic Castle clusters
export REFRAME_TIMEOUT=1430m
fi

# Create virtualenv for ReFrame using system python
python3 -m venv "${TEMPDIR}"/reframe_venv
Expand All @@ -76,11 +81,15 @@ python3 -m pip install --upgrade pip
python3 -m pip install reframe-hpc=="${REFRAME_VERSION}"

# Clone reframe repo to have the hpctestlib:
git clone "${REFRAME_URL}" --branch "${REFRAME_BRANCH}" "${TEMPDIR}"/reframe
REFRAME_CLONE_ARGS="${REFRAME_URL} --branch ${REFRAME_BRANCH} ${TEMPDIR}/reframe"
echo "Cloning ReFrame repo: git clone ${REFRAME_CLONE_ARGS}"
git clone ${REFRAME_CLONE_ARGS}
export PYTHONPATH="${PYTHONPATH}":"${TEMPDIR}"/reframe

# Clone test suite repo
git clone "${EESSI_TESTSUITE_URL}" --branch "${EESSI_TESTSUITE_BRANCH}" "${TEMPDIR}"/test-suite
EESSI_CLONE_ARGS="${EESSI_TESTSUITE_URL} --branch ${EESSI_TESTSUITE_BRANCH} ${TEMPDIR}/test-suite"
echo "Cloning EESSI repo: git clone ${EESSI_CLONE_ARGS}"
git clone ${EESSI_CLONE_ARGS}
export PYTHONPATH="${PYTHONPATH}":"${TEMPDIR}"/test-suite/

# Start the EESSI environment
Expand All @@ -100,7 +109,7 @@ echo ""
echo "TEMPDIR: ${TEMPDIR}"
echo "PYTHONPATH: ${PYTHONPATH}"
echo "EESSI test suite URL: ${EESSI_TESTSUITE_URL}"
echo "EESSI test suite version: ${EESSI_TESTSUITE_VERSION}"
echo "EESSI test suite version: ${EESSI_TESTSUITE_BRANCH}"
echo "HPCtestlib from ReFrame URL: ${REFRAME_URL}"
echo "HPCtestlib from ReFrame branch: ${REFRAME_BRANCH}"
echo "ReFrame executable: $(which reframe)"
Expand All @@ -118,7 +127,7 @@ reframe ${REFRAME_ARGS} --list

# Run
echo "Run tests:"
reframe ${REFRAME_ARGS} --run
timeout -v --preserve-status -s SIGTERM ${REFRAME_TIMEOUT} reframe ${REFRAME_ARGS} --run

# Cleanup
rm -rf "${TEMPDIR}"
6 changes: 6 additions & 0 deletions config/aws_mc.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,12 @@
# steps inherit environment. It doesn't hurt to define this even if srun is not used
'export SLURM_EXPORT_ENV=ALL'
],
'resources': [
{
'name': 'memory',
'options': ['--mem={size}'],
}
],
'extras': {
# Node types have somewhat varying amounts of memory, but we'll make it easy on ourselves
# All should _at least_ have this amount (30GB * 1E9 / (1024*1024) = 28610 MiB)
Expand Down
36 changes: 26 additions & 10 deletions config/azure_mc.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,32 @@
'name': 'x86_64-amd-zen4-node',
'access': ['--partition=x86-64-amd-zen4-node', '--export=NONE'],
'descr': 'Zen4, 16 cores, 30 GB',
'prepare_cmds': [
'export EESSI_SOFTWARE_SUBDIR_OVERRIDE=x86_64/amd/zen4',
common_eessi_init(),
# Required when using srun as launcher with --export=NONE in partition access,
# in order to ensure job steps inherit environment. It doesn't hurt to define
# this even if srun is not used
'export SLURM_EXPORT_ENV=ALL'
],
'extras': {
'mem_per_node': 768000
},
},
{
'name': 'aarch64-neoverse-N1-16c-62gb',
'access': ['--partition=aarch64-neoverse-n1-node', '--export=NONE'],
'descr': 'Neoverse N1, 16 cores, 62 GiB',
'prepare_cmds': [
common_eessi_init(),
# Required when using srun as launcher with --export=NONE in partition access,
# in order to ensure job steps inherit environment. It doesn't hurt to define
# this even if srun is not used
'export SLURM_EXPORT_ENV=ALL'
],
'extras': {
'mem_per_node': 64000
},
},
]
},
Expand Down Expand Up @@ -69,17 +90,12 @@
'features': [
FEATURES['CPU']
] + list(SCALES.keys()),
'prepare_cmds': [
common_eessi_init(),
# Required when using srun as launcher with --export=NONE in partition access, in order to ensure job
# steps inherit environment. It doesn't hurt to define this even if srun is not used
'export SLURM_EXPORT_ENV=ALL'
'resources': [
{
'name': 'memory',
'options': ['--mem={size}'],
}
],
'extras': {
# Node types have strongly varying amounts of memory, but we'll make it easy on ourselves
# All should _at least_ have this amount
'mem_per_node': 64000
},
}
for system in site_configuration['systems']:
for partition in system['partitions']:
Expand Down
6 changes: 6 additions & 0 deletions config/it4i_karolina.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@
'features': [
FEATURES[CPU],
] + list(SCALES.keys()),
'resources': [
{
'name': 'memory',
'options': ['--mem={size}'],
}
],
'extras': {
# Make sure to round down, otherwise a job might ask for more mem than is available
# per node
Expand Down
8 changes: 8 additions & 0 deletions config/settings_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@
'options': ['--mem={size}'],
}
],
'extras': {
# Make sure to round down, otherwise a job might ask for more mem than is available
# per node
'mem_per_node': 229376 # in MiB
},
# list(SCALES.keys()) adds all the scales from eessi.testsuite.constants as valid for thi partition
# Can be modified if not all scales can run on this partition, see e.g. the surf_snellius.py config
'features': [FEATURES[CPU]] + list(SCALES.keys()),
Expand Down Expand Up @@ -98,6 +103,9 @@
FEATURES[GPU],
] + list(SCALES.keys()),
'extras': {
# Make sure to round down, otherwise a job might ask for more mem than is available
# per node
'mem_per_node': 229376, # in MiB
GPU_VENDOR: GPU_VENDORS[NVIDIA],
},
},
Expand Down
14 changes: 7 additions & 7 deletions eessi/testsuite/common_config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import warnings

import reframe.core.logging as rlog

perflog_format = '|'.join([
'%(check_job_completion_time)s',
Expand Down Expand Up @@ -96,12 +97,11 @@ def common_eessi_init(eessi_version=None):
eessi_cvmfs_repo = os.getenv('EESSI_CVMFS_REPO', None)

if eessi_cvmfs_repo is None:
warn_msg = '\n' + '\n'.join([
"EESSI WARNING: Environment variable 'EESSI_CVMFS_REPO' was not found.",
"EESSI WARNING: If you do not intend to use the EESSI software stack, this is perfectly fine.",
"EESSI WARNING: To use EESSI, initialize the EESSI environment before running the test suite.",
])
warnings.warn(warn_msg)
rlog.getlogger().warning('\n'.join([
"Environment variable 'EESSI_CVMFS_REPO' is not defined.",
"If you do not intend to use the EESSI software stack, this is perfectly fine.",
"To use EESSI, initialize the EESSI environment before running the test suite.",
]))
return ''

eessi_init = []
Expand Down
18 changes: 18 additions & 0 deletions eessi/testsuite/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,24 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req: float):
msg += f" but {app_mem_req} MiB is needed"
test.skip_if(test.current_partition.extras['mem_per_node'] < app_mem_req, msg)

# Check if a resource with the name 'memory' was set in the ReFrame config file. If not, warn the user
# and return from this hook (as setting test.extra_resources will be ignored in that case according to
# https://reframe-hpc.readthedocs.io/en/stable/regression_test_api.html#reframe.core.pipeline.RegressionTest.extra_resources
if 'memory' not in test.current_partition.resources:
logger = rflog.getlogger()
msg = "Your ReFrame configuration file does not specify any resource called 'memory' for this partition "
msg += f" ({test.current_partition.name})."
msg += " Without this, an explicit memory request cannot be made from the scheduler. This test will run,"
msg += " but it may result in an out of memory error."
msg += " Please specify how to requst memory (per node) from your resource scheduler by defining a resource"
msg += " 'memory' in your ReFrame configuration file for this partition."
msg += " For a SLURM system, one would e.g. define:"
msg += " 'resources': [{'name': 'memory', 'options': ['--mem={size}']}]"
logger.warning(msg)
# We return, as setting a test.extra_resources is pointless - it would be ignored anyway
# This way, we also don't add any lines to the log that a specific amount of memory was requested
return

# Compute what is higher: the requested memory, or the memory available proportional to requested CPUs
# Fraction of CPU cores requested
check_proc_attribute_defined(test, 'num_cpus')
Expand Down

0 comments on commit 5020460

Please sign in to comment.