diff --git a/.github/pseudo-cluster/docker-compose.yml b/.github/pseudo-cluster/docker-compose.yml new file mode 100644 index 0000000000..a6adb767f4 --- /dev/null +++ b/.github/pseudo-cluster/docker-compose.yml @@ -0,0 +1,114 @@ +services: + munge-key-generator: + image: ghcr.io/reframe-hpc/munge-ubuntu:20.04 + hostname: munge-host + healthcheck: + test: ["CMD-SHELL", "test -f /scratch/munge.key"] + interval: 10s + timeout: 10s + retries: 5 + volumes: + - shared-scratch:/scratch + + frontend: + image: slurm-reframe + container_name: frontend + build: + dockerfile: .github/pseudo-cluster/reframe/Dockerfile + context: ../../ + hostname: login + user: admin + init: True + volumes: + - shared-home:/home/admin:rw + - shared-scratch:/scratch:rw + links: + - slurm-master + depends_on: + munge-key-generator: + condition: service_healthy + slurm-master: + condition: service_started + node0: + condition: service_started + node1: + condition: service_started + node2: + condition: service_started + environment: + - SLURM_CPUS_ON_NODE=1 + - BACKEND=${BACKEND:-squeue} + + slurm-master: + image: ghcr.io/reframe-hpc/slurm-master-ubuntu:20.04 + hostname: slurm-master + user: admin + volumes: + - shared-home:/home/admin + - shared-scratch:/scratch:rw + depends_on: + munge-key-generator: + condition: service_healthy + environment: + - SLURM_CPUS_ON_NODE=1 + + node0: + image: ghcr.io/reframe-hpc/slurm-node-ubuntu:20.04 + hostname: nid00 + container_name: slurm-node0 + user: admin + volumes: + - shared-home:/home/admin + - shared-scratch:/scratch:rw + environment: + - SLURM_NODENAME=nid00 + - SLURM_CPUS_ON_NODE=1 + depends_on: + munge-key-generator: + condition: service_healthy + slurm-master: + condition: service_started + links: + - slurm-master + + node1: + image: ghcr.io/reframe-hpc/slurm-node-ubuntu:20.04 + hostname: nid01 + container_name: slurm-node1 + user: admin + volumes: + - shared-home:/home/admin + - shared-scratch:/scratch:rw + environment: + - SLURM_NODENAME=nid01 + - SLURM_CPUS_ON_NODE=1 + depends_on: + munge-key-generator: + condition: service_healthy + slurm-master: + condition: service_started + links: + - slurm-master + + node2: + image: ghcr.io/reframe-hpc/slurm-node-ubuntu:20.04 + hostname: nid02 + container_name: slurm-node2 + user: admin + volumes: + - shared-home:/home/admin + - shared-scratch:/scratch:rw + environment: + - SLURM_NODENAME=nid02 + - SLURM_CPUS_ON_NODE=1 + depends_on: + munge-key-generator: + condition: service_healthy + slurm-master: + condition: service_started + links: + - slurm-master + +volumes: + shared-home: + shared-scratch: diff --git a/.github/pseudo-cluster/reframe/Dockerfile b/.github/pseudo-cluster/reframe/Dockerfile new file mode 100644 index 0000000000..afeac49f98 --- /dev/null +++ b/.github/pseudo-cluster/reframe/Dockerfile @@ -0,0 +1,42 @@ +FROM ubuntu:20.04 + +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt update -y && \ + apt install -y \ + build-essential \ + clang jq libomp-dev tree vim \ + git \ + mariadb-client \ + munge \ + slurm-client \ + slurm-wlm-torque \ + sudo \ + python3 \ + python3-pip \ + wget \ + curl \ + mpich \ + libmpich-dev && \ + rm -rf /var/lib/apt/lists/* + +RUN useradd -m admin -s /usr/bin/bash -d /home/admin && \ + echo "admin:admin" | chpasswd && adduser admin sudo && \ + echo "admin ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers + +COPY .github/pseudo-cluster/reframe/slurm.conf /etc/slurm-llnl/ +COPY .github/pseudo-cluster/reframe/cgroup.conf /etc/slurm-llnl/ +COPY .github/pseudo-cluster/reframe/docker-entrypoint.sh /etc/slurm-llnl/ +COPY . /usr/local/share/reframe + +RUN mkdir /scratch && \ + chown -R admin:admin /scratch + +RUN chmod +rx /etc/slurm-llnl/docker-entrypoint.sh + +WORKDIR /home/admin + +ENV USER admin +ENV SHELL bash + +ENTRYPOINT ["/etc/slurm-llnl/docker-entrypoint.sh"] diff --git a/.github/pseudo-cluster/reframe/cgroup.conf b/.github/pseudo-cluster/reframe/cgroup.conf new file mode 120000 index 0000000000..c99f23956f --- /dev/null +++ b/.github/pseudo-cluster/reframe/cgroup.conf @@ -0,0 +1 @@ +../../../examples/tutorial/dockerfiles/slurm-cluster/reframe/cgroup.conf \ No newline at end of file diff --git a/.github/pseudo-cluster/reframe/docker-entrypoint.sh b/.github/pseudo-cluster/reframe/docker-entrypoint.sh new file mode 100755 index 0000000000..665f23ff8d --- /dev/null +++ b/.github/pseudo-cluster/reframe/docker-entrypoint.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +trap exit 0 INT + +while [ ! -f /scratch/munge.key ] +do + sleep 1 +done + +sudo cp /scratch/munge.key /etc/munge/munge.key +sudo service munge start +sudo sed -i "s/REPLACE_IT/CPUs=${SLURM_CPUS_ON_NODE}/g" /etc/slurm-llnl/slurm.conf + +# Needs to be copied in the shared home directory +cp -r /usr/local/share/reframe . +cd reframe +./bootstrap.sh + +echo "Running unittests with backend scheduler: ${BACKEND}" + +tempdir=$(mktemp -d -p /scratch) +TMPDIR=$tempdir ./test_reframe.py -v \ + --rfm-user-config=ci-scripts/configs/ci-cluster.py \ + --rfm-user-system=pseudo-cluster:compute-${BACKEND:-squeue} diff --git a/.github/pseudo-cluster/reframe/slurm.conf b/.github/pseudo-cluster/reframe/slurm.conf new file mode 120000 index 0000000000..39eb5d7b6f --- /dev/null +++ b/.github/pseudo-cluster/reframe/slurm.conf @@ -0,0 +1 @@ +../../../examples/tutorial/dockerfiles/slurm-cluster/reframe/slurm.conf \ No newline at end of file diff --git a/.github/workflows/test-schedulers.yaml b/.github/workflows/test-schedulers.yaml new file mode 100644 index 0000000000..83285e0988 --- /dev/null +++ b/.github/workflows/test-schedulers.yaml @@ -0,0 +1,24 @@ +name: ReFrame CI / Scheduler backend tests +on: + pull_request: [] + +jobs: + scheduler-test: + runs-on: ubuntu-latest + strategy: + matrix: + scheduler: ['pbs', 'squeue', 'torque'] + steps: + - uses: actions/checkout@v4 + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build Images + run: | + docker compose -f .github/pseudo-cluster/docker-compose.yml build + - name: Run Unittests with ${{ matrix.scheduler }} sceduler + run: | + BACKEND=${{ matrix.scheduler }} docker compose -f .github/pseudo-cluster/docker-compose.yml up --abort-on-container-exit --exit-code-from frontend diff --git a/Jenkinsfile b/Jenkinsfile deleted file mode 100644 index 1553a9b1b1..0000000000 --- a/Jenkinsfile +++ /dev/null @@ -1,169 +0,0 @@ -#!/usr/bin/env groovy - -def dirPrefix = 'reframe-ci' -def loginBash = '#!/bin/bash -l' -def bashScript = 'ci-scripts/ci-runner.bash' -def machinesList = params.machines.split() -def machinesToRun = machinesList -def runTests = true -def uniqueID - -stage('Initialization') { - node('master') { - catchError(stageResult: 'FAILURE') { - uniqueID = "${env.ghprbActualCommit[0..6]}-${env.BUILD_ID}" - echo 'Environment Variables:' - echo sh(script: 'env|sort', returnStdout: true) - - def githubComment = env.ghprbCommentBody - if (githubComment == 'null' || !githubComment.trim().startsWith('@jenkins-cscs')) { - machinesToRun = machinesList - currentBuild.result = 'SUCCESS' - return - } - - def splittedComment = githubComment.split() - if (splittedComment.size() < 3) { - println 'No machines were found. Aborting...' - currentBuild.result = 'ABORTED' - return - } - if (splittedComment[1] != 'retry') { - println "Invalid command ${splittedComment[1]}. Aborting..." - currentBuild.result = 'ABORTED' - return - } - if (splittedComment[2] == 'all') { - machinesToRun = machinesList - currentBuild.result = 'SUCCESS' - return - } - else if (splittedComment[2] == 'none') { - runTests = false - currentBuild.result = 'SUCCESS' - return - } - - machinesRequested = [] - for (i = 2; i < splittedComment.size(); i++) { - machinesRequested.add(splittedComment[i]) - } - - machinesToRun = machinesRequested.findAll({it in machinesList}) - if (!machinesToRun) { - println 'No machines were found. Aborting...' - currentBuild.result = 'ABORTED' - return - } - currentBuild.result = 'SUCCESS' - } - } -} - -if (!runTests) { - println "Won't execute any test (${currentBuild.result}). Exiting..." - return -} - -if (currentBuild.result != 'SUCCESS') { - println "Initialization failed (${currentBuild.result}). Exiting..." - return -} - -def builds = [:] -stage('Unittest') { - for (mach in machinesToRun) { - def machineName = mach - builds[machineName] = { - node(machineName) { - def scratch = sh(returnStdout: true, - script: """${loginBash} - echo \$SCRATCH""").trim() - def reframeDir = "${scratch}/${dirPrefix}-${machineName}-${uniqueID}" - dir(reframeDir) { - checkout scm - sh("""${loginBash} - bash ${reframeDir}/${bashScript} -f ${reframeDir} -i ''""") - } - } - } - } - - catchError(stageResult: 'FAILURE') { - parallel builds - } -} - -builds = [:] -stage('Tutorial Check') { - if (currentBuild.result != 'SUCCESS') { - println 'Not executing "Tutorial Check" Stage' - return - } - else { - catchError(stageResult: 'FAILURE') { - if (!('daint' in machinesToRun)) { - return - } - node('daint') { - def scratch = sh(returnStdout: true, - script: """${loginBash} - echo \$SCRATCH""").trim() - def reframeDir = "${scratch}/${dirPrefix}-daint-${uniqueID}" - dir(reframeDir) { - sh("""${loginBash} - bash ${reframeDir}/${bashScript} -f ${reframeDir} -i '' -t""") - } - } - } - } -} - -builds = [:] -stage('Cleanup') { - if (currentBuild.result != 'SUCCESS') { - println 'Not executing "Cleanup" Stage' - return - } - else { - for (mach in machinesToRun) { - def machineName = mach - builds[machineName] = { - node(machineName) { - def scratch = sh(returnStdout: true, - script: """$loginBash - echo \$SCRATCH""").trim() - def reframeDir = "${scratch}/${dirPrefix}-${machineName}-${uniqueID}" - sh("""${loginBash} - rm -rf ${reframeDir} - date""") - - } - } - } - catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { - parallel builds - } - } -} - -def staleCleanupInterval = 3 -builds = [:] -stage('Cleanup Stale') { - for (mach in machinesToRun) { - def machineName = mach - builds[machineName] = { - node(machineName) { - def scratch = sh(returnStdout: true, - script: """${loginBash} - echo \$SCRATCH""").trim() - sh("""${loginBash} - find ${scratch} -maxdepth 1 -name 'reframe-ci*' -ctime +${staleCleanupInterval} -type d -exec printf 'Removing %s\\n' {} + - find ${scratch} -maxdepth 1 -name 'reframe-ci*' -ctime +${staleCleanupInterval} -type d -exec rm -rf {} +""") - } - } - } - catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { - parallel builds - } -} diff --git a/ci-scripts/ci-runner.bash b/ci-scripts/ci-runner.bash index 50130481b9..08e0062c9d 100644 --- a/ci-scripts/ci-runner.bash +++ b/ci-scripts/ci-runner.bash @@ -170,7 +170,7 @@ else for backend in slurm pbs torque; do echo "[INFO] Running unit tests with ${backend}" TMPDIR=$tempdir checked_exec ./test_reframe.py ${parallel_opts} \ - --rfm-user-config=config/cscs-ci.py \ + --rfm-user-config=ci-scripts/configs/cscs-ci.py \ -W=error::reframe.core.warnings.ReframeDeprecationWarning \ --rfm-user-system=dom:${backend} -ra done @@ -178,7 +178,7 @@ else else echo "[INFO] Running unit tests" TMPDIR=$tempdir checked_exec ./test_reframe.py ${parallel_opts} \ - --rfm-user-config=config/cscs-ci.py \ + --rfm-user-config=ci-scripts/configs/cscs-ci.py \ -W=error::reframe.core.warnings.ReframeDeprecationWarning -ra fi diff --git a/ci-scripts/configs/ci-cluster.py b/ci-scripts/configs/ci-cluster.py new file mode 100644 index 0000000000..3d30203fbd --- /dev/null +++ b/ci-scripts/configs/ci-cluster.py @@ -0,0 +1,73 @@ +# Copyright 2016-2024 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +site_configuration = { + 'systems': [ + { + 'name': 'pseudo-cluster', + 'descr': 'CI Slurm-based pseudo cluster', + 'hostnames': ['login'], + 'partitions': [ + { + 'name': 'login', + 'descr': 'Login nodes', + 'scheduler': 'local', + 'launcher': 'local', + 'environs': ['gnu', 'clang'] + }, + { + 'name': 'compute-squeue', + 'descr': 'Squeue compute nodes', + 'scheduler': 'squeue', + 'launcher': 'srun', + 'access': ['-p all'], + 'environs': ['gnu', 'clang'] + }, + { + 'name': 'compute-torque', + 'descr': 'Torque compute nodes', + 'scheduler': 'torque', + 'launcher': 'mpiexec', + 'access': ['-p all'], + 'environs': ['gnu', 'clang'] + }, + { + 'name': 'compute-pbs', + 'descr': 'PBS compute nodes', + 'scheduler': 'pbs', + 'launcher': 'mpiexec', + 'access': ['-p all'], + 'environs': ['gnu', 'clang'] + } + ] + }, + ], + 'environments': [ + { + 'name': 'baseline', + 'features': ['stream'] + }, + { + 'name': 'gnu', + 'cc': 'gcc', + 'cxx': 'g++', + 'features': ['openmp'], + 'extras': {'omp_flag': '-fopenmp'} + }, + { + 'name': 'clang', + 'cc': 'clang', + 'cxx': 'clang++', + 'features': ['openmp'], + 'extras': {'omp_flag': '-fopenmp'} + } + ], + 'modes': [ + { + 'name': 'singlethread', + 'options': ['-E num_threads==1'] + } + ] +} diff --git a/config/cscs-ci.py b/ci-scripts/configs/cscs-ci.py similarity index 100% rename from config/cscs-ci.py rename to ci-scripts/configs/cscs-ci.py