From 0fe71830794715f31dc11601ab482d6c9dc02b42 Mon Sep 17 00:00:00 2001 From: Ioan Hadade Date: Mon, 20 Jan 2025 13:49:35 +0000 Subject: [PATCH 1/6] Open input files with read-only rather than readwrite to prevent lock storms --- src/ecwam/wvopenbathy.F90 | 2 +- src/ecwam/wvopensubbathy.F90 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ecwam/wvopenbathy.F90 b/src/ecwam/wvopenbathy.F90 index 2f4594fb..e5de1fb9 100644 --- a/src/ecwam/wvopenbathy.F90 +++ b/src/ecwam/wvopenbathy.F90 @@ -78,7 +78,7 @@ SUBROUTINE WVOPENBATHY (IU06, IU07, KGRIB_HANDLE) IF ( KGRIB_HANDLE < 0 ) THEN WRITE(IU06,*) ' BINARY INPUT OF MODEL BATHYMETRY ' WRITE(IU06,*) '' - IU07 = IWAM_GET_UNIT(IU06, FILENAME(1:LFILE), 'r', 'u',0,'READWRITE') + IU07 = IWAM_GET_UNIT(IU06, FILENAME(1:LFILE), 'r', 'u',0,'READ') CALL FLUSH(IU06) ENDIF diff --git a/src/ecwam/wvopensubbathy.F90 b/src/ecwam/wvopensubbathy.F90 index 2877db85..1fbec9dd 100644 --- a/src/ecwam/wvopensubbathy.F90 +++ b/src/ecwam/wvopensubbathy.F90 @@ -105,7 +105,7 @@ SUBROUTINE WVOPENSUBBATHY (IREAD, NPR, FILENAME, KFILE_HANDLE, KGRIB_HANDLE) IF ( KGRIB_HANDLE < 0 ) THEN WRITE(IU06,*) ' BINARY INPUT OF MODEL SUBGRID BATHYMETRY ' WRITE(IU06,*) '' - IU08(IPROPAGS) = IWAM_GET_UNIT(IU06, FILENAME(1:LFILE), 'r', 'u',0,'READWRITE') + IU08(IPROPAGS) = IWAM_GET_UNIT(IU06, FILENAME(1:LFILE), 'r', 'u',0,'READ') CALL FLUSH(IU06) ENDIF From 48abcefdc6511685553a2ea51ffb4fb159f8b82f Mon Sep 17 00:00:00 2001 From: Ahmad Nawab <113430901+awnawab@users.noreply.github.com> Date: Thu, 6 Feb 2025 08:11:39 +0000 Subject: [PATCH 2/6] Run hpc-ci on PRs filed from forks (#61) * Enable HPC-CI to be run on PRs filed from forks --- .github/workflows/build-hpc.yml | 17 ++++++++++++----- .github/workflows/label-public-pr.yml | 11 +++++++++++ 2 files changed, 23 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/label-public-pr.yml diff --git a/.github/workflows/build-hpc.yml b/.github/workflows/build-hpc.yml index 721360d4..543ecf35 100644 --- a/.github/workflows/build-hpc.yml +++ b/.github/workflows/build-hpc.yml @@ -9,6 +9,10 @@ on: # Allow workflow to be dispatched on demand workflow_dispatch: ~ + # Trigger after public PR approved for CI + pull_request_target: + types: [labeled] + env: ECWAM_TOOLS: ${{ github.workspace }}/.github/tools CTEST_PARALLEL_LEVEL: 1 @@ -17,6 +21,7 @@ env: jobs: ci-hpc: name: ci-hpc + if: ${{ !github.event.pull_request.head.repo.fork && github.event.action != 'labeled' || github.event.label.name == 'approved-for-ci' }} strategy: fail-fast: false # false: try to complete all jobs @@ -119,15 +124,17 @@ jobs: popd {% endfor %} - mkdir -p ${{ github.repository }} - pushd ${{ github.repository }} + REPO=${{ github.event.pull_request.head.repo.full_name || github.repository }} + SHA=${{ github.event.pull_request.head.sha || github.sha }} + mkdir -p $REPO + pushd $REPO git init - git remote add origin ${{ github.server_url }}/${{ github.repository }} - git fetch origin ${{ github.sha }} + git remote add origin ${{ github.server_url }}/$REPO + git fetch origin $SHA git reset --hard FETCH_HEAD popd - cmake -G Ninja -S ${{ github.repository }} -B build \ + cmake -G Ninja -S $REPO -B build \ {% for name in dependencies %} {% set org, proj = name.split('/') %} -D{{proj}}_ROOT=$BASEDIR/{{name}}/installation \ diff --git a/.github/workflows/label-public-pr.yml b/.github/workflows/label-public-pr.yml new file mode 100644 index 00000000..1027b04a --- /dev/null +++ b/.github/workflows/label-public-pr.yml @@ -0,0 +1,11 @@ +# Manage labels of pull requests that originate from forks +name: label-public-pr + +on: + pull_request_target: + types: [opened, synchronize] + +jobs: + label: + uses: ecmwf-actions/reusable-workflows/.github/workflows/label-pr.yml@v2 + From c486e336484a95da5f3b34b01c709017234e9cf3 Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Mon, 20 Jan 2025 09:30:42 +0100 Subject: [PATCH 3/6] STATIC_LINKING: remove hard-coded static linking for GPU builds --- .github/workflows/build-hpc.yml | 1 + README.md | 6 +++--- package/bundle/bundle.yml | 4 ++++ src/ecwam/CMakeLists.txt | 8 -------- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/.github/workflows/build-hpc.yml b/.github/workflows/build-hpc.yml index 543ecf35..b5f50130 100644 --- a/.github/workflows/build-hpc.yml +++ b/.github/workflows/build-hpc.yml @@ -67,6 +67,7 @@ jobs: - -DENABLE_CUDA=ON - -DENABLE_GPU_AWARE_MPI=ON - -DENABLE_SINGLE_PRECISION=ON + - -DBUILD_SHARED_LIBS=OFF dependencies: ecmwf/eccodes: version: develop diff --git a/README.md b/README.md index 76849d36..9ed01b39 100644 --- a/README.md +++ b/README.md @@ -232,9 +232,9 @@ NB: GPU offload is not supported for ecWAM 1.4.0. Building -------- The recommended option for building the GPU enabled ecWAM is to use the provided bundle, and pass the -`--with-loki --with-acc` options. Different Loki transformations can also be chosen at build-time via the following -bundle option: `--loki-mode=`. Direct GPU-to-GPU MPI communications can be enabled by passing the -`--with-gpu-aware-mpi` option. CPU to GPU data transfers can be accelerated (via pinning of host-side allocations) +`--with-loki --with-acc --with-static-linking` options. Different Loki transformations can also be chosen at +build-time via the following bundle option: `--loki-mode=`. Direct GPU-to-GPU MPI communications can be enabled by +passing the `--with-gpu-aware-mpi` option. CPU to GPU data transfers can be accelerated (via pinning of host-side allocations) by building with the `--with-cuda` option. The ecwam-bundle also provides appropriate arch files for the nvhpc suite on the ECMWF ATOS system. diff --git a/package/bundle/bundle.yml b/package/bundle/bundle.yml index fa5a19ae..a61386dc 100644 --- a/package/bundle/bundle.yml +++ b/package/bundle/bundle.yml @@ -97,3 +97,7 @@ options : - with-fckit : help : Build fckit with Python virtual environment containing fypp and yaml parser cmake : BUILD_fckit=ON + + - with-static-linking : + help : Build static libraries by default + cmake : ECWAM_BUILD_SHARED_LIBS=OFF diff --git a/src/ecwam/CMakeLists.txt b/src/ecwam/CMakeLists.txt index 9514524c..dfcc838d 100644 --- a/src/ecwam/CMakeLists.txt +++ b/src/ecwam/CMakeLists.txt @@ -431,16 +431,8 @@ if( HAVE_SINGLE_PRECISION ) list(APPEND ECWAM_DEFINITIONS WAM_HAVE_SINGLE_PRECISION ) endif() -# Using dynamic linking creates undefined references to the device -# copies of module global variables -set( LIBRARY_TYPE SHARED ) -if( HAVE_ACC ) - set( LIBRARY_TYPE STATIC ) -endif() - ecbuild_add_library( TARGET ${ecwam} - TYPE ${LIBRARY_TYPE} SOURCES ${ecwam_srcs} PUBLIC_LIBS fiat parkind_${prec} ${ecwam}_intfb ${MPI_Fortran_LIBRARIES} From 5d7ce85f3cad3db1895a0cfe332fecce6834aa01 Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Thu, 6 Feb 2025 20:43:50 +0100 Subject: [PATCH 4/6] Static linking flag now applies to entire bundle --- package/bundle/bundle.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/bundle/bundle.yml b/package/bundle/bundle.yml index a61386dc..b3c46d89 100644 --- a/package/bundle/bundle.yml +++ b/package/bundle/bundle.yml @@ -100,4 +100,4 @@ options : - with-static-linking : help : Build static libraries by default - cmake : ECWAM_BUILD_SHARED_LIBS=OFF + cmake : BUILD_SHARED_LIBS=OFF From 30d59efc2883c42205c4a598fd095fdf3682e4fb Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Thu, 6 Feb 2025 20:58:38 +0100 Subject: [PATCH 5/6] HPC-CI: add CPU build --- .github/workflows/build-hpc.yml | 60 ++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 16 deletions(-) diff --git a/.github/workflows/build-hpc.yml b/.github/workflows/build-hpc.yml index b5f50130..02c29fb5 100644 --- a/.github/workflows/build-hpc.yml +++ b/.github/workflows/build-hpc.yml @@ -29,6 +29,7 @@ jobs: matrix: name: - ac-gpu nvhpc sp + - ac-cpu intel sp include: - name: ac-gpu nvhpc sp @@ -41,6 +42,36 @@ jobs: #SBATCH --gpus-per-task=1 #SBATCH --mem=0 #SBATCH --qos=dg + modules: + - cmake + - fcm + - ninja + - ecbuild + - prgenv/nvidia + - hpcx-openmpi/2.14.0-cuda + - python3 + gpu: 1 + + - name: ac-cpu intel sp + site: ac-batch + sbatch_options: | + #SBATCH --time=00:20:00 + #SBATCH --nodes=1 + #SBATCH --ntasks=4 + #SBATCH --cpus-per-task=32 + #SBATCH --hint=nomultithread + #SBATCH --mem=60GB + #SBATCH --qos=np + modules: + - cmake + - fcm + - ninja + - ecbuild + - prgenv/intel + - intel/2021.4.0 + - hpcx-openmpi/2.9.0 + - python3 + gpu: 0 runs-on: [self-hosted, linux, hpc] env: @@ -52,22 +83,14 @@ jobs: troika_user: ${{ secrets.HPC_CI_SSH_USER }} sbatch_options: ${{ matrix.sbatch_options }} template_data: | - modules: - - cmake - - fcm - - ninja - - ecbuild - - prgenv/nvidia - - hpcx-openmpi/2.14.0-cuda - - python3 cmake_options: - -DENABLE_MPI=ON - - -DENABLE_LOKI=ON - - -DENABLE_ACC=ON - - -DENABLE_CUDA=ON - - -DENABLE_GPU_AWARE_MPI=ON + - -DENABLE_LOKI=${{ matrix.gpu }} + - -DENABLE_ACC=${{ matrix.gpu }} + - -DENABLE_CUDA=${{ matrix.gpu }} + - -DENABLE_GPU_AWARE_MPI=${{ matrix.gpu }} - -DENABLE_SINGLE_PRECISION=ON - - -DBUILD_SHARED_LIBS=OFF + - -DBUILD_SHARED_LIBS=${{ !matrix.gpu }} dependencies: ecmwf/eccodes: version: develop @@ -75,31 +98,36 @@ jobs: - -DENABLE_MEMFS=ON - -DENABLE_JPG=OFF - -DENABLE_PNG=OFF + - -DBUILD_SHARED_LIBS=${{ !matrix.gpu }} ecmwf/fckit: version: 0.13.0 cmake_options: - -DENABLE_TESTS=OFF - -DENABLE_FCKIT_VENV=ON + - -DBUILD_SHARED_LIBS=${{ !matrix.gpu }} ecmwf-ifs/fiat: version: 1.4.1 cmake_options: - -DENABLE_MPI=ON - -DENABLE_SINGLE_PRECISION=ON - -DENABLE_DOUBLE_PRECISION=OFF + - -DBUILD_SHARED_LIBS=${{ !matrix.gpu }} ecmwf-ifs/field_api: version: v0.3.1 cmake_options: - -DENABLE_TESTS=OFF - - -DENABLE_ACC=ON - - -DENABLE_CUDA=ON + - -DENABLE_ACC=${{ matrix.gpu }} + - -DENABLE_CUDA=${{ matrix.gpu }} - -DENABLE_SINGLE_PRECISION=ON - -DENABLE_DOUBLE_PRECISION=OFF + - -DBUILD_SHARED_LIBS=${{ !matrix.gpu }} ecmwf-ifs/loki: version: v0.2.9 cmake_options: + - -DENABLE_NO_INSTALL=${{ !matrix.gpu }} - -DENABLE_TESTS=OFF template: | - {% for module in modules %} + {% for module in "${{ join(matrix.modules, ',') }}".split(',') %} module load {{module}} {% endfor %} From 5f7f330a291860b5ed4009588211a9e641797fe2 Mon Sep 17 00:00:00 2001 From: Jean Bidlot Date: Thu, 30 Jan 2025 10:07:34 +0000 Subject: [PATCH 6/6] fix for infinite loop in getspec when reading input data on a different resolution than the model one --- src/ecwam/getspec.F90 | 125 ++++++++++++++++++++---------------------- 1 file changed, 58 insertions(+), 67 deletions(-) diff --git a/src/ecwam/getspec.F90 b/src/ecwam/getspec.F90 index e2e73348..c5cb3853 100644 --- a/src/ecwam/getspec.F90 +++ b/src/ecwam/getspec.F90 @@ -290,52 +290,43 @@ SUBROUTINE GETSPEC(FL1, BLK2GLO, BLK2LOC, WVENVI, NBLKS, NBLKE, IREAD) GOTO 1021 ELSEIF (LLRESIZING .AND. IRET == JPGRIB_END_OF_FILE) THEN ! WE SHOULD HAVE THE MAXIMUM SIZE NECESSARY, START ALL OVER. + WRITE(IU06,*) '' + WRITE(IU06,*) '* GETSPEC: WE SHOULD HAVE THE MAXIMUM SIZE NECESSARY, START ALL OVER.' + WRITE(IU06,*) '' + CALL FLUSH(IU06) DEALLOCATE(INGRIB) LLRESIZING=.FALSE. CALL IGRIB_CLOSE_FILE(KFILE_HANDLE) CALL IGRIB_OPEN_FILE(KFILE_HANDLE,FILENAME(1:LFILE),'r') ISIZE=NBIT IF (.NOT.ALLOCATED(INGRIB)) ALLOCATE(INGRIB(ISIZE)) +! READ AGAIN UNTIL THE FIRST TIME WE ENCOUNTERED JPGRIB_BUFFER_TOO_SMALL + DO IBREAD=1,NBREAD_AGAIN + KBYTES=ISIZE*NPRECI + CALL IGRIB_READ_FROM_FILE(KFILE_HANDLE,INGRIB,KBYTES,IRET) + IF (IRET == JPGRIB_BUFFER_TOO_SMALL) THEN + WRITE(IU06,*) '****************************************************' + WRITE(IU06,*) '* GETSPEC: JPGRIB_BUFFER_TOO_SMALL SHOULD NOT HAPPEN' + WRITE(NULERR,*) '* GETSPEC: JPGRIB_BUFFER_TOO_SMALL SHOULD NOT HAPPEN' + WRITE(IU06,*) '****************************************************' + CALL ABORT1 + ELSEIF (IRET == JPGRIB_END_OF_FILE) THEN + WRITE(IU06,*) '**********************************' + WRITE(IU06,*) '* GETSPEC: END OF FILE ENCOUNTED' + WRITE(NULERR,*) '* GETSPEC: END OF FILE ENCOUNTED' + WRITE(IU06,*) '**********************************' + CALL ABORT1 + ELSEIF (IRET /= JPGRIB_SUCCESS) THEN + WRITE(IU06,*) '**********************************' + WRITE(IU06,*) '* GETSPEC: FILE HANDLING ERROR' + WRITE(NULERR,*) '* GETSPEC: FILE HANDLING ERROR' + WRITE(IU06,*) '**********************************' + CALL ABORT1 + ENDIF + ENDDO + NBREAD=IBREAD-1 + NBREAD_AGAIN=0 - NBREAD=NBREAD+1 - - CALL IGRIB_READ_FROM_FILE(KFILE_HANDLE,INGRIB,KBYTES,IRET) - IF (IRET == JPGRIB_BUFFER_TOO_SMALL) THEN - IF (.NOT.LLRESIZING) NBREAD_AGAIN=NBREAD - CALL KGRIBSIZE(IU06, KBYTES, NBIT, 'GETSPEC') - DEALLOCATE(INGRIB) - LLRESIZING=.TRUE. - GOTO 1021 - ELSEIF (LLRESIZING .AND. IRET /= JPGRIB_END_OF_FILE) THEN -! LOOP UNTIL YOU HAVE EXPLORE THE SIZE FOR THE WHOLE FILE. - DEALLOCATE(INGRIB) - GOTO 1021 - ELSEIF (LLRESIZING .AND. IRET == JPGRIB_END_OF_FILE) THEN -! WE SHOULD HAVE THE MAXIMUM SIZE NECESSARY, START ALL OVER. - DEALLOCATE(INGRIB) - LLRESIZING=.FALSE. - CALL IGRIB_CLOSE_FILE(KFILE_HANDLE) - CALL IGRIB_OPEN_FILE(KFILE_HANDLE,FILENAME(1:LFILE),'r') - ISIZE=NBIT - IF (.NOT.ALLOCATED(INGRIB)) ALLOCATE(INGRIB(ISIZE)) - DO IBREAD=1,NBREAD_AGAIN - KBYTES=ISIZE*NPRECI - CALL IGRIB_READ_FROM_FILE(KFILE_HANDLE,INGRIB,KBYTES,IRET) - ENDDO - NBREAD=IBREAD-1 - NBREAD_AGAIN=0 - - ELSEIF (IRET == JPGRIB_END_OF_FILE) THEN - WRITE(IU06,*) '**********************************' - WRITE(IU06,*) '* GETSPEC: END OF FILE ENCOUNTED' - WRITE(IU06,*) '**********************************' - CALL ABORT1 - ELSEIF (IRET /= JPGRIB_SUCCESS) THEN - WRITE(IU06,*) '**********************************' - WRITE(IU06,*) '* GETSPEC: FILE HANDLING ERROR' - WRITE(IU06,*) '**********************************' - CALL ABORT1 - ENDIF ENDIF ENDIF @@ -414,40 +405,40 @@ SUBROUTINE GETSPEC(FL1, BLK2GLO, BLK2LOC, WVENVI, NBLKS, NBLKE, IREAD) CALL IGRIB_RELEASE(KGRIB_HANDLE) IF (CDATE /= CDTPRO) THEN - WRITE(IU06,*)'**********************************' - WRITE(IU06,*)'* *' - WRITE(IU06,*)'* FATAL ERROR IN SUB GETSPEC *' - WRITE(IU06,*)'* =========================== *' - WRITE(IU06,*)'* *' - WRITE(IU06,*)'* REQUESTED DATE IS NOT EQUAL TO *' - WRITE(IU06,*)'* RETRIEVED DATE. *' - WRITE(IU06,*)'* IN FILE: ',FILENAME - WRITE(IU06,*)'* CDATE = ',CDATE - WRITE(IU06,*)'* CDTPRO = ',CDTPRO - WRITE(IU06,*)'* *' - WRITE(IU06,*)'**********************************' + WRITE(NULERR,*)'**********************************' + WRITE(NULERR,*)'* *' + WRITE(NULERR,*)'* FATAL ERROR IN SUB GETSPEC *' + WRITE(NULERR,*)'* =========================== *' + WRITE(NULERR,*)'* *' + WRITE(NULERR,*)'* REQUESTED DATE IS NOT EQUAL TO *' + WRITE(NULERR,*)'* RETRIEVED DATE. *' + WRITE(NULERR,*)'* IN FILE: ',FILENAME + WRITE(NULERR,*)'* CDATE = ',CDATE + WRITE(NULERR,*)'* CDTPRO = ',CDTPRO + WRITE(NULERR,*)'* *' + WRITE(NULERR,*)'**********************************' CALL ABORT1 ENDIF IF (K /= KK) THEN - WRITE(IU06,*) '************************************' - WRITE(IU06,*) '* FATAL ERROR IN SUB. GETSPEC *' - WRITE(IU06,*) '* REQUESTED AND DECODED DIRECTIONAL*' - WRITE(IU06,*) '* INDEX ARE DIFFERENT : *' - WRITE(IU06,*) '* REQUESTED : ',K - WRITE(IU06,*) '* DECODED : ',KK - WRITE(IU06,*) '* *' - WRITE(IU06,*) '************************************' + WRITE(NULERR,*) '************************************' + WRITE(NULERR,*) '* FATAL ERROR IN SUB. GETSPEC *' + WRITE(NULERR,*) '* REQUESTED AND DECODED DIRECTIONAL*' + WRITE(NULERR,*) '* INDEX ARE DIFFERENT : *' + WRITE(NULERR,*) '* REQUESTED : ',K + WRITE(NULERR,*) '* DECODED : ',KK + WRITE(NULERR,*) '* *' + WRITE(NULERR,*) '************************************' CALL ABORT1 ENDIF IF (M /= MM) THEN - WRITE(IU06,*) '************************************' - WRITE(IU06,*) '* FATAL ERROR IN SUB. GETSPEC *' - WRITE(IU06,*) '* REQUESTED AND DECODED FREQUENCY *' - WRITE(IU06,*) '* INDEX ARE DIFFERENT : *' - WRITE(IU06,*) '* REQUESTED : ',M - WRITE(IU06,*) '* DECODED : ',MM - WRITE(IU06,*) '* *' - WRITE(IU06,*) '************************************' + WRITE(NULERR,*) '************************************' + WRITE(NULERR,*) '* FATAL ERROR IN SUB. GETSPEC *' + WRITE(NULERR,*) '* REQUESTED AND DECODED FREQUENCY *' + WRITE(NULERR,*) '* INDEX ARE DIFFERENT : *' + WRITE(NULERR,*) '* REQUESTED : ',M + WRITE(NULERR,*) '* DECODED : ',MM + WRITE(NULERR,*) '* *' + WRITE(NULERR,*) '************************************' CALL ABORT1 ENDIF