diff --git a/env.sh b/env.sh index 6fff86847..6c5d9b1f5 100755 --- a/env.sh +++ b/env.sh @@ -54,7 +54,12 @@ elif [[ $LOGINHOST =~ ^m[A-Za-z0-9]+\.hpc\.dkrz\.de$ ]]; then STRATEGY="mistral.dkrz.de" elif [[ $LOGINHOST =~ ^levante ]] || [[ $LOGINHOST =~ ^l[:alnum:]+\.lvt\.dkrz\.de$ ]]; then STRATEGY="levante.dkrz.de" -elif [[ $LOGINHOST =~ ^ollie[0-9]$ ]] || [[ $LOGINHOST =~ ^prod-[0-9]{4}$ ]]; then + # following regex only matches if input is 2 word like levante.nvhpc, this enables using different shells for a machine directly + compid_regex="^([[:alnum:]]+)\.([[:alnum:]]+)$" + if [[ $LOGINHOST =~ $compid_regex ]]; then + COMPILERID="${BASH_REMATCH[2]}" + fi + elif [[ $LOGINHOST =~ ^ollie[0-9]$ ]] || [[ $LOGINHOST =~ ^prod-[0-9]{4}$ ]]; then STRATEGY="ollie" elif [[ $LOGINHOST =~ ^albedo[0-9]$ ]] || [[ $LOGINHOST =~ ^prod-[0-9]{4}$ ]]; then STRATEGY="albedo" diff --git a/env/levante.dkrz.de/shell.nvhpc b/env/levante.dkrz.de/shell.nvhpc index eb2b776f6..f0ae54531 100755 --- a/env/levante.dkrz.de/shell.nvhpc +++ b/env/levante.dkrz.de/shell.nvhpc @@ -5,9 +5,10 @@ export CPU_MODEL=AMD_EPYC_ZEN3 module --force purge # module load intel-oneapi-compilers/2022.0.1-gcc-11.2.0 # module load openmpi/4.1.2-intel-2021.5.0 -module load nvhpc/22.5-gcc-11.2.0 -module load openmpi/.4.1.4-nvhpc-22.5 +module load nvhpc/23.9-gcc-11.2.0 +module load openmpi/4.1.6-nvhpc-23.9 export FC=mpif90 CC=mpicc CXX=mpicxx; +# export LD_LIBRARY_PATH=/sw/spack-levante/intel-oneapi-mkl-2022.0.1-ttdktf/mkl/2022.0.1/lib/intel64:$LD_LIBRARY_PATH module load netcdf-c/4.8.1-openmpi-4.1.2-intel-2021.5.0 module load netcdf-fortran/4.5.3-openmpi-4.1.2-intel-2021.5.0 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 509e899a4..ae6283a09 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -58,12 +58,13 @@ endif() option(ENABLE_OPENACC "compile with OpenACC support" OFF) message(STATUS "ENABLE_OPENACC: ${ENABLE_OPENACC}") - -set(NV_GPU_ARCH "cc80" CACHE STRING "GPU arch for nvfortran compiler (cc35,cc50,cc60,cc70,cc80,...)") +option(DISABLE_OPENACC_ATOMICS "disable kernels using atomic statement for reproducible results" ON) +set(GPU_COMPUTE_CAPABILITY "cc80" CACHE STRING "GPU arch for nvfortran compiler (cc35,cc50,cc60,cc70,cc80,...)") +set(GPU_FLAGS "cuda12.2,${GPU_COMPUTE_CAPABILITY}" CACHE STRING "GPU arch for nvfortran compiler (cc35,cc50,cc60,cc70,cc80,...)") option(ENABLE_OPENMP "build FESOM with OpenMP" OFF) message(STATUS "ENABLE_OPENMP: ${ENABLE_OPENMP}") -if(ENABLE_OPENMP) +if(${ENABLE_OPENMP}) find_package(OpenMP REQUIRED COMPONENTS Fortran) endif() @@ -184,7 +185,7 @@ target_link_libraries(${PROJECT_NAME} PRIVATE parms) #metis target_link_libraries(${PROJECT_NAME} PRIVATE MPI::MPI_Fortran) set_target_properties(${PROJECT_NAME} PROPERTIES LINKER_LANGUAGE Fortran) -if(ENABLE_OPENMP) +if(${ENABLE_OPENMP}) target_link_libraries(${PROJECT_NAME} PRIVATE OpenMP::OpenMP_Fortran) endif() @@ -256,7 +257,7 @@ elseif(${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU ) elseif(${CMAKE_Fortran_COMPILER_ID} STREQUAL Cray ) #target_compile_options(${PROJECT_NAME} PRIVATE -c -emf -hbyteswapio -hflex_mp=conservative -hfp1 -hadd_paren -Ounroll0 -hipa0 -r am -s real64 -N 1023 -g -G2 -O3) target_compile_options(${PROJECT_NAME} PRIVATE -c -emf -hbyteswapio -hflex_mp=conservative -hfp1 -hadd_paren -Ounroll0 -hipa0 -r am -s real64 -N 1023 -g -G2 -O2 -hnoacc -M878) #-hnoacc is a workaround for cray automatically activate -hacc, -M878 is to suppress ftn-878 warning - if(ENABLE_OPENMP) + if(${ENABLE_OPENMP}) target_compile_options(${PROJECT_NAME} PRIVATE -homp) else() target_compile_options(${PROJECT_NAME} PRIVATE -hnoomp) @@ -269,13 +270,19 @@ elseif(${CMAKE_Fortran_COMPILER_ID} STREQUAL Cray ) endif() elseif(${CMAKE_Fortran_COMPILER_ID} STREQUAL NVHPC ) target_compile_definitions(${PROJECT_NAME} PRIVATE ENABLE_NVHPC_WORKAROUNDS) - target_compile_options(${PROJECT_NAME} PRIVATE -fast -fastsse -O3 -Mallocatable=95 -Mr8 -pgf90libs) - if(ENABLE_OPENACC) + #target_compile_options(${PROJECT_NAME} PRIVATE -fast -fastsse -O3 -Mallocatable=95 -Mr8 -pgf90libs) + target_compile_options(${PROJECT_NAME} PRIVATE -Mnofma -Mallocatable=95 -Mr8 -pgf90libs) + if(${ENABLE_OPENACC}) # additional compiler settings - target_compile_options(${PROJECT_NAME} PRIVATE -acc -ta=tesla:${NV_GPU_ARCH} -Minfo=accel) - set(CMAKE_EXE_LINKER_FLAGS "-acc -ta=tesla:${NV_GPU_ARCH}") + message("Taking ENABLE_OPENACC = ON") + target_compile_options(${PROJECT_NAME} PRIVATE -acc -O2 -gpu=${GPU_FLAGS} -Minfo=accel) + # set(CMAKE_EXE_LINKER_FLAGS "-acc -gpu=${GPU_FLAGS}") + if(${DISABLE_OPENACC_ATOMICS}) + message("Taking DISABLE_OPENACC_ATOMICS = ON") + target_compile_definitions(${PROJECT_NAME} PRIVATE DISABLE_OPENACC_ATOMICS) + endif() endif() - if(ENABLE_OPENMP) + if(${ENABLE_OPENMP}) target_compile_options(${PROJECT_NAME} PRIVATE -Mipa=fast) else() target_compile_options(${PROJECT_NAME} PRIVATE -Mipa=fast,inline) diff --git a/src/fesom_module.F90 b/src/fesom_module.F90 index 843152e4e..4ca4ac633 100755 --- a/src/fesom_module.F90 +++ b/src/fesom_module.F90 @@ -382,6 +382,9 @@ subroutine fesom_init(fesom_total_nsteps) !$ACC CREATE (f%tracers%work%adv_flux_hor, f%tracers%work%adv_flux_ver, f%tracers%work%fct_LO) & !$ACC CREATE (f%tracers%work%del_ttf_advvert, f%tracers%work%del_ttf_advhoriz, f%tracers%work%edge_up_dn_grad) & !$ACC CREATE (f%tracers%work%del_ttf) + + !! Creating variables in GPU memory for init_tracers_AB module + !$ACC ENTER DATA CREATE(tr_xy, tr_z, relax2clim, Sclim, Tclim) end subroutine @@ -632,6 +635,9 @@ subroutine fesom_finalize() !$ACC EXIT DATA DELETE (f%dynamics%w, f%dynamics%w_e, f%dynamics%uv) !$ACC EXIT DATA DELETE (f%dynamics, f%tracers) + !!$ Deleting init_tracers_AB values + !$ACC EXIT DATA DELETE (tr_xy, tr_z, relax2clim, Sclim, Tclim) + !delete mesh and partit data. !$ACC EXIT DATA DELETE (f%mesh%coriolis_node, f%mesh%nn_num, f%mesh%nn_pos) !$ACC EXIT DATA DELETE (f%mesh%ssh_stiff, f%mesh%ssh_stiff%rowptr) diff --git a/src/ice_fct.F90 b/src/ice_fct.F90 index d60bf352d..df7fe3948 100755 --- a/src/ice_fct.F90 +++ b/src/ice_fct.F90 @@ -1122,7 +1122,11 @@ subroutine ice_fem_fct(tr_array_id, ice, partit, mesh) call exchange_nod(ice_temp, partit, luse_g2g = .true.) #endif +#ifndef ENABLE_OPENACC +!$OMP PARALLEL DO +#else !$ACC END DATA +#endif !$OMP BARRIER end subroutine ice_fem_fct diff --git a/src/oce_ale_tracer.F90 b/src/oce_ale_tracer.F90 index 1a0deaf5e..187339ede 100644 --- a/src/oce_ale_tracer.F90 +++ b/src/oce_ale_tracer.F90 @@ -148,6 +148,9 @@ subroutine solve_tracers_ale(ice, dynamics, tracers, partit, mesh) subroutine solve_tracers_ale(ice, dynamics, tracers, partit, mesh) use g_config use o_PARAM, only: SPP, Fer_GM + !tr_xy and tr_z are needed cause, we are writing them on the GPU in init_tracers_AB subroutine + !and updating them so HOST can have access to them + use o_arrays, only: tr_xy, tr_z use mod_mesh USE MOD_PARTIT USE MOD_PARSUP @@ -219,15 +222,20 @@ subroutine solve_tracers_ale(ice, dynamics, tracers, partit, mesh) ! do tracer AB (Adams-Bashfort) interpolation only for advectiv part ! needed if (flag_debug .and. mype==0) print *, achar(27)//'[37m'//' --> call init_tracers_AB'//achar(27)//'[0m' + !$ACC UPDATE DEVICE(tracers%data(tr_num)%values, tracers%data(tr_num)%valuesAB) call init_tracers_AB(tr_num, tracers, partit, mesh) + !$ACC UPDATE HOST(tr_xy, tr_z) + ! advect tracers if (flag_debug .and. mype==0) print *, achar(27)//'[37m'//' --> call adv_tracers_ale'//achar(27)//'[0m' !here update only those initialized in the init_tracers. (values, valuesAB, edge_up_dn_grad, ...) - !$ACC UPDATE DEVICE(tracers%data(tr_num)%values, tracers%data(tr_num)%valuesAB) & - !$ACC DEVICE(tracers%work%edge_up_dn_grad) !!& + !!!! UPDATE from hpc_tracer !!!! + !we dont have to update because we are updating before init_tracers_AB + !!$ACC UPDATE DEVICE(tracers%data(tr_num)%values, tracers%data(tr_num)%valuesAB) & + !$ACC UPDATE DEVICE(tracers%work%edge_up_dn_grad) !!& ! it will update del_ttf with contributions from horizontal and vertical advection parts (del_ttf_advhoriz and del_ttf_advvert) !$ACC wait(1) call do_oce_adv_tra(dt, UV, Wvel, Wvel_i, Wvel_e, tr_num, dynamics, tracers, partit, mesh) diff --git a/src/oce_tracer_mod.F90 b/src/oce_tracer_mod.F90 index 944be884b..7bedf0b27 100755 --- a/src/oce_tracer_mod.F90 +++ b/src/oce_tracer_mod.F90 @@ -25,7 +25,10 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh) type(t_tracer), intent(inout), target :: tracers integer :: n,nz +#ifndef ENABLE_OPENACC +#else !$ACC parallel loop collapse(2) default(present) !!!async(1) +#endif do n=1, partit%myDim_nod2D+partit%eDim_nod2D do nz=1, mesh%nl-1 ! del_ttf will contain all advection / diffusion contributions for this tracer. Set it to 0 at the beginning! @@ -34,44 +37,95 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh) tracers%work%del_ttf_advvert (nz, n) = 0.0_WP end do end do +#ifndef ENABLE_OPENACC +#else !$ACC end parallel loop +#endif + + ! AB interpolation + if (tracers%data(tr_num)%AB_order==2) then +#ifndef ENABLE_OPENACC !$OMP PARALLEL DO +#else +!$ACC parallel loop collapse(2) +#endif do n=1, partit%myDim_nod2D+partit%eDim_nod2D - ! AB interpolation - if (tracers%data(tr_num)%AB_order==2) then - tracers%data(tr_num)%valuesAB(:, n) =-(0.5_WP+epsilon)*tracers%data(tr_num)%valuesold(1, :, n)+(1.5_WP+epsilon)*tracers%data(tr_num)%values(:, n) - elseif (tracers%data(tr_num)%AB_order==3) then - tracers%data(tr_num)%valuesAB(:, n) =5.0_WP*tracers%data(tr_num)%valuesold(2, :, n)-16.0_WP*tracers%data(tr_num)%valuesold(1, :, n)+23.0_WP*tracers%data(tr_num)%values(:, n) - tracers%data(tr_num)%valuesAB(:, n) =tracers%data(tr_num)%valuesAB(:, n)/12.0_WP - end if + do nz = 1, mesh%nl-1 + tracers%data(tr_num)%valuesAB(nz, n) =-(0.5_WP+epsilon)*tracers%data(tr_num)%valuesold(1, nz, n)+(1.5_WP+epsilon)*tracers%data(tr_num)%values(nz, n) + end do end do +#ifndef ENABLE_OPENACC !$OMP END PARALLEL DO +#else +!$ACC end parallel loop +#endif + ! AB interpolation contd + elseif (tracers%data(tr_num)%AB_order==3) then +#ifndef ENABLE_OPENACC +!$OMP PARALLEL DO +#else +!$ACC parallel loop collapse(2) +#endif + do n=1, partit%myDim_nod2D+partit%eDim_nod2D + do nz = 1, mesh%nl-1 + tracers%data(tr_num)%valuesAB(nz, n) =5.0_WP*tracers%data(tr_num)%valuesold(2, nz, n)-16.0_WP*tracers%data(tr_num)%valuesold(1, nz, n)+23.0_WP*tracers%data(tr_num)%values(nz, n) + tracers%data(tr_num)%valuesAB(nz, n) =tracers%data(tr_num)%valuesAB(nz, n)/12.0_WP + end do + end do +end if +#ifndef ENABLE_OPENACC +!$OMP END PARALLEL DO +#else +!$ACC end parallel loop +#endif + if (tracers%data(tr_num)%AB_order==2) then +#ifndef ENABLE_OPENACC !$OMP PARALLEL DO +#else +!$ACC parallel loop collapse(2) +#endif do n=1, partit%myDim_nod2d+partit%eDim_nod2D - tracers%data(tr_num)%valuesold(1, :, n)=tracers%data(tr_num)%values(:, n) + do nz = 1, mesh%nl-1 + tracers%data(tr_num)%valuesold(1, nz, n)=tracers%data(tr_num)%values(nz, n) + end do end do +#ifndef ENABLE_OPENACC !$OMP END PARALLEL DO +#else +!$ACC end parallel loop +#endif + elseif (tracers%data(tr_num)%AB_order==3) then +#ifndef ENABLE_OPENACC !$OMP PARALLEL DO +#else +!$ACC parallel loop collapse(2) +#endif do n=1, partit%myDim_nod2d+partit%eDim_nod2D - tracers%data(tr_num)%valuesold(2, :, n)=tracers%data(tr_num)%valuesold(1, :, n) - tracers%data(tr_num)%valuesold(1, :, n)=tracers%data(tr_num)%values(:, n) + do nz = 1, mesh%nl-1 + tracers%data(tr_num)%valuesold(2, nz, n)=tracers%data(tr_num)%valuesold(1, nz, n) + tracers%data(tr_num)%valuesold(1, nz, n)=tracers%data(tr_num)%values(nz, n) + end do end do +#ifndef ENABLE_OPENACC !$OMP END PARALLEL DO +#else +!$ACC end parallel loop +#endif end if if (flag_debug .and. partit%mype==0) print *, achar(27)//'[38m'//' --> call tracer_gradient_elements'//achar(27)//'[0m' call tracer_gradient_elements(tracers%data(tr_num)%valuesAB, partit, mesh) - call exchange_elem_begin(tr_xy, partit) + call exchange_elem_begin(tr_xy, partit, luse_g2g = .true.) if (flag_debug .and. partit%mype==0) print *, achar(27)//'[38m'//' --> call tracer_gradient_z'//achar(27)//'[0m' call tracer_gradient_z(tracers%data(tr_num)%values, partit, mesh) !WHY NOT AB HERE? DSIDOREN! call exchange_elem_end(partit) ! tr_xy used in fill_up_dn_grad !$OMP BARRIER - call exchange_nod_begin(tr_z, partit) ! not used in fill_up_dn_grad + call exchange_nod_begin(tr_z, partit, luse_g2g = .true.) ! not used in fill_up_dn_grad if (flag_debug .and. partit%mype==0) print *, achar(27)//'[38m'//' --> call fill_up_dn_grad'//achar(27)//'[0m' call fill_up_dn_grad(tracers%work, partit, mesh) @@ -79,7 +133,7 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh) if (flag_debug .and. partit%mype==0) print *, achar(27)//'[38m'//' --> call tracer_gradient_elements'//achar(27)//'[0m' call tracer_gradient_elements(tracers%data(tr_num)%values, partit, mesh) !redefine tr_arr to the current timestep - call exchange_elem(tr_xy, partit) + call exchange_elem(tr_xy, partit, luse_g2g = .true.) END SUBROUTINE init_tracers_AB ! @@ -105,7 +159,12 @@ SUBROUTINE tracer_gradient_elements(ttf, partit, mesh) #include "associate_mesh_def.h" #include "associate_part_ass.h" #include "associate_mesh_ass.h" +#ifndef ENABLE_OPENACC !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(elem, elnodes, nz, nzmin, nzmax) +#else +!$ACC UPDATE DEVICE(gradient_sca) +!$ACC parallel loop private(elnodes) +#endif DO elem=1, myDim_elem2D elnodes=elem2D_nodes(:,elem) nzmin = ulevels(elem) @@ -116,7 +175,11 @@ SUBROUTINE tracer_gradient_elements(ttf, partit, mesh) tr_xy(2,nz, elem)=sum(gradient_sca(4:6,elem)*ttf(nz,elnodes)) END DO END DO +#ifndef ENABLE_OPENACC !$OMP END PARALLEL DO +#else +!$ACC end parallel loop +#endif END SUBROUTINE tracer_gradient_elements ! ! @@ -141,7 +204,12 @@ SUBROUTINE tracer_gradient_z(ttf, partit, mesh) #include "associate_mesh_def.h" #include "associate_part_ass.h" #include "associate_mesh_ass.h" +#ifndef ENABLE_OPENACC !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, nz, nzmin, nzmax, dz) +#else +!$ACC UDPATE DEVICE(hnode_new) +!$ACC parallel loop +#endif DO n=1, myDim_nod2D+eDim_nod2D !!PS nlev=nlevels_nod2D(n) nzmax=nlevels_nod2D(n) @@ -156,7 +224,11 @@ SUBROUTINE tracer_gradient_z(ttf, partit, mesh) tr_z(nzmin, n)=0.0_WP tr_z(nzmax, n)=0.0_WP END DO +#ifndef ENABLE_OPENACC !$OMP END PARALLEL DO +#else +!$ACC end parallel loop +#endif END SUBROUTINE tracer_gradient_z ! ! @@ -184,7 +256,12 @@ SUBROUTINE relax_to_clim(tr_num, tracers, partit, mesh) trarr=>tracers%data(tr_num)%values(:,:) if ((clim_relax>1.0e-8_WP).and.(tracers%data(tr_num)%ID==1)) then +#ifndef ENABLE_OPENACC !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, nzmin, nzmax) +#else +!$ACC UPDATE DEVICE(relax2clim, Tclim) +!$ACC parallel loop +#endif DO n=1, myDim_nod2D nzmin = ulevels_nod2D(n) nzmax = nlevels_nod2D(n) @@ -193,17 +270,30 @@ SUBROUTINE relax_to_clim(tr_num, tracers, partit, mesh) trarr(nzmin:nzmax-1,n)=trarr(nzmin:nzmax-1,n)+& relax2clim(n)*dt*(Tclim(nzmin:nzmax-1,n)-trarr(nzmin:nzmax-1,n)) END DO +#ifndef ENABLE_OPENACC !$OMP END PARALLEL DO +#else +!$ACC end parallel loop +#endif END if if ((clim_relax>1.0e-8_WP).and.(tracers%data(tr_num)%ID==2)) then +#ifndef ENABLE_OPENACC !$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, nzmin, nzmax) +#else +!$ACC UPDATE DEVICE(relax2clim, Sclim) +!$ACC parallel loop +#endif DO n=1, myDim_nod2D nzmin = ulevels_nod2D(n) nzmax = nlevels_nod2D(n) trarr(nzmin:nzmax-1,n)=trarr(nzmin:nzmax-1,n)+& relax2clim(n)*dt*(Sclim(nzmin:nzmax-1,n)-trarr(nzmin:nzmax-1,n)) END DO +#ifndef ENABLE_OPENACC !$OMP END PARALLEL DO +#else +!$ACC end parallel loop +#endif END IF END SUBROUTINE relax_to_clim END MODULE o_tracers