diff --git a/.github/workflows/bittree.yml b/.github/workflows/bittree.yml index c12fbedc58f..687bf07c00d 100644 --- a/.github/workflows/bittree.yml +++ b/.github/workflows/bittree.yml @@ -52,7 +52,7 @@ jobs: mpiexec -n 2 ./main2d.gnu.TEST.MPI.ex inputs_bittree amr.plot_int=1000 ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-15 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -104,7 +104,7 @@ jobs: mpiexec -n 2 ./main3d.gnu.TEST.MPI.ex inputs_bittree max_step=10 ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-15 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml index ec469bb5de6..a343832b510 100644 --- a/.github/workflows/clang.yml +++ b/.github/workflows/clang.yml @@ -59,7 +59,7 @@ jobs: make test_install ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -117,7 +117,7 @@ jobs: make -j 2 ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-14 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -159,7 +159,7 @@ jobs: make install ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-14 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" diff --git a/.github/workflows/dependencies/dependencies_codeplay.sh b/.github/workflows/dependencies/dependencies_codeplay.sh index dfd2e5a03f7..bf692d3c2c7 100755 --- a/.github/workflows/dependencies/dependencies_codeplay.sh +++ b/.github/workflows/dependencies/dependencies_codeplay.sh @@ -6,10 +6,24 @@ set -eu -o pipefail -curl -o oneapi_nvidia.sh -L "https://developer.codeplay.com/api/v1/products/download?product=oneapi&variant=nvidia&filters[]=linux&aat=$1" -chmod +x oneapi_nvidia.sh -sudo ./oneapi_nvidia.sh --yes +# `man apt.conf`: +# Number of retries to perform. If this is non-zero APT will retry +# failed files the given number of times. +echo 'Acquire::Retries "3";' | sudo tee /etc/apt/apt.conf.d/80-retries -curl -o oneapi_amd.sh -L "https://developer.codeplay.com/api/v1/products/download?product=oneapi&variant=amd&filters[]=linux&aat=$1" -chmod +x oneapi_amd.sh -sudo ./oneapi_amd.sh --yes +# https://developer.codeplay.com/apt/index.html +sudo wget -qO - https://developer.codeplay.com/apt/public.key | gpg --dearmor | sudo tee /usr/share/keyrings/codeplay-keyring.gpg > /dev/null +echo "deb [signed-by=/usr/share/keyrings/codeplay-keyring.gpg] https://developer.codeplay.com/apt all main" | sudo tee /etc/apt/sources.list.d/codeplay.list + +sudo apt-get update + +# try apt install up to five times, to avoid connection splits +status=1 +for itry in {1..5} +do + sudo apt-get install -y --no-install-recommends \ + $1 \ + && { sudo apt-get clean; status=0; break; } \ + || { sleep 10; } +done +if [[ ${status} -ne 0 ]]; then exit 1; fi diff --git a/.github/workflows/dependencies/dependencies_hip.sh b/.github/workflows/dependencies/dependencies_hip.sh index 852342e4ac1..4673a7caed5 100755 --- a/.github/workflows/dependencies/dependencies_hip.sh +++ b/.github/workflows/dependencies/dependencies_hip.sh @@ -45,6 +45,9 @@ sudo apt-get install -y --no-install-recommends \ rocrand-dev \ rocprim-dev +# hiprand-dev is a new package that does not exist in old versions +sudo apt-get install -y --no-install-recommends hiprand-dev || true + # activate # source /etc/profile.d/rocm.sh diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml index afc2044bdd7..aca7a9c8723 100644 --- a/.github/workflows/gcc.yml +++ b/.github/workflows/gcc.yml @@ -55,7 +55,7 @@ jobs: make test_install ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -107,7 +107,7 @@ jobs: cmake --build build -j 2 ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -155,7 +155,7 @@ jobs: cmake --build build -j 2 ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -204,7 +204,7 @@ jobs: cmake --build build -j 2 ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -263,7 +263,7 @@ jobs: # Let's not use clang-tidy for this test because it wants to use C++20. # ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - # make -j2 -f clang-tidy-ccache-misses.mak \ + # make -j2 -k -f clang-tidy-ccache-misses.mak \ # CLANG_TIDY=clang-tidy-12 \ # CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -320,7 +320,7 @@ jobs: make -j 2 ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -384,7 +384,7 @@ jobs: make -j 2 ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-14 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -426,7 +426,7 @@ jobs: make install ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -466,7 +466,7 @@ jobs: make install ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-15 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -506,7 +506,7 @@ jobs: make install ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -546,7 +546,7 @@ jobs: make install ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -585,7 +585,7 @@ jobs: CCACHE=ccache ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -630,7 +630,7 @@ jobs: make -j 2 ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-12 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" diff --git a/.github/workflows/hypre.yml b/.github/workflows/hypre.yml index 50423f3942a..871224fc79c 100644 --- a/.github/workflows/hypre.yml +++ b/.github/workflows/hypre.yml @@ -100,7 +100,7 @@ jobs: mpiexec -n 2 ./main3d.gnu.MPI.ex inputs.hypre ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-14 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" @@ -148,7 +148,7 @@ jobs: mpiexec -n 2 ./main2d.gnu.MPI.ex inputs.2d ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-14 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml index d86035d916e..aa1036ddf94 100644 --- a/.github/workflows/intel.yml +++ b/.github/workflows/intel.yml @@ -44,7 +44,8 @@ jobs: -DCMAKE_C_COMPILER=$(which icx) \ -DCMAKE_CXX_COMPILER=$(which icpx) \ -DCMAKE_Fortran_COMPILER=$(which ifx) \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DAMReX_PARALLEL_LINK_JOBS=2 cmake --build build --parallel 2 ccache -s @@ -86,7 +87,8 @@ jobs: -DAMReX_GPU_BACKEND=SYCL \ -DCMAKE_C_COMPILER=$(which icx) \ -DCMAKE_CXX_COMPILER=$(which icpx) \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DAMReX_PARALLEL_LINK_JOBS=2 cmake --build build --parallel 2 ccache -s @@ -95,19 +97,16 @@ jobs: tests-oneapi-sycl-eb-nvidia: name: oneAPI SYCL for Nvidia GPUs [tests w/ EB] runs-on: ubuntu-latest - env: - CODEPLAYTOKEN: ${{ secrets.CODEPLAYTOKEN }} steps: - uses: actions/checkout@v3 - name: Dependencies - if: ${{ env.CODEPLAYTOKEN != '' }} run: | - .github/workflows/dependencies/dependencies_nvcc.sh + .github/workflows/dependencies/ubuntu_free_disk_space.sh .github/workflows/dependencies/dependencies_dpcpp.sh - .github/workflows/dependencies/dependencies_codeplay.sh ${{ env.CODEPLAYTOKEN }} + .github/workflows/dependencies/dependencies_nvcc.sh 12.0 + .github/workflows/dependencies/dependencies_codeplay.sh oneapi-nvidia-12.0 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - if: ${{ env.CODEPLAYTOKEN != '' }} uses: actions/cache@v3 with: path: ~/.cache/ccache @@ -115,9 +114,7 @@ jobs: restore-keys: | ccache-${{ github.workflow }}-${{ github.job }}-git- - name: Build & Install - if: ${{ env.CODEPLAYTOKEN != '' }} - # clang currently supports CUDA up to version 11.5 and a warning is issued with newer versions - env: {CXXFLAGS: "-fsycl -fsycl-targets=nvptx64-nvidia-cuda -fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor -Wno-unknown-cuda-version"} + env: {CXXFLAGS: "-fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --offload-arch=sm_80 -fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor"} run: | export CCACHE_COMPRESS=1 export CCACHE_COMPRESSLEVEL=10 @@ -136,7 +133,8 @@ jobs: -DAMReX_GPU_BACKEND=SYCL \ -DCMAKE_C_COMPILER=$(which icx) \ -DCMAKE_CXX_COMPILER=$(which clang++) \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DAMReX_PARALLEL_LINK_JOBS=2 cmake --build build --parallel 2 ccache -s @@ -145,20 +143,16 @@ jobs: no-tests-oneapi-sycl-amd: name: oneAPI SYCL for AMD GPUs runs-on: ubuntu-20.04 - env: - CODEPLAYTOKEN: ${{ secrets.CODEPLAYTOKEN }} steps: - uses: actions/checkout@v3 - name: Dependencies - if: ${{ env.CODEPLAYTOKEN != '' }} run: | .github/workflows/dependencies/ubuntu_free_disk_space.sh - .github/workflows/dependencies/dependencies_hip.sh 5.4.6 .github/workflows/dependencies/dependencies_dpcpp.sh - .github/workflows/dependencies/dependencies_codeplay.sh ${{ env.CODEPLAYTOKEN }} + .github/workflows/dependencies/dependencies_hip.sh 5.4.3 + .github/workflows/dependencies/dependencies_codeplay.sh oneapi-amd-5.4.3 .github/workflows/dependencies/dependencies_ccache.sh - name: Set Up Cache - if: ${{ env.CODEPLAYTOKEN != '' }} uses: actions/cache@v3 with: path: ~/.cache/ccache @@ -166,7 +160,6 @@ jobs: restore-keys: | ccache-${{ github.workflow }}-${{ github.job }}-git- - name: Build & Install - if: ${{ env.CODEPLAYTOKEN != '' }} env: {CXXFLAGS: "-fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx90a -fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor"} run: | export CCACHE_COMPRESS=1 @@ -186,7 +179,8 @@ jobs: -DAMReX_SYCL_SUB_GROUP_SIZE=64 \ -DCMAKE_C_COMPILER=$(which icx) \ -DCMAKE_CXX_COMPILER=$(which clang++) \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DAMReX_PARALLEL_LINK_JOBS=2 cmake --build build --parallel 2 ccache -s diff --git a/.github/workflows/petsc.yml b/.github/workflows/petsc.yml index 6d0b92b1343..eaddf1c2489 100644 --- a/.github/workflows/petsc.yml +++ b/.github/workflows/petsc.yml @@ -50,7 +50,7 @@ jobs: mpiexec -n 2 ./main2d.gnu.TEST.MPI.ex inputs.rt.2d.petsc ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-14 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" diff --git a/.github/workflows/smoke.yml b/.github/workflows/smoke.yml index 080a17fd984..d907b485261 100644 --- a/.github/workflows/smoke.yml +++ b/.github/workflows/smoke.yml @@ -47,7 +47,7 @@ jobs: make test_install ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-15 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" diff --git a/.github/workflows/sundials.yml b/.github/workflows/sundials.yml index 12dfd42c159..a890e10fad3 100644 --- a/.github/workflows/sundials.yml +++ b/.github/workflows/sundials.yml @@ -60,7 +60,7 @@ jobs: cmake --build build -j 2 ${{github.workspace}}/Tools/C_scripts/mmclt.py --input ${{github.workspace}}/ccache.log.txt - make -j2 -f clang-tidy-ccache-misses.mak \ + make -j2 -k -f clang-tidy-ccache-misses.mak \ CLANG_TIDY=clang-tidy-14 \ CLANG_TIDY_ARGS="--config-file=${{github.workspace}}/.clang-tidy --warnings-as-errors=*" diff --git a/CHANGES b/CHANGES index 82b28a03e87..cad5363b99e 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,53 @@ +# 24.01 + + -- MLMG: Use free functions instead of MF member functions (#3681) + + -- Add a few free functions for MLMG (#3680) + + -- Eliminating Matrix operations in MLMG CG bottom solver if initial vector is zero (#3668) + + -- Add a for loop that is unrolled at compile time (#3674) + + -- Add PTD version of getParticleCell (#3675) + + -- Improve ParIter docs (#3676) + + -- Fix CI for ROCm 6.0 (#3673) + + -- PureSoA IdCpu fixes (#3671) + + -- CMake: AMReX_PARALLEL_LINK_JOBS (#3628) + + -- Clang-Tidy in CI: Keep Going after Errors (#3667) + + -- Delete empty below comments on classes and functions (#3669) + + -- Documentation for Profiling: Hot Spots and Load Balance (#3622) + + -- Fix warnings in SortParticlesForDeposition (#3664) + + -- Fix Resize Issue of Fab with the Async Arena (#3663) + + -- Fix SuperParticle `push_back` (#3661) + + -- Pure SoA Particle: Separate Array for IdCPU (#3585) + + -- Limit the scope of gpu_rand_generator (#3659) + + -- Fix a typo in doxygen for NonLocalBC::FillBoundary (#3658) + + -- GNU Make: Fix name collision for aurora (#3656) + + -- two separate fixes -- particle_compare and ref_ratio=1 (#3655) + + -- Clarify documentation on setEBDirchlet() and fix link to AMReX-Hydro (#3652) + + -- Robustify the Cache Cleanup Scripts (#3650) + + -- Disable CodeQL scheduled jobs on forks (#3649) + + -- Work around compiler bug in nvcc 12.2 by using functor instead of lambda (#3653) + # 23.12 -- solve_cg: avoid use of MF `z` (#3637) diff --git a/Docs/sphinx_documentation/source/AMReX_Profiling_Tools.rst b/Docs/sphinx_documentation/source/AMReX_Profiling_Tools.rst index 8726f51a2ba..cdd774488e4 100644 --- a/Docs/sphinx_documentation/source/AMReX_Profiling_Tools.rst +++ b/Docs/sphinx_documentation/source/AMReX_Profiling_Tools.rst @@ -93,6 +93,47 @@ it is also recommended to wrap any ``BL_PROFILE_TINY_FLUSH();`` calls in informative ``amrex::Print()`` lines to ensure accurate identification of each set of timers. +Hot Spots and Load Balance +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The output of TinyProfiler can help us to identify hot spots. For example, +the following output shows the top three hot spots of a linear solver test +running on 4 MPI processes. + +.. highlight:: console + +:: + + -------------------------------------------------------------------------------------------- + Name NCalls Excl. Min Excl. Avg Excl. Max Max % + -------------------------------------------------------------------------------------------- + MLPoisson::Fsmooth() 560 0.4775 0.4793 0.4815 34.97% + MLPoisson::Fapply() 114 0.1103 0.113 0.1167 8.48% + FabArray::Xpay() 109 0.1 0.1013 0.1038 7.54% + +In this test, there are 16 boxes evenly distributed among 4 MPI processes. The +output above shows that the load is perfectly balanced. However, if the load +is not balanced, the results can be very different and sometimes +misleading. For example, if we put 2, 2, 6 and 6 boxes on processes 0, 1, 2 +and 3, respectively, the top three hot spots now include two MPI +communication functions, ``FillBoundary`` and ``ParallelCopy``. + +.. highlight:: console + +:: + + -------------------------------------------------------------------------------------------- + Name NCalls Excl. Min Excl. Avg Excl. Max Max % + -------------------------------------------------------------------------------------------- + FillBoundary_finish() 607 0.01568 0.3367 0.6574 41.97% + MLPoisson::Fsmooth() 560 0.2133 0.4047 0.5973 38.13% + FabArray::ParallelCopy_finish() 231 0.002977 0.09748 0.1895 12.10% + +The reason that the MPI communication appears slow is that the lightly +loaded processes have to wait for messages sent by the heavily loaded +processes. See also :ref:`sec:profopts` for a diagnostic option that may +provide more insight on the load imbalance. + .. _sec:full:profiling: Full Profiling diff --git a/Docs/sphinx_documentation/source/GPU.rst b/Docs/sphinx_documentation/source/GPU.rst index aff060e9166..08297cb3e2a 100644 --- a/Docs/sphinx_documentation/source/GPU.rst +++ b/Docs/sphinx_documentation/source/GPU.rst @@ -217,7 +217,7 @@ variables to configure the build +------------------------------+-------------------------------------------------+-------------+-----------------+ | SYCL_SUB_GROUP_SIZE | Specify subgroup size | 32 | 64, 32, 16 | +------------------------------+-------------------------------------------------+-------------+-----------------+ - | SYCL_MAX_PARALLEL_LINK_JOBS | Number of parallel jobs in device link | 1 | 1, 2, 3, etc. | + | SYCL_PARALLEL_LINK_JOBS | Number of parallel jobs in device link | 1 | 1, 2, 3, etc. | +------------------------------+-------------------------------------------------+-------------+-----------------+ .. raw:: latex @@ -428,22 +428,24 @@ Below is an example configuration for SYCL: .. table:: AMReX SYCL-specific build options - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | Variable Name | Description | Default | Possible values | - +==============================+=================================================+=============+=================+ - | AMReX_SYCL_AOT | Enable SYCL ahead-of-time compilation | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | AMReX_SYCL_AOT_GRF_MODE | Specify AOT register file mode | Default | Default, Large, | - | | | | AutoLarge | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | AMREX_INTEL_ARCH | Specify target if AOT is enabled | None | pvc, etc. | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | AMReX_SYCL_SPLIT_KERNEL | Enable SYCL kernel splitting | YES | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | AMReX_SYCL_ONEDPL | Enable SYCL's oneDPL algorithms | NO | YES, NO | - +------------------------------+-------------------------------------------------+-------------+-----------------+ - | AMReX_SYCL_SUB_GROUP_SIZE | Specify subgroup size | 32 | 64, 32, 16 | - +------------------------------+-------------------------------------------------+-------------+-----------------+ + +-------------------------------+----------------------------------------------+-------------+------------------+ + | Variable Name | Description | Default | Possible values | + +===============================+==============================================+=============+==================+ + | AMReX_SYCL_AOT | Enable SYCL ahead-of-time compilation | NO | YES, NO | + +-------------------------------+----------------------------------------------+-------------+------------------+ + | AMReX_SYCL_AOT_GRF_MODE | Specify AOT register file mode | Default | Default, Large, | + | | | | AutoLarge | + +-------------------------------+----------------------------------------------+-------------+------------------+ + | AMREX_INTEL_ARCH | Specify target if AOT is enabled | None | pvc, etc. | + +-------------------------------+----------------------------------------------+-------------+------------------+ + | AMReX_SYCL_SPLIT_KERNEL | Enable SYCL kernel splitting | YES | YES, NO | + +-------------------------------+----------------------------------------------+-------------+------------------+ + | AMReX_SYCL_ONEDPL | Enable SYCL's oneDPL algorithms | NO | YES, NO | + +-------------------------------+----------------------------------------------+-------------+------------------+ + | AMReX_SYCL_SUB_GROUP_SIZE | Specify subgroup size | 32 | 64, 32, 16 | + +-------------------------------+----------------------------------------------+-------------+------------------+ + | AMReX_PARALLEL_LINK_JOBS | Specify number of parallel link jobs | 1 | positive integer | + +-------------------------------+----------------------------------------------+-------------+------------------+ .. raw:: latex \end{center} diff --git a/Docs/sphinx_documentation/source/Particle.rst b/Docs/sphinx_documentation/source/Particle.rst index be8292c772b..e3a28591a72 100644 --- a/Docs/sphinx_documentation/source/Particle.rst +++ b/Docs/sphinx_documentation/source/Particle.rst @@ -86,7 +86,8 @@ tracked as the particle positions change. To do this, we provide the :: - ParticleContainer<3, 2, 4, 4> mypc; + using MyParticleContainer = ParticleContainer<3, 2, 4, 4>; + MyParticleContainer mypc; Like the :cpp:`Particle` class itself, the :cpp:`ParticleContainer` class is templated. The first two template parameters have the same meaning as @@ -375,8 +376,8 @@ example, to iterate over all the AoS data: :: - using MyParIter = ConstParIter<2*BL_SPACEDIM>; - for (MyParIter pti(pc, lev); pti.isValid(); ++pti) { + using MyParConstIter = MyParticleContainer::ParConstIterType; + for (MyParConstIter pti(pc, lev); pti.isValid(); ++pti) { const auto& particles = pti.GetArrayOfStructs(); for (const auto& p : particles) { // do stuff with p... @@ -392,7 +393,7 @@ skipped. You can also access the SoA data using the :math:`ParIter` as follows: :: - using MyParIter = ParIter<0, 0, 2, 2>; + using MyParIter = MyParticleContainer::ParIterType; for (MyParIter pti(pc, lev); pti.isValid(); ++pti) { auto& particle_attributes = pti.GetStructOfArrays(); RealVector& real_comp0 = particle_attributes.GetRealData(0); diff --git a/Src/Amr/AMReX_Amr.H b/Src/Amr/AMReX_Amr.H index a7173fd105a..bb18ec9d160 100644 --- a/Src/Amr/AMReX_Amr.H +++ b/Src/Amr/AMReX_Amr.H @@ -30,7 +30,6 @@ class AmrInSituBridge; * not belong on a single level, like establishing and updating the hierarchy * of levels, global timestepping, and managing the different AmrLevels */ - class Amr : public AmrCore { diff --git a/Src/Amr/AMReX_AmrLevel.H b/Src/Amr/AMReX_AmrLevel.H index d4ac6c7c70d..8abb00b5475 100644 --- a/Src/Amr/AMReX_AmrLevel.H +++ b/Src/Amr/AMReX_AmrLevel.H @@ -34,7 +34,6 @@ class TagBoxArray; * AmrLevel functions both as a container for state data on a level * and also manages the advancement of data in time. */ - class AmrLevel { friend class Amr; diff --git a/Src/Amr/AMReX_Derive.H b/Src/Amr/AMReX_Derive.H index e1a7310a7b2..1e0cceb7894 100644 --- a/Src/Amr/AMReX_Derive.H +++ b/Src/Amr/AMReX_Derive.H @@ -100,7 +100,6 @@ class DescriptorList; * from the state data contained in AmrLevel and its derivatives. Some * examples might be kinetic energy, vorticity, concentration gradients ... */ - class DeriveRec { friend class DeriveList; @@ -339,7 +338,6 @@ private: * * DeriveList manages and provides access to the list of DeriveRecs. */ - class DeriveList { public: diff --git a/Src/Amr/AMReX_LevelBld.H b/Src/Amr/AMReX_LevelBld.H index 8b421265bfc..bb79184ca5e 100644 --- a/Src/Amr/AMReX_LevelBld.H +++ b/Src/Amr/AMReX_LevelBld.H @@ -18,7 +18,6 @@ namespace amrex { * Abstract base class specifying an interface for building problem-specific * AmrLevels. */ - class LevelBld { public: diff --git a/Src/Amr/AMReX_StateData.H b/Src/Amr/AMReX_StateData.H index 251e6482a45..e6edb486c41 100644 --- a/Src/Amr/AMReX_StateData.H +++ b/Src/Amr/AMReX_StateData.H @@ -29,7 +29,6 @@ class StateDataPhysBCFunct; * * StateData holds state data on a level for the current and previous time step. */ - class StateData { friend class StateDataPhysBCFunct; diff --git a/Src/Amr/AMReX_StateDescriptor.H b/Src/Amr/AMReX_StateDescriptor.H index 6cd6c92cdd3..2830b955705 100644 --- a/Src/Amr/AMReX_StateDescriptor.H +++ b/Src/Amr/AMReX_StateDescriptor.H @@ -29,7 +29,6 @@ namespace amrex { /** * \brief Attributes of StateData. */ - class StateDescriptor { friend class DescriptorList; @@ -434,7 +433,6 @@ private: * * A container class for StateDescriptors. */ - class DescriptorList { public: diff --git a/Src/AmrCore/AMReX_AmrCore.H b/Src/AmrCore/AMReX_AmrCore.H index 20428b40930..2969b986a75 100644 --- a/Src/AmrCore/AMReX_AmrCore.H +++ b/Src/AmrCore/AMReX_AmrCore.H @@ -20,7 +20,6 @@ class AmrParGDB; * virtual functions to allocate, initialize and delete data. It also * requires the derived class to tag cells for refinement. */ - class AmrCore : public AmrMesh { diff --git a/Src/AmrCore/AMReX_Cluster.H b/Src/AmrCore/AMReX_Cluster.H index 5bbf5c796bc..7d60131e6cf 100644 --- a/Src/AmrCore/AMReX_Cluster.H +++ b/Src/AmrCore/AMReX_Cluster.H @@ -20,7 +20,6 @@ class ClusterList; * * Utility class for tagging error cells. */ - class Cluster { public: @@ -138,7 +137,6 @@ private: * * A container class for Cluster. */ - class ClusterList { public: diff --git a/Src/AmrCore/AMReX_ErrorList.H b/Src/AmrCore/AMReX_ErrorList.H index 9ab1a978966..ab4395d8ce8 100644 --- a/Src/AmrCore/AMReX_ErrorList.H +++ b/Src/AmrCore/AMReX_ErrorList.H @@ -102,7 +102,6 @@ extern "C" * actual error tagging will be through derivation, so provision is made * for this as well. */ - class ErrorRec { public: @@ -348,7 +347,6 @@ private: * * Container class for ErrorRecs. */ - class ErrorList { public: diff --git a/Src/AmrCore/AMReX_FillPatcher.H b/Src/AmrCore/AMReX_FillPatcher.H index d36b3529efd..5ff1c9550d3 100644 --- a/Src/AmrCore/AMReX_FillPatcher.H +++ b/Src/AmrCore/AMReX_FillPatcher.H @@ -68,7 +68,6 @@ namespace amrex { * See AmrLevel::RK for an example of using the RungeKutta functions and * FillPatcher together. */ - template class FillPatcher { diff --git a/Src/AmrCore/AMReX_FluxRegister.H b/Src/AmrCore/AMReX_FluxRegister.H index 4178eb289ba..f5983e18872 100644 --- a/Src/AmrCore/AMReX_FluxRegister.H +++ b/Src/AmrCore/AMReX_FluxRegister.H @@ -14,7 +14,6 @@ namespace amrex { * * Stores and manipulates fluxes at coarse-fine interfaces. */ - class FluxRegister : public BndryRegister diff --git a/Src/AmrCore/AMReX_InterpFaceRegister.H b/Src/AmrCore/AMReX_InterpFaceRegister.H index a63c2c23e4c..c54879bcaf6 100644 --- a/Src/AmrCore/AMReX_InterpFaceRegister.H +++ b/Src/AmrCore/AMReX_InterpFaceRegister.H @@ -12,7 +12,6 @@ namespace amrex { * \brief InterpFaceRegister is a coarse/fine boundary register for * interpolation of face data at the coarse/fine boundary. */ - class InterpFaceRegister { public: diff --git a/Src/AmrCore/AMReX_Interpolater.H b/Src/AmrCore/AMReX_Interpolater.H index d2fe66b0cbd..e1210a83329 100644 --- a/Src/AmrCore/AMReX_Interpolater.H +++ b/Src/AmrCore/AMReX_Interpolater.H @@ -17,7 +17,6 @@ class IArrayBox; * * Specifies interpolater interface for coarse-to-fine interpolation in space. */ - class Interpolater : public InterpBase { @@ -160,7 +159,6 @@ public: * * Bilinear interpolation on node centered data. */ - class NodeBilinear : public Interpolater @@ -219,7 +217,6 @@ public: * * Bilinear interpolation on cell centered data. */ - class CellBilinear : public Interpolater @@ -286,7 +283,6 @@ public: * sum_ivar a(ic,jc,ivar)*fab(if,jf,ivar) = 0 is satisfied * in all fine cells if,jf covering coarse cell ic,jc. */ - class CellConservativeLinear : public Interpolater @@ -344,7 +340,6 @@ protected: * Linear conservative interpolation on cell centered data * but with protection against undershoots or overshoots. */ - class CellConservativeProtected : public CellConservativeLinear @@ -393,7 +388,6 @@ public: * * Quadratic interpolation on cell centered data. */ - class CellQuadratic : public Interpolater @@ -451,7 +445,6 @@ public: /** * \brief Piecewise Constant interpolation on cell centered data. */ - class PCInterp : public Interpolater @@ -512,7 +505,6 @@ public: * in constructing the polynomial, the average of the polynomial inside that * cell is equal to the cell averaged value of the original data. */ - class CellConservativeQuartic : public Interpolater @@ -574,7 +566,6 @@ public: * a given coarse cell will have the same divergence, even when the coarse * grid divergence is spatially varying. */ - class FaceDivFree : public Interpolater @@ -667,7 +658,6 @@ public: * * Bilinear interpolation on data. */ - class FaceLinear : public Interpolater @@ -789,7 +779,6 @@ public: * * Quartic interpolation on cell centered data. */ - class CellQuartic : public Interpolater diff --git a/Src/AmrCore/AMReX_TagBox.H b/Src/AmrCore/AMReX_TagBox.H index 929e181e0e8..3d26f76e9cb 100644 --- a/Src/AmrCore/AMReX_TagBox.H +++ b/Src/AmrCore/AMReX_TagBox.H @@ -20,7 +20,6 @@ namespace amrex { * * This class is used to tag cells in a Box that need addition refinement. */ - class TagBox final : public BaseFab @@ -145,7 +144,6 @@ public: * * A container class for TagBoxes. */ - class TagBoxArray : public FabArray diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H index d328f693a96..b93c476f86b 100644 --- a/Src/Base/AMReX_Arena.H +++ b/Src/Base/AMReX_Arena.H @@ -82,7 +82,6 @@ struct ArenaInfo * A virtual base class for objects that manage their own dynamic * memory allocation. */ - class Arena { public: diff --git a/Src/Base/AMReX_BArena.H b/Src/Base/AMReX_BArena.H index 9a3b4aa0f1b..d587d100859 100644 --- a/Src/Base/AMReX_BArena.H +++ b/Src/Base/AMReX_BArena.H @@ -11,7 +11,6 @@ namespace amrex { * This is the simplest dynamic memory management class derived from Arena. * Makes calls to std::malloc and std::free. */ - class BArena : public Arena diff --git a/Src/Base/AMReX_BCRec.H b/Src/Base/AMReX_BCRec.H index 268147a3a04..d23da777eda 100644 --- a/Src/Base/AMReX_BCRec.H +++ b/Src/Base/AMReX_BCRec.H @@ -10,10 +10,9 @@ namespace amrex { /** * \brief Boundary Condition Records. * Necessary information and functions for computing boundary conditions. +* +* This class has standard layout. And we should keep it so! */ - -// This class has standard layout. And we should keep it so! - class BCRec { public: diff --git a/Src/Base/AMReX_BaseFab.H b/Src/Base/AMReX_BaseFab.H index e0331e7a0ae..eb8e5c59615 100644 --- a/Src/Base/AMReX_BaseFab.H +++ b/Src/Base/AMReX_BaseFab.H @@ -3528,7 +3528,6 @@ BaseFab::protected_divide (const BaseFab& src, const Box& srcbox, const Bo * and stored in component comp of this FAB. * This fab is returned as a reference for chaining. */ - template template BaseFab& diff --git a/Src/Base/AMReX_BoxDomain.H b/Src/Base/AMReX_BoxDomain.H index a82e5ddc72e..af92d631607 100644 --- a/Src/Base/AMReX_BoxDomain.H +++ b/Src/Base/AMReX_BoxDomain.H @@ -55,14 +55,12 @@ std::ostream& operator<< (std::ostream& os, const BoxDomain& bd); /** * \brief A List of Disjoint Boxes. +* * A BoxDomain is a BoxList with the restriction that Boxes in the list * are disjoint. +* Note that a BoxDomain is NOT a BoxList due to the protected inheritance. +* This is a concrete class, not a polymorphic one. */ - -//Note that a BoxDomain is NOT a BoxList due to the protected inheritance. -//This is a concrete class, not a polymorphic one. - - class BoxDomain : protected BoxList diff --git a/Src/Base/AMReX_BoxList.H b/Src/Base/AMReX_BoxList.H index c0ff30025ff..cab414d36c8 100644 --- a/Src/Base/AMReX_BoxList.H +++ b/Src/Base/AMReX_BoxList.H @@ -48,7 +48,6 @@ namespace amrex * IndexType. This class implements operations for sets of Boxes. * This is a concrete class, not a polymorphic one. */ - class BoxList { public: diff --git a/Src/Base/AMReX_CArena.H b/Src/Base/AMReX_CArena.H index 163039df2ef..9547bc92f21 100644 --- a/Src/Base/AMReX_CArena.H +++ b/Src/Base/AMReX_CArena.H @@ -24,7 +24,6 @@ struct MemStat; * chunks of heap space and apportions it out as requested. It merges * together neighboring chunks on each free(). */ - class CArena : public Arena diff --git a/Src/Base/AMReX_CoordSys.H b/Src/Base/AMReX_CoordSys.H index ab946ffa3d0..24096c6f428 100644 --- a/Src/Base/AMReX_CoordSys.H +++ b/Src/Base/AMReX_CoordSys.H @@ -20,7 +20,6 @@ class FArrayBox; * * Routines for mapping between physical coordinate system and index space. */ - class CoordSys { public: diff --git a/Src/Base/AMReX_DistributionMapping.H b/Src/Base/AMReX_DistributionMapping.H index 0707532a0fc..e9aa82f16a2 100644 --- a/Src/Base/AMReX_DistributionMapping.H +++ b/Src/Base/AMReX_DistributionMapping.H @@ -37,7 +37,6 @@ class FabArrayBase; * BoxArray are as equal across CPUs as is possible. The SFC distribution is * based on a space filling curve. */ - class DistributionMapping { public: diff --git a/Src/Base/AMReX_FACopyDescriptor.H b/Src/Base/AMReX_FACopyDescriptor.H index 7e1e383d237..ca7d3f47029 100644 --- a/Src/Base/AMReX_FACopyDescriptor.H +++ b/Src/Base/AMReX_FACopyDescriptor.H @@ -103,7 +103,6 @@ FabCopyDescriptor::~FabCopyDescriptor () * \brief This class orchestrates filling a destination fab of size destFabBox * from fabarray on the local processor (myProc). */ - template class FabArrayCopyDescriptor { diff --git a/Src/Base/AMReX_FPC.H b/Src/Base/AMReX_FPC.H index 77c4dfa923b..8975ed8e9b9 100644 --- a/Src/Base/AMReX_FPC.H +++ b/Src/Base/AMReX_FPC.H @@ -15,7 +15,6 @@ namespace amrex { * namespaces, and we don't like global constants, we make them static * constant data members of this class. */ - class FPC { public: diff --git a/Src/Base/AMReX_FabArrayBase.H b/Src/Base/AMReX_FabArrayBase.H index d8bc4411874..e2cf0ed9641 100644 --- a/Src/Base/AMReX_FabArrayBase.H +++ b/Src/Base/AMReX_FabArrayBase.H @@ -721,6 +721,11 @@ public: }; +[[nodiscard]] int nComp (FabArrayBase const& fa); +[[nodiscard]] IntVect nGrowVect (FabArrayBase const& fa); +[[nodiscard]] BoxArray const& boxArray (FabArrayBase const& fa); +[[nodiscard]] DistributionMapping const& DistributionMap (FabArrayBase const& fa); + #ifdef BL_USE_MPI bool CheckRcvStats (Vector& recv_stats, const Vector& recv_size, int tag); #endif diff --git a/Src/Base/AMReX_FabArrayBase.cpp b/Src/Base/AMReX_FabArrayBase.cpp index 8dd8275f66a..6997f3489dd 100644 --- a/Src/Base/AMReX_FabArrayBase.cpp +++ b/Src/Base/AMReX_FabArrayBase.cpp @@ -2699,4 +2699,24 @@ FabArrayBase::flushParForCache () #endif +int nComp (FabArrayBase const& fa) +{ + return fa.nComp(); +} + +IntVect nGrowVect (FabArrayBase const& fa) +{ + return fa.nGrowVect(); +} + +BoxArray const& boxArray (FabArrayBase const& fa) +{ + return fa.boxArray(); +} + +DistributionMapping const& DistributionMap (FabArrayBase const& fa) +{ + return fa.DistributionMap(); +} + } diff --git a/Src/Base/AMReX_FabArrayUtility.H b/Src/Base/AMReX_FabArrayUtility.H index ca80a070f45..0897c57ed4f 100644 --- a/Src/Base/AMReX_FabArrayUtility.H +++ b/Src/Base/AMReX_FabArrayUtility.H @@ -1602,6 +1602,193 @@ Dot (FabArray const& x, int xcomp, FabArray const& y, int ycomp, int n return sm; } +//! dst = val +template ,int> = 0> +void setVal (MF& dst, typename MF::value_type val) +{ + dst.setVal(val); +} + +//! dst = val in ghost cells. +template ,int> = 0> +void setBndry (MF& dst, typename MF::value_type val, int scomp, int ncomp) +{ + dst.setBndry(val, scomp, ncomp); +} + +//! dst = src +template && + IsMultiFabLike_v, int> = 0> +void LocalCopy (DMF& dst, SMF const& src, int scomp, int dcomp, + int ncomp, IntVect const& nghost) +{ + amrex::Copy(dst, src, scomp, dcomp, ncomp, nghost); +} + +//! dst += src +template ,int> = 0> +void LocalAdd (MF& dst, MF const& src, int scomp, int dcomp, + int ncomp, IntVect const& nghost) +{ + amrex::Add(dst, src, scomp, dcomp, ncomp, nghost); +} + +//! dst += a * src +template ,int> = 0> +void Saxpy (MF& dst, typename MF::value_type a, MF const& src, int scomp, int dcomp, + int ncomp, IntVect const& nghost) +{ + MF::Saxpy(dst, a, src, scomp, dcomp, ncomp, nghost); +} + +//! dst = src + a * dst +template ,int> = 0> +void Xpay (MF& dst, typename MF::value_type a, MF const& src, int scomp, int dcomp, + int ncomp, IntVect const& nghost) +{ + MF::Xpay(dst, a, src, scomp, dcomp, ncomp, nghost); +} + +//! dst = src w/ MPI communication +template , int> = 0> +void ParallelCopy (MF& dst, MF const& src, int scomp, int dcomp, int ncomp, + IntVect const& ng_src = IntVect(0), + IntVect const& ng_dst = IntVect(0), + Periodicity const& period = Periodicity::NonPeriodic()) +{ + dst.ParallelCopy(src, scomp, dcomp, ncomp, ng_src, ng_dst, period); +} + +template , int> = 0> +[[nodiscard]] typename MF::value_type +norminf (MF const& mf, int scomp, int ncomp, IntVect const& nghost, + bool local = false) +{ + return mf.norminf(scomp, ncomp, nghost, local); +} + +//! dst = val +template ,int> = 0> +void setVal (Array& dst, typename MF::value_type val) +{ + for (auto& mf: dst) { + mf.setVal(val); + } +} + +//! dst = val in ghost cells. +template ,int> = 0> +void setBndry (Array& dst, typename MF::value_type val, int scomp, int ncomp) +{ + for (auto& mf : dst) { + mf.setBndry(val, scomp, ncomp); + } +} + +//! dst = src +template && + IsMultiFabLike_v, int> = 0> +void LocalCopy (Array& dst, Array const& src, int scomp, int dcomp, + int ncomp, IntVect const& nghost) +{ + for (std::size_t i = 0; i < N; ++i) { + amrex::Copy(dst[i], src[i], scomp, dcomp, ncomp, nghost); + } +} + +//! dst += src +template ,int> = 0> +void LocalAdd (Array& dst, Array const& src, int scomp, int dcomp, + int ncomp, IntVect const& nghost) +{ + for (std::size_t i = 0; i < N; ++i) { + amrex::Add(dst[i], src[i], scomp, dcomp, ncomp, nghost); + } +} + +//! dst += a * src +template ,int> = 0> +void Saxpy (Array& dst, typename MF::value_type a, + Array const& src, int scomp, int dcomp, int ncomp, + IntVect const& nghost) +{ + for (std::size_t i = 0; i < N; ++i) { + MF::Saxpy(dst[i], a, src[i], scomp, dcomp, ncomp, nghost); + } +} + +//! dst = src + a * dst +template ,int> = 0> +void Xpay (Array& dst, typename MF::value_type a, + Array const& src, int scomp, int dcomp, int ncomp, + IntVect const& nghost) +{ + for (std::size_t i = 0; i < N; ++i) { + MF::Xpay(dst[i], a, src[i], scomp, dcomp, ncomp, nghost); + } +} + +//! dst = src w/ MPI communication +template , int> = 0> +void ParallelCopy (Array& dst, Array const& src, + int scomp, int dcomp, int ncomp, + IntVect const& ng_src = IntVect(0), + IntVect const& ng_dst = IntVect(0), + Periodicity const& period = Periodicity::NonPeriodic()) +{ + for (std::size_t i = 0; i < N; ++i) { + dst[i].ParallelCopy(src[i], scomp, dcomp, ncomp, ng_src, ng_dst, period); + } +} + +template , int> = 0> +[[nodiscard]] typename MF::value_type +norminf (Array const& mf, int scomp, int ncomp, IntVect const& nghost, + bool local = false) +{ + auto r = typename MF::value_type(0); + for (std::size_t i = 0; i < N; ++i) { + auto tmp = mf[i].norminf(scomp, ncomp, nghost, true); + r = std::max(r,tmp); + } + if (!local) { + ParallelAllReduce::Max(r, ParallelContext::CommunicatorSub()); + } + return r; +} + +template && (N > 0), + int> = 0> +[[nodiscard]] int nComp (Array const& mf) +{ + return mf[0].nComp(); +} + +template && (N > 0), + int> = 0> +[[nodiscard]] IntVect nGrowVect (Array const& mf) +{ + return mf[0].nGrowVect(); +} + +template && (N > 0), + int> = 0> +[[nodiscard]] BoxArray const& +boxArray (Array const& mf) +{ + return mf[0].boxArray(); +} + +template && (N > 0), + int> = 0> +[[nodiscard]] DistributionMapping const& +DistributionMap (Array const& mf) +{ + return mf[0].DistributionMap(); +} + } #endif diff --git a/Src/Base/AMReX_Geometry.H b/Src/Base/AMReX_Geometry.H index 4017273151a..550b42f2f6a 100644 --- a/Src/Base/AMReX_Geometry.H +++ b/Src/Base/AMReX_Geometry.H @@ -16,14 +16,6 @@ #include namespace amrex { -/** -* \class Geometry -* \brief Rectangular problem domain geometry. -* -* This class describes problem domain and coordinate system for -* RECTANGULAR problem domains. Since the problem domain is RECTANGULAR, -* periodicity is meaningful. -*/ class MultiFab; class DistributionMapping; @@ -67,6 +59,14 @@ public: int coord; }; +/** + * \class Geometry + * \brief Rectangular problem domain geometry. + * + * This class describes problem domain and coordinate system for + * RECTANGULAR problem domains. Since the problem domain is RECTANGULAR, + * periodicity is meaningful. + */ class Geometry : public CoordSys diff --git a/Src/Base/AMReX_IArrayBox.H b/Src/Base/AMReX_IArrayBox.H index b5240395f02..db0f26d5080 100644 --- a/Src/Base/AMReX_IArrayBox.H +++ b/Src/Base/AMReX_IArrayBox.H @@ -41,7 +41,6 @@ public: * This class does NOT provide a copy constructor or assignment operator. */ - class IArrayBox : public BaseFab diff --git a/Src/Base/AMReX_IndexType.H b/Src/Base/AMReX_IndexType.H index 02a56aae2a2..0fd613d2a99 100644 --- a/Src/Base/AMReX_IndexType.H +++ b/Src/Base/AMReX_IndexType.H @@ -19,7 +19,6 @@ namespace amrex { * enumerated type CellIndex to be either CELL or NODE; i.e. each of the * AMREX_SPACEDIM dimensions must be either CELL or NODE. */ - class IndexType { friend MPI_Datatype ParallelDescriptor::Mpi_typemap::type(); diff --git a/Src/Base/AMReX_IntVect.H b/Src/Base/AMReX_IntVect.H index fd71c93ae87..b2658a5ec94 100644 --- a/Src/Base/AMReX_IntVect.H +++ b/Src/Base/AMReX_IntVect.H @@ -42,7 +42,6 @@ int coarsen (int i, int ratio) noexcept * C++ array. In addition, the basic arithmetic operators have been overloaded * to implement scaling and translation operations. */ - class IntVect { friend MPI_Datatype ParallelDescriptor::Mpi_typemap::type(); diff --git a/Src/Base/AMReX_Loop.H b/Src/Base/AMReX_Loop.H index 84b39107e45..19e1c3e5191 100644 --- a/Src/Base/AMReX_Loop.H +++ b/Src/Base/AMReX_Loop.H @@ -211,6 +211,30 @@ void LoopConcurrentOnCpu (Box const& bx, int ncomp, F&& f) noexcept }}}} } +// Implementation of "constexpr for" based on +// https://artificial-mind.net/blog/2020/10/31/constexpr-for +// +// Approximates what one would get from a compile-time +// unrolling of the loop +// for (int i = 0; i < N; ++i) { +// f(i); +// } +// +// The mechanism is recursive: we evaluate f(i) at the current +// i and then call the for loop at i+1. f() is a lambda function +// that provides the body of the loop and takes only an integer +// i as its argument. + +template +AMREX_GPU_HOST_DEVICE AMREX_INLINE +constexpr void constexpr_for (F&& f) +{ + if constexpr (I < N) { + f(std::integral_constant()); + constexpr_for(f); + } +} + #include } diff --git a/Src/Base/AMReX_MultiFabUtil.H b/Src/Base/AMReX_MultiFabUtil.H index 29af89ba88e..ca9b1ab7fff 100644 --- a/Src/Base/AMReX_MultiFabUtil.H +++ b/Src/Base/AMReX_MultiFabUtil.H @@ -637,13 +637,13 @@ void average_down (const FabArray& S_fine, FabArray& S_crse, - /** - * \brief Returns part of a norm based on two MultiFabs - * The MultiFabs MUST have the same underlying BoxArray. - * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n)) - * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n) - */ - +/** + * \brief Returns part of a norm based on two MultiFabs. + * + * The MultiFabs MUST have the same underlying BoxArray. + * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n)) + * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n) + */ template Real NormHelper (const MultiFab& x, int xcomp, @@ -696,14 +696,14 @@ NormHelper (const MultiFab& x, int xcomp, return sm; } - /** - * \brief Returns part of a norm based on three MultiFabs - * The MultiFabs MUST have the same underlying BoxArray. - * The Predicate pf is used to test the mask - * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n)) - * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n) - */ - +/** + * \brief Returns part of a norm based on three MultiFabs + * + * The MultiFabs MUST have the same underlying BoxArray. + * The Predicate pf is used to test the mask + * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n)) + * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n) + */ template Real NormHelper (const MMF& mask, diff --git a/Src/Base/AMReX_NFiles.H b/Src/Base/AMReX_NFiles.H index 824351b50dc..bd1518dd44c 100644 --- a/Src/Base/AMReX_NFiles.H +++ b/Src/Base/AMReX_NFiles.H @@ -23,7 +23,6 @@ namespace amrex { * nfi.Stream().write((const char *) data.dataPtr(), nChars); * } */ - class NFilesIter { public: diff --git a/Src/Base/AMReX_Orientation.H b/Src/Base/AMReX_Orientation.H index 263bb84a95e..61e3622b336 100644 --- a/Src/Base/AMReX_Orientation.H +++ b/Src/Base/AMReX_Orientation.H @@ -25,7 +25,6 @@ class OrientationIter; * AMREX_SPACEDIM-1 and then the AMREX_SPACEDIM high sides from direction 0 .. * AMREX_SPACEDIM-1. */ - class Orientation { public: diff --git a/Src/Base/AMReX_PArena.H b/Src/Base/AMReX_PArena.H index d2e8c8ebec3..75db747fd9f 100644 --- a/Src/Base/AMReX_PArena.H +++ b/Src/Base/AMReX_PArena.H @@ -15,7 +15,6 @@ namespace amrex { * \brief This arena uses CUDA stream-ordered memory allocator if available. * If not, use The_Arena(). */ - class PArena : public Arena diff --git a/Src/Base/AMReX_ParmParse.H b/Src/Base/AMReX_ParmParse.H index 01a0098333e..b6f4799f2e8 100644 --- a/Src/Base/AMReX_ParmParse.H +++ b/Src/Base/AMReX_ParmParse.H @@ -267,7 +267,6 @@ class IntVect; * #endif * */ - class ParmParse { public: diff --git a/Src/Base/AMReX_ParmParse.cpp b/Src/Base/AMReX_ParmParse.cpp index c2ecfc7b37a..6fe442bfc5b 100644 --- a/Src/Base/AMReX_ParmParse.cpp +++ b/Src/Base/AMReX_ParmParse.cpp @@ -450,7 +450,6 @@ ppfound (const std::string& keyword, // except if n==-1, return the index of the last occurrence. // Return 0 if the specified occurrence does not exist. // - const ParmParse::PP_entry* ppindex (const ParmParse::Table& table, int n, diff --git a/Src/Base/AMReX_RealVect.H b/Src/Base/AMReX_RealVect.H index 635d21927f6..9e1d72700f7 100644 --- a/Src/Base/AMReX_RealVect.H +++ b/Src/Base/AMReX_RealVect.H @@ -28,7 +28,6 @@ namespace amrex C++ array. In addition, the basic arithmetic operators have been overloaded to implement scaling and translation operations. */ - class RealVect { public: diff --git a/Src/Base/AMReX_RungeKutta.H b/Src/Base/AMReX_RungeKutta.H index cfac0851cab..d68bf00bfb4 100644 --- a/Src/Base/AMReX_RungeKutta.H +++ b/Src/Base/AMReX_RungeKutta.H @@ -4,8 +4,6 @@ #include -namespace amrex::RungeKutta { - /** * \brief Functions for Runge-Kutta methods * @@ -48,6 +46,7 @@ namespace amrex::RungeKutta { * FillPatcher class can be useful for implementing such a callable. See * AmrLevel::RK for an example. */ +namespace amrex::RungeKutta { struct PostStageNoOp { template diff --git a/Src/Base/AMReX_TypeTraits.H b/Src/Base/AMReX_TypeTraits.H index 222576f05f5..fbcb7a2c0e3 100644 --- a/Src/Base/AMReX_TypeTraits.H +++ b/Src/Base/AMReX_TypeTraits.H @@ -37,6 +37,18 @@ namespace amrex template inline constexpr bool IsFabArray_v = IsFabArray::value; + template + struct IsMultiFabLike : std::false_type {}; + // + template + struct IsMultiFabLike && + IsBaseFab_v > > + : std::true_type {}; + // + template + inline constexpr bool IsMultiFabLike_v = IsMultiFabLike::value; + + template using EnableIf_t = typename std::enable_if::type; diff --git a/Src/Base/AMReX_Vector.H b/Src/Base/AMReX_Vector.H index c377076fe1b..18e14d5c3c0 100644 --- a/Src/Base/AMReX_Vector.H +++ b/Src/Base/AMReX_Vector.H @@ -20,7 +20,6 @@ namespace amrex { * Vector::operator[] provides bound checking when compiled with * DEBUG=TRUE. */ - template > class Vector : diff --git a/Src/Base/AMReX_VisMF.H b/Src/Base/AMReX_VisMF.H index f0b146f6a9d..468523e0039 100644 --- a/Src/Base/AMReX_VisMF.H +++ b/Src/Base/AMReX_VisMF.H @@ -29,7 +29,6 @@ class IArrayBox; * \brief File I/O for FabArray. * Wrapper class for reading/writing FabArray objects to disk in various "smart" ways. */ - class VisMF : public VisMFBuffer { diff --git a/Src/Boundary/AMReX_BoundCond.H b/Src/Boundary/AMReX_BoundCond.H index 834f790f6b5..963a2fa7ecc 100644 --- a/Src/Boundary/AMReX_BoundCond.H +++ b/Src/Boundary/AMReX_BoundCond.H @@ -16,7 +16,6 @@ namespace amrex { boundary conditions are specified via an integer identifier. This class maintains that integer. */ - class BoundCond { public: diff --git a/Src/Boundary/AMReX_FabSet.H b/Src/Boundary/AMReX_FabSet.H index f4ae8b7d247..9841555b336 100644 --- a/Src/Boundary/AMReX_FabSet.H +++ b/Src/Boundary/AMReX_FabSet.H @@ -40,7 +40,6 @@ namespace amrex { FabSets are used primarily as a data storage mechanism, and are manipulated by more sophisticated control classes. */ - template class FabSetT { diff --git a/Src/Boundary/AMReX_Mask.H b/Src/Boundary/AMReX_Mask.H index 3a41ea81913..02000250f49 100644 --- a/Src/Boundary/AMReX_Mask.H +++ b/Src/Boundary/AMReX_Mask.H @@ -22,7 +22,6 @@ namespace amrex { This class does NOT provide a copy constructor or assignment operator. */ - class Mask final : public BaseFab diff --git a/Src/Boundary/AMReX_YAFluxRegister.H b/Src/Boundary/AMReX_YAFluxRegister.H index 075a630a2f7..e26426ce15b 100644 --- a/Src/Boundary/AMReX_YAFluxRegister.H +++ b/Src/Boundary/AMReX_YAFluxRegister.H @@ -23,7 +23,6 @@ namespace amrex { `Reflux` is called to update the coarse cells next to the coarse/fine boundary. */ - template class YAFluxRegisterT { diff --git a/Src/EB/AMReX_EBFluxRegister.H b/Src/EB/AMReX_EBFluxRegister.H index 33ec811dcf6..72fec3b6a7c 100644 --- a/Src/EB/AMReX_EBFluxRegister.H +++ b/Src/EB/AMReX_EBFluxRegister.H @@ -53,7 +53,6 @@ namespace amrex { to add the part in ghost cells (excluding ghost cells covered by valid cells of other grids) to EBFluxRegister's internal data. */ - class EBFluxRegister : public YAFluxRegister { diff --git a/Src/Extern/Bittree/AMReX_Bittree.H b/Src/Extern/Bittree/AMReX_Bittree.H index 54a046be720..feb05e9f189 100644 --- a/Src/Extern/Bittree/AMReX_Bittree.H +++ b/Src/Extern/Bittree/AMReX_Bittree.H @@ -18,7 +18,6 @@ LIBRARIES += -lbittree Include in inputs: amr.use_bittree = true */ - class btUnit { // Functions used in AmrMesh public: diff --git a/Src/Extern/SUNDIALS/AMReX_NVector_MultiFab.cpp b/Src/Extern/SUNDIALS/AMReX_NVector_MultiFab.cpp index 8408f75c41d..34671fac1a8 100644 --- a/Src/Extern/SUNDIALS/AMReX_NVector_MultiFab.cpp +++ b/Src/Extern/SUNDIALS/AMReX_NVector_MultiFab.cpp @@ -24,7 +24,6 @@ namespace amrex::sundials { /* ---------------------------------------------------------------------------- * Function to create a new empty multifab vector */ - N_Vector N_VNewEmpty_MultiFab(sunindextype length, ::sundials::Context* sunctx) { /* Create vector */ @@ -76,7 +75,6 @@ N_Vector N_VNewEmpty_MultiFab(sunindextype length, ::sundials::Context* sunctx) /* ---------------------------------------------------------------------------- * Function to create a new MultiFab vector */ - N_Vector N_VNew_MultiFab(sunindextype length, const amrex::BoxArray &ba, const amrex::DistributionMapping &dm, @@ -102,7 +100,6 @@ N_Vector N_VNew_MultiFab(sunindextype length, /* ---------------------------------------------------------------------------- * Function to create a MultiFab N_Vector with user-specific MultiFab */ - N_Vector N_VMake_MultiFab(sunindextype length, amrex::MultiFab *v_mf, ::sundials::Context* sunctx) { diff --git a/Src/Extern/SUNDIALS/AMReX_SUNMemory.H b/Src/Extern/SUNDIALS/AMReX_SUNMemory.H index f7700ce4210..5fc01c3b6b9 100644 --- a/Src/Extern/SUNDIALS/AMReX_SUNMemory.H +++ b/Src/Extern/SUNDIALS/AMReX_SUNMemory.H @@ -13,7 +13,6 @@ namespace amrex::sundials { * * This class allows SUNDIALS to allocate memory using the amrex::Arena. */ - class MemoryHelper { public: MemoryHelper(::sundials::Context* sunctx); diff --git a/Src/Extern/SUNDIALS/AMReX_Sundials_Core.H b/Src/Extern/SUNDIALS/AMReX_Sundials_Core.H index bb3695d19ac..090a5f43534 100644 --- a/Src/Extern/SUNDIALS/AMReX_Sundials_Core.H +++ b/Src/Extern/SUNDIALS/AMReX_Sundials_Core.H @@ -15,7 +15,6 @@ namespace amrex::sundials { * This will create the nthreads SUNDIALS context objects that are needed by * the SUNDIALS solver and vector objects. Called by amrex::Initialize. */ - void Initialize(int nthreads); /** @@ -23,7 +22,6 @@ void Initialize(int nthreads); * * Called by amrex::Finalize. */ - void Finalize(); /** @@ -33,7 +31,6 @@ void Finalize(); * * A SUNDIALS context should not be used concurrently from different threads. */ - ::sundials::Context* The_Sundials_Context(int i = amrex::OpenMP::get_thread_num()); } diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H index 3afa56ee245..3bfab3c9f61 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H @@ -12,8 +12,8 @@ class MLCGSolverT { public: - using FAB = typename MF::fab_type; - using RT = typename MF::value_type; + using FAB = typename MLLinOpT::FAB; + using RT = typename MLLinOpT::RT; enum struct Type { BiCGStab, CG }; @@ -42,6 +42,16 @@ public: void setMaxIter (int _maxiter) { maxiter = _maxiter; } [[nodiscard]] int getMaxIter () const { return maxiter; } + + /** + * Is the initial guess provided to the solver zero ? + * If so, set this to true. + * The solver will avoid a few operations if this is true. + * Default is false. + */ + void setInitSolnZeroed (bool _sol_zeroed) { initial_vec_zeroed = _sol_zeroed; } + [[nodiscard]] bool getInitSolnZeroed () const { return initial_vec_zeroed; } + void setNGhost(int _nghost) {nghost = IntVect(_nghost);} [[nodiscard]] int getNGhost() {return nghost[0];} @@ -62,6 +72,7 @@ private: int maxiter = 100; IntVect nghost = IntVect(0); int iter = -1; + bool initial_vec_zeroed = false; }; template @@ -88,27 +99,34 @@ MLCGSolverT::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs) { BL_PROFILE("MLCGSolver::bicgstab"); - const int ncomp = sol.nComp(); + const int ncomp = nComp(sol); - MF p = Lp.make(amrlev, mglev, sol.nGrowVect()); - MF r = Lp.make(amrlev, mglev, sol.nGrowVect()); - p.setVal(RT(0.0)); // Make sure all entries are initialized to avoid errors - r.setVal(RT(0.0)); + MF p = Lp.make(amrlev, mglev, nGrowVect(sol)); + MF r = Lp.make(amrlev, mglev, nGrowVect(sol)); + setVal(p, RT(0.0)); // Make sure all entries are initialized to avoid errors + setVal(r, RT(0.0)); - MF sorig = Lp.make(amrlev, mglev, nghost); MF rh = Lp.make(amrlev, mglev, nghost); MF v = Lp.make(amrlev, mglev, nghost); MF t = Lp.make(amrlev, mglev, nghost); - Lp.correctionResidual(amrlev, mglev, r, sol, rhs, MLLinOpT::BCMode::Homogeneous); - // Then normalize - Lp.normalize(amrlev, mglev, r); + MF sorig; - sorig.LocalCopy(sol,0,0,ncomp,nghost); - rh.LocalCopy (r ,0,0,ncomp,nghost); + if ( initial_vec_zeroed ) { + LocalCopy(r,rhs,0,0,ncomp,nghost); + } else { + sorig = Lp.make(amrlev, mglev, nghost); - sol.setVal(RT(0.0)); + Lp.correctionResidual(amrlev, mglev, r, sol, rhs, MLLinOpT::BCMode::Homogeneous); + + LocalCopy(sorig,sol,0,0,ncomp,nghost); + setVal(sol, RT(0.0)); + } + + // Then normalize + Lp.normalize(amrlev, mglev, r); + LocalCopy(rh, r, 0,0,ncomp,nghost); RT rnorm = norm_inf(r); const RT rnorm0 = rnorm; @@ -141,13 +159,13 @@ MLCGSolverT::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs) } if ( iter == 1 ) { - p.LocalCopy(r,0,0,ncomp,nghost); + LocalCopy(p,r,0,0,ncomp,nghost); } else { const RT beta = (rho/rho_1)*(alpha/omega); - MF::Saxpy(p, -omega, v, 0, 0, ncomp, nghost); // p += -omega*v - MF::Xpay(p, beta, r, 0, 0, ncomp, nghost); // p = r + beta*p + Saxpy(p, -omega, v, 0, 0, ncomp, nghost); // p += -omega*v + Xpay(p, beta, r, 0, 0, ncomp, nghost); // p = r + beta*p } Lp.apply(amrlev, mglev, v, p, MLLinOpT::BCMode::Homogeneous, MLLinOpT::StateMode::Correction); Lp.normalize(amrlev, mglev, v); @@ -161,8 +179,8 @@ MLCGSolverT::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs) { ret = 2; break; } - MF::Saxpy(sol, alpha, p, 0, 0, ncomp, nghost); // sol += alpha * p - MF::Saxpy(r, -alpha, v, 0, 0, ncomp, nghost); // r += -alpha * v + Saxpy(sol, alpha, p, 0, 0, ncomp, nghost); // sol += alpha * p + Saxpy(r, -alpha, v, 0, 0, ncomp, nghost); // r += -alpha * v rnorm = norm_inf(r); rnorm = norm_inf(r); @@ -198,8 +216,8 @@ MLCGSolverT::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs) { ret = 3; break; } - MF::Saxpy(sol, omega, r, 0, 0, ncomp, nghost); // sol += omega * r - MF::Saxpy(r, -omega, t, 0, 0, ncomp, nghost); // r += -omega * t + Saxpy(sol, omega, r, 0, 0, ncomp, nghost); // sol += omega * r + Saxpy(r, -omega, t, 0, 0, ncomp, nghost); // r += -omega * t rnorm = norm_inf(r); @@ -238,12 +256,16 @@ MLCGSolverT::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs) if ( ( ret == 0 || ret == 8 ) && (rnorm < rnorm0) ) { - sol.LocalAdd(sorig, 0, 0, ncomp, nghost); + if ( !initial_vec_zeroed ) { + LocalAdd(sol, sorig, 0, 0, ncomp, nghost); + } } else { - sol.setVal(RT(0.0)); - sol.LocalAdd(sorig, 0, 0, ncomp, nghost); + setVal(sol, RT(0.0)); + if ( !initial_vec_zeroed ) { + LocalAdd(sol, sorig, 0, 0, ncomp, nghost); + } } return ret; @@ -255,20 +277,26 @@ MLCGSolverT::solve_cg (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs) { BL_PROFILE("MLCGSolver::cg"); - const int ncomp = sol.nComp(); + const int ncomp = nComp(sol); - MF p = Lp.make(amrlev, mglev, sol.nGrowVect()); - p.setVal(RT(0.0)); + MF p = Lp.make(amrlev, mglev, nGrowVect(sol)); + setVal(p, RT(0.0)); - MF sorig = Lp.make(amrlev, mglev, nghost); MF r = Lp.make(amrlev, mglev, nghost); MF q = Lp.make(amrlev, mglev, nghost); - sorig.LocalCopy(sol,0,0,ncomp,nghost); + MF sorig; - Lp.correctionResidual(amrlev, mglev, r, sol, rhs, MLLinOpT::BCMode::Homogeneous); + if ( initial_vec_zeroed ) { + LocalCopy(r,rhs,0,0,ncomp,nghost); + } else { + sorig = Lp.make(amrlev, mglev, nghost); - sol.setVal(RT(0.0)); + Lp.correctionResidual(amrlev, mglev, r, sol, rhs, MLLinOpT::BCMode::Homogeneous); + + LocalCopy(sorig,sol,0,0,ncomp,nghost); + setVal(sol, RT(0.0)); + } RT rnorm = norm_inf(r); const RT rnorm0 = rnorm; @@ -302,12 +330,12 @@ MLCGSolverT::solve_cg (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs) } if (iter == 1) { - p.LocalCopy(r,0,0,ncomp,nghost); + LocalCopy(p,r,0,0,ncomp,nghost); } else { RT beta = rho/rho_1; - MF::Xpay(p, beta, r, 0, 0, ncomp, nghost); // p = r + beta * p + Xpay(p, beta, r, 0, 0, ncomp, nghost); // p = r + beta * p } Lp.apply(amrlev, mglev, q, p, MLLinOpT::BCMode::Homogeneous, MLLinOpT::StateMode::Correction); @@ -329,8 +357,8 @@ MLCGSolverT::solve_cg (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs) << " rho " << rho << " alpha " << alpha << '\n'; } - MF::Saxpy(sol, alpha, p, 0, 0, ncomp, nghost); // sol += alpha * p - MF::Saxpy(r, -alpha, q, 0, 0, ncomp, nghost); // r += -alpha * q + Saxpy(sol, alpha, p, 0, 0, ncomp, nghost); // sol += alpha * p + Saxpy(r, -alpha, q, 0, 0, ncomp, nghost); // r += -alpha * q rnorm = norm_inf(r); if ( verbose > 2 ) @@ -364,12 +392,16 @@ MLCGSolverT::solve_cg (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs) if ( ( ret == 0 || ret == 8 ) && (rnorm < rnorm0) ) { - sol.LocalAdd(sorig, 0, 0, ncomp, nghost); + if ( !initial_vec_zeroed ) { + LocalAdd(sol, sorig, 0, 0, ncomp, nghost); + } } else { - sol.setVal(RT(0.0)); - sol.LocalAdd(sorig, 0, 0, ncomp, nghost); + setVal(sol, RT(0.0)); + if ( !initial_vec_zeroed ) { + LocalAdd(sol, sorig, 0, 0, ncomp, nghost); + } } return ret; @@ -390,8 +422,8 @@ template auto MLCGSolverT::norm_inf (const MF& res, bool local) -> RT { - int ncomp = res.nComp(); - RT result = res.norminf(0,ncomp,IntVect(0),true); + int ncomp = nComp(res); + RT result = norminf(res,0,ncomp,IntVect(0),true); if (!local) { BL_PROFILE("MLCGSolver::ParallelAllReduce"); ParallelAllReduce::Max(result, Lp.BottomCommunicator()); diff --git a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H index b8aa71eebd0..f0dca07f3ab 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H @@ -85,6 +85,15 @@ struct LinOpEnumType enum struct Location { FaceCenter, FaceCentroid, CellCenter, CellCentroid }; }; +template struct LinOpData {}; +// +template +struct LinOpData > > +{ + using fab_type = typename T::fab_type; + using value_type = typename T::value_type; +}; + template class MLMGT; template class MLCGSolverT; template class MLPoissonT; @@ -100,8 +109,8 @@ public: template friend class MLPoissonT; template friend class MLABecLaplacianT; - using FAB = typename MF::fab_type; - using RT = typename MF::value_type; + using FAB = typename LinOpData::fab_type; + using RT = typename LinOpData::value_type; using BCType = LinOpBCType; using BCMode = LinOpEnumType::BCMode; @@ -1375,13 +1384,18 @@ template void MLLinOpT::make (Vector >& mf, IntVect const& ng) const { - mf.clear(); - mf.resize(m_num_amr_levels); - for (int alev = 0; alev < m_num_amr_levels; ++alev) { - mf[alev].resize(m_num_mg_levels[alev]); - for (int mlev = 0; mlev < m_num_mg_levels[alev]; ++mlev) { - mf[alev][mlev] = make(alev, mlev, ng); + if constexpr (IsMultiFabLike_v) { + mf.clear(); + mf.resize(m_num_amr_levels); + for (int alev = 0; alev < m_num_amr_levels; ++alev) { + mf[alev].resize(m_num_mg_levels[alev]); + for (int mlev = 0; mlev < m_num_mg_levels[alev]; ++mlev) { + mf[alev][mlev] = make(alev, mlev, ng); + } } + } else { + amrex::ignore_unused(mf, ng); + amrex::Abort("MLLinOpT::make: how did we get here?"); } } @@ -1389,39 +1403,62 @@ template MF MLLinOpT::make (int amrlev, int mglev, IntVect const& ng) const { - return MF(amrex::convert(m_grids[amrlev][mglev], m_ixtype), - m_dmap[amrlev][mglev], getNComp(), ng, MFInfo(), - *m_factory[amrlev][mglev]); + if constexpr (IsMultiFabLike_v) { + return MF(amrex::convert(m_grids[amrlev][mglev], m_ixtype), + m_dmap[amrlev][mglev], getNComp(), ng, MFInfo(), + *m_factory[amrlev][mglev]); + } else { + amrex::ignore_unused(amrlev, mglev, ng); + amrex::Abort("MLLinOpT::make: how did we get here?"); + return {}; + } } template MF MLLinOpT::makeAlias (MF const& mf) const { - return MF(mf, amrex::make_alias, 0, mf.nComp()); + if constexpr (IsMultiFabLike_v) { + return MF(mf, amrex::make_alias, 0, mf.nComp()); + } else { + amrex::ignore_unused(mf); + amrex::Abort("MLLinOpT::makeAlias: how did we get here?"); + return {}; + } } template MF MLLinOpT::makeCoarseMG (int amrlev, int mglev, IntVect const& ng) const { - BoxArray cba = m_grids[amrlev][mglev]; - IntVect ratio = (amrlev > 0) ? IntVect(2) : mg_coarsen_ratio_vec[mglev]; - cba.coarsen(ratio); - cba.convert(m_ixtype); - return MF(cba, m_dmap[amrlev][mglev], getNComp(), ng); - + if constexpr (IsMultiFabLike_v) { + BoxArray cba = m_grids[amrlev][mglev]; + IntVect ratio = (amrlev > 0) ? IntVect(2) : mg_coarsen_ratio_vec[mglev]; + cba.coarsen(ratio); + cba.convert(m_ixtype); + return MF(cba, m_dmap[amrlev][mglev], getNComp(), ng); + } else { + amrex::ignore_unused(amrlev, mglev, ng); + amrex::Abort("MLLinOpT::makeCoarseMG: how did we get here?"); + return {}; + } } template MF MLLinOpT::makeCoarseAmr (int famrlev, IntVect const& ng) const { - BoxArray cba = m_grids[famrlev][0]; - IntVect ratio(AMRRefRatio(famrlev-1)); - cba.coarsen(ratio); - cba.convert(m_ixtype); - return MF(cba, m_dmap[famrlev][0], getNComp(), ng); + if constexpr (IsMultiFabLike_v) { + BoxArray cba = m_grids[famrlev][0]; + IntVect ratio(AMRRefRatio(famrlev-1)); + cba.coarsen(ratio); + cba.convert(m_ixtype); + return MF(cba, m_dmap[famrlev][0], getNComp(), ng); + } else { + amrex::ignore_unused(famrlev, ng); + amrex::Abort("MLLinOpT::makeCoarseAmr: how did we get here?"); + return {}; + } } template diff --git a/Src/LinearSolvers/MLMG/AMReX_MLMG.H b/Src/LinearSolvers/MLMG/AMReX_MLMG.H index 70e7e121486..9bfc2f0007d 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLMG.H +++ b/Src/LinearSolvers/MLMG/AMReX_MLMG.H @@ -21,8 +21,8 @@ public: template friend class MLCGSolverT; - using FAB = typename MF::fab_type; - using RT = typename MF::value_type; + using FAB = typename MLLinOpT::FAB; + using RT = typename MLLinOpT::RT; using BCMode = typename MLLinOpT::BCMode; using Location = typename MLLinOpT::Location; @@ -507,7 +507,7 @@ MLMGT::solve (const Vector& a_sol, const Vector& a_rhs, for (int alev = 0; alev < namrlevs; ++alev) { if (!sol_is_alias[alev]) { - a_sol[alev]->LocalCopy(sol[alev], 0, 0, ncomp, ng_back); + LocalCopy(*a_sol[alev], sol[alev], 0, 0, ncomp, ng_back); } } @@ -541,11 +541,11 @@ MLMGT::getGradSolution (const Vector >& a_grad_so Array grad_sol; for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { auto const& amf = *(a_grad_sol[alev][idim]); - grad_sol[idim].define(amf.boxArray(), amf.DistributionMap(), ncomp, 0); + grad_sol[idim].define(boxArray(amf), DistributionMap(amf), ncomp, 0); } linop.compGrad(alev, GetArrOfPtrs(grad_sol), sol[alev], a_loc); for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { - a_grad_sol[alev][idim]->LocalCopy(grad_sol[idim], 0, 0, ncomp, IntVect(0)); + LocalCopy(*a_grad_sol[alev][idim], grad_sol[idim], 0, 0, ncomp, IntVect(0)); } } } @@ -578,13 +578,13 @@ MLMGT::getFluxes (const Vector >& a_flux, for (int ilev = 0; ilev < namrlevs; ++ilev) { for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { auto const& amf = *(a_flux[ilev][idim]); - fluxes[ilev][idim].define(amf.boxArray(), amf.DistributionMap(), ncomp, 0); + fluxes[ilev][idim].define(boxArray(amf), DistributionMap(amf), ncomp, 0); } } getFluxes(GetVecOfArrOfPtrs(fluxes), GetVecOfPtrs(sol), a_loc); for (int ilev = 0; ilev < namrlevs; ++ilev) { for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { - a_flux[ilev][idim]->LocalCopy(fluxes[ilev][idim], 0, 0, ncomp, IntVect(0)); + LocalCopy(*a_flux[ilev][idim], fluxes[ilev][idim], 0, 0, ncomp, IntVect(0)); } } } @@ -618,14 +618,14 @@ MLMGT::getFluxes (const Vector >& a_flux, for (int ilev = 0; ilev < namrlevs; ++ilev) { for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { auto const& amf = *(a_flux[ilev][idim]); - fluxes[ilev][idim].define(amf.boxArray(), amf.DistributionMap(), ncomp, 0); + fluxes[ilev][idim].define(boxArray(amf), DistributionMap(amf), ncomp, 0); } - sol[ilev].LocalCopy(*a_sol[ilev], 0, 0, ncomp, sol[ilev].nGrowVect()); + LocalCopy(sol[ilev], *a_sol[ilev], 0, 0, ncomp, nGrowVect(sol[ilev])); } linop.getFluxes(GetVecOfArrOfPtrs(fluxes), GetVecOfPtrs(sol), a_loc); for (int ilev = 0; ilev < namrlevs; ++ilev) { for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) { - a_flux[ilev][idim]->LocalCopy(fluxes[ilev][idim], 0, 0, ncomp, IntVect(0)); + LocalCopy(*a_flux[ilev][idim], fluxes[ilev][idim], 0, 0, ncomp, IntVect(0)); } } } @@ -653,11 +653,11 @@ MLMGT::getFluxes (const Vector & a_flux, Location a_loc) Vector fluxes(namrlevs); for (int ilev = 0; ilev < namrlevs; ++ilev) { auto const& amf = *a_flux[ilev]; - fluxes[ilev].define(amf.boxArray(), amf.DistributionMap(), ncomp, 0); + fluxes[ilev].define(boxArray(amf), DistributionMap(amf), ncomp, 0); } getFluxes(GetVecOfPtrs(fluxes), GetVecOfPtrs(sol), a_loc); for (int ilev = 0; ilev < namrlevs; ++ilev) { - a_flux[ilev]->LocalCopy(fluxes[ilev], 0, 0, ncomp, IntVect(0)); + LocalCopy(*a_flux[ilev], fluxes[ilev], 0, 0, ncomp, IntVect(0)); } } } @@ -676,11 +676,11 @@ void MLMGT::getFluxes (const Vector & a_flux, const Vector& a_sol, Location /*a_loc*/) { - AMREX_ASSERT(a_flux[0]->nComp() >= AMREX_SPACEDIM); + AMREX_ASSERT(nComp(*a_flux[0]) >= AMREX_SPACEDIM); if constexpr (! std::is_same()) { for (int alev = 0; alev < namrlevs; ++alev) { - sol[alev].LocalCopy(*a_sol[alev], 0, 0, ncomp, sol[alev].nGrowVect()); + LocalCopy(sol[alev], *a_sol[alev], 0, 0, ncomp, nGrowVect(sol[alev])); } } @@ -718,11 +718,11 @@ MLMGT::getFluxes (const Vector & a_flux, Vector fluxes(namrlevs); for (int ilev = 0; ilev < namrlevs; ++ilev) { auto const& amf = *a_flux[ilev]; - fluxes[ilev].define(amf.boxArray(), amf.DistributionMap(), ncomp, 0); + fluxes[ilev].define(boxArray(amf), DistributionMap(amf), ncomp, 0); } linop.getFluxes(GetVecOfPtrs(fluxes), GetVecOfPtrs(sol)); for (int ilev = 0; ilev < namrlevs; ++ilev) { - a_flux[ilev]->LocalCopy(fluxes[ilev], 0, 0, ncomp, IntVect(0)); + LocalCopy(*a_flux[ilev], fluxes[ilev], 0, 0, ncomp, IntVect(0)); } } } @@ -779,7 +779,7 @@ MLMGT::compResidual (const Vector& a_res, const Vector& a_sol, sol_is_alias.resize(namrlevs,true); for (int alev = 0; alev < namrlevs; ++alev) { - if (cf_strategy == CFStrategy::ghostnodes || a_sol[alev]->nGrowVect() == ng_sol) + if (cf_strategy == CFStrategy::ghostnodes || nGrowVect(*a_sol[alev]) == ng_sol) { sol[alev] = linop.makeAlias(*a_sol[alev]); sol_is_alias[alev] = true; @@ -790,7 +790,7 @@ MLMGT::compResidual (const Vector& a_res, const Vector& a_sol, { sol[alev] = linop.make(alev, 0, ng_sol); } - sol[alev].LocalCopy(*a_sol[alev], 0, 0, ncomp, IntVect(0)); + LocalCopy(sol[alev], *a_sol[alev], 0, 0, ncomp, IntVect(0)); } } @@ -808,9 +808,9 @@ MLMGT::compResidual (const Vector& a_res, const Vector& a_sol, const MF* prhs = a_rhs[alev]; #if (AMREX_SPACEDIM != 3) int nghost = (cf_strategy == CFStrategy::ghostnodes) ? linop.getNGrow(alev) : 0; - MF rhstmp(prhs->boxArray(), prhs->DistributionMap(), ncomp, nghost, + MF rhstmp(boxArray(*prhs), DistributionMap(*prhs), ncomp, nghost, MFInfo(), *linop.Factory(alev)); - rhstmp.LocalCopy(*prhs, 0, 0, ncomp, IntVect(nghost)); + LocalCopy(rhstmp, *prhs, 0, 0, ncomp, IntVect(nghost)); linop.applyMetricTerm(alev, 0, rhstmp); linop.unimposeNeumannBC(alev, rhstmp); linop.applyInhomogNeumannTerm(alev, rhstmp); @@ -822,9 +822,9 @@ MLMGT::compResidual (const Vector& a_res, const Vector& a_sol, *a_res[alev+1], sol[alev+1], *a_rhs[alev+1]); if (linop.isCellCentered()) { #ifdef AMREX_USE_EB - amrex::EB_average_down(*a_res[alev+1], *a_res[alev], 0, ncomp, amrrr[alev]); + EB_average_down(*a_res[alev+1], *a_res[alev], 0, ncomp, amrrr[alev]); #else - amrex::average_down(*a_res[alev+1], *a_res[alev], 0, ncomp, amrrr[alev]); + average_down(*a_res[alev+1], *a_res[alev], 0, ncomp, amrrr[alev]); #endif } } @@ -858,7 +858,7 @@ MLMGT::apply (const Vector& out, const Vector& a_in) nghost = linop.getNGrow(alev); in[alev] = a_in[alev]; } - else if (a_in[alev]->nGrowVect() == ng_sol) + else if (nGrowVect(*a_in[alev]) == ng_sol) { in[alev] = a_in[alev]; } @@ -866,18 +866,18 @@ MLMGT::apply (const Vector& out, const Vector& a_in) { IntVect ng = ng_sol; if (cf_strategy == CFStrategy::ghostnodes) { ng = IntVect(nghost); } - in_raii[alev].define(a_in[alev]->boxArray(), - a_in[alev]->DistributionMap(), - a_in[alev]->nComp(), ng, + in_raii[alev].define(boxArray (*a_in[alev]), + DistributionMap(*a_in[alev]), + nComp (*a_in[alev]), ng, MFInfo(), *linop.Factory(alev)); - in_raii[alev].LocalCopy(*a_in[alev], 0, 0, ncomp, IntVect(nghost)); + LocalCopy(in_raii[alev], *a_in[alev], 0, 0, ncomp, IntVect(nghost)); in[alev] = &(in_raii[alev]); } - rh[alev].define(a_in[alev]->boxArray(), - a_in[alev]->DistributionMap(), - a_in[alev]->nComp(), nghost, MFInfo(), + rh[alev].define(boxArray (*a_in[alev]), + DistributionMap(*a_in[alev]), + nComp (*a_in[alev]), nghost, MFInfo(), *linop.Factory(alev)); - rh[alev].setVal(RT(0.0)); + setVal(rh[alev], RT(0.0)); } if (!linop_prepared) { @@ -901,9 +901,9 @@ MLMGT::apply (const Vector& out, const Vector& a_in) *out[alev+1], *in[alev+1], rh[alev+1]); if (linop.isCellCentered()) { #ifdef AMREX_USE_EB - amrex::EB_average_down(*out[alev+1], *out[alev], 0, out[alev]->nComp(), amrrr[alev]); + EB_average_down(*out[alev+1], *out[alev], 0, nComp(*out[alev]), amrrr[alev]); #else - amrex::average_down(*out[alev+1], *out[alev], 0, out[alev]->nComp(), amrrr[alev]); + average_down(*out[alev+1], *out[alev], 0, nComp(*out[alev]), amrrr[alev]); #endif } } @@ -970,10 +970,10 @@ MLMGT::prepareForSolve (Vector const& a_sol, Vector const& } else { - if (a_sol[alev]->nGrowVect() == ng_sol) { + if (nGrowVect(*a_sol[alev]) == ng_sol) { if constexpr (std::is_same()) { sol[alev] = linop.makeAlias(*a_sol[alev]); - sol[alev].setBndry(RT(0.0), 0, ncomp); + setBndry(sol[alev], RT(0.0), 0, ncomp); sol_is_alias[alev] = true; } } @@ -981,8 +981,8 @@ MLMGT::prepareForSolve (Vector const& a_sol, Vector const& if (!solve_called) { sol[alev] = linop.make(alev, 0, ng_sol); } - sol[alev].LocalCopy(*a_sol[alev], 0, 0, ncomp, IntVect(0)); - sol[alev].setBndry(RT(0.0), 0, ncomp); + LocalCopy(sol[alev], *a_sol[alev], 0, 0, ncomp, IntVect(0)); + setBndry(sol[alev], RT(0.0), 0, ncomp); } } } @@ -994,7 +994,7 @@ MLMGT::prepareForSolve (Vector const& a_sol, Vector const& if (!solve_called) { rhs[alev] = linop.make(alev, 0, ng_rhs); } - rhs[alev].LocalCopy(*a_rhs[alev], 0, 0, ncomp, ng_rhs); + LocalCopy(rhs[alev], *a_rhs[alev], 0, 0, ncomp, ng_rhs); linop.applyMetricTerm(alev, 0, rhs[alev]); linop.unimposeNeumannBC(alev, rhs[alev]); linop.applyInhomogNeumannTerm(alev, rhs[alev]); @@ -1036,8 +1036,8 @@ MLMGT::prepareForSolve (Vector const& a_sol, Vector const& const int nmglevs = linop.NMGLevels(alev); for (int mglev = 0; mglev < nmglevs; ++mglev) { - res [alev][mglev].setVal(RT(0.0)); - rescor[alev][mglev].setVal(RT(0.0)); + setVal(res [alev][mglev], RT(0.0)); + setVal(rescor[alev][mglev], RT(0.0)); } } @@ -1054,7 +1054,7 @@ MLMGT::prepareForSolve (Vector const& a_sol, Vector const& if (cf_strategy == CFStrategy::ghostnodes) { _ng=IntVect(linop.getNGrow(alev,mglev)); } cor[alev][mglev] = linop.make(alev, mglev, _ng); } - cor[alev][mglev].setVal(RT(0.0)); + setVal(cor[alev][mglev], RT(0.0)); } } @@ -1070,7 +1070,7 @@ MLMGT::prepareForSolve (Vector const& a_sol, Vector const& if (cf_strategy == CFStrategy::ghostnodes) { _ng=IntVect(linop.getNGrow(alev,mglev)); } cor_hold[alev][mglev] = linop.make(alev, mglev, _ng); } - cor_hold[alev][mglev].setVal(RT(0.0)); + setVal(cor_hold[alev][mglev], RT(0.0)); } } for (int alev = 1; alev < finest_amr_lev; ++alev) @@ -1081,7 +1081,7 @@ MLMGT::prepareForSolve (Vector const& a_sol, Vector const& if (cf_strategy == CFStrategy::ghostnodes) { _ng=IntVect(linop.getNGrow(alev)); } cor_hold[alev][0] = linop.make(alev, 0, _ng); } - cor_hold[alev][0].setVal(RT(0.0)); + setVal(cor_hold[alev][0], RT(0.0)); } if (linop.m_parent // no embedded N-Solve @@ -1110,30 +1110,32 @@ template void MLMGT::prepareForNSolve () { - ns_linop = linop.makeNLinOp(nsolve_grid_size); + if constexpr (IsMultiFabLike_v) { + ns_linop = linop.makeNLinOp(nsolve_grid_size); - int nghost = 0; - if (cf_strategy == CFStrategy::ghostnodes) { nghost = linop.getNGrow(); } - - const BoxArray& ba = (*ns_linop).m_grids[0][0]; - const DistributionMapping& dm =(*ns_linop).m_dmap[0][0]; - - int ng = 1; - if (cf_strategy == CFStrategy::ghostnodes) { ng = nghost; } - ns_sol = std::make_unique(ba, dm, ncomp, ng, MFInfo(), *(ns_linop->Factory(0,0))); - ng = 0; - if (cf_strategy == CFStrategy::ghostnodes) { ng = nghost; } - ns_rhs = std::make_unique(ba, dm, ncomp, ng, MFInfo(), *(ns_linop->Factory(0,0))); - ns_sol->setVal(RT(0.0)); - ns_rhs->setVal(RT(0.0)); - - ns_linop->setLevelBC(0, ns_sol.get()); - - ns_mlmg = std::make_unique>(*ns_linop); - ns_mlmg->setVerbose(0); - ns_mlmg->setFixedIter(1); - ns_mlmg->setMaxFmgIter(20); - ns_mlmg->setBottomSolver(BottomSolver::smoother); + int nghost = 0; + if (cf_strategy == CFStrategy::ghostnodes) { nghost = linop.getNGrow(); } + + const BoxArray& ba = (*ns_linop).m_grids[0][0]; + const DistributionMapping& dm =(*ns_linop).m_dmap[0][0]; + + int ng = 1; + if (cf_strategy == CFStrategy::ghostnodes) { ng = nghost; } + ns_sol = std::make_unique(ba, dm, ncomp, ng, MFInfo(), *(ns_linop->Factory(0,0))); + ng = 0; + if (cf_strategy == CFStrategy::ghostnodes) { ng = nghost; } + ns_rhs = std::make_unique(ba, dm, ncomp, ng, MFInfo(), *(ns_linop->Factory(0,0))); + setVal(*ns_sol, RT(0.0)); + setVal(*ns_rhs, RT(0.0)); + + ns_linop->setLevelBC(0, ns_sol.get()); + + ns_mlmg = std::make_unique>(*ns_linop); + ns_mlmg->setVerbose(0); + ns_mlmg->setFixedIter(1); + ns_mlmg->setMaxFmgIter(20); + ns_mlmg->setBottomSolver(BottomSolver::smoother); + } } // in : Residual (res) on the finest AMR level @@ -1149,7 +1151,7 @@ void MLMGT::oneIter (int iter) IntVect nghost(0); if (cf_strategy == CFStrategy::ghostnodes) { nghost = IntVect(linop.getNGrow(alev)); } - sol[alev].LocalAdd(cor[alev][0], 0, 0, ncomp, nghost); + LocalAdd(sol[alev], cor[alev][0], 0, 0, ncomp, nghost); // compute residual for the coarse AMR level computeResWithCrseSolFineCor(alev-1,alev); @@ -1175,7 +1177,7 @@ void MLMGT::oneIter (int iter) IntVect nghost(0); if (cf_strategy == CFStrategy::ghostnodes) { nghost = IntVect(linop.getNGrow(0)); } - sol[0].LocalAdd(cor[0][0], 0, 0, ncomp, nghost); + LocalAdd(sol[0], cor[0][0], 0, 0, ncomp, nghost); } for (int alev = 1; alev <= finest_amr_lev; ++alev) @@ -1185,10 +1187,10 @@ void MLMGT::oneIter (int iter) IntVect nghost(0); if (cf_strategy == CFStrategy::ghostnodes) { nghost = IntVect(linop.getNGrow(alev)); } - sol[alev].LocalAdd(cor[alev][0], 0, 0, ncomp, nghost); + LocalAdd(sol[alev], cor[alev][0], 0, 0, ncomp, nghost); if (alev != finest_amr_lev) { - cor_hold[alev][0].LocalAdd(cor[alev][0], 0, 0, ncomp, nghost); + LocalAdd(cor_hold[alev][0], cor[alev][0], 0, 0, ncomp, nghost); } // Update fine AMR level correction @@ -1196,10 +1198,10 @@ void MLMGT::oneIter (int iter) miniCycle(alev); - sol[alev].LocalAdd(cor[alev][0], 0, 0, ncomp, nghost); + LocalAdd(sol[alev], cor[alev][0], 0, 0, ncomp, nghost); if (alev != finest_amr_lev) { - cor[alev][0].LocalAdd(cor_hold[alev][0], 0, 0, ncomp, nghost); + LocalAdd(cor[alev][0], cor_hold[alev][0], 0, 0, ncomp, nghost); } } @@ -1231,12 +1233,12 @@ MLMGT::mgVcycle (int amrlev, int mglev_top) if (verbose >= 4) { - RT norm = res[amrlev][mglev].norminf(0,ncomp,IntVect(0)); + RT norm = norminf(res[amrlev][mglev],0,ncomp,IntVect(0)); amrex::Print() << "AT LEVEL " << amrlev << " " << mglev << " DN: Norm before smooth " << norm << "\n"; } - cor[amrlev][mglev].setVal(RT(0.0)); + setVal(cor[amrlev][mglev], RT(0.0)); bool skip_fillboundary = true; for (int i = 0; i < nu1; ++i) { linop.smooth(amrlev, mglev, cor[amrlev][mglev], res[amrlev][mglev], skip_fillboundary); @@ -1248,7 +1250,7 @@ MLMGT::mgVcycle (int amrlev, int mglev_top) if (verbose >= 4) { - RT norm = rescor[amrlev][mglev].norminf(0,ncomp,IntVect(0)); + RT norm = norminf(rescor[amrlev][mglev],0,ncomp,IntVect(0)); amrex::Print() << "AT LEVEL " << amrlev << " " << mglev << " DN: Norm after smooth " << norm << "\n"; } @@ -1262,7 +1264,7 @@ MLMGT::mgVcycle (int amrlev, int mglev_top) { if (verbose >= 4) { - RT norm = res[amrlev][mglev_bottom].norminf(0,ncomp,IntVect(0)); + RT norm = norminf(res[amrlev][mglev_bottom],0,ncomp,IntVect(0)); amrex::Print() << "AT LEVEL " << amrlev << " " << mglev_bottom << " DN: Norm before bottom " << norm << "\n"; } @@ -1270,7 +1272,7 @@ MLMGT::mgVcycle (int amrlev, int mglev_top) if (verbose >= 4) { computeResOfCorrection(amrlev, mglev_bottom); - RT norm = rescor[amrlev][mglev_bottom].norminf(0,ncomp,IntVect(0)); + RT norm = norminf(rescor[amrlev][mglev_bottom],0,ncomp,IntVect(0)); amrex::Print() << "AT LEVEL " << amrlev << " " << mglev_bottom << " UP: Norm after bottom " << norm << "\n"; } @@ -1279,11 +1281,11 @@ MLMGT::mgVcycle (int amrlev, int mglev_top) { if (verbose >= 4) { - RT norm = res[amrlev][mglev_bottom].norminf(0,ncomp,IntVect(0)); + RT norm = norminf(res[amrlev][mglev_bottom],0,ncomp,IntVect(0)); amrex::Print() << "AT LEVEL " << amrlev << " " << mglev_bottom << " Norm before smooth " << norm << "\n"; } - cor[amrlev][mglev_bottom].setVal(RT(0.0)); + setVal(cor[amrlev][mglev_bottom], RT(0.0)); bool skip_fillboundary = true; for (int i = 0; i < nu1; ++i) { linop.smooth(amrlev, mglev_bottom, cor[amrlev][mglev_bottom], @@ -1293,7 +1295,7 @@ MLMGT::mgVcycle (int amrlev, int mglev_top) if (verbose >= 4) { computeResOfCorrection(amrlev, mglev_bottom); - RT norm = rescor[amrlev][mglev_bottom].norminf(0,ncomp,IntVect(0)); + RT norm = norminf(rescor[amrlev][mglev_bottom],0,ncomp,IntVect(0)); amrex::Print() << "AT LEVEL " << amrlev << " " << mglev_bottom << " Norm after smooth " << norm << "\n"; } @@ -1308,7 +1310,7 @@ MLMGT::mgVcycle (int amrlev, int mglev_top) if (verbose >= 4) { computeResOfCorrection(amrlev, mglev); - RT norm = rescor[amrlev][mglev].norminf(0,ncomp,IntVect(0)); + RT norm = norminf(rescor[amrlev][mglev],0,ncomp,IntVect(0)); amrex::Print() << "AT LEVEL " << amrlev << " " << mglev << " UP: Norm before smooth " << norm << "\n"; } @@ -1321,7 +1323,7 @@ MLMGT::mgVcycle (int amrlev, int mglev_top) if (verbose >= 4) { computeResOfCorrection(amrlev, mglev); - RT norm = rescor[amrlev][mglev].norminf(0,ncomp,IntVect(0)); + RT norm = norminf(rescor[amrlev][mglev],0,ncomp,IntVect(0)); amrex::Print() << "AT LEVEL " << amrlev << " " << mglev << " UP: Norm after smooth " << norm << "\n"; } @@ -1361,12 +1363,12 @@ MLMGT::mgFcycle () // rescor = res - L(cor) computeResOfCorrection(amrlev, mglev); // res = rescor; this provides b to the vcycle below - res[amrlev][mglev].LocalCopy(rescor[amrlev][mglev], 0, 0, ncomp, nghost); + LocalCopy(res[amrlev][mglev], rescor[amrlev][mglev], 0, 0, ncomp, nghost); // save cor; do v-cycle; add the saved to cor std::swap(cor[amrlev][mglev], cor_hold[amrlev][mglev]); mgVcycle(amrlev, mglev); - cor[amrlev][mglev].LocalAdd(cor_hold[amrlev][mglev], 0, 0, ncomp, nghost); + LocalAdd(cor[amrlev][mglev], cor_hold[amrlev][mglev], 0, 0, ncomp, nghost); } } @@ -1393,16 +1395,16 @@ MLMGT::NSolve (MLMGT& a_solver, MF& a_sol, MF& a_rhs) { BL_PROFILE("MLMG::NSolve()"); - a_sol.setVal(RT(0.0)); + setVal(a_sol, RT(0.0)); MF const& res_bottom = res[0].back(); - if (BoxArray::SameRefs(a_rhs.boxArray(),res_bottom.boxArray()) && - DistributionMapping::SameRefs(a_rhs.DistributionMap(),res_bottom.DistributionMap())) + if (BoxArray::SameRefs(boxArray(a_rhs),boxArray(res_bottom)) && + DistributionMapping::SameRefs(DistributionMap(a_rhs),DistributionMap(res_bottom))) { - a_rhs.LocalCopy(res_bottom, 0, 0, ncomp, IntVect(0)); + LocalCopy(a_rhs, res_bottom, 0, 0, ncomp, IntVect(0)); } else { - a_rhs.setVal(RT(0.0)); - a_rhs.ParallelCopy(res_bottom); + setVal(a_rhs, RT(0.0)); + ParallelCopy(a_rhs, res_bottom, 0, 0, ncomp); } a_solver.solve(Vector{&a_sol}, Vector{&a_rhs}, @@ -1428,7 +1430,7 @@ MLMGT::actualBottomSolve () auto& x = cor[amrlev][mglev]; auto& b = res[amrlev][mglev]; - x.setVal(RT(0.0)); + setVal(x, RT(0.0)); if (bottom_solver == BottomSolver::smoother) { @@ -1444,9 +1446,9 @@ MLMGT::actualBottomSolve () MF raii_b; if (linop.isBottomSingular() && linop.getEnforceSingularSolvable()) { - const IntVect ng = b.nGrowVect(); + const IntVect ng = nGrowVect(b); raii_b = linop.make(amrlev, mglev, ng); - raii_b.LocalCopy(b, 0, 0, ncomp, ng); + LocalCopy(raii_b, b, 0, 0, ncomp, ng); bottom_b = &raii_b; makeSolvable(amrlev,mglev,*bottom_b); @@ -1486,7 +1488,7 @@ MLMGT::actualBottomSolve () int ret = bottomSolveWithCG(x, *bottom_b, cg_type); // If the MLMG solve failed then set the correction to zero if (ret != 0) { - cor[amrlev][mglev].setVal(RT(0.0)); + setVal(cor[amrlev][mglev], RT(0.0)); if (bottom_solver == BottomSolver::cgbicg || bottom_solver == BottomSolver::bicgcg) { if (bottom_solver == BottomSolver::cgbicg) { @@ -1496,7 +1498,7 @@ MLMGT::actualBottomSolve () } ret = bottomSolveWithCG(x, *bottom_b, cg_type); if (ret != 0) { - cor[amrlev][mglev].setVal(RT(0.0)); + setVal(cor[amrlev][mglev], RT(0.0)); } else { // switch permanently if (cg_type == MLCGSolverT::Type::CG) { bottom_solver = BottomSolver::cg; @@ -1526,6 +1528,7 @@ MLMGT::bottomSolveWithCG (MF& x, const MF& b, typename MLCGSolverT::Type cg_solver.setSolver(type); cg_solver.setVerbose(bottom_verbose); cg_solver.setMaxIter(bottom_maxiter); + cg_solver.setInitSolnZeroed(true); if (cf_strategy == CFStrategy::ghostnodes) { cg_solver.setNGhost(linop.getNGrow()); } int ret = cg_solver.solve(x, b, bottom_reltol, bottom_abstol); @@ -1590,7 +1593,7 @@ MLMGT::computeResWithCrseSolFineCor (int calev, int falev) linop.solutionResidual(calev, crse_res, crse_sol, crse_rhs, crse_bcdata); linop.correctionResidual(falev, 0, fine_rescor, fine_cor, fine_res, BCMode::Homogeneous); - fine_res.LocalCopy(fine_rescor, 0, 0, ncomp, nghost); + LocalCopy(fine_res, fine_rescor, 0, 0, ncomp, nghost); linop.reflux(calev, crse_res, crse_sol, crse_rhs, fine_res, fine_sol, fine_rhs); @@ -1618,7 +1621,7 @@ MLMGT::computeResWithCrseCorFineCor (int falev) // fine_rescor = fine_res - L(fine_cor) linop.correctionResidual(falev, 0, fine_rescor, fine_cor, fine_res, BCMode::Inhomogeneous, &crse_cor); - fine_res.LocalCopy(fine_rescor, 0, 0, ncomp, nghost); + LocalCopy(fine_res, fine_rescor, 0, 0, ncomp, nghost); } // Interpolate correction from coarse to fine AMR level. @@ -1647,9 +1650,9 @@ MLMGT::interpCorrection (int alev) } MF cfine = linop.makeCoarseAmr(alev, IntVect(ng_dst)); - cfine.setVal(RT(0.0)); - cfine.ParallelCopy(crse_cor, 0, 0, ncomp, IntVect(ng_src), IntVect(ng_dst), - crse_geom.periodicity()); + setVal(cfine, RT(0.0)); + ParallelCopy(cfine, crse_cor, 0, 0, ncomp, IntVect(ng_src), IntVect(ng_dst), + crse_geom.periodicity()); linop.interpolationAmr(alev, fine_cor, cfine, nghost); // NOLINT(readability-suspicious-call-argument) } @@ -1688,7 +1691,7 @@ MLMGT::addInterpCorrection (int alev, int mglev) else { cfine = linop.makeCoarseMG(alev, mglev, IntVect(0)); - cfine.ParallelCopy(crse_cor,0,0,ncomp,IntVect(0),IntVect(0)); + ParallelCopy(cfine, crse_cor, 0, 0, ncomp); cmf = &cfine; } diff --git a/Src/Particle/AMReX_Particle.H b/Src/Particle/AMReX_Particle.H index 16004d1231e..4ae8b7c4365 100644 --- a/Src/Particle/AMReX_Particle.H +++ b/Src/Particle/AMReX_Particle.H @@ -24,15 +24,6 @@ namespace constexpr Long NoSplitParticleID = GhostParticleID - 4; } - /** Used for 32bit int particle Ids, as in pure SoA layout */ - namespace IntParticleIds { - constexpr int GhostParticleID = 2147483647; // 2**31-1 - constexpr int VirtualParticleID = GhostParticleID - 1; - constexpr int LastParticleID = GhostParticleID - 2; - constexpr int DoSplitParticleID = GhostParticleID - 3; - constexpr int NoSplitParticleID = GhostParticleID - 4; - } - using namespace LongParticleIds; } diff --git a/Src/Particle/AMReX_ParticleTile.H b/Src/Particle/AMReX_ParticleTile.H index a1bdbdd56e2..1048df8724d 100644 --- a/Src/Particle/AMReX_ParticleTile.H +++ b/Src/Particle/AMReX_ParticleTile.H @@ -458,7 +458,7 @@ SoAParticle::NextID () #endif next = the_next_id++; - if (next > IntParticleIds::LastParticleID) { + if (next > LongParticleIds::LastParticleID) { amrex::Abort("SoAParticle::NextID() -- too many particles"); } @@ -470,7 +470,7 @@ int SoAParticle::UnprotectedNextID () { int next = the_next_id++; - if (next > IntParticleIds::LastParticleID) { + if (next > LongParticleIds::LastParticleID) { amrex::Abort("SoAParticle::NextID() -- too many particles"); } return next; @@ -1039,7 +1039,9 @@ struct ParticleTile void shrink_to_fit () { - if constexpr (!ParticleType::is_soa_particle) { + if constexpr (ParticleType::is_soa_particle) { + GetStructOfArrays().GetIdCPUData().shrink_to_fit(); + } else { m_aos_tile().shrink_to_fit(); } for (int j = 0; j < NumRealComps(); ++j) @@ -1058,7 +1060,9 @@ struct ParticleTile Long capacity () const { Long nbytes = 0; - if constexpr (!ParticleType::is_soa_particle) { + if constexpr (ParticleType::is_soa_particle) { + nbytes += GetStructOfArrays().GetIdCPUData().capacity() * sizeof(uint64_t); + } else { nbytes += m_aos_tile().capacity() * sizeof(ParticleType); } for (int j = 0; j < NumRealComps(); ++j) @@ -1077,7 +1081,9 @@ struct ParticleTile void swap (ParticleTile& other) { - if constexpr (!ParticleType::is_soa_particle) { + if constexpr (ParticleType::is_soa_particle) { + GetStructOfArrays().GetIdCPUData().swap(other.GetStructOfArrays().GetIdCPUData()); + } else { m_aos_tile().swap(other.GetArrayOfStructs()()); } for (int j = 0; j < NumRealComps(); ++j) diff --git a/Src/Particle/AMReX_ParticleTransformation.H b/Src/Particle/AMReX_ParticleTransformation.H index aa737455ce6..7ca26cef064 100644 --- a/Src/Particle/AMReX_ParticleTransformation.H +++ b/Src/Particle/AMReX_ParticleTransformation.H @@ -608,10 +608,8 @@ int filterAndTransformParticles (DstTile1& dst1, DstTile2& dst2, const SrcTile& * \param p predicate function - particles will be copied if p returns true * \param src_start the offset at which to start reading particles from src * \param dst_start the offset at which to start writing particles to dst - * \param n the number of particles to apply the operation to * */ - template >,Index> nvccfoo = 0> Index filterAndTransformParticles (DstTile& dst, const SrcTile& src, Pred&& p, F&& f, diff --git a/Src/Particle/AMReX_ParticleUtil.H b/Src/Particle/AMReX_ParticleUtil.H index 5430cd34037..682a82450f0 100644 --- a/Src/Particle/AMReX_ParticleUtil.H +++ b/Src/Particle/AMReX_ParticleUtil.H @@ -47,7 +47,6 @@ numParticlesOutOfRange (Iterator const& pti, int nGrow) * \param nGrow the number of grow cells allowed. * */ - template ::value && !Iterator::ContainerType::ParticleType::is_soa_particle, int> foo = 0> int numParticlesOutOfRange (Iterator const& pti, IntVect nGrow) @@ -372,6 +371,26 @@ IntVect getParticleCell (P const& p, return iv; } +template +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +IntVect getParticleCell (PTD const& ptd, int i, + amrex::GpuArray const& plo, + amrex::GpuArray const& dxi, + const Box& domain) noexcept +{ + if constexpr (PTD::ParticleType::is_soa_particle) + { + IntVect iv( + AMREX_D_DECL(int(amrex::Math::floor((ptd.m_rdata[0][i]-plo[0])*dxi[0])), + int(amrex::Math::floor((ptd.m_rdata[1][i]-plo[1])*dxi[1])), + int(amrex::Math::floor((ptd.m_rdata[2][i]-plo[2])*dxi[2])))); + iv += domain.smallEnd(); + return iv; + } else { + return getParticleCell(ptd.m_aos[i], plo, dxi, domain);; + } +} + struct DefaultAssignor { @@ -675,6 +694,7 @@ void PermutationForDeposition (Gpu::DeviceVector& perm, index_type n } }); #else + amrex::ignore_unused(pperm, pglobal_idx); Abort("Not implemented"); #endif diff --git a/Src/Particle/AMReX_StructOfArrays.H b/Src/Particle/AMReX_StructOfArrays.H index 6cd498e20a2..4de35e085ca 100644 --- a/Src/Particle/AMReX_StructOfArrays.H +++ b/Src/Particle/AMReX_StructOfArrays.H @@ -195,13 +195,12 @@ struct StructOfArrays { for (int i = 0; i < int(m_runtime_idata.size()); ++i) { m_runtime_idata[i].resize(count); } } - [[nodiscard]] IdCPU* idcpuarray () { + [[nodiscard]] uint64_t* idcpuarray () { if constexpr (use64BitIdCpu == true) { return m_idcpu.dataPtr(); } else { return nullptr; } - } [[nodiscard]] GpuArray realarray () diff --git a/Tools/CMake/AMReXFlagsTargets.cmake b/Tools/CMake/AMReXFlagsTargets.cmake index 9e3073cd53f..a2e86b2fbd3 100644 --- a/Tools/CMake/AMReXFlagsTargets.cmake +++ b/Tools/CMake/AMReXFlagsTargets.cmake @@ -89,7 +89,7 @@ target_compile_options( Flags_CXX $<${_cxx_appleclang_rwdbg}:> $<${_cxx_appleclang_rel}:> $<${_cxx_intelllvm_dbg}:-O0 -Wall -Wextra -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable> - $<${_cxx_intelllvm_rwdbg}:-g1> + $<${_cxx_intelllvm_rwdbg}:-gline-tables-only -fdebug-info-for-profiling> # recommended by Intel VTune $<${_cxx_intelllvm_rel}:> ) diff --git a/Tools/CMake/AMReXOptions.cmake b/Tools/CMake/AMReXOptions.cmake index 7f24cb46db4..319a6939acc 100644 --- a/Tools/CMake/AMReXOptions.cmake +++ b/Tools/CMake/AMReXOptions.cmake @@ -213,6 +213,16 @@ if (AMReX_SYCL) endif() endif() + set(AMReX_PARALLEL_LINK_JOBS_DEFAULT 1) + if (DEFINED ENV{AMREX_PARALLEL_LINK_JOBS}) + set(AMReX_PARALLEL_LINK_JOBS_DEFAULT "$ENV{AMREX_PARALLEL_LINK_JOBS}") + endif() + set(AMReX_PARALLEL_LINK_JOBS ${AMReX_PARALLEL_LINK_JOBS_DEFAULT} + CACHE STRING "SYCL max parallel link jobs") + if (NOT AMReX_PARALLEL_LINK_JOBS GREATER_EQUAL 1 OR + NOT AMReX_PARALLEL_LINK_JOBS MATCHES "^[1-9][0-9]*$") + message(FATAL_ERROR "AMReX_PARALLEL_LINK_JOBS (${AMReX_PARALLEL_LINK_JOBS}) must be a positive integer") + endif() endif () # --- HIP ---- diff --git a/Tools/CMake/AMReXSYCL.cmake b/Tools/CMake/AMReXSYCL.cmake index a67571dc412..2b48f1c53fe 100644 --- a/Tools/CMake/AMReXSYCL.cmake +++ b/Tools/CMake/AMReXSYCL.cmake @@ -88,4 +88,10 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND "${CMAKE_BUILD_TYPE}" MATCHES "Debug" "$<${_cxx_sycl}:-fsycl-link-huge-device-code>" ) endif () +if (AMReX_PARALLEL_LINK_JOBS GREATER 1) + target_link_options( SYCL + INTERFACE + $<${_cxx_sycl}:-fsycl-max-parallel-link-jobs=${AMReX_PARALLEL_LINK_JOBS}>) +endif() + unset(_cxx_sycl)