From 5c46d041011304160c6211c77a61ab5111c96a87 Mon Sep 17 00:00:00 2001 From: LorenzoVarese Date: Thu, 18 Jul 2024 16:05:22 +0200 Subject: [PATCH 01/53] Fix mesh definitions in benchmarks --- benchmark/benchmarks.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 8255aca..119506b 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -4,6 +4,8 @@ push!(LOAD_PATH, path_to_package) using BenchmarkTools using GridTools +include("../test/mesh_definitions.jl") + SUITE = BenchmarkGroup() SUITE["arith_broadcast"] = BenchmarkGroup() From 1eea839026adea94dae4f590052dd8ea7bf8a3a8 Mon Sep 17 00:00:00 2001 From: LorenzoVarese Date: Thu, 18 Jul 2024 16:05:46 +0200 Subject: [PATCH 02/53] Attempt to fix the benchmark PR --- .github/workflows/benchmark_pr.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark_pr.yml b/.github/workflows/benchmark_pr.yml index 71b10e2..ed7f950 100644 --- a/.github/workflows/benchmark_pr.yml +++ b/.github/workflows/benchmark_pr.yml @@ -18,6 +18,14 @@ jobs: with: version: "1.8" - uses: julia-actions/cache@v1 + - name: Set up Python environment + run: | + sudo apt-get update + sudo apt-get install python3-pip + python3 -m pip install gt4py.next + - name: Configure PyCall + run: | + julia -e 'using Pkg; ENV["PYTHON"]="/usr/bin/python3"; Pkg.build("PyCall");' - name: Extract Package Name from Project.toml id: extract-package-name run: | @@ -27,7 +35,6 @@ jobs: env: JULIA_NUM_THREADS: 2 run: | - # Lightweight build step, as sometimes the runner runs out of memory: julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.add(;url="https://github.com/MilesCranmer/AirspeedVelocity.jl.git")' julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.build("AirspeedVelocity")' - name: Add ~/.julia/bin to PATH From 02ee8b71922628bbbe6cdcbc628f60ca497520cb Mon Sep 17 00:00:00 2001 From: LorenzoVarese Date: Thu, 18 Jul 2024 16:11:56 +0200 Subject: [PATCH 03/53] Fix pip install gt4py --- .github/workflows/benchmark_pr.yml | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmark_pr.yml b/.github/workflows/benchmark_pr.yml index ed7f950..46e5b72 100644 --- a/.github/workflows/benchmark_pr.yml +++ b/.github/workflows/benchmark_pr.yml @@ -21,11 +21,18 @@ jobs: - name: Set up Python environment run: | sudo apt-get update - sudo apt-get install python3-pip - python3 -m pip install gt4py.next + sudo apt-get install python3-pip python3-venv + python3 -m venv ~/gt4py-venv + source ~/gt4py-venv/bin/activate + python3 -m pip install --upgrade pip + - name: Install GT4Py from specific branch + run: | + git clone --branch fix_python_interp_path_in_cmake https://github.com/tehrengruber/gt4py.git ~/gt4py + pip install -r ~/gt4py/requirements-dev.txt + pip install -e ~/gt4py - name: Configure PyCall run: | - julia -e 'using Pkg; ENV["PYTHON"]="/usr/bin/python3"; Pkg.build("PyCall");' + julia -e 'using Pkg; ENV["PYTHON"]="~/gt4py-venv/bin/python3"; Pkg.build("PyCall");' - name: Extract Package Name from Project.toml id: extract-package-name run: | @@ -82,4 +89,4 @@ jobs: # comment-id: ${{ steps.fcbenchmark.outputs.comment-id }} issue-number: ${{ github.event.pull_request.number }} body-path: body.md - edit-mode: replace \ No newline at end of file + edit-mode: replace From 2ae9b772758b290186401327cc5befe7fc75983b Mon Sep 17 00:00:00 2001 From: LorenzoVarese Date: Thu, 18 Jul 2024 16:16:28 +0200 Subject: [PATCH 04/53] Fix PyCall installation in the benchmark_pr.yml --- .github/workflows/benchmark_pr.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark_pr.yml b/.github/workflows/benchmark_pr.yml index 46e5b72..2c1911a 100644 --- a/.github/workflows/benchmark_pr.yml +++ b/.github/workflows/benchmark_pr.yml @@ -30,9 +30,9 @@ jobs: git clone --branch fix_python_interp_path_in_cmake https://github.com/tehrengruber/gt4py.git ~/gt4py pip install -r ~/gt4py/requirements-dev.txt pip install -e ~/gt4py - - name: Configure PyCall + - name: Install and Configure PyCall run: | - julia -e 'using Pkg; ENV["PYTHON"]="~/gt4py-venv/bin/python3"; Pkg.build("PyCall");' + julia --project -e 'using Pkg; Pkg.add("PyCall"); ENV["PYTHON"]="~/gt4py-venv/bin/python3"; Pkg.build("PyCall");' - name: Extract Package Name from Project.toml id: extract-package-name run: | From 86863261de448f3db32cadc55e0466690dbabc9e Mon Sep 17 00:00:00 2001 From: LorenzoVarese Date: Thu, 18 Jul 2024 16:29:28 +0200 Subject: [PATCH 05/53] Fix the PyCall invoke --- .github/workflows/benchmark_pr.yml | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmark_pr.yml b/.github/workflows/benchmark_pr.yml index 2c1911a..5dc9f78 100644 --- a/.github/workflows/benchmark_pr.yml +++ b/.github/workflows/benchmark_pr.yml @@ -28,11 +28,17 @@ jobs: - name: Install GT4Py from specific branch run: | git clone --branch fix_python_interp_path_in_cmake https://github.com/tehrengruber/gt4py.git ~/gt4py - pip install -r ~/gt4py/requirements-dev.txt - pip install -e ~/gt4py + cd ~/gt4py + pip install -r requirements-dev.txt + pip install -e . - name: Install and Configure PyCall run: | - julia --project -e 'using Pkg; Pkg.add("PyCall"); ENV["PYTHON"]="~/gt4py-venv/bin/python3"; Pkg.build("PyCall");' + source ~/gt4py-venv/bin/activate + julia --project -e 'using Pkg; Pkg.add("PyCall"); ENV["PYTHON"]="python"; Pkg.build("PyCall");' + - name: Check PyCall Configuration + run: | + source ~/gt4py-venv/bin/activate + julia -e 'using PyCall; @show PyCall.python' - name: Extract Package Name from Project.toml id: extract-package-name run: | @@ -42,19 +48,21 @@ jobs: env: JULIA_NUM_THREADS: 2 run: | - julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.add(;url="https://github.com/MilesCranmer/AirspeedVelocity.jl.git")' + julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.add(url="https://github.com/MilesCranmer/AirspeedVelocity.jl.git")' julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.build("AirspeedVelocity")' - name: Add ~/.julia/bin to PATH run: | echo "$HOME/.julia/bin" >> $GITHUB_PATH - name: Run benchmarks run: | + source ~/gt4py-venv/bin/activate echo $PATH ls -l ~/.julia/bin mkdir results benchpkg ${{ steps.extract-package-name.outputs.package_name }} --rev="${{github.event.repository.default_branch}},${{github.event.pull_request.head.sha}}" --url=${{ github.event.repository.clone_url }} --bench-on="${{github.event.repository.default_branch}}" --output-dir=results/ --tune - name: Create plots from benchmarks run: | + source ~/gt4py-venv/bin/activate mkdir -p plots benchpkgplot ${{ steps.extract-package-name.outputs.package_name }} --rev="${{github.event.repository.default_branch}},${{github.event.pull_request.head.sha}}" --npart=10 --format=png --input-dir=results/ --output-dir=plots/ - name: Upload plot as artifact @@ -64,6 +72,7 @@ jobs: path: plots - name: Create markdown table from benchmarks run: | + source ~/gt4py-venv/bin/activate benchpkgtable ${{ steps.extract-package-name.outputs.package_name }} --rev="${{github.event.repository.default_branch}},${{github.event.pull_request.head.sha}}" --input-dir=results/ --ratio > table.md echo '### Benchmark Results' > body.md echo '' >> body.md From e978b6ce0beb9075fa1d9ba7dd90bc7d8701f349 Mon Sep 17 00:00:00 2001 From: LorenzoVarese Date: Thu, 18 Jul 2024 16:42:58 +0200 Subject: [PATCH 06/53] Add reference to julia env in benchmark_pr config --- .github/workflows/benchmark_pr.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmark_pr.yml b/.github/workflows/benchmark_pr.yml index 5dc9f78..064fb50 100644 --- a/.github/workflows/benchmark_pr.yml +++ b/.github/workflows/benchmark_pr.yml @@ -34,11 +34,11 @@ jobs: - name: Install and Configure PyCall run: | source ~/gt4py-venv/bin/activate - julia --project -e 'using Pkg; Pkg.add("PyCall"); ENV["PYTHON"]="python"; Pkg.build("PyCall");' + julia --project=. -e 'using Pkg; Pkg.add("PyCall"); ENV["PYTHON"]="python"; Pkg.build("PyCall");' - name: Check PyCall Configuration run: | source ~/gt4py-venv/bin/activate - julia -e 'using PyCall; @show PyCall.python' + julia --project=. -e 'using PyCall; @show PyCall.python' - name: Extract Package Name from Project.toml id: extract-package-name run: | @@ -48,8 +48,7 @@ jobs: env: JULIA_NUM_THREADS: 2 run: | - julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.add(url="https://github.com/MilesCranmer/AirspeedVelocity.jl.git")' - julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.build("AirspeedVelocity")' + julia --project=. -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; using Pkg; Pkg.add(url="https://github.com/MilesCranmer/AirspeedVelocity.jl.git"); Pkg.build("AirspeedVelocity")' - name: Add ~/.julia/bin to PATH run: | echo "$HOME/.julia/bin" >> $GITHUB_PATH From ea7d32d9888b623d64abd49937bddb364e658619 Mon Sep 17 00:00:00 2001 From: LorenzoVarese Date: Thu, 18 Jul 2024 16:51:02 +0200 Subject: [PATCH 07/53] Add cache for python, and fix the pycall (again :/) --- .github/workflows/benchmark_pr.yml | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark_pr.yml b/.github/workflows/benchmark_pr.yml index 064fb50..b0a3d51 100644 --- a/.github/workflows/benchmark_pr.yml +++ b/.github/workflows/benchmark_pr.yml @@ -17,7 +17,20 @@ jobs: - uses: julia-actions/setup-julia@v1 with: version: "1.8" - - uses: julia-actions/cache@v1 + - uses: actions/cache@v2 + name: Cache Julia packages + with: + path: ~/.julia + key: ${{ runner.os }}-julia-${{ hashFiles('**/Project.toml', '**/Manifest.toml') }} + restore-keys: | + ${{ runner.os }}-julia- + - uses: actions/cache@v2 + name: Cache Python packages + with: + path: ~/gt4py-venv + key: ${{ runner.os }}-python-${{ hashFiles('**/requirements-dev.txt') }} + restore-keys: | + ${{ runner.os }}-python- - name: Set up Python environment run: | sudo apt-get update From 3d12f85374acae4fc033ccdb232a823aadfee67f Mon Sep 17 00:00:00 2001 From: LorenzoVarese Date: Fri, 19 Jul 2024 15:26:37 +0200 Subject: [PATCH 08/53] Activate the env in the benchmark CI --- .github/workflows/benchmark_pr.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/benchmark_pr.yml b/.github/workflows/benchmark_pr.yml index b0a3d51..a8bee98 100644 --- a/.github/workflows/benchmark_pr.yml +++ b/.github/workflows/benchmark_pr.yml @@ -40,6 +40,7 @@ jobs: python3 -m pip install --upgrade pip - name: Install GT4Py from specific branch run: | + source ~/gt4py-venv/bin/activate git clone --branch fix_python_interp_path_in_cmake https://github.com/tehrengruber/gt4py.git ~/gt4py cd ~/gt4py pip install -r requirements-dev.txt From 2ae25f26317187135c2bda01d1534bf028f9ffef Mon Sep 17 00:00:00 2001 From: LorenzoVarese Date: Fri, 19 Jul 2024 17:12:34 +0200 Subject: [PATCH 09/53] Include the Cell and K definitions in the benchmark --- benchmark/benchmarks.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 119506b..ab0de20 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -4,7 +4,11 @@ push!(LOAD_PATH, path_to_package) using BenchmarkTools using GridTools -include("../test/mesh_definitions.jl") +# Mesh definitions ------------------------------------------------------------------------------------------- +const global Cell_ = Dimension{:Cell_, HORIZONTAL} +const global K_ = Dimension{:K_, HORIZONTAL} +const global Cell = Cell_() +const global K = K_() SUITE = BenchmarkGroup() From 5412e44644315be8671c8016c7534bf158d97c23 Mon Sep 17 00:00:00 2001 From: LorenzoVarese Date: Mon, 22 Jul 2024 11:41:57 +0200 Subject: [PATCH 10/53] Add readme to run benchmark example --- .gitignore | 5 +++- benchmark/README.md | 56 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 benchmark/README.md diff --git a/.gitignore b/.gitignore index 0e02d77..3d9056e 100644 --- a/.gitignore +++ b/.gitignore @@ -29,5 +29,8 @@ env_setup.sh .python-version # Misc -.DS_Store +**/.DS_Store .vscode + +# Ignore benchmark (benchpkg) results +results_GridTools@* diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 0000000..9ae7e7a --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,56 @@ +# Benchmark Guide 🧭📈 + +## Installation + +To install the benchmark CLI, execute the following command: + +```bash +julia -e 'using Pkg; Pkg.add("AirspeedVelocity"); Pkg.build("AirspeedVelocity")' +``` + +This installation will create three executables in the `~/.julia/bin` folder: `benchpkg`, `benchpkgplot`, and `benchpkgtable`. It is necessary to add them to your `$PATH` to use them from any terminal session. + +### Add to PATH Temporarily + +To temporarily add the path to your session: + +```bash +export PATH="$PATH:~/.julia/bin" +``` + +### Add to PATH Permanently + +To permanently add the executables to your path, append the following line to your `.zshrc` or `.bashrc` file: + +```bash +echo 'export PATH="$PATH:~/.julia/bin"' >> ~/.zshrc # For zsh users +echo 'export PATH="$PATH:~/.julia/bin"' >> ~/.bashrc # For bash users +``` + +## Running Benchmarks + +To run benchmarks, simply execute the following command in the shell: + +```bash +benchpkg +``` + +and it will: + +1. Figure out the package name (from Project.toml) +2. Figure out the default branch name to compare the dirty state of your repo against +3. Evaluate all the benchmarks in benchmarks/benchmark.jl (BenchmarkTools.jl format – i.e., const SUITE = BenchmarkGroup()) +4. Print the result in a nicely formatted markdown table + +You can use the `--filter` option to quickly check if the load time has worsened compared to the master branch: + +```bash +benchpkg --filter=time_to_load +``` + +The `benchpkg` was updated in June 2024 to automate the benchmark without specifying the parameters. +To specify additional condition in `benchpkg` and to work with `benchpkgplot` consult the help command (`--h`). + +## Creating New Benchmarks + +TODO: Instructions for adding new benchmarks to the suite. From 978e6b59e3d341e90f8d737623cb72658ce5937e Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Wed, 24 Jul 2024 13:50:01 +0200 Subject: [PATCH 11/53] Add commented benchmark for field operations --- benchmark/benchmarks.jl | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index ab0de20..9daa4b6 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -19,4 +19,20 @@ af = Field((Cell, K), rand(1000, 1000)); bf = Field((Cell, K), rand(1000, 1000)) SUITE["arith_broadcast"]["arrays"] = @benchmarkable a .+ b .- c SUITE["arith_broadcast"]["fields"] = @benchmarkable af .+ bf .- cf +# Benchmark for field operator addition + +# function benchmark_fo_addition() +# a = Field(Cell, collect(1.0:15.0)) +# b = Field(Cell, collect(-1.0:-1:-15.0)) +# out = Field(Cell, zeros(Float64, 15)) + +# @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} +# return a .+ b +# end + +# @benchmarkable fo_addition(a, b, backend="embedded", out=out) +# end + +# SUITE["field_operator"]["addition"] = benchmark_fo_addition() + run(SUITE, verbose = true, seconds = 1) From 8046538e8b4fcd28aabfe7a7d980ffa56ddd018d Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Fri, 26 Jul 2024 14:43:21 +0200 Subject: [PATCH 12/53] fix benchmarks --- benchmark/benchmarks.jl | 95 ++++++++++++++++++++++++++++++++++------- 1 file changed, 80 insertions(+), 15 deletions(-) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 9daa4b6..f4248d7 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -9,30 +9,95 @@ const global Cell_ = Dimension{:Cell_, HORIZONTAL} const global K_ = Dimension{:K_, HORIZONTAL} const global Cell = Cell_() const global K = K_() +const global Edge_ = Dimension{:Edge_, HORIZONTAL} +const global Edge = Edge_() +const global E2CDim_ = Dimension{:E2CDim_, LOCAL} +const global E2CDim = E2CDim_() + + +function setup_simple_connectivity()::Dict{String,Connectivity} + edge_to_cell_table = [ + [1 -1]; + [3 -1]; + [3 -1]; + [4 -1]; + [5 -1]; + [6 -1]; + [1 6]; + [1 2]; + [2 3]; + [2 4]; + [4 5]; + [5 6] + ] + + cell_to_edge_table = [ + [1 7 8]; + [8 9 10]; + [2 3 9]; + [4 10 11]; + [5 11 12]; + [6 7 12] + ] + + E2C_offset_provider = Connectivity(edge_to_cell_table, Cell, Edge, 2) + C2E_offset_provider = Connectivity(cell_to_edge_table, Edge, Cell, 3) + + offset_provider = Dict{String,Connectivity}( + "E2C" => E2C_offset_provider, + "C2E" => C2E_offset_provider, + "E2CDim" => E2C_offset_provider # TODO(lorenzovarese): this is required for the embedded backend (note: python already uses E2C) + ) + + return offset_provider +end SUITE = BenchmarkGroup() -SUITE["arith_broadcast"] = BenchmarkGroup() +# Legacy Suite with first tests +# SUITE["arith_broadcast"] = BenchmarkGroup() + +# a = rand(1000, 1000); b = rand(1000,1000); c = rand(1000,1000) +# af = Field((Cell, K), rand(1000, 1000)); bf = Field((Cell, K), rand(1000, 1000)); cf = Field((Cell, K), rand(1000, 1000)) +# SUITE["arith_broadcast"]["arrays"] = @benchmarkable a .+ b .- c +# SUITE["arith_broadcast"]["fields"] = @benchmarkable af .+ bf .- cf -a = rand(1000, 1000); b = rand(1000,1000); c = rand(1000,1000) -af = Field((Cell, K), rand(1000, 1000)); bf = Field((Cell, K), rand(1000, 1000)); cf = Field((Cell, K), rand(1000, 1000)) -SUITE["arith_broadcast"]["arrays"] = @benchmarkable a .+ b .- c -SUITE["arith_broadcast"]["fields"] = @benchmarkable af .+ bf .- cf +SUITE["field_operator"] = BenchmarkGroup() # Benchmark for field operator addition +function benchmark_fo_addition() + a = Field(Cell, collect(1.0:15.0)) + b = Field(Cell, collect(-1.0:-1:-15.0)) + out = Field(Cell, zeros(Float64, 15)) + + @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return a .+ b + end + + @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out) #setup=( + # a = Field(Cell, collect(1.0:15.0)); + # b = Field(Cell, collect(-1.0:-1:-15.0)); + # out_field = Field(Cell, zeros(Float64, 15)); + # @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} return a .+ b end; + # ) +end + +SUITE["field_operator"]["addition"] = benchmark_fo_addition() -# function benchmark_fo_addition() -# a = Field(Cell, collect(1.0:15.0)) -# b = Field(Cell, collect(-1.0:-1:-15.0)) -# out = Field(Cell, zeros(Float64, 15)) +# Benchmark for neighbor sum +function benchmark_fo_neighbor_sum() + offset_provider = setup_simple_connectivity(); + a = Field(Cell, collect(5.0:17.0) * 3); + E2C = FieldOffset("E2C", source=Cell, target=(Edge, E2CDim)) + out_field = Field(Edge, zeros(Float64, 12)) -# @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} -# return a .+ b -# end + @field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} + return neighbor_sum(a(E2C), axis=E2CDim) + end -# @benchmarkable fo_addition(a, b, backend="embedded", out=out) -# end + @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, out=$out_field) +end -# SUITE["field_operator"]["addition"] = benchmark_fo_addition() +SUITE["field_operator"]["neighbor_sum"] = benchmark_fo_neighbor_sum() run(SUITE, verbose = true, seconds = 1) From 8fea0cae311f1f20de9443f6b9c31ea283c59bfd Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Wed, 31 Jul 2024 18:14:17 +0200 Subject: [PATCH 13/53] Add benchmark comparison between Julia's broadcast addition and the field operator one --- benchmark/benchsuite_fo.jl | 124 +++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 benchmark/benchsuite_fo.jl diff --git a/benchmark/benchsuite_fo.jl b/benchmark/benchsuite_fo.jl new file mode 100644 index 0000000..a208bc1 --- /dev/null +++ b/benchmark/benchsuite_fo.jl @@ -0,0 +1,124 @@ +using BenchmarkTools +using Statistics +using GridTools + +# Data size +const global STREAM_SIZE = 10000000 # 10 million + +# Mesh definitions +const global Cell_ = Dimension{:Cell_, HORIZONTAL} +const global Cell = Cell_() + +""" + julia_broadcast_addition_setup(ARRAY_SIZE::Int64) + +Setup function for the Julia broadcast addition benchmark. + +# Arguments +- `ARRAY_SIZE::Int64`: The size of the arrays to be generated. + +# Returns +- `a, b`: Two randomly generated arrays of integers of size `ARRAY_SIZE`. +- `data_size`: The total size of the data processed. +""" +function julia_broadcast_addition_setup(ARRAY_SIZE::Int64) + a = rand(Int, ARRAY_SIZE) + b = rand(Int, ARRAY_SIZE) + data_size = sizeof(a) + sizeof(b) # Total bytes processed + return a, b, data_size +end + +""" + julia_broadcast_addition_operation(a, b) + +Core operation for the Julia broadcast addition benchmark. + +# Arguments +- `a, b`: Two arrays to be added. + +# Returns +- The result of element-wise addition of `a` and `b`. +""" +function julia_broadcast_addition_operation(a, b) + return a .+ b +end + +""" + fo_broadcast_addition_setup(FIELD_DATA_SIZE::Int64) + +Setup function for the field operator broadcast addition benchmark. + +# Arguments +- `FIELD_DATA_SIZE::Int64`: The size of the fields to be generated. + +# Returns +- `a, b`: Two randomly generated fields of floats of size `FIELD_DATA_SIZE`. +- `out`: An output field similar to `a`. +""" +function fo_broadcast_addition_setup(FIELD_DATA_SIZE::Int64) + a = Field(Cell, rand(Float64, FIELD_DATA_SIZE)) + b = Field(Cell, rand(Float64, FIELD_DATA_SIZE)) + out = GridTools.similar_field(a) + return a, b, out +end + +""" + fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Core operation for the field operator broadcast addition benchmark. + +# Arguments +- `a, b`: Two fields to be added. + +# Returns +- The result of element-wise addition of `a` and `b`. +""" +@field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return a .+ b +end + +""" + compute_memory_bandwidth_addition(results, a, b, out) + +Function to compute the memory bandwidth for the addition benchmarks. + +# Arguments +- `results`: Benchmark results. +- `a, b`: The input arrays/fields used in the benchmark. +- `out`: The output array/field of the benchmark. + +# Returns +- The computed memory bandwidth in GB/s. +""" +function compute_memory_bandwidth_addition(results, a, b, out) + @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data) + data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data) # Read a and b, write to out + time_in_seconds = median(results.times) / 1e9 # Convert ns to s + bandwidth = data_size / time_in_seconds / 1e9 # GB/s + return bandwidth +end + +# Create the benchmark suite +suite = BenchmarkGroup() + +# Julia broadcast addition benchmark +a, b, data_size = julia_broadcast_addition_setup(STREAM_SIZE) +suite["julia_broadcast_addition"] = @benchmarkable $julia_broadcast_addition_operation($a, $b) + +# FO broadcast addition benchmark +a, b, out = fo_broadcast_addition_setup(STREAM_SIZE) +suite["fo_broadcast_addition"] = @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out) + +# Run the benchmark suite +results = run(suite) + +# Process the results +julia_results = results["julia_broadcast_addition"] +fo_results = results["fo_broadcast_addition"] + +# Process and print the results +julia_bandwidth = compute_memory_bandwidth_addition(julia_results, a, b, a) # TODO: improve out +fo_bandwidth = compute_memory_bandwidth_addition(fo_results, a, b, out) + +println("Julia broadcast addition bandwidth: $julia_bandwidth GB/s") +println("FO broadcast addition bandwidth: $fo_bandwidth GB/s") From 8d0296bab23cb9976e711c527a2bd1c10aa4823f Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Fri, 2 Aug 2024 14:20:17 +0200 Subject: [PATCH 14/53] Update the Benchmark suite. Provide comparison between broadcast on array, on fields data and with field operator. --- benchmark/benchmarks.jl | 211 ++++++++++++++++++++++-------------- benchmark/benchmarks_old.jl | 103 ++++++++++++++++++ benchmark/benchsuite_fo.jl | 124 --------------------- 3 files changed, 232 insertions(+), 206 deletions(-) create mode 100644 benchmark/benchmarks_old.jl delete mode 100644 benchmark/benchsuite_fo.jl diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index f4248d7..2fcae28 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -1,103 +1,150 @@ -using Pkg -path_to_package = joinpath(@__DIR__, "..") # Assuming the benchmarks.jl file is in the "benchmark" directory -push!(LOAD_PATH, path_to_package) using BenchmarkTools +using Statistics using GridTools -# Mesh definitions ------------------------------------------------------------------------------------------- +# Data size +const global STREAM_SIZE = 10000000 # 10 million + +# Mesh definitions const global Cell_ = Dimension{:Cell_, HORIZONTAL} -const global K_ = Dimension{:K_, HORIZONTAL} const global Cell = Cell_() -const global K = K_() -const global Edge_ = Dimension{:Edge_, HORIZONTAL} -const global Edge = Edge_() -const global E2CDim_ = Dimension{:E2CDim_, LOCAL} -const global E2CDim = E2CDim_() - - -function setup_simple_connectivity()::Dict{String,Connectivity} - edge_to_cell_table = [ - [1 -1]; - [3 -1]; - [3 -1]; - [4 -1]; - [5 -1]; - [6 -1]; - [1 6]; - [1 2]; - [2 3]; - [2 4]; - [4 5]; - [5 6] - ] - - cell_to_edge_table = [ - [1 7 8]; - [8 9 10]; - [2 3 9]; - [4 10 11]; - [5 11 12]; - [6 7 12] - ] - - E2C_offset_provider = Connectivity(edge_to_cell_table, Cell, Edge, 2) - C2E_offset_provider = Connectivity(cell_to_edge_table, Edge, Cell, 3) - - offset_provider = Dict{String,Connectivity}( - "E2C" => E2C_offset_provider, - "C2E" => C2E_offset_provider, - "E2CDim" => E2C_offset_provider # TODO(lorenzovarese): this is required for the embedded backend (note: python already uses E2C) - ) - - return offset_provider + +""" + julia_broadcast_addition_setup(ARRAY_SIZE::Int64) + +Setup function for the Julia broadcast addition benchmark. + +# Arguments +- `ARRAY_SIZE::Int64`: The size of the arrays to be generated. + +# Returns +- `a, b`: Two randomly generated arrays of integers of size `ARRAY_SIZE`. +- `data_size`: The total size of the data processed. +""" +function julia_broadcast_addition_setup(ARRAY_SIZE::Int64) + a = rand(Float64, ARRAY_SIZE) + b = rand(Float64, ARRAY_SIZE) + data_size = sizeof(a) + sizeof(b) # Total bytes processed + return a, b, data_size end -SUITE = BenchmarkGroup() +""" + broadcast_addition_array(a::Array{Float64}, b::Array{Float64}) -# Legacy Suite with first tests -# SUITE["arith_broadcast"] = BenchmarkGroup() +Core operation for the Julia broadcast addition benchmark. -# a = rand(1000, 1000); b = rand(1000,1000); c = rand(1000,1000) -# af = Field((Cell, K), rand(1000, 1000)); bf = Field((Cell, K), rand(1000, 1000)); cf = Field((Cell, K), rand(1000, 1000)) -# SUITE["arith_broadcast"]["arrays"] = @benchmarkable a .+ b .- c -# SUITE["arith_broadcast"]["fields"] = @benchmarkable af .+ bf .- cf +# Arguments +- `a, b`: Two arrays to be added. -SUITE["field_operator"] = BenchmarkGroup() +# Returns +- The result of element-wise addition of `a` and `b`. +""" +function broadcast_addition_array(a::Array{Float64}, b::Array{Float64}) + return a .+ b +end -# Benchmark for field operator addition -function benchmark_fo_addition() - a = Field(Cell, collect(1.0:15.0)) - b = Field(Cell, collect(-1.0:-1:-15.0)) - out = Field(Cell, zeros(Float64, 15)) +""" + broadcast_addition(a::Field, b::Field) - @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} - return a .+ b - end +Core operation for the broadcast addition of two Field benchmark. +Useful to asses and track possible overhead on fields. - @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out) #setup=( - # a = Field(Cell, collect(1.0:15.0)); - # b = Field(Cell, collect(-1.0:-1:-15.0)); - # out_field = Field(Cell, zeros(Float64, 15)); - # @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} return a .+ b end; - # ) +# Arguments +- `a, b`: Two field to be added. + +# Returns +- The result of element-wise addition of the data of the fields `a` and `b`. +""" +function broadcast_addition_fields(a::Field, b::Field) + return a .+ b end -SUITE["field_operator"]["addition"] = benchmark_fo_addition() +""" + fo_broadcast_addition_setup(FIELD_DATA_SIZE::Int64) -# Benchmark for neighbor sum -function benchmark_fo_neighbor_sum() - offset_provider = setup_simple_connectivity(); - a = Field(Cell, collect(5.0:17.0) * 3); - E2C = FieldOffset("E2C", source=Cell, target=(Edge, E2CDim)) - out_field = Field(Edge, zeros(Float64, 12)) +Setup function for the field operator broadcast addition benchmark. - @field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} - return neighbor_sum(a(E2C), axis=E2CDim) - end +# Arguments +- `FIELD_DATA_SIZE::Int64`: The size of the fields to be generated. - @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, out=$out_field) +# Returns +- `a, b`: Two randomly generated fields of floats of size `FIELD_DATA_SIZE`. +- `out`: An output field similar to `a`. +""" +function fo_broadcast_addition_setup(FIELD_DATA_SIZE::Int64) + a = Field(Cell, rand(Float64, FIELD_DATA_SIZE)) + b = Field(Cell, rand(Float64, FIELD_DATA_SIZE)) + out = GridTools.similar_field(a) + return a, b, out end -SUITE["field_operator"]["neighbor_sum"] = benchmark_fo_neighbor_sum() +""" + fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Core operation for the field operator broadcast addition benchmark. + +# Arguments +- `a, b`: Two fields to be added. + +# Returns +- The result of element-wise addition of `a` and `b`. +""" +@field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return a .+ b +end + +""" + compute_memory_bandwidth_addition(results, a, b, out) + +Function to compute the memory bandwidth for the addition benchmarks. + +# Arguments +- `results`: Benchmark results. +- `a, b`: The input arrays/fields used in the benchmark. +- `out`: The output array/field of the benchmark. + +# Returns +- The computed memory bandwidth in GB/s. +""" +function compute_memory_bandwidth_addition(results, a, b, out) + @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data) + data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data) # Read a and b, write to out + time_in_seconds = median(results.times) / 1e9 # Convert ns to s + bandwidth = data_size / time_in_seconds / 1e9 # GB/s + return bandwidth +end + +# Create the benchmark suite +suite = BenchmarkGroup() + +# Define the main groups +suite["addition"] = BenchmarkGroup() + +# Julia broadcast addition benchmark +a, b, data_size = julia_broadcast_addition_setup(STREAM_SIZE) +suite["addition"]["array_broadcast_addition"] = @benchmarkable $broadcast_addition_array($a, $b) + +# Field broadcast addition benchmark +a, b, out = fo_broadcast_addition_setup(STREAM_SIZE) +suite["addition"]["fields_broadcast_addition"] = @benchmarkable $broadcast_addition_fields($a, $b) + +# Field Operator broadcast addition benchmark +a, b, out = fo_broadcast_addition_setup(STREAM_SIZE) +suite["addition"]["field_op_broadcast_addition"] = @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out) + +# Run the benchmark suite +results = run(suite) + +# Process the results +array_results = results["addition"]["array_broadcast_addition"] +fields_results = results["addition"]["fields_broadcast_addition"] +fo_results = results["addition"]["field_op_broadcast_addition"] + +# Process and print the results +array_bandwidth = compute_memory_bandwidth_addition(array_results, a, b, a) # Out is a temporary array with size a +fields_bandwidth = compute_memory_bandwidth_addition(fields_results, a, b, a) # Out is a temporary array with size a +fo_bandwidth = compute_memory_bandwidth_addition(fo_results, a, b, out) -run(SUITE, verbose = true, seconds = 1) +println("Array broadcast addition bandwidth: $array_bandwidth GB/s") +println("Fields data broadcast addition bandwidth: $fields_bandwidth GB/s") +println("Field Operator broadcast addition bandwidth: $fo_bandwidth GB/s") diff --git a/benchmark/benchmarks_old.jl b/benchmark/benchmarks_old.jl new file mode 100644 index 0000000..0bb429f --- /dev/null +++ b/benchmark/benchmarks_old.jl @@ -0,0 +1,103 @@ +using Pkg +path_to_package = joinpath(@__DIR__, "..") # Assuming the benchmarks.jl file is in the "benchmark" directory +push!(LOAD_PATH, path_to_package) +using BenchmarkTools +using GridTools + +# Mesh definitions ------------------------------------------------------------------------------------------- +# const global Cell_ = Dimension{:Cell_, HORIZONTAL} +# const global K_ = Dimension{:K_, HORIZONTAL} +# const global Cell = Cell_() +# const global K = K_() +# const global Edge_ = Dimension{:Edge_, HORIZONTAL} +# const global Edge = Edge_() +# const global E2CDim_ = Dimension{:E2CDim_, LOCAL} +# const global E2CDim = E2CDim_() + + +# function setup_simple_connectivity()::Dict{String,Connectivity} +# edge_to_cell_table = [ +# [1 -1]; +# [3 -1]; +# [3 -1]; +# [4 -1]; +# [5 -1]; +# [6 -1]; +# [1 6]; +# [1 2]; +# [2 3]; +# [2 4]; +# [4 5]; +# [5 6] +# ] + +# cell_to_edge_table = [ +# [1 7 8]; +# [8 9 10]; +# [2 3 9]; +# [4 10 11]; +# [5 11 12]; +# [6 7 12] +# ] + +# E2C_offset_provider = Connectivity(edge_to_cell_table, Cell, Edge, 2) +# C2E_offset_provider = Connectivity(cell_to_edge_table, Edge, Cell, 3) + +# offset_provider = Dict{String,Connectivity}( +# "E2C" => E2C_offset_provider, +# "C2E" => C2E_offset_provider, +# "E2CDim" => E2C_offset_provider # TODO(lorenzovarese): this is required for the embedded backend (note: python already uses E2C) +# ) + +# return offset_provider +# end + +SUITE = BenchmarkGroup() + +# Legacy Suite with first tests +SUITE["arith_broadcast"] = BenchmarkGroup() + +a = rand(1000, 1000); b = rand(1000,1000); c = rand(1000,1000) +af = Field((Cell, K), rand(1000, 1000)); bf = Field((Cell, K), rand(1000, 1000)); cf = Field((Cell, K), rand(1000, 1000)) +SUITE["arith_broadcast"]["arrays"] = @benchmarkable a .+ b .- c +SUITE["arith_broadcast"]["fields"] = @benchmarkable af .+ bf .- cf + +# SUITE["field_operator"] = BenchmarkGroup() + +# # Benchmark for field operator addition +# function benchmark_fo_addition() +# a = Field(Cell, collect(1.0:15.0)) +# b = Field(Cell, collect(-1.0:-1:-15.0)) +# out = Field(Cell, zeros(Float64, 15)) + +# @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} +# return a .+ b +# end + +# @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out) #setup=( +# # a = Field(Cell, collect(1.0:15.0)); +# # b = Field(Cell, collect(-1.0:-1:-15.0)); +# # out_field = Field(Cell, zeros(Float64, 15)); +# # @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} return a .+ b end; +# # ) +# end + +# SUITE["field_operator"]["addition"] = benchmark_fo_addition() + +# # Benchmark for neighbor sum +# function benchmark_fo_neighbor_sum() +# offset_provider = setup_simple_connectivity(); +# a = Field(Cell, collect(5.0:17.0) * 3); +# E2C = FieldOffset("E2C", source=Cell, target=(Edge, E2CDim)) +# out_field = Field(Edge, zeros(Float64, 12)) + +# @field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} +# return neighbor_sum(a(E2C), axis=E2CDim) +# end + +# @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, out=$out_field) +# end + +# SUITE["field_operator"]["neighbor_sum"] = benchmark_fo_neighbor_sum() + +run(SUITE, verbose = true, seconds = 1) diff --git a/benchmark/benchsuite_fo.jl b/benchmark/benchsuite_fo.jl deleted file mode 100644 index a208bc1..0000000 --- a/benchmark/benchsuite_fo.jl +++ /dev/null @@ -1,124 +0,0 @@ -using BenchmarkTools -using Statistics -using GridTools - -# Data size -const global STREAM_SIZE = 10000000 # 10 million - -# Mesh definitions -const global Cell_ = Dimension{:Cell_, HORIZONTAL} -const global Cell = Cell_() - -""" - julia_broadcast_addition_setup(ARRAY_SIZE::Int64) - -Setup function for the Julia broadcast addition benchmark. - -# Arguments -- `ARRAY_SIZE::Int64`: The size of the arrays to be generated. - -# Returns -- `a, b`: Two randomly generated arrays of integers of size `ARRAY_SIZE`. -- `data_size`: The total size of the data processed. -""" -function julia_broadcast_addition_setup(ARRAY_SIZE::Int64) - a = rand(Int, ARRAY_SIZE) - b = rand(Int, ARRAY_SIZE) - data_size = sizeof(a) + sizeof(b) # Total bytes processed - return a, b, data_size -end - -""" - julia_broadcast_addition_operation(a, b) - -Core operation for the Julia broadcast addition benchmark. - -# Arguments -- `a, b`: Two arrays to be added. - -# Returns -- The result of element-wise addition of `a` and `b`. -""" -function julia_broadcast_addition_operation(a, b) - return a .+ b -end - -""" - fo_broadcast_addition_setup(FIELD_DATA_SIZE::Int64) - -Setup function for the field operator broadcast addition benchmark. - -# Arguments -- `FIELD_DATA_SIZE::Int64`: The size of the fields to be generated. - -# Returns -- `a, b`: Two randomly generated fields of floats of size `FIELD_DATA_SIZE`. -- `out`: An output field similar to `a`. -""" -function fo_broadcast_addition_setup(FIELD_DATA_SIZE::Int64) - a = Field(Cell, rand(Float64, FIELD_DATA_SIZE)) - b = Field(Cell, rand(Float64, FIELD_DATA_SIZE)) - out = GridTools.similar_field(a) - return a, b, out -end - -""" - fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} - -Core operation for the field operator broadcast addition benchmark. - -# Arguments -- `a, b`: Two fields to be added. - -# Returns -- The result of element-wise addition of `a` and `b`. -""" -@field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} - return a .+ b -end - -""" - compute_memory_bandwidth_addition(results, a, b, out) - -Function to compute the memory bandwidth for the addition benchmarks. - -# Arguments -- `results`: Benchmark results. -- `a, b`: The input arrays/fields used in the benchmark. -- `out`: The output array/field of the benchmark. - -# Returns -- The computed memory bandwidth in GB/s. -""" -function compute_memory_bandwidth_addition(results, a, b, out) - @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data) - data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data) # Read a and b, write to out - time_in_seconds = median(results.times) / 1e9 # Convert ns to s - bandwidth = data_size / time_in_seconds / 1e9 # GB/s - return bandwidth -end - -# Create the benchmark suite -suite = BenchmarkGroup() - -# Julia broadcast addition benchmark -a, b, data_size = julia_broadcast_addition_setup(STREAM_SIZE) -suite["julia_broadcast_addition"] = @benchmarkable $julia_broadcast_addition_operation($a, $b) - -# FO broadcast addition benchmark -a, b, out = fo_broadcast_addition_setup(STREAM_SIZE) -suite["fo_broadcast_addition"] = @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out) - -# Run the benchmark suite -results = run(suite) - -# Process the results -julia_results = results["julia_broadcast_addition"] -fo_results = results["fo_broadcast_addition"] - -# Process and print the results -julia_bandwidth = compute_memory_bandwidth_addition(julia_results, a, b, a) # TODO: improve out -fo_bandwidth = compute_memory_bandwidth_addition(fo_results, a, b, out) - -println("Julia broadcast addition bandwidth: $julia_bandwidth GB/s") -println("FO broadcast addition bandwidth: $fo_bandwidth GB/s") From b9d4e2e52d0721e2114e6f3b3149b4a65b4620ea Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Fri, 2 Aug 2024 14:36:11 +0200 Subject: [PATCH 15/53] Improve naming and type checking --- benchmark/benchmarks.jl | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 2fcae28..f381dc3 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -10,7 +10,7 @@ const global Cell_ = Dimension{:Cell_, HORIZONTAL} const global Cell = Cell_() """ - julia_broadcast_addition_setup(ARRAY_SIZE::Int64) + array_broadcast_addition_setup(ARRAY_SIZE::Int64) Setup function for the Julia broadcast addition benchmark. @@ -21,7 +21,7 @@ Setup function for the Julia broadcast addition benchmark. - `a, b`: Two randomly generated arrays of integers of size `ARRAY_SIZE`. - `data_size`: The total size of the data processed. """ -function julia_broadcast_addition_setup(ARRAY_SIZE::Int64) +function array_broadcast_addition_setup(ARRAY_SIZE::Int64)::Tuple{Array{Float64,1}, Array{Float64,1}, Int64} a = rand(Float64, ARRAY_SIZE) b = rand(Float64, ARRAY_SIZE) data_size = sizeof(a) + sizeof(b) # Total bytes processed @@ -39,7 +39,7 @@ Core operation for the Julia broadcast addition benchmark. # Returns - The result of element-wise addition of `a` and `b`. """ -function broadcast_addition_array(a::Array{Float64}, b::Array{Float64}) +function broadcast_addition_array(a::Array{Float64}, b::Array{Float64})::Array{Float64,1} return a .+ b end @@ -55,12 +55,12 @@ Useful to asses and track possible overhead on fields. # Returns - The result of element-wise addition of the data of the fields `a` and `b`. """ -function broadcast_addition_fields(a::Field, b::Field) +function broadcast_addition_fields(a::Field, b::Field)::Field return a .+ b end """ - fo_broadcast_addition_setup(FIELD_DATA_SIZE::Int64) + fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64) Setup function for the field operator broadcast addition benchmark. @@ -71,7 +71,7 @@ Setup function for the field operator broadcast addition benchmark. - `a, b`: Two randomly generated fields of floats of size `FIELD_DATA_SIZE`. - `out`: An output field similar to `a`. """ -function fo_broadcast_addition_setup(FIELD_DATA_SIZE::Int64) +function fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field, Field} a = Field(Cell, rand(Float64, FIELD_DATA_SIZE)) b = Field(Cell, rand(Float64, FIELD_DATA_SIZE)) out = GridTools.similar_field(a) @@ -106,7 +106,7 @@ Function to compute the memory bandwidth for the addition benchmarks. # Returns - The computed memory bandwidth in GB/s. """ -function compute_memory_bandwidth_addition(results, a, b, out) +function compute_memory_bandwidth_addition(results, a, b, out)::Float64 @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data) data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data) # Read a and b, write to out time_in_seconds = median(results.times) / 1e9 # Convert ns to s @@ -121,15 +121,15 @@ suite = BenchmarkGroup() suite["addition"] = BenchmarkGroup() # Julia broadcast addition benchmark -a, b, data_size = julia_broadcast_addition_setup(STREAM_SIZE) +a, b, data_size = array_broadcast_addition_setup(STREAM_SIZE) suite["addition"]["array_broadcast_addition"] = @benchmarkable $broadcast_addition_array($a, $b) # Field broadcast addition benchmark -a, b, out = fo_broadcast_addition_setup(STREAM_SIZE) +a, b, out = fields_broadcast_addition_setup(STREAM_SIZE) suite["addition"]["fields_broadcast_addition"] = @benchmarkable $broadcast_addition_fields($a, $b) # Field Operator broadcast addition benchmark -a, b, out = fo_broadcast_addition_setup(STREAM_SIZE) +a, b, out = fields_broadcast_addition_setup(STREAM_SIZE) suite["addition"]["field_op_broadcast_addition"] = @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out) # Run the benchmark suite @@ -145,6 +145,6 @@ array_bandwidth = compute_memory_bandwidth_addition(array_results, a, b, a) # Ou fields_bandwidth = compute_memory_bandwidth_addition(fields_results, a, b, a) # Out is a temporary array with size a fo_bandwidth = compute_memory_bandwidth_addition(fo_results, a, b, out) -println("Array broadcast addition bandwidth: $array_bandwidth GB/s") -println("Fields data broadcast addition bandwidth: $fields_bandwidth GB/s") -println("Field Operator broadcast addition bandwidth: $fo_bandwidth GB/s") +println("Array broadcast addition bandwidth:\t\t$array_bandwidth GB/s") +println("Fields data broadcast addition bandwidth:\t$fields_bandwidth GB/s") +println("Field Operator broadcast addition bandwidth:\t$fo_bandwidth GB/s") From 2ab1421efe0c404c1f49f5dab3b4952bffd51bca Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Tue, 13 Aug 2024 11:14:07 +0200 Subject: [PATCH 16/53] Add draft of neighbour_sum benchmark --- benchmark/benchmarks_neighbour_sum.jl | 47 +++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 benchmark/benchmarks_neighbour_sum.jl diff --git a/benchmark/benchmarks_neighbour_sum.jl b/benchmark/benchmarks_neighbour_sum.jl new file mode 100644 index 0000000..3016d43 --- /dev/null +++ b/benchmark/benchmarks_neighbour_sum.jl @@ -0,0 +1,47 @@ + +using BenchmarkTools +using Statistics +using GridTools + +const N = 1_000_000 +const DIM_SIZE = sqrt(N) |> floor |> Int + +include("../test/mesh_definitions.jl") + +function create_large_connectivity(size::Int) + edge_to_cell_table = hcat([rand(1:size, 2) for _ in 1:size]...) + cell_to_edge_table = hcat([rand(1:size, 3) for _ in 1:size]...) + + E2C = Connectivity(edge_to_cell_table, Cell, Edge, 2) + C2E = Connectivity(cell_to_edge_table, Edge, Cell, 3) + + Dict( + "E2C" => E2C, + "C2E" => C2E, + "E2CDim" => E2C # Using the same for simplicity # TODO: to be removed + ) +end + +offset_provider = create_large_connectivity(DIM_SIZE) + +a = Field(Cell, collect(1.0:N)) +out_field = GridTools.similar_field(a) + +@field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} + return neighbor_sum(a(E2C), axis=E2CDim) +end + +# Benchmark the field operation +fo_benchmark = @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, backend="embedded", out=$out_field) + +# Run the benchmark +results = run(fo_benchmark) + +# Memory bandwidth calculation +time_in_seconds = median(results.times) / 1e9 # convert ns to s +data_size = sizeof(a.data) + sizeof(out_field.data) # total bytes read and written +bandwidth = data_size / time_in_seconds / 1e9 # GB/s + +# Output results +println("Time taken: ", median(results.times) / 1e6, " ms") +println("Memory bandwidth: ", bandwidth, " GB/s") From 3062bdf7a23db8543c57aa1fc360f79862dec010 Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Tue, 13 Aug 2024 15:24:47 +0200 Subject: [PATCH 17/53] Add the benchmarks for sine and cosine field operators --- benchmark/benchmarks.jl | 141 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 138 insertions(+), 3 deletions(-) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index f381dc3..65799c6 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -3,12 +3,30 @@ using Statistics using GridTools # Data size -const global STREAM_SIZE = 10000000 # 10 million +const global STREAM_SIZE = 10_000_000 # Mesh definitions const global Cell_ = Dimension{:Cell_, HORIZONTAL} const global Cell = Cell_() +""" + single_field_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field} + +Setup function to create a field and a similar output field for benchmarking operations that require a single input field. + +# Arguments +- `FIELD_DATA_SIZE::Int64`: The size of the field to be generated. + +# Returns +- `a`: A randomly generated field of floats of size `FIELD_DATA_SIZE`. +- `out`: An output field similar to `a`, used for storing operation results. +""" +function single_field_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field} + a = Field(Cell, rand(Float64, FIELD_DATA_SIZE)) + out = GridTools.similar_field(a) + return a, out +end + """ array_broadcast_addition_setup(ARRAY_SIZE::Int64) @@ -93,6 +111,93 @@ Core operation for the field operator broadcast addition benchmark. return a .+ b end +""" + sin_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Applies the sine function element-wise to the data of a field without using a field operator. + +# Arguments +- `a`: Input field containing Float64 data. + +# Returns +- A new field where each element is the sine of the corresponding element in the input field `a`. +""" +function sin_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return sin.(a) +end + +""" + cos_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Applies the cosine function element-wise to the data of a field without using a field operator. + +# Arguments +- `a`: Input field containing Float64 data. + +# Returns +- A new field where each element is the cosine of the corresponding element in the input field `a`. +""" +function cos_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return cos.(a) +end + +""" + fo_sin(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Field operator that applies the sine function element-wise to the data of a field. + +# Arguments +- `a`: Input field containing Float64 data. + +# Returns +- A new field where each element is the sine of the corresponding element in the input field `a`. +""" +@field_operator function fo_sin(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return sin.(a) +end + +""" + fo_cos(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Field operator that applies the cosine function element-wise to the data of a field. + +# Arguments +- `a`: Input field containing Float64 data. + +# Returns +- A new field where each element is the cosine of the corresponding element in the input field `a`. +""" +@field_operator function fo_cos(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return cos.(a) +end + +""" + compute_memory_bandwidth_single(results, a, out)::Float64 + +Calculates the memory bandwidth for operations that involve a single input and output field based on benchmark results. + +This function measures how efficiently data is transferred to and from memory during the execution of a benchmarked operation. + +# Arguments +- `results`: The benchmark results object containing timing and other performance data. +- `a`: The input field used in the benchmark. +- `out`: The output field produced by the benchmark. + +# Returns +- `bandwidth`: The computed memory bandwidth in gigabytes per second (GB/s), which represents the rate at which data is read from and written to the system memory during the operation. + +# Calculation Details +- `data_size`: Sum of the sizes of the input and output data in bytes. +- `time_in_seconds`: The median execution time of the benchmark, converted from nanoseconds to seconds. +- `bandwidth`: Calculated as the total data transferred divided by the time taken, expressed in GB/s. +""" +function compute_memory_bandwidth_single(results, a, out=a)::Float64 + data_size = sizeof(a.data) + sizeof(out.data) # Read from a and write to out + time_in_seconds = median(results.times) / 1e9 # Convert ns to s + bandwidth = data_size / time_in_seconds / 1e9 # GB/s + return bandwidth +end + """ compute_memory_bandwidth_addition(results, a, b, out) @@ -132,6 +237,22 @@ suite["addition"]["fields_broadcast_addition"] = @benchmarkable $broadcast_addit a, b, out = fields_broadcast_addition_setup(STREAM_SIZE) suite["addition"]["field_op_broadcast_addition"] = @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out) +# Sine without field operator benchmark +a, out = single_field_setup(STREAM_SIZE) +suite["trigonometry"]["sin"] = @benchmarkable $sin_without_fo($a) + +# Field operator sine benchmark +a, out = single_field_setup(STREAM_SIZE) +suite["trigonometry"]["field_op_sin"] = @benchmarkable $fo_sin($a, backend="embedded", out=$out) + +# Cosine without field operator benchmark +a, out = single_field_setup(STREAM_SIZE) +suite["trigonometry"]["cos"] = @benchmarkable $cos_without_fo($a) + +# Field operator cosine benchmark +a, out = single_field_setup(STREAM_SIZE) +suite["trigonometry"]["field_op_cos"] = @benchmarkable $fo_cos($a, backend="embedded", out=$out) + # Run the benchmark suite results = run(suite) @@ -139,12 +260,26 @@ results = run(suite) array_results = results["addition"]["array_broadcast_addition"] fields_results = results["addition"]["fields_broadcast_addition"] fo_results = results["addition"]["field_op_broadcast_addition"] +sin_results = results["trigonometry"]["sin"] +fo_sin_results = results["trigonometry"]["field_op_sin"] +cos_results = results["trigonometry"]["cos"] +fo_cos_results = results["trigonometry"]["field_op_cos"] # Process and print the results -array_bandwidth = compute_memory_bandwidth_addition(array_results, a, b, a) # Out is a temporary array with size a -fields_bandwidth = compute_memory_bandwidth_addition(fields_results, a, b, a) # Out is a temporary array with size a +array_bandwidth = compute_memory_bandwidth_addition(array_results, a, b, a) # Out is a temporary array with size equal to the size of a +fields_bandwidth = compute_memory_bandwidth_addition(fields_results, a, b, a) fo_bandwidth = compute_memory_bandwidth_addition(fo_results, a, b, out) +sin_bandwidth = compute_memory_bandwidth_single(sin_results, a) +fo_sin_bandwidth = compute_memory_bandwidth_single(fo_sin_results, a) +cos_bandwidth = compute_memory_bandwidth_single(cos_results, a) +fo_cos_bandwidth = compute_memory_bandwidth_single(fo_cos_results, a) + +# Print the results println("Array broadcast addition bandwidth:\t\t$array_bandwidth GB/s") println("Fields data broadcast addition bandwidth:\t$fields_bandwidth GB/s") println("Field Operator broadcast addition bandwidth:\t$fo_bandwidth GB/s") +println("Sine operation bandwidth (no field operator):\t$sin_bandwidth GB/s") +println("Field Operator sine bandwidth:\t$fo_sin_bandwidth GB/s") +println("Cosine operation bandwidth (no field operator):\t$cos_bandwidth GB/s") +println("Field Operator cosine bandwidth:\t$fo_cos_bandwidth GB/s") From 323c269839cec847734a44c1f4190605193c32ea Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Wed, 14 Aug 2024 15:04:15 +0200 Subject: [PATCH 18/53] Add benchmarks for remapping --- benchmark/benchmarks_remapping.jl | 71 +++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 benchmark/benchmarks_remapping.jl diff --git a/benchmark/benchmarks_remapping.jl b/benchmark/benchmarks_remapping.jl new file mode 100644 index 0000000..66470a6 --- /dev/null +++ b/benchmark/benchmarks_remapping.jl @@ -0,0 +1,71 @@ +using BenchmarkTools +using Statistics +using GridTools + +const N = 10_000_000 |> floor |> Int # Adjust as needed (10 millions is the SLURM test size) + +include("../test/mesh_definitions.jl") # Ensure all necessary mesh and dimension definitions are loaded + +# Unstructured Mesh ------------------------------------------------------------------------------------------ + +function create_large_connectivity(size::Int) + edge_to_cell_table = hcat([rand(1:size, 2) for _ in 1:size]...) + cell_to_edge_table = hcat([rand(1:size, 3) for _ in 1:size]...) + + E2C = Connectivity(edge_to_cell_table, Cell, Edge, 2) + C2E = Connectivity(cell_to_edge_table, Edge, Cell, 3) + + Dict( + "E2C" => E2C, + "C2E" => C2E, + "E2CDim" => E2C # TODO: remove it + ) +end + +offset_provider = create_large_connectivity(N) + +a = Field(Cell, collect(1.0:N)) +out_field = GridTools.similar_field(a) + +@field_operator function fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} + return a(E2C[1]) +end + +# Benchmark the field remapping operation +remapping_benchmark = @benchmarkable $fo_remapping($a, offset_provider=$offset_provider, backend="embedded", out=$out_field) + +# Run the benchmark +results = run(remapping_benchmark) + +# Memory bandwidth calculation +unstr_time_in_seconds = median(results.times) / 1e9 # convert ns to s +unstr_data_size = sizeof(a.data) + sizeof(out_field.data) # total bytes read and written +unstr_bandwidth = unstr_data_size / unstr_time_in_seconds / 1e9 # GB/s + +# Output results +println("Time taken: ", median(results.times) / 1e6, " ms") +println("Memory bandwidth for Unstructured Mesh Remapping: ", unstr_bandwidth, " GB/s") + +# Cartesian Mesh --------------------------------------------------------------------------------------------- + +# Cartesian Offset Field Operator +@field_operator function fo_cartesian_offset(inp::Field{Tuple{K_},Float64})::Field{Tuple{K_},Float64} + return inp(Koff[1]) +end + +# Create and benchmark the Cartesian offset operation +a = Field(K, collect(1.0:N)) +out_field = Field(K, zeros(Float64, N-1)) +cartesian_offset_provider = Dict("Koff" => K) + +cartesian_benchmark = @benchmarkable $fo_cartesian_offset($a, backend="embedded", out=$out_field, offset_provider=$cartesian_offset_provider) +cartesian_results = run(cartesian_benchmark) + +# Memory bandwidth calculation +cartesian_time_in_seconds = median(cartesian_results.times) / 1e9 # convert ns to s +cartesian_data_size = sizeof(a.data) + sizeof(out_field.data) # total bytes read and written +cartesian_bandwidth = cartesian_data_size / cartesian_time_in_seconds / 1e9 # GB/s + +# Output results +println("Time taken for Cartesian Mesh Offset: ", median(cartesian_results.times) / 1e6, " ms") +println("Memory bandwidth for Cartesian Mesh Offset: ", cartesian_bandwidth, " GB/s") From 9c138c2216c481c3e73d8a631fac559d4cb1d82e Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Wed, 14 Aug 2024 15:30:13 +0200 Subject: [PATCH 19/53] Add draft mpdata --- benchmark/benchmark_mpdata.jl | 94 +++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 benchmark/benchmark_mpdata.jl diff --git a/benchmark/benchmark_mpdata.jl b/benchmark/benchmark_mpdata.jl new file mode 100644 index 0000000..34a1e49 --- /dev/null +++ b/benchmark/benchmark_mpdata.jl @@ -0,0 +1,94 @@ +# benchmark_mpdata.jl - Benchmarking for atlas advection code + +using BenchmarkTools +using GridTools # Assuming all necessary functionality like Field, Dimension are defined here +using Statistics +using Printf + +Cell_ = Dimension{:Cell_, HORIZONTAL} +Edge_ = Dimension{:Edge_, HORIZONTAL} +Vertex_ = Dimension{:Vertex_, HORIZONTAL} +K_ = Dimension{:K_, VERTICAL} +V2VDim_ = Dimension{:V2V_, LOCAL} +V2EDim_ = Dimension{:V2E_, LOCAL} +E2VDim_ = Dimension{:E2V_, LOCAL} +Cell = Cell_() +K = K_() +Edge = Edge_() +Vertex = Vertex_() +V2VDim = V2VDim_() +V2EDim = V2EDim_() +E2VDim = E2VDim_() + +V2V = FieldOffset("V2V", source = Vertex, target = (Vertex, V2VDim)) +E2V = FieldOffset("E2V", source = Vertex, target = (Edge, E2VDim)) +V2E = FieldOffset("V2E", source = Edge, target = (Vertex, V2EDim)) +Koff = FieldOffset("Koff", source = K, target = K) + +include("../src/atlas/atlas_mesh.jl") +include("../src/atlas/state_container.jl") +include("../src/atlas/metric.jl") +include("../src/atlas/advection.jl") + +# Function to set up and run the benchmark +function benchmark_mpdata() + # Set up the environment or load data + grid = atlas.StructuredGrid("O50") + mesh = AtlasMesh(grid, num_level = 30) + + # Define dimensions based on the mesh properties + vertex_dim = getproperty(mesh, DIMENSION_TO_SIZE_ATTR[Vertex]) + k_dim = getproperty(mesh, DIMENSION_TO_SIZE_ATTR[K]) + edge_dim = getproperty(mesh, DIMENSION_TO_SIZE_ATTR[Edge]) + + # Set parameters + δt = 1800.0 # time step in s + eps = 1.0e-8 + niter = 50 # Adjust based on how long you want the benchmark to run + + # Initialize fields and metrics + state = sc_from_mesh(mesh) + state_next = sc_from_mesh(mesh) + tmp_fields = Dict{String, Field}() + for i = 1:6 + tmp_fields[@sprintf("tmp_vertex_%d", i)] = Field((Vertex, K), zeros(vertex_dim, k_dim)) + end + for j = 1:3 + tmp_fields[@sprintf("tmp_edge_%d", j)] = Field((Edge, K), zeros(edge_dim, k_dim)) + end + + # Benchmark the mpdata_program + println("Starting the benchmark for mpdata_program...") + bench_result = @benchmark begin + mpdata_program( + state.rho, + δt, + eps, + mesh.vol, + metric.gac, + state.vel[1], + state.vel[2], + state.vel[3], + mesh.pole_edge_mask, + mesh.dual_face_orientation, + mesh.dual_face_normal_weighted_x, + mesh.dual_face_normal_weighted_y, + tmp_fields["tmp_vertex_1"], + tmp_fields["tmp_vertex_2"], + tmp_fields["tmp_vertex_3"], + tmp_fields["tmp_vertex_4"], + tmp_fields["tmp_vertex_5"], + tmp_fields["tmp_vertex_6"], + tmp_fields["tmp_edge_1"], + tmp_fields["tmp_edge_2"], + tmp_fields["tmp_edge_3"] + ) + end + + # Output benchmark results + println("Benchmark completed.") + display(bench_result) +end + +# Run the benchmark function +benchmark_mpdata() From 700a545f2bc481bc5d14cab254fec35c25e0a8a5 Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Fri, 16 Aug 2024 09:47:35 +0200 Subject: [PATCH 20/53] Clear benchmarks.jl and add remapping --- benchmark/benchmarks.jl | 169 ++++++++++++++++++++++++++++------------ 1 file changed, 121 insertions(+), 48 deletions(-) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 65799c6..18b7743 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -1,13 +1,78 @@ using BenchmarkTools using Statistics using GridTools +using GridTools.ExampleMeshes.Unstructured +using GridTools.ExampleMeshes.Cartesian # Data size const global STREAM_SIZE = 10_000_000 -# Mesh definitions -const global Cell_ = Dimension{:Cell_, HORIZONTAL} -const global Cell = Cell_() +# Utils ------------------------------------------------------------------------------------------------------ + +# Useful for the benchmark of the field remapping operation +function create_large_connectivity(size::Int) + edge_to_cell_table = hcat([rand(1:size, 2) for _ in 1:size]...) + cell_to_edge_table = hcat([rand(1:size, 3) for _ in 1:size]...) + + E2C = Connectivity(edge_to_cell_table, Cell, Edge, 2) + C2E = Connectivity(cell_to_edge_table, Edge, Cell, 3) + + Dict( + "E2C" => E2C, + "C2E" => C2E, + "E2CDim" => E2C # TODO: remove it + ) +end + +""" + compute_memory_bandwidth_single(results, a, out)::Float64 + +Calculates the memory bandwidth for operations that involve a single input and output field based on benchmark results. + +This function measures how efficiently data is transferred to and from memory during the execution of a benchmarked operation. + +# Arguments +- `results`: The benchmark results object containing timing and other performance data. +- `a`: The input field used in the benchmark. +- `out`: The output field produced by the benchmark. + +# Returns +- `bandwidth`: The computed memory bandwidth in gigabytes per second (GB/s), which represents the rate at which data is read from and written to the system memory during the operation. + +# Calculation Details +- `data_size`: Sum of the sizes of the input and output data in bytes. +- `time_in_seconds`: The median execution time of the benchmark, converted from nanoseconds to seconds. +- `bandwidth`: Calculated as the total data transferred divided by the time taken, expressed in GB/s. +""" +function compute_memory_bandwidth_single(results, a, out=a)::Float64 + data_size = sizeof(a.data) + sizeof(out.data) # Read from a and write to out + time_in_seconds = median(results.times) / 1e9 # Convert ns to s + bandwidth = data_size / time_in_seconds / 1e9 # GB/s + return bandwidth +end + +""" + compute_memory_bandwidth_addition(results, a, b, out) + +Function to compute the memory bandwidth for the addition benchmarks. + +# Arguments +- `results`: Benchmark results. +- `a, b`: The input arrays/fields used in the benchmark. +- `out`: The output array/field of the benchmark. + +# Returns +- The computed memory bandwidth in GB/s. +""" +function compute_memory_bandwidth_addition(results, a, b, out)::Float64 + @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data) + data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data) # Read a and b, write to out + time_in_seconds = median(results.times) / 1e9 # Convert ns to s + bandwidth = data_size / time_in_seconds / 1e9 # GB/s + return bandwidth +end + +# Operations ------------------------------------------------------------------------------------------------- """ single_field_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field} @@ -172,52 +237,23 @@ Field operator that applies the cosine function element-wise to the data of a fi end """ - compute_memory_bandwidth_single(results, a, out)::Float64 + fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} -Calculates the memory bandwidth for operations that involve a single input and output field based on benchmark results. +Field operator that performs remapping from cell-based data to edge-based data. -This function measures how efficiently data is transferred to and from memory during the execution of a benchmarked operation. +This operator utilizes a connectivity table (`E2C`) to map the values from cells to edges, implying a transformation from the cell-centered field to an edge-centered field based on predefined relationships in the connectivity table. # Arguments -- `results`: The benchmark results object containing timing and other performance data. -- `a`: The input field used in the benchmark. -- `out`: The output field produced by the benchmark. +- `a`: Input field containing Float64 data structured around cells. # Returns -- `bandwidth`: The computed memory bandwidth in gigabytes per second (GB/s), which represents the rate at which data is read from and written to the system memory during the operation. - -# Calculation Details -- `data_size`: Sum of the sizes of the input and output data in bytes. -- `time_in_seconds`: The median execution time of the benchmark, converted from nanoseconds to seconds. -- `bandwidth`: Calculated as the total data transferred divided by the time taken, expressed in GB/s. +- A new field where each element represents data remapped from cells to edges, structured as specified by the edge-to-cell connectivity. """ -function compute_memory_bandwidth_single(results, a, out=a)::Float64 - data_size = sizeof(a.data) + sizeof(out.data) # Read from a and write to out - time_in_seconds = median(results.times) / 1e9 # Convert ns to s - bandwidth = data_size / time_in_seconds / 1e9 # GB/s - return bandwidth +@field_operator function fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} + return a(E2C[1]) end -""" - compute_memory_bandwidth_addition(results, a, b, out) - -Function to compute the memory bandwidth for the addition benchmarks. - -# Arguments -- `results`: Benchmark results. -- `a, b`: The input arrays/fields used in the benchmark. -- `out`: The output array/field of the benchmark. - -# Returns -- The computed memory bandwidth in GB/s. -""" -function compute_memory_bandwidth_addition(results, a, b, out)::Float64 - @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data) - data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data) # Read a and b, write to out - time_in_seconds = median(results.times) / 1e9 # Convert ns to s - bandwidth = data_size / time_in_seconds / 1e9 # GB/s - return bandwidth -end +# Benchmark -------------------------------------------------------------------------------------------------- # Create the benchmark suite suite = BenchmarkGroup() @@ -253,6 +289,12 @@ suite["trigonometry"]["cos"] = @benchmarkable $cos_without_fo($a) a, out = single_field_setup(STREAM_SIZE) suite["trigonometry"]["field_op_cos"] = @benchmarkable $fo_cos($a, backend="embedded", out=$out) +# Benchmark the field remapping operation +offset_provider = create_large_connectivity(STREAM_SIZE) +a, out = single_field_setup(STREAM_SIZE) +suite["remapping"]["field_operator"] = + @benchmarkable $fo_remapping($a, offset_provider=$offset_provider, backend="embedded", out=$out) + # Run the benchmark suite results = run(suite) @@ -264,6 +306,7 @@ sin_results = results["trigonometry"]["sin"] fo_sin_results = results["trigonometry"]["field_op_sin"] cos_results = results["trigonometry"]["cos"] fo_cos_results = results["trigonometry"]["field_op_cos"] +remapping_results = results["remapping"]["field_operator"] # Process and print the results array_bandwidth = compute_memory_bandwidth_addition(array_results, a, b, a) # Out is a temporary array with size equal to the size of a @@ -275,11 +318,41 @@ fo_sin_bandwidth = compute_memory_bandwidth_single(fo_sin_results, a) cos_bandwidth = compute_memory_bandwidth_single(cos_results, a) fo_cos_bandwidth = compute_memory_bandwidth_single(fo_cos_results, a) -# Print the results -println("Array broadcast addition bandwidth:\t\t$array_bandwidth GB/s") -println("Fields data broadcast addition bandwidth:\t$fields_bandwidth GB/s") -println("Field Operator broadcast addition bandwidth:\t$fo_bandwidth GB/s") -println("Sine operation bandwidth (no field operator):\t$sin_bandwidth GB/s") -println("Field Operator sine bandwidth:\t$fo_sin_bandwidth GB/s") -println("Cosine operation bandwidth (no field operator):\t$cos_bandwidth GB/s") -println("Field Operator cosine bandwidth:\t$fo_cos_bandwidth GB/s") +remapping_bandwidth = compute_memory_bandwidth_single(remapping_results, a) + +# Function to convert nanoseconds to milliseconds for clearer output +ns_to_ms(time_ns) = time_ns / 1e6 + +# Process and print the results along with the time taken for each +println("Array broadcast addition:") +println("\tBandwidth: $array_bandwidth GB/s") +println("\tTime taken: $(ns_to_ms(median(array_results.times))) ms\n") + +println("Fields data broadcast addition:") +println("\tBandwidth: $fields_bandwidth GB/s") +println("\tTime taken: $(ns_to_ms(median(fields_results.times))) ms\n") + +println("Field Operator broadcast addition:") +println("\tBandwidth: $fo_bandwidth GB/s") +println("\tTime taken: $(ns_to_ms(median(fo_results.times))) ms\n") + +println("Sine operation (no field operator):") +println("\tBandwidth: $sin_bandwidth GB/s") +println("\tTime taken: $(ns_to_ms(median(sin_results.times))) ms\n") + +println("Field Operator sine operation:") +println("\tBandwidth: $fo_sin_bandwidth GB/s") +println("\tTime taken: $(ns_to_ms(median(fo_sin_results.times))) ms\n") + +println("Cosine operation (no field operator):") +println("\tBandwidth: $cos_bandwidth GB/s") +println("\tTime taken: $(ns_to_ms(median(cos_results.times))) ms\n") + +println("Field Operator cosine operation:") +println("\tBandwidth: $fo_cos_bandwidth GB/s") +println("\tTime taken: $(ns_to_ms(median(fo_cos_results.times))) ms\n") + +println("Field Operator Remapping:") +println("\tBandwidth: $remapping_bandwidth GB/s") +println("\tTime taken: $(ns_to_ms(median(remapping_results.times))) ms\n") + From 35902807cb788be4de1e691ce7a38dbc0ac83a8a Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Fri, 16 Aug 2024 10:00:30 +0200 Subject: [PATCH 21/53] Add neighbor sum benchmark to the suite --- benchmark/benchmarks.jl | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 18b7743..f5ff83e 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -253,6 +253,23 @@ This operator utilizes a connectivity table (`E2C`) to map the values from cells return a(E2C[1]) end +""" + fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} + +Field operator that computes the sum of neighboring cell values for each edge. This function leverages the connectivity table (`E2C`), which defines the relationship between edges and cells, to sum the values of cells that are connected to each edge. + +The summation is performed across the dimension specified by `E2CDim`, ensuring that each edge aggregates values from its associated cells correctly. + +# Arguments +- `a`: Input field containing Float64 data, where each cell contains a numerical value. + +# Returns +- A new field where each edge holds the summed value of its neighboring cells, based on the edge-to-cell connectivity defined in `E2C`. +""" +@field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} + return neighbor_sum(a(E2C), axis=E2CDim) +end + # Benchmark -------------------------------------------------------------------------------------------------- # Create the benchmark suite @@ -295,6 +312,12 @@ a, out = single_field_setup(STREAM_SIZE) suite["remapping"]["field_operator"] = @benchmarkable $fo_remapping($a, offset_provider=$offset_provider, backend="embedded", out=$out) +# Benchmark the field neighbor sum operation +offset_provider = create_large_connectivity(STREAM_SIZE) +a, out = single_field_setup(STREAM_SIZE) +suite["neighbor_sum"]["field_operator"] = + @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, backend="embedded", out=$out) + # Run the benchmark suite results = run(suite) @@ -307,6 +330,7 @@ fo_sin_results = results["trigonometry"]["field_op_sin"] cos_results = results["trigonometry"]["cos"] fo_cos_results = results["trigonometry"]["field_op_cos"] remapping_results = results["remapping"]["field_operator"] +neighbor_sum_results = results["neighbor_sum"]["field_operator"] # Process and print the results array_bandwidth = compute_memory_bandwidth_addition(array_results, a, b, a) # Out is a temporary array with size equal to the size of a @@ -318,8 +342,6 @@ fo_sin_bandwidth = compute_memory_bandwidth_single(fo_sin_results, a) cos_bandwidth = compute_memory_bandwidth_single(cos_results, a) fo_cos_bandwidth = compute_memory_bandwidth_single(fo_cos_results, a) -remapping_bandwidth = compute_memory_bandwidth_single(remapping_results, a) - # Function to convert nanoseconds to milliseconds for clearer output ns_to_ms(time_ns) = time_ns / 1e6 @@ -353,6 +375,7 @@ println("\tBandwidth: $fo_cos_bandwidth GB/s") println("\tTime taken: $(ns_to_ms(median(fo_cos_results.times))) ms\n") println("Field Operator Remapping:") -println("\tBandwidth: $remapping_bandwidth GB/s") println("\tTime taken: $(ns_to_ms(median(remapping_results.times))) ms\n") +println("Field Operator Neighbor Sum:") +println("\tTime taken: $(ns_to_ms(median(neighbor_sum_results.times))) ms\n") From 0be7ec1218719fe814dbe1d9290d8db46237fb2a Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Fri, 16 Aug 2024 10:00:53 +0200 Subject: [PATCH 22/53] Fix dependencies in benchmarks --- benchmark/benchmarks_neighbour_sum.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmark/benchmarks_neighbour_sum.jl b/benchmark/benchmarks_neighbour_sum.jl index 3016d43..a374f40 100644 --- a/benchmark/benchmarks_neighbour_sum.jl +++ b/benchmark/benchmarks_neighbour_sum.jl @@ -2,12 +2,11 @@ using BenchmarkTools using Statistics using GridTools +using GridTools.ExampleMeshes.Unstructured const N = 1_000_000 const DIM_SIZE = sqrt(N) |> floor |> Int -include("../test/mesh_definitions.jl") - function create_large_connectivity(size::Int) edge_to_cell_table = hcat([rand(1:size, 2) for _ in 1:size]...) cell_to_edge_table = hcat([rand(1:size, 3) for _ in 1:size]...) From 62276b057b0f0ff38897b88c3825a5e226bb5f29 Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Fri, 16 Aug 2024 10:20:13 +0200 Subject: [PATCH 23/53] Add verbose flag to avoid printing --- advection/advection_miniapp.jl | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/advection/advection_miniapp.jl b/advection/advection_miniapp.jl index b4230a9..9baf6a5 100644 --- a/advection/advection_miniapp.jl +++ b/advection/advection_miniapp.jl @@ -8,6 +8,7 @@ using Profile using GridTools const global VISUALIZATION_FLAG::Bool=false +const global VERBOSE_FLAG::Bool=true # Mesh Definitions -------------------------------------------------------------------------------------------- # Define dimensions for the mesh @@ -215,7 +216,9 @@ for i = 1:niter ) # Print the current timestep - println("Timestep $i") + if VERBOSE_FLAG + println("Timestep $i") + end if VISUALIZATION_FLAG # Print the current state as ASCII art every 5 timesteps @@ -232,9 +235,11 @@ for i = 1:niter update_periodic_layers(mesh, state.rho) end -# Output the final statistics for the scalar field (rho) and velocity fields -println( - "min max sum of final rho = $(minimum(state.rho.data)) , $(maximum(state.rho.data)) , $(sum(state.rho.data))" -) -println("Final Vel0 sum after $niter iterations: $(sum(state.vel[1].data))") -println("Final Vel1 sum after $niter iterations: $(sum(state.vel[2].data))") +if VERBOSE_FLAG + # Output the final statistics for the scalar field (rho) and velocity fields + println( + "min max sum of final rho = $(minimum(state.rho.data)) , $(maximum(state.rho.data)) , $(sum(state.rho.data))" + ) + println("Final Vel0 sum after $niter iterations: $(sum(state.vel[1].data))") + println("Final Vel1 sum after $niter iterations: $(sum(state.vel[2].data))") +end From 1bbb1e64d8d88aa374d91dc1418c35bc9a8449fb Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Fri, 16 Aug 2024 11:56:53 +0200 Subject: [PATCH 24/53] Quick fix to the unary/binary operation support --- advection/advection.jl | 12 ++---------- src/gt2py/jast_to_foast.jl | 1 + 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/advection/advection.jl b/advection/advection.jl index 19f7741..35cbece 100644 --- a/advection/advection.jl +++ b/advection/advection.jl @@ -149,7 +149,8 @@ end )::Field{Tuple{Vertex_, K_}, Float64} zrhin = (1.0 ./ vol) .* neighbor_sum( - -min.(0.0, flux(V2E)) .* max.(0.0, dual_face_orientation) - + # TODO: fix the 0-min workaround due to the binary/unary operation issue + (broadcast(0., (Vertex, V2EDim, K)) .- min.(0.0, flux(V2E))) .* max.(0.0, dual_face_orientation) - max.(0.0, flux(V2E)) .* min.(0.0, dual_face_orientation), axis = V2EDim, ) @@ -227,15 +228,6 @@ end dual_face_orientation::Field{Tuple{Vertex_, V2EDim_}, Float64}, dual_face_normal_weighted_x::Field{Tuple{Edge_}, Float64}, dual_face_normal_weighted_y::Field{Tuple{Edge_}, Float64}, - tmp_vertex_1::Field{Tuple{Vertex_, K_}, Float64}, - tmp_vertex_2::Field{Tuple{Vertex_, K_}, Float64}, - tmp_vertex_3::Field{Tuple{Vertex_, K_}, Float64}, - tmp_vertex_4::Field{Tuple{Vertex_, K_}, Float64}, - tmp_vertex_5::Field{Tuple{Vertex_, K_}, Float64}, - tmp_vertex_6::Field{Tuple{Vertex_, K_}, Float64}, - tmp_edge_1::Field{Tuple{Edge_, K_}, Float64}, - tmp_edge_2::Field{Tuple{Edge_, K_}, Float64}, - tmp_edge_3::Field{Tuple{Edge_, K_}, Float64}, ) tmp_edge_1 = advector_normal( diff --git a/src/gt2py/jast_to_foast.jl b/src/gt2py/jast_to_foast.jl index c843059..f0663c7 100644 --- a/src/gt2py/jast_to_foast.jl +++ b/src/gt2py/jast_to_foast.jl @@ -266,6 +266,7 @@ end function visit_(sym::Val{:call}, args::Array, outer_loc) if args[1] in bin_op + # TODO: check the case where a unary expression, that is at the same time binary operation is encountered: i.e. -x @assert length(args)==3 "Expected a binary operation. AST must be canonicalized using `canonicalize_arithmetic_ops` first." return foast.BinOp( op = visit(args[1]), From 17a55f8af2b655a526ba0127142df8a6035860f2 Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Fri, 16 Aug 2024 11:57:42 +0200 Subject: [PATCH 25/53] Use the ExampleMeshes in Atlas miniapp (with workaround on the offset_provider) --- advection/advection_miniapp.jl | 28 ++-------------------------- src/atlas/atlas_mesh.jl | 6 +++++- 2 files changed, 7 insertions(+), 27 deletions(-) diff --git a/advection/advection_miniapp.jl b/advection/advection_miniapp.jl index 9baf6a5..0073430 100644 --- a/advection/advection_miniapp.jl +++ b/advection/advection_miniapp.jl @@ -6,35 +6,11 @@ using Debugger using Statistics using Profile using GridTools +using GridTools.ExampleMeshes.Unstructured const global VISUALIZATION_FLAG::Bool=false const global VERBOSE_FLAG::Bool=true -# Mesh Definitions -------------------------------------------------------------------------------------------- -# Define dimensions for the mesh -Cell_ = Dimension{:Cell_, HORIZONTAL} -Edge_ = Dimension{:Edge_, HORIZONTAL} -Vertex_ = Dimension{:Vertex_, HORIZONTAL} -K_ = Dimension{:K_, VERTICAL} -V2VDim_ = Dimension{:V2V_, LOCAL} -V2EDim_ = Dimension{:V2E_, LOCAL} -E2VDim_ = Dimension{:E2V_, LOCAL} - -# Instantiate dimension objects -Cell = Cell_() -K = K_() -Edge = Edge_() -Vertex = Vertex_() -V2VDim = V2VDim_() -V2EDim = V2EDim_() -E2VDim = E2VDim_() - -# Define field offsets to describe the relationships between different dimensions -V2V = FieldOffset("V2V", source = Vertex, target = (Vertex, V2VDim)) -E2V = FieldOffset("E2V", source = Vertex, target = (Edge, E2VDim)) -V2E = FieldOffset("V2E", source = Edge, target = (Vertex, V2EDim)) -Koff = FieldOffset("Koff", source = K, target = K) - # Include additional necessary files for mesh, state container, metric calculations, and advection operations include("../src/atlas/atlas_mesh.jl") include("state_container.jl") @@ -50,7 +26,7 @@ mesh = AtlasMesh(grid, num_level = 30) # Simulation Parameters --------------------------------------------------------------------------------------- δt = 1800.0 # time step in s niter = 50 -ε = 1.0e-8 +ϵ = 1.0e-8 # Calculate metric properties from the mesh metric = m_from_mesh(mesh) diff --git a/src/atlas/atlas_mesh.jl b/src/atlas/atlas_mesh.jl index be45be3..d8f947b 100644 --- a/src/atlas/atlas_mesh.jl +++ b/src/atlas/atlas_mesh.jl @@ -260,7 +260,11 @@ struct AtlasMesh "V2V" => v2v, "V2E" => v2e, "E2V" => e2v, - "Koff" => K + "Koff" => K, + # TODO: cleanup + "V2VDim" => v2v, + "V2EDim" => v2e, + "E2VDim" => e2v, ) remote_indices = Dict{Dimension, Array}( From fcb44cd09e06d02b219db3dda9e480ab6cf9652f Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Fri, 16 Aug 2024 11:58:11 +0200 Subject: [PATCH 26/53] Add benchmark for mp_data --- benchmark/benchmarks.jl | 59 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index f5ff83e..2ebf5f0 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -270,7 +270,7 @@ The summation is performed across the dimension specified by `E2CDim`, ensuring return neighbor_sum(a(E2C), axis=E2CDim) end -# Benchmark -------------------------------------------------------------------------------------------------- +# Benchmarks ------------------------------------------------------------------------------------------------- # Create the benchmark suite suite = BenchmarkGroup() @@ -379,3 +379,60 @@ println("\tTime taken: $(ns_to_ms(median(remapping_results.times))) ms\n") println("Field Operator Neighbor Sum:") println("\tTime taken: $(ns_to_ms(median(neighbor_sum_results.times))) ms\n") + +# Advection Benchmarks + +include("../advection/advection_miniapp.jl") + +println("Starting julia embedded benchmark") + +suite["advection"]["mpdata_program_julia_embedded"] = @benchmark mpdata_program( + state.rho, + δt, + ϵ, + mesh.vol, + metric.gac, + state.vel[1], + state.vel[2], + state.vel[3], + mesh.pole_edge_mask, + mesh.dual_face_orientation, + mesh.dual_face_normal_weighted_x, + mesh.dual_face_normal_weighted_y, + out = state_next.rho, + offset_provider = mesh.offset_provider + ) + +println("Finished Julia embedded benchmark") + +# TODO: disabled because the backend is not currently supporting it (the backend is too slow) +# println("Starting julia python benchmark") + +# suite["advection"]["mpdata_program_julia_pyback"] = @benchmark mpdata_program( +# state.rho, +# δt, +# ϵ, +# mesh.vol, +# metric.gac, +# state.vel[1], +# state.vel[2], +# state.vel[3], +# mesh.pole_edge_mask, +# mesh.dual_face_orientation, +# mesh.dual_face_normal_weighted_x, +# mesh.dual_face_normal_weighted_y, +# out = state_next.rho, +# offset_provider = mesh.offset_provider, +# backend = "py" +# ) + +# println("Finished Julia python backend benchmark") + +mpdata_emb_results = results["advection"]["mpdata_program_julia_embedded"] +# mpdata_pyback_results = results["advection"]["mpdata_program_julia_pyback"] + +println("mpdata_program julia embedded version:") +println("\tTime taken: $(ns_to_ms(median(mpdata_emb_results.times))) ms\n") + +# println("mpdata_program julia with python backend:") +# println("\tTime taken: $(ns_to_ms(median(mpdata_pyback_results.times))) ms\n") From 22d1257051617c2376a95652b7a1d78f9e42341b Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Mon, 19 Aug 2024 14:07:36 +0200 Subject: [PATCH 27/53] Fix slicing operation in the advection_miniapp --- advection/advection.jl | 5 ++-- advection/advection_miniapp.jl | 4 +-- benchmark/benchmarks.jl | 54 ++++++++++++++++++++-------------- src/GridTools.jl | 5 ++-- src/embedded/cust_broadcast.jl | 2 +- test/gt2py_fo_exec.jl | 19 ++++++++++++ 6 files changed, 59 insertions(+), 30 deletions(-) diff --git a/advection/advection.jl b/advection/advection.jl index 35cbece..159a2ef 100644 --- a/advection/advection.jl +++ b/advection/advection.jl @@ -6,11 +6,10 @@ level_indices::Field{Tuple{K_}, Int64}, num_level::Int64 )::Field{Tuple{Vertex_, K_}, Float64} - return where( - level_indices .== num_level - 1, + level_indices .== 0, lower, - where(slice(level_indices .== 0, 1:29), upper, interior) + where(slice(level_indices .== 29, 2:30), upper, interior) ) end diff --git a/advection/advection_miniapp.jl b/advection/advection_miniapp.jl index 0073430..cdc72e0 100644 --- a/advection/advection_miniapp.jl +++ b/advection/advection_miniapp.jl @@ -9,7 +9,7 @@ using GridTools using GridTools.ExampleMeshes.Unstructured const global VISUALIZATION_FLAG::Bool=false -const global VERBOSE_FLAG::Bool=true +const global VERBOSE_FLAG::Bool=false # Include additional necessary files for mesh, state container, metric calculations, and advection operations include("../src/atlas/atlas_mesh.jl") @@ -20,7 +20,7 @@ include("visualization_utils.jl") # Grid and Mesh Initialization -------------------------------------------------------------------------------- # Create a structured grid and mesh for the simulation -grid = atlas.StructuredGrid("O50") +grid = atlas.StructuredGrid("O10") mesh = AtlasMesh(grid, num_level = 30) # Simulation Parameters --------------------------------------------------------------------------------------- diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 2ebf5f0..5e61886 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -5,14 +5,14 @@ using GridTools.ExampleMeshes.Unstructured using GridTools.ExampleMeshes.Cartesian # Data size -const global STREAM_SIZE = 10_000_000 +const global STREAM_SIZE = 100 # Utils ------------------------------------------------------------------------------------------------------ # Useful for the benchmark of the field remapping operation function create_large_connectivity(size::Int) - edge_to_cell_table = hcat([rand(1:size, 2) for _ in 1:size]...) - cell_to_edge_table = hcat([rand(1:size, 3) for _ in 1:size]...) + edge_to_cell_table = vcat([rand(1:size, (1, 2)) for _ in 1:size]...) + cell_to_edge_table = vcat([rand(1:size, (1, 3)) for _ in 1:size]...) E2C = Connectivity(edge_to_cell_table, Cell, Edge, 2) C2E = Connectivity(cell_to_edge_table, Edge, Cell, 3) @@ -64,12 +64,12 @@ Function to compute the memory bandwidth for the addition benchmarks. # Returns - The computed memory bandwidth in GB/s. """ -function compute_memory_bandwidth_addition(results, a, b, out)::Float64 +function compute_memory_bandwidth_addition(results, a, b, out)::Tuple{Float64, Int64} @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data) data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data) # Read a and b, write to out time_in_seconds = median(results.times) / 1e9 # Convert ns to s bandwidth = data_size / time_in_seconds / 1e9 # GB/s - return bandwidth + return bandwidth, data_size end # Operations ------------------------------------------------------------------------------------------------- @@ -280,11 +280,11 @@ suite["addition"] = BenchmarkGroup() # Julia broadcast addition benchmark a, b, data_size = array_broadcast_addition_setup(STREAM_SIZE) -suite["addition"]["array_broadcast_addition"] = @benchmarkable $broadcast_addition_array($a, $b) +suite["addition"]["array_broadcast_addition"] = @benchmarkable broadcast_addition_array(a, b) setup=((a, b, data_size) = $array_broadcast_addition_setup($STREAM_SIZE); ) #a=$a; b=$b) # Field broadcast addition benchmark a, b, out = fields_broadcast_addition_setup(STREAM_SIZE) -suite["addition"]["fields_broadcast_addition"] = @benchmarkable $broadcast_addition_fields($a, $b) +suite["addition"]["fields_broadcast_addition"] = @benchmarkable broadcast_addition_fields($a, $b) # Field Operator broadcast addition benchmark a, b, out = fields_broadcast_addition_setup(STREAM_SIZE) @@ -292,7 +292,7 @@ suite["addition"]["field_op_broadcast_addition"] = @benchmarkable $fo_addition($ # Sine without field operator benchmark a, out = single_field_setup(STREAM_SIZE) -suite["trigonometry"]["sin"] = @benchmarkable $sin_without_fo($a) +suite["trigonometry"]["sin"] = @benchmarkable sin_without_fo($a) # Field operator sine benchmark a, out = single_field_setup(STREAM_SIZE) @@ -300,7 +300,7 @@ suite["trigonometry"]["field_op_sin"] = @benchmarkable $fo_sin($a, backend="embe # Cosine without field operator benchmark a, out = single_field_setup(STREAM_SIZE) -suite["trigonometry"]["cos"] = @benchmarkable $cos_without_fo($a) +suite["trigonometry"]["cos"] = @benchmarkable cos_without_fo($a) # Field operator cosine benchmark a, out = single_field_setup(STREAM_SIZE) @@ -319,6 +319,7 @@ suite["neighbor_sum"]["field_operator"] = @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, backend="embedded", out=$out) # Run the benchmark suite +println("Running the benchmark suite...") results = run(suite) # Process the results @@ -332,10 +333,10 @@ fo_cos_results = results["trigonometry"]["field_op_cos"] remapping_results = results["remapping"]["field_operator"] neighbor_sum_results = results["neighbor_sum"]["field_operator"] -# Process and print the results -array_bandwidth = compute_memory_bandwidth_addition(array_results, a, b, a) # Out is a temporary array with size equal to the size of a -fields_bandwidth = compute_memory_bandwidth_addition(fields_results, a, b, a) -fo_bandwidth = compute_memory_bandwidth_addition(fo_results, a, b, out) +# Compute memory bandwidth +array_bandwidth, data_size_arr = compute_memory_bandwidth_addition(array_results, a, b, a) # Out is a temporary array with size equal to the size of a +fields_bandwidth, data_size_fields = compute_memory_bandwidth_addition(fields_results, a, b, a) +fo_bandwidth, data_size_fo = compute_memory_bandwidth_addition(fo_results, a, b, out) sin_bandwidth = compute_memory_bandwidth_single(sin_results, a) fo_sin_bandwidth = compute_memory_bandwidth_single(fo_sin_results, a) @@ -347,14 +348,17 @@ ns_to_ms(time_ns) = time_ns / 1e6 # Process and print the results along with the time taken for each println("Array broadcast addition:") +println("\tData size: $data_size_arr") println("\tBandwidth: $array_bandwidth GB/s") println("\tTime taken: $(ns_to_ms(median(array_results.times))) ms\n") println("Fields data broadcast addition:") +println("\tData size: $data_size_fields") println("\tBandwidth: $fields_bandwidth GB/s") println("\tTime taken: $(ns_to_ms(median(fields_results.times))) ms\n") println("Field Operator broadcast addition:") +println("\tData size: $data_size_fo") println("\tBandwidth: $fo_bandwidth GB/s") println("\tTime taken: $(ns_to_ms(median(fo_results.times))) ms\n") @@ -384,9 +388,11 @@ println("\tTime taken: $(ns_to_ms(median(neighbor_sum_results.times))) ms\n") include("../advection/advection_miniapp.jl") -println("Starting julia embedded benchmark") +avection_suite = BenchmarkGroup() + +println("Starting Advection Benchmark (julia embedded)") -suite["advection"]["mpdata_program_julia_embedded"] = @benchmark mpdata_program( +avection_suite["advection"]["mpdata_program_julia_embedded"] = @benchmark $mpdata_program( state.rho, δt, ϵ, @@ -403,12 +409,12 @@ suite["advection"]["mpdata_program_julia_embedded"] = @benchmark mpdata_program( offset_provider = mesh.offset_provider ) -println("Finished Julia embedded benchmark") +println("Finished Advection Benchmark (julia embedded)") # TODO: disabled because the backend is not currently supporting it (the backend is too slow) -# println("Starting julia python benchmark") +# println("Starting Advection Benchmark (julia-python)") -# suite["advection"]["mpdata_program_julia_pyback"] = @benchmark mpdata_program( +# advection_suite["advection"]["mpdata_program_julia_pyback"] = @benchmark mpdata_program( # state.rho, # δt, # ϵ, @@ -426,13 +432,17 @@ println("Finished Julia embedded benchmark") # backend = "py" # ) -# println("Finished Julia python backend benchmark") +# println("Finished Advection Benchmark (julia-python)") + +# Run the benchmark suite +println("Running the advection suite...") +# advection_results = run(avection_suite) -mpdata_emb_results = results["advection"]["mpdata_program_julia_embedded"] +# mpdata_emb_results = advection_results["advection"]["mpdata_program_julia_embedded"] # mpdata_pyback_results = results["advection"]["mpdata_program_julia_pyback"] -println("mpdata_program julia embedded version:") -println("\tTime taken: $(ns_to_ms(median(mpdata_emb_results.times))) ms\n") +# println("mpdata_program julia embedded version:") +# println("\tTime taken: $(ns_to_ms(median(mpdata_emb_results.times))) ms\n") # println("mpdata_program julia with python backend:") # println("\tTime taken: $(ns_to_ms(median(mpdata_pyback_results.times))) ms\n") diff --git a/src/GridTools.jl b/src/GridTools.jl index 580be0a..083cbd7 100644 --- a/src/GridTools.jl +++ b/src/GridTools.jl @@ -475,6 +475,7 @@ Base.convert(t::Type{T}, F::Field) where {T <: Number} = inds::Vararg{Int, N} ) where {BD, T, N} new_inds = inds .- F.origin + # @assert Tuple(1 for i in 1:length(new_inds)) <= new_inds <= size(F.data) "Error: $new_inds, $(size(F.data)), $(F.origin)" return F.data[new_inds...] end @propagate_inbounds function Base.setindex!( @@ -488,8 +489,9 @@ end Base.showarg(io::IO, @nospecialize(F::Field), toplevel) = print(io, eltype(F), " Field with dimensions ", get_dim_name.(F.broadcast_dims)) function slice(F::Field, inds...)::Field + @assert all(typeof(x) <: UnitRange{Int64} for x in inds) # TODO: understand why the line below is filtering the UnitRange only dim_ind = findall(x -> typeof(x) <: UnitRange{Int64}, inds) - return Field(F.dims[dim_ind], view(F.data, inds...), F.broadcast_dims) + return Field(F.dims[dim_ind], view(F.data, inds...), F.broadcast_dims, origin=Dict(d=>ind[1]-1 for (d,ind) in zip(F.dims, inds))) end # Connectivity struct ------------------------------------------------------------ @@ -561,7 +563,6 @@ function (fo::FieldOp)( out = nothing, kwargs... ) - is_outermost_fo = isnothing(OFFSET_PROVIDER) if is_outermost_fo @assert !isnothing(out) "Must provide an out field." diff --git a/src/embedded/cust_broadcast.jl b/src/embedded/cust_broadcast.jl index 0b0ad16..66cb372 100644 --- a/src/embedded/cust_broadcast.jl +++ b/src/embedded/cust_broadcast.jl @@ -66,7 +66,7 @@ function get_size_ifelse(mask::FieldShape, branch::FieldShape) out_size = [branch.axes...] ind_mask = findall(x -> x in branch.dims, mask.dims) ind_out = findall(x -> x in mask.dims, branch.dims) - + # TODO: this is not correct if the mask has an origin out_size[ind_out] .= mask.axes[ind_mask] return FieldShape(branch.dims, Tuple(out_size), branch.broadcast_dims) diff --git a/test/gt2py_fo_exec.jl b/test/gt2py_fo_exec.jl index ec0014f..187eb23 100644 --- a/test/gt2py_fo_exec.jl +++ b/test/gt2py_fo_exec.jl @@ -564,6 +564,23 @@ function test_lap_lap(offset_provider::Dict{String, Dimension}, backend::String, # TODO: add in the future the test for the border values end +""" + test_slice() + +This test checks the `slice` function, which should correctly extract a subset of data from a larger field and properly adjust the origin to reflect the new sliced field's starting point. + +# Expected Behavior +- The sliced data should match the expected subset from the original field. +- The origin of the sliced field should be adjusted correctly to match the new starting index of the sliced data. +""" +function test_slice() + a::Field = Field((IDim,), [1; 2; 3; 4; 5]) + sliced_a = slice(a, 2:4) + @test sliced_a.data == [2; 3; 4] + @test sliced_a.origin == (2-1,) + @test sliced_a.dims == (IDim,) +end + # Test Executions -------------------------------------------------------------------------------------------- function test_gt4py_fo_exec() @@ -638,6 +655,8 @@ function test_gt4py_fo_exec() # testwrapper(setup_cartesian_offset_provider, test_lap_lap, "embedded", simple_cartesian_field) testwrapper(setup_cartesian_offset_provider, test_lap_lap, "py", simple_cartesian_field) + + testwrapper(nothing, test_slice) end @testset "Testset GT2Py fo exec" test_gt4py_fo_exec() From 30d92c1f1621817f45a6e1feea3c2b0676d7c95f Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Mon, 19 Aug 2024 14:11:32 +0200 Subject: [PATCH 28/53] Fix benchmarking size --- benchmark/benchmarks.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 5e61886..faa4468 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -5,7 +5,7 @@ using GridTools.ExampleMeshes.Unstructured using GridTools.ExampleMeshes.Cartesian # Data size -const global STREAM_SIZE = 100 +const global STREAM_SIZE = 10_000_000 # Utils ------------------------------------------------------------------------------------------------------ From 9928e143f20a15c0be8f26db49838fd9c1e1766e Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Mon, 19 Aug 2024 14:22:03 +0200 Subject: [PATCH 29/53] Remove deprecated benchmark files --- benchmark/benchmarks_neighbour_sum.jl | 46 ----------------- benchmark/benchmarks_remapping.jl | 71 --------------------------- 2 files changed, 117 deletions(-) delete mode 100644 benchmark/benchmarks_neighbour_sum.jl delete mode 100644 benchmark/benchmarks_remapping.jl diff --git a/benchmark/benchmarks_neighbour_sum.jl b/benchmark/benchmarks_neighbour_sum.jl deleted file mode 100644 index a374f40..0000000 --- a/benchmark/benchmarks_neighbour_sum.jl +++ /dev/null @@ -1,46 +0,0 @@ - -using BenchmarkTools -using Statistics -using GridTools -using GridTools.ExampleMeshes.Unstructured - -const N = 1_000_000 -const DIM_SIZE = sqrt(N) |> floor |> Int - -function create_large_connectivity(size::Int) - edge_to_cell_table = hcat([rand(1:size, 2) for _ in 1:size]...) - cell_to_edge_table = hcat([rand(1:size, 3) for _ in 1:size]...) - - E2C = Connectivity(edge_to_cell_table, Cell, Edge, 2) - C2E = Connectivity(cell_to_edge_table, Edge, Cell, 3) - - Dict( - "E2C" => E2C, - "C2E" => C2E, - "E2CDim" => E2C # Using the same for simplicity # TODO: to be removed - ) -end - -offset_provider = create_large_connectivity(DIM_SIZE) - -a = Field(Cell, collect(1.0:N)) -out_field = GridTools.similar_field(a) - -@field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} - return neighbor_sum(a(E2C), axis=E2CDim) -end - -# Benchmark the field operation -fo_benchmark = @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, backend="embedded", out=$out_field) - -# Run the benchmark -results = run(fo_benchmark) - -# Memory bandwidth calculation -time_in_seconds = median(results.times) / 1e9 # convert ns to s -data_size = sizeof(a.data) + sizeof(out_field.data) # total bytes read and written -bandwidth = data_size / time_in_seconds / 1e9 # GB/s - -# Output results -println("Time taken: ", median(results.times) / 1e6, " ms") -println("Memory bandwidth: ", bandwidth, " GB/s") diff --git a/benchmark/benchmarks_remapping.jl b/benchmark/benchmarks_remapping.jl deleted file mode 100644 index 66470a6..0000000 --- a/benchmark/benchmarks_remapping.jl +++ /dev/null @@ -1,71 +0,0 @@ -using BenchmarkTools -using Statistics -using GridTools - -const N = 10_000_000 |> floor |> Int # Adjust as needed (10 millions is the SLURM test size) - -include("../test/mesh_definitions.jl") # Ensure all necessary mesh and dimension definitions are loaded - -# Unstructured Mesh ------------------------------------------------------------------------------------------ - -function create_large_connectivity(size::Int) - edge_to_cell_table = hcat([rand(1:size, 2) for _ in 1:size]...) - cell_to_edge_table = hcat([rand(1:size, 3) for _ in 1:size]...) - - E2C = Connectivity(edge_to_cell_table, Cell, Edge, 2) - C2E = Connectivity(cell_to_edge_table, Edge, Cell, 3) - - Dict( - "E2C" => E2C, - "C2E" => C2E, - "E2CDim" => E2C # TODO: remove it - ) -end - -offset_provider = create_large_connectivity(N) - -a = Field(Cell, collect(1.0:N)) -out_field = GridTools.similar_field(a) - -@field_operator function fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} - return a(E2C[1]) -end - -# Benchmark the field remapping operation -remapping_benchmark = @benchmarkable $fo_remapping($a, offset_provider=$offset_provider, backend="embedded", out=$out_field) - -# Run the benchmark -results = run(remapping_benchmark) - -# Memory bandwidth calculation -unstr_time_in_seconds = median(results.times) / 1e9 # convert ns to s -unstr_data_size = sizeof(a.data) + sizeof(out_field.data) # total bytes read and written -unstr_bandwidth = unstr_data_size / unstr_time_in_seconds / 1e9 # GB/s - -# Output results -println("Time taken: ", median(results.times) / 1e6, " ms") -println("Memory bandwidth for Unstructured Mesh Remapping: ", unstr_bandwidth, " GB/s") - -# Cartesian Mesh --------------------------------------------------------------------------------------------- - -# Cartesian Offset Field Operator -@field_operator function fo_cartesian_offset(inp::Field{Tuple{K_},Float64})::Field{Tuple{K_},Float64} - return inp(Koff[1]) -end - -# Create and benchmark the Cartesian offset operation -a = Field(K, collect(1.0:N)) -out_field = Field(K, zeros(Float64, N-1)) -cartesian_offset_provider = Dict("Koff" => K) - -cartesian_benchmark = @benchmarkable $fo_cartesian_offset($a, backend="embedded", out=$out_field, offset_provider=$cartesian_offset_provider) -cartesian_results = run(cartesian_benchmark) - -# Memory bandwidth calculation -cartesian_time_in_seconds = median(cartesian_results.times) / 1e9 # convert ns to s -cartesian_data_size = sizeof(a.data) + sizeof(out_field.data) # total bytes read and written -cartesian_bandwidth = cartesian_data_size / cartesian_time_in_seconds / 1e9 # GB/s - -# Output results -println("Time taken for Cartesian Mesh Offset: ", median(cartesian_results.times) / 1e6, " ms") -println("Memory bandwidth for Cartesian Mesh Offset: ", cartesian_bandwidth, " GB/s") From 177f8babbcae606614d96ff11fecb2af685a41d4 Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Mon, 19 Aug 2024 14:50:26 +0200 Subject: [PATCH 30/53] Fix advection benchmarks and place them in a separate script --- benchmark/benchmark_advection.jl | 104 +++++++++++++++++++++++++++++++ benchmark/benchmark_mpdata.jl | 94 ---------------------------- benchmark/benchmarks.jl | 63 ------------------- 3 files changed, 104 insertions(+), 157 deletions(-) create mode 100644 benchmark/benchmark_advection.jl delete mode 100644 benchmark/benchmark_mpdata.jl diff --git a/benchmark/benchmark_advection.jl b/benchmark/benchmark_advection.jl new file mode 100644 index 0000000..8432aea --- /dev/null +++ b/benchmark/benchmark_advection.jl @@ -0,0 +1,104 @@ +using BenchmarkTools +using Statistics +using GridTools +using GridTools.ExampleMeshes.Unstructured +using GridTools.ExampleMeshes.Cartesian + +include("../advection/advection_miniapp.jl") + +# Advection Benchmarks + +advection_suite = BenchmarkGroup() +advection_suite["advection"]["upwind_julia_embedded"] = @benchmarkable upwind_scheme( + state.rho, + δt, + mesh.vol, + metric.gac, + state.vel[1], + state.vel[2], + state.vel[3], + mesh.pole_edge_mask, + mesh.dual_face_orientation, + mesh.dual_face_normal_weighted_x, + mesh.dual_face_normal_weighted_y, + out = state_next.rho, + offset_provider = mesh.offset_provider + # embedded backend + ) + +# advection_suite["advection"]["upwind_python_backend"] = @benchmarkable upwind_scheme( +# state.rho, +# δt, +# mesh.vol, +# metric.gac, +# state.vel[1], +# state.vel[2], +# state.vel[3], +# mesh.pole_edge_mask, +# mesh.dual_face_orientation, +# mesh.dual_face_normal_weighted_x, +# mesh.dual_face_normal_weighted_y, +# out = state_next.rho, +# offset_provider = mesh.offset_provider, +# backend = "py" +# ) + +advection_suite["advection"]["mpdata_program_julia_embedded"] = @benchmarkable mpdata_program( + state.rho, + δt, + ϵ, + mesh.vol, + metric.gac, + state.vel[1], + state.vel[2], + state.vel[3], + mesh.pole_edge_mask, + mesh.dual_face_orientation, + mesh.dual_face_normal_weighted_x, + mesh.dual_face_normal_weighted_y, + out = state_next.rho, + offset_provider = mesh.offset_provider + ) + +# TODO: disabled because the backend is not currently supporting it (the backend is too slow) +# advection_suite["advection"]["mpdata_program_python_backend"] = @benchmarkable mpdata_program( +# state.rho, +# δt, +# ϵ, +# mesh.vol, +# metric.gac, +# state.vel[1], +# state.vel[2], +# state.vel[3], +# mesh.pole_edge_mask, +# mesh.dual_face_orientation, +# mesh.dual_face_normal_weighted_x, +# mesh.dual_face_normal_weighted_y, +# out = state_next.rho, +# offset_provider = mesh.offset_provider, +# backend = "py" +# ) + +# Run the benchmark suite +println("Running the advection suite...") +advection_results = run(advection_suite) + +upwind_embedded_results = advection_results["advection"]["upwind_julia_embedded"] +# upwind_python_backend_results = results["advection"]["upwind_python_backend"] +mpdata_embedded_results = advection_results["advection"]["mpdata_program_julia_embedded"] +# mpdata_python_backend_results = results["advection"]["mpdata_program_python_backend"] + +# Function to convert nanoseconds to milliseconds for clearer output +ns_to_ms(time_ns) = time_ns / 1e6 + +println("Upwind scheme julia (embedded):") +println("\tTime taken: $(ns_to_ms(median(upwind_embedded_results.times))) ms\n") + +# println("Upwind scheme julia (python backend):") +# println("\tTime taken: $(ns_to_ms(median(upwind_python_backend_results.times))) ms\n") + +println("Mpdata program julia (embedded):") +println("\tTime taken: $(ns_to_ms(median(mpdata_embedded_results.times))) ms\n") + +# println("Mpdata program julia (python backend):") +# println("\tTime taken: $(ns_to_ms(median(mpdata_python_backend_results.times))) ms\n") diff --git a/benchmark/benchmark_mpdata.jl b/benchmark/benchmark_mpdata.jl deleted file mode 100644 index 34a1e49..0000000 --- a/benchmark/benchmark_mpdata.jl +++ /dev/null @@ -1,94 +0,0 @@ -# benchmark_mpdata.jl - Benchmarking for atlas advection code - -using BenchmarkTools -using GridTools # Assuming all necessary functionality like Field, Dimension are defined here -using Statistics -using Printf - -Cell_ = Dimension{:Cell_, HORIZONTAL} -Edge_ = Dimension{:Edge_, HORIZONTAL} -Vertex_ = Dimension{:Vertex_, HORIZONTAL} -K_ = Dimension{:K_, VERTICAL} -V2VDim_ = Dimension{:V2V_, LOCAL} -V2EDim_ = Dimension{:V2E_, LOCAL} -E2VDim_ = Dimension{:E2V_, LOCAL} -Cell = Cell_() -K = K_() -Edge = Edge_() -Vertex = Vertex_() -V2VDim = V2VDim_() -V2EDim = V2EDim_() -E2VDim = E2VDim_() - -V2V = FieldOffset("V2V", source = Vertex, target = (Vertex, V2VDim)) -E2V = FieldOffset("E2V", source = Vertex, target = (Edge, E2VDim)) -V2E = FieldOffset("V2E", source = Edge, target = (Vertex, V2EDim)) -Koff = FieldOffset("Koff", source = K, target = K) - -include("../src/atlas/atlas_mesh.jl") -include("../src/atlas/state_container.jl") -include("../src/atlas/metric.jl") -include("../src/atlas/advection.jl") - -# Function to set up and run the benchmark -function benchmark_mpdata() - # Set up the environment or load data - grid = atlas.StructuredGrid("O50") - mesh = AtlasMesh(grid, num_level = 30) - - # Define dimensions based on the mesh properties - vertex_dim = getproperty(mesh, DIMENSION_TO_SIZE_ATTR[Vertex]) - k_dim = getproperty(mesh, DIMENSION_TO_SIZE_ATTR[K]) - edge_dim = getproperty(mesh, DIMENSION_TO_SIZE_ATTR[Edge]) - - # Set parameters - δt = 1800.0 # time step in s - eps = 1.0e-8 - niter = 50 # Adjust based on how long you want the benchmark to run - - # Initialize fields and metrics - state = sc_from_mesh(mesh) - state_next = sc_from_mesh(mesh) - tmp_fields = Dict{String, Field}() - for i = 1:6 - tmp_fields[@sprintf("tmp_vertex_%d", i)] = Field((Vertex, K), zeros(vertex_dim, k_dim)) - end - for j = 1:3 - tmp_fields[@sprintf("tmp_edge_%d", j)] = Field((Edge, K), zeros(edge_dim, k_dim)) - end - - # Benchmark the mpdata_program - println("Starting the benchmark for mpdata_program...") - bench_result = @benchmark begin - mpdata_program( - state.rho, - δt, - eps, - mesh.vol, - metric.gac, - state.vel[1], - state.vel[2], - state.vel[3], - mesh.pole_edge_mask, - mesh.dual_face_orientation, - mesh.dual_face_normal_weighted_x, - mesh.dual_face_normal_weighted_y, - tmp_fields["tmp_vertex_1"], - tmp_fields["tmp_vertex_2"], - tmp_fields["tmp_vertex_3"], - tmp_fields["tmp_vertex_4"], - tmp_fields["tmp_vertex_5"], - tmp_fields["tmp_vertex_6"], - tmp_fields["tmp_edge_1"], - tmp_fields["tmp_edge_2"], - tmp_fields["tmp_edge_3"] - ) - end - - # Output benchmark results - println("Benchmark completed.") - display(bench_result) -end - -# Run the benchmark function -benchmark_mpdata() diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index faa4468..a1c8136 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -383,66 +383,3 @@ println("\tTime taken: $(ns_to_ms(median(remapping_results.times))) ms\n") println("Field Operator Neighbor Sum:") println("\tTime taken: $(ns_to_ms(median(neighbor_sum_results.times))) ms\n") - -# Advection Benchmarks - -include("../advection/advection_miniapp.jl") - -avection_suite = BenchmarkGroup() - -println("Starting Advection Benchmark (julia embedded)") - -avection_suite["advection"]["mpdata_program_julia_embedded"] = @benchmark $mpdata_program( - state.rho, - δt, - ϵ, - mesh.vol, - metric.gac, - state.vel[1], - state.vel[2], - state.vel[3], - mesh.pole_edge_mask, - mesh.dual_face_orientation, - mesh.dual_face_normal_weighted_x, - mesh.dual_face_normal_weighted_y, - out = state_next.rho, - offset_provider = mesh.offset_provider - ) - -println("Finished Advection Benchmark (julia embedded)") - -# TODO: disabled because the backend is not currently supporting it (the backend is too slow) -# println("Starting Advection Benchmark (julia-python)") - -# advection_suite["advection"]["mpdata_program_julia_pyback"] = @benchmark mpdata_program( -# state.rho, -# δt, -# ϵ, -# mesh.vol, -# metric.gac, -# state.vel[1], -# state.vel[2], -# state.vel[3], -# mesh.pole_edge_mask, -# mesh.dual_face_orientation, -# mesh.dual_face_normal_weighted_x, -# mesh.dual_face_normal_weighted_y, -# out = state_next.rho, -# offset_provider = mesh.offset_provider, -# backend = "py" -# ) - -# println("Finished Advection Benchmark (julia-python)") - -# Run the benchmark suite -println("Running the advection suite...") -# advection_results = run(avection_suite) - -# mpdata_emb_results = advection_results["advection"]["mpdata_program_julia_embedded"] -# mpdata_pyback_results = results["advection"]["mpdata_program_julia_pyback"] - -# println("mpdata_program julia embedded version:") -# println("\tTime taken: $(ns_to_ms(median(mpdata_emb_results.times))) ms\n") - -# println("mpdata_program julia with python backend:") -# println("\tTime taken: $(ns_to_ms(median(mpdata_pyback_results.times))) ms\n") From 391dc0a269a5254d7e8fc47e61665bf7b44ae082 Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Tue, 20 Aug 2024 10:53:21 +0200 Subject: [PATCH 31/53] Fix K dimension in advection meshes --- ...k_advection.jl => benchmarks_advection.jl} | 40 +++++++++---------- src/ExampleMeshes.jl | 2 +- 2 files changed, 21 insertions(+), 21 deletions(-) rename benchmark/{benchmark_advection.jl => benchmarks_advection.jl} (76%) diff --git a/benchmark/benchmark_advection.jl b/benchmark/benchmarks_advection.jl similarity index 76% rename from benchmark/benchmark_advection.jl rename to benchmark/benchmarks_advection.jl index 8432aea..f6411b1 100644 --- a/benchmark/benchmark_advection.jl +++ b/benchmark/benchmarks_advection.jl @@ -26,22 +26,22 @@ advection_suite["advection"]["upwind_julia_embedded"] = @benchmarkable upwind_sc # embedded backend ) -# advection_suite["advection"]["upwind_python_backend"] = @benchmarkable upwind_scheme( -# state.rho, -# δt, -# mesh.vol, -# metric.gac, -# state.vel[1], -# state.vel[2], -# state.vel[3], -# mesh.pole_edge_mask, -# mesh.dual_face_orientation, -# mesh.dual_face_normal_weighted_x, -# mesh.dual_face_normal_weighted_y, -# out = state_next.rho, -# offset_provider = mesh.offset_provider, -# backend = "py" -# ) +advection_suite["advection"]["upwind_python_backend"] = @benchmarkable upwind_scheme( + state.rho, + δt, + mesh.vol, + metric.gac, + state.vel[1], + state.vel[2], + state.vel[3], + mesh.pole_edge_mask, + mesh.dual_face_orientation, + mesh.dual_face_normal_weighted_x, + mesh.dual_face_normal_weighted_y, + out = state_next.rho, + offset_provider = mesh.offset_provider, + backend = "py" + ) advection_suite["advection"]["mpdata_program_julia_embedded"] = @benchmarkable mpdata_program( state.rho, @@ -84,9 +84,9 @@ println("Running the advection suite...") advection_results = run(advection_suite) upwind_embedded_results = advection_results["advection"]["upwind_julia_embedded"] -# upwind_python_backend_results = results["advection"]["upwind_python_backend"] +upwind_python_backend_results = advection_results["advection"]["upwind_python_backend"] mpdata_embedded_results = advection_results["advection"]["mpdata_program_julia_embedded"] -# mpdata_python_backend_results = results["advection"]["mpdata_program_python_backend"] +# mpdata_python_backend_results = advection_results["advection"]["mpdata_program_python_backend"] # Function to convert nanoseconds to milliseconds for clearer output ns_to_ms(time_ns) = time_ns / 1e6 @@ -94,8 +94,8 @@ ns_to_ms(time_ns) = time_ns / 1e6 println("Upwind scheme julia (embedded):") println("\tTime taken: $(ns_to_ms(median(upwind_embedded_results.times))) ms\n") -# println("Upwind scheme julia (python backend):") -# println("\tTime taken: $(ns_to_ms(median(upwind_python_backend_results.times))) ms\n") +println("Upwind scheme julia (python backend):") +println("\tTime taken: $(ns_to_ms(median(upwind_python_backend_results.times))) ms\n") println("Mpdata program julia (embedded):") println("\tTime taken: $(ns_to_ms(median(mpdata_embedded_results.times))) ms\n") diff --git a/src/ExampleMeshes.jl b/src/ExampleMeshes.jl index 96612cf..6d5d237 100644 --- a/src/ExampleMeshes.jl +++ b/src/ExampleMeshes.jl @@ -11,7 +11,7 @@ export Cell, K, Edge, Vertex, V2VDim, V2EDim, E2VDim, E2CDim, C2EDim export V2V, E2V, V2E, E2C, C2E, Koff const global Cell_ = Dimension{:Cell_, HORIZONTAL} -const global K_ = Dimension{:K_, HORIZONTAL} +const global K_ = Dimension{:K_, VERTICAL} const global Edge_ = Dimension{:Edge_, HORIZONTAL} const global Vertex_ = Dimension{:Vertex_, HORIZONTAL} const global V2VDim_ = Dimension{:V2VDim_, LOCAL} From 904866621b486fb1e3c77fa25868f49e139f65f5 Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Tue, 20 Aug 2024 10:54:49 +0200 Subject: [PATCH 32/53] Add multi-threads optimization on broadcasting operation --- src/embedded/cust_broadcast.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/embedded/cust_broadcast.jl b/src/embedded/cust_broadcast.jl index 66cb372..ff0aa9f 100644 --- a/src/embedded/cust_broadcast.jl +++ b/src/embedded/cust_broadcast.jl @@ -1,3 +1,6 @@ + +using Base.Threads: @threads + Base.BroadcastStyle(::Type{<:Field}) = Broadcast.ArrayStyle{Field}() # TODO(tehrengruber): Implement a range with an attached dimension instead of this single object @@ -257,7 +260,7 @@ end # Performance may vary depending on whether `@inbounds` is placed outside the # for loop or not. (cf. https://github.com/JuliaLang/julia/issues/38086) - @inbounds @simd for I in eachindex(dest) + @inbounds @threads for I in eachindex(dest) dest[I] = bc′[I] end return dest From e2ce6012037404cf02859e5985c575c71b8cf5f1 Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Tue, 20 Aug 2024 11:13:05 +0200 Subject: [PATCH 33/53] Change benchmark SUITE for compatibility with AirSpeedVelocity --- benchmark/benchmarks.jl | 30 +++++++++++++++--------------- benchmark/benchmarks_advection.jl | 12 ++++++------ 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index a1c8136..87c404f 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -272,55 +272,55 @@ end # Benchmarks ------------------------------------------------------------------------------------------------- -# Create the benchmark suite -suite = BenchmarkGroup() +# Create the benchmark SUITE +SUITE = BenchmarkGroup() # Define the main groups -suite["addition"] = BenchmarkGroup() +SUITE["addition"] = BenchmarkGroup() # Julia broadcast addition benchmark a, b, data_size = array_broadcast_addition_setup(STREAM_SIZE) -suite["addition"]["array_broadcast_addition"] = @benchmarkable broadcast_addition_array(a, b) setup=((a, b, data_size) = $array_broadcast_addition_setup($STREAM_SIZE); ) #a=$a; b=$b) +SUITE["addition"]["array_broadcast_addition"] = @benchmarkable broadcast_addition_array(a, b) setup=((a, b, data_size) = $array_broadcast_addition_setup($STREAM_SIZE); ) #a=$a; b=$b) # Field broadcast addition benchmark a, b, out = fields_broadcast_addition_setup(STREAM_SIZE) -suite["addition"]["fields_broadcast_addition"] = @benchmarkable broadcast_addition_fields($a, $b) +SUITE["addition"]["fields_broadcast_addition"] = @benchmarkable broadcast_addition_fields($a, $b) # Field Operator broadcast addition benchmark a, b, out = fields_broadcast_addition_setup(STREAM_SIZE) -suite["addition"]["field_op_broadcast_addition"] = @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out) +SUITE["addition"]["field_op_broadcast_addition"] = @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out) # Sine without field operator benchmark a, out = single_field_setup(STREAM_SIZE) -suite["trigonometry"]["sin"] = @benchmarkable sin_without_fo($a) +SUITE["trigonometry"]["sin"] = @benchmarkable sin_without_fo($a) # Field operator sine benchmark a, out = single_field_setup(STREAM_SIZE) -suite["trigonometry"]["field_op_sin"] = @benchmarkable $fo_sin($a, backend="embedded", out=$out) +SUITE["trigonometry"]["field_op_sin"] = @benchmarkable $fo_sin($a, backend="embedded", out=$out) # Cosine without field operator benchmark a, out = single_field_setup(STREAM_SIZE) -suite["trigonometry"]["cos"] = @benchmarkable cos_without_fo($a) +SUITE["trigonometry"]["cos"] = @benchmarkable cos_without_fo($a) # Field operator cosine benchmark a, out = single_field_setup(STREAM_SIZE) -suite["trigonometry"]["field_op_cos"] = @benchmarkable $fo_cos($a, backend="embedded", out=$out) +SUITE["trigonometry"]["field_op_cos"] = @benchmarkable $fo_cos($a, backend="embedded", out=$out) # Benchmark the field remapping operation offset_provider = create_large_connectivity(STREAM_SIZE) a, out = single_field_setup(STREAM_SIZE) -suite["remapping"]["field_operator"] = +SUITE["remapping"]["field_operator"] = @benchmarkable $fo_remapping($a, offset_provider=$offset_provider, backend="embedded", out=$out) # Benchmark the field neighbor sum operation offset_provider = create_large_connectivity(STREAM_SIZE) a, out = single_field_setup(STREAM_SIZE) -suite["neighbor_sum"]["field_operator"] = +SUITE["neighbor_sum"]["field_operator"] = @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, backend="embedded", out=$out) -# Run the benchmark suite -println("Running the benchmark suite...") -results = run(suite) +# Run the benchmark SUITE +println("Running the benchmark SUITE...") +results = run(SUITE) # Process the results array_results = results["addition"]["array_broadcast_addition"] diff --git a/benchmark/benchmarks_advection.jl b/benchmark/benchmarks_advection.jl index f6411b1..c40d809 100644 --- a/benchmark/benchmarks_advection.jl +++ b/benchmark/benchmarks_advection.jl @@ -8,8 +8,8 @@ include("../advection/advection_miniapp.jl") # Advection Benchmarks -advection_suite = BenchmarkGroup() -advection_suite["advection"]["upwind_julia_embedded"] = @benchmarkable upwind_scheme( +SUITE = BenchmarkGroup() +SUITE["advection"]["upwind_julia_embedded"] = @benchmarkable upwind_scheme( state.rho, δt, mesh.vol, @@ -26,7 +26,7 @@ advection_suite["advection"]["upwind_julia_embedded"] = @benchmarkable upwind_sc # embedded backend ) -advection_suite["advection"]["upwind_python_backend"] = @benchmarkable upwind_scheme( +SUITE["advection"]["upwind_python_backend"] = @benchmarkable upwind_scheme( state.rho, δt, mesh.vol, @@ -43,7 +43,7 @@ advection_suite["advection"]["upwind_python_backend"] = @benchmarkable upwind_sc backend = "py" ) -advection_suite["advection"]["mpdata_program_julia_embedded"] = @benchmarkable mpdata_program( +SUITE["advection"]["mpdata_program_julia_embedded"] = @benchmarkable mpdata_program( state.rho, δt, ϵ, @@ -61,7 +61,7 @@ advection_suite["advection"]["mpdata_program_julia_embedded"] = @benchmarkable m ) # TODO: disabled because the backend is not currently supporting it (the backend is too slow) -# advection_suite["advection"]["mpdata_program_python_backend"] = @benchmarkable mpdata_program( +# SUITE["advection"]["mpdata_program_python_backend"] = @benchmarkable mpdata_program( # state.rho, # δt, # ϵ, @@ -81,7 +81,7 @@ advection_suite["advection"]["mpdata_program_julia_embedded"] = @benchmarkable m # Run the benchmark suite println("Running the advection suite...") -advection_results = run(advection_suite) +advection_results = run(SUITE) upwind_embedded_results = advection_results["advection"]["upwind_julia_embedded"] upwind_python_backend_results = advection_results["advection"]["upwind_python_backend"] From 45cf97a27f7680032a7db4955c0724f856c9346d Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Tue, 20 Aug 2024 11:13:28 +0200 Subject: [PATCH 34/53] Add multi-threads optimization --- src/embedded/cust_broadcast.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/embedded/cust_broadcast.jl b/src/embedded/cust_broadcast.jl index 66cb372..ff0aa9f 100644 --- a/src/embedded/cust_broadcast.jl +++ b/src/embedded/cust_broadcast.jl @@ -1,3 +1,6 @@ + +using Base.Threads: @threads + Base.BroadcastStyle(::Type{<:Field}) = Broadcast.ArrayStyle{Field}() # TODO(tehrengruber): Implement a range with an attached dimension instead of this single object @@ -257,7 +260,7 @@ end # Performance may vary depending on whether `@inbounds` is placed outside the # for loop or not. (cf. https://github.com/JuliaLang/julia/issues/38086) - @inbounds @simd for I in eachindex(dest) + @inbounds @threads for I in eachindex(dest) dest[I] = bc′[I] end return dest From 8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Tue, 20 Aug 2024 11:24:17 +0200 Subject: [PATCH 35/53] Restoring SIMD loop in broadcast --- src/embedded/cust_broadcast.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/embedded/cust_broadcast.jl b/src/embedded/cust_broadcast.jl index ff0aa9f..e76c3fa 100644 --- a/src/embedded/cust_broadcast.jl +++ b/src/embedded/cust_broadcast.jl @@ -260,7 +260,7 @@ end # Performance may vary depending on whether `@inbounds` is placed outside the # for loop or not. (cf. https://github.com/JuliaLang/julia/issues/38086) - @inbounds @threads for I in eachindex(dest) + @inbounds @simd for I in eachindex(dest) dest[I] = bc′[I] end return dest From be385b7b32164868b81dda520dc62b17bcc9c341 Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Tue, 20 Aug 2024 12:04:33 +0200 Subject: [PATCH 36/53] Add benchmark readme on how to run benchmarks on separate revisions --- benchmark/README.md | 110 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 2 deletions(-) diff --git a/benchmark/README.md b/benchmark/README.md index 9ae7e7a..e898b2a 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -51,6 +51,112 @@ benchpkg --filter=time_to_load The `benchpkg` was updated in June 2024 to automate the benchmark without specifying the parameters. To specify additional condition in `benchpkg` and to work with `benchpkgplot` consult the help command (`--h`). -## Creating New Benchmarks +Here’s an improved and completed version of your README section, with the necessary definitions, examples, and explanations: -TODO: Instructions for adding new benchmarks to the suite. +--- + +## Comparing Two or More Different Revisions (States) + +To compare two or more different states of your codebase, you can use revisions. In this context, a **revision** refers to a specific state of the repository, which can be identified by a commit hash or a tag. + +### (Reminder) What is a Revision? + +A **revision** in Git is an identifier that refers to a specific state of the repository at a particular point in time. Revisions can be specified using: +- **Commit Hashes**: A unique SHA-1 identifier for each commit, e.g., `8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd`. +- **Tags**: Human-readable names assigned to specific commits, often used to mark release points (e.g., `v1.0.0`). + +### How to Add a Tag + +You can create a tag in Git by using the following command: + +```bash +git tag -a -m "Tag message" +``` + +For example, to tag the current commit with `v1.0.0`, you would run: + +```bash +git tag -a v1.0.0 -m "Release version 1.0.0" +``` + +To push the tag to the remote repository, use: + +```bash +git push origin +``` + +For example: + +```bash +git push origin v1.0.0 +``` + +To see information about all tags, such as the commit they point to and the tag messages, use: + +```bash +git show-ref --tags && git tag -n | while IFS= read -r line; do echo "$line"; done +``` + +### Example: Using Commit Hashes to Compare Revisions + +Here is an example of how to use commit hashes to compare different revisions: + +```bash +benchpkg --rev=8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd,6fb48706f988613860c6c98beef32c32e900737b \ + --bench-on=8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd --exeflags="--threads=8" +``` + +In this example, `benchpkg` compares the two specified revisions, with the first hash being the baseline for comparison. + +### Example: Using Tags to Compare Revisions + +Here’s how you can use tags instead of commit hashes: + +1. **Create Tags**: + Suppose you want to tag the two commits: + + ```bash + git tag -a v1.0.0 8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd -m "Tagging v1.0.0" + git tag -a v1.1.0 6fb48706f988613860c6c98beef32c32e900737b -m "Tagging v1.1.0" + ``` + +2. **Use Tags in `benchpkg`**: + Once the tags are set, you can use them in the comparison: + + ```bash + benchpkg --rev=v1.0.0,v1.1.0 --bench-on=v1.0.0 --exeflags="--threads=8" + ``` + +### How to Remove a Tag + +If you need to remove a tag from your repository, you can do so with the following commands: + +1. **Delete the tag locally**: + + ```bash + git tag -d + ``` + + For example: + + ```bash + git tag -d v1.0.0 + ``` + +2. **Delete the tag from the remote repository**: + + ```bash + git push origin --delete + ``` + + For example: + + ```bash + git push origin --delete v1.0.0 + ``` + +## Developer Notes + +1. The `benchpkg` tool compares different revisions, allowing you to specify the commits or tags you wish to compare. It is crucial to ensure that both commits include all necessary dependencies; otherwise, the dependencies might not be resolved. + +2. **AirSpeedVelocity**: Note that AirSpeedVelocity requires the benchmarking suite to be named `SUITE`. Any other names will not be recognized, which could lead to errors in your benchmarking process. From b9ebf8e32f4ba159be68d0485156ca9fc4d93187 Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Tue, 20 Aug 2024 16:21:23 +0200 Subject: [PATCH 37/53] Create an AtlasMeshes module and resolve issues with atlas4py import --- advection/advection_miniapp.jl | 8 ++++---- benchmark/benchmarks_advection.jl | 2 -- src/GridTools.jl | 1 + src/atlas/{atlas_mesh.jl => AtlasMeshes.jl} | 14 +++++++++++++- 4 files changed, 18 insertions(+), 7 deletions(-) rename src/atlas/{atlas_mesh.jl => AtlasMeshes.jl} (97%) diff --git a/advection/advection_miniapp.jl b/advection/advection_miniapp.jl index cdc72e0..dae9794 100644 --- a/advection/advection_miniapp.jl +++ b/advection/advection_miniapp.jl @@ -7,17 +7,17 @@ using Statistics using Profile using GridTools using GridTools.ExampleMeshes.Unstructured - -const global VISUALIZATION_FLAG::Bool=false -const global VERBOSE_FLAG::Bool=false +using GridTools.AtlasMeshes # Include additional necessary files for mesh, state container, metric calculations, and advection operations -include("../src/atlas/atlas_mesh.jl") include("state_container.jl") include("metric.jl") include("advection.jl") include("visualization_utils.jl") +const global VISUALIZATION_FLAG::Bool=false +const global VERBOSE_FLAG::Bool=true + # Grid and Mesh Initialization -------------------------------------------------------------------------------- # Create a structured grid and mesh for the simulation grid = atlas.StructuredGrid("O10") diff --git a/benchmark/benchmarks_advection.jl b/benchmark/benchmarks_advection.jl index c40d809..638d744 100644 --- a/benchmark/benchmarks_advection.jl +++ b/benchmark/benchmarks_advection.jl @@ -1,8 +1,6 @@ using BenchmarkTools using Statistics using GridTools -using GridTools.ExampleMeshes.Unstructured -using GridTools.ExampleMeshes.Cartesian include("../advection/advection_miniapp.jl") diff --git a/src/GridTools.jl b/src/GridTools.jl index 083cbd7..70873e3 100644 --- a/src/GridTools.jl +++ b/src/GridTools.jl @@ -758,5 +758,6 @@ end generate_unique_name(name::Symbol, value::Integer = 0) = Symbol("$(name)ᐞ$(value)") include("ExampleMeshes.jl") +include("atlas/AtlasMeshes.jl") end diff --git a/src/atlas/atlas_mesh.jl b/src/atlas/AtlasMeshes.jl similarity index 97% rename from src/atlas/atlas_mesh.jl rename to src/atlas/AtlasMeshes.jl index d8f947b..dbce49d 100644 --- a/src/atlas/atlas_mesh.jl +++ b/src/atlas/AtlasMeshes.jl @@ -1,9 +1,19 @@ # ENV["PYCALL_JL_RUNTIME_PYTHON"] = Sys.which("python3.10") # ENV["PYTHONBREAKPOINT"] = "pdb.set_trace" +module AtlasMeshes + +using GridTools +using GridTools.ExampleMeshes.Unstructured using PyCall -atlas = pyimport("atlas4py") +export AtlasMesh, atlas, update_periodic_layers, DIMENSION_TO_SIZE_ATTR + +const atlas = PyNULL() + +function __init__() + copy!(atlas, pyimport("atlas4py")) +end const rpi = 2.0 * asin(1.0) const _deg2rad = 2.0 * rpi / 360.0 @@ -361,3 +371,5 @@ function update_periodic_layers(mesh::AtlasMesh, field::Field) ) field[periodic_indices, :] .= field[remote_indices[periodic_indices], :] end + +end # AtlasMeshes module \ No newline at end of file From 085877d39e0e942ef6ff899a23c87cac896e4a5a Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Tue, 20 Aug 2024 16:22:04 +0200 Subject: [PATCH 38/53] Fix embedded test with the new K dimension definition in example meshes --- test/embedded_test.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/embedded_test.jl b/test/embedded_test.jl index 770cfed..b61db94 100644 --- a/test/embedded_test.jl +++ b/test/embedded_test.jl @@ -135,8 +135,8 @@ end # Broadcast ------------------------- - @test typeof(broadcast(cell_values, (Cell, K))) == Field{Tuple{Dimension{:Cell_, HORIZONTAL}, Dimension{:K_, HORIZONTAL}}, Float64, 1, Tuple{Dimension{:Cell_, HORIZONTAL}}, Vector{Float64}} - @test typeof(broadcast(5.0, (Cell, K))) == Field{Tuple{Dimension{:Cell_, HORIZONTAL}, Dimension{:K_, HORIZONTAL}}, Float64, 0, Tuple{}, Array{Float64, 0}} + @test typeof(broadcast(cell_values, (Cell, K))) == Field{Tuple{Dimension{:Cell_, HORIZONTAL}, Dimension{:K_, VERTICAL}}, Float64, 1, Tuple{Dimension{:Cell_, HORIZONTAL}}, Vector{Float64}} + @test typeof(broadcast(5.0, (Cell, K))) == Field{Tuple{Dimension{:Cell_, HORIZONTAL}, Dimension{:K_, VERTICAL}}, Float64, 0, Tuple{}, Array{Float64, 0}} # Where ----------------------------------------- From 89572e10205090bd9c1f2c17ddbb343090983199 Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Tue, 20 Aug 2024 16:38:28 +0200 Subject: [PATCH 39/53] Separate the simulation loop from the Advection Setup of the miniapp --- advection/README.md | 20 +++--- ...dvection_miniapp.jl => advection_setup.jl} | 64 +------------------ advection/run_simulation_loop.jl | 62 ++++++++++++++++++ benchmark/benchmarks_advection.jl | 2 +- notes/Benchmarks.jl | 4 +- 5 files changed, 77 insertions(+), 75 deletions(-) rename advection/{advection_miniapp.jl => advection_setup.jl} (74%) create mode 100644 advection/run_simulation_loop.jl diff --git a/advection/README.md b/advection/README.md index b838658..cfae700 100644 --- a/advection/README.md +++ b/advection/README.md @@ -1,6 +1,6 @@ -### README for Running `advection_miniapp.jl` +### README for Running `advection_setup.jl` using `run_simulation_loop.jl` -This README provides instructions on how to run the `advection_miniapp.jl` script for simulating advection using the Atlas library. The script allows for terminal visualization, which can be enabled as described below. +This README provides instructions on how to run the `run_simulation_loop.jl` script for simulating advection using the Atlas library. The script allows for terminal visualization, which can be enabled as described below. #### Prerequisites @@ -15,23 +15,23 @@ This README provides instructions on how to run the `advection_miniapp.jl` scrip ``` 2. **Enabling Visualization** (optional): - - The script has a `VISUALIZATION_FLAG` that can be set to enable or disable visualization on the terminal. Ensure that this flag is set to `true` in the `advection_miniapp.jl` script if you wish to enable visualization. - - Note: Other parameters such as the number of iterations can be changed in the `# Simulation Parameters` section of the script. + - The script has a `VISUALIZATION_FLAG` that can be set to enable or disable visualization on the terminal. Ensure that this flag is set to `true` in the `run_simulation_loop.jl` script if you wish to enable visualization. + - Note: Other parameters such as the number of iterations can be changed in the `# Simulation Parameters` section of the `advection_setup.jl` script. #### Running the Simulation 1. **Running the Script**: - - Use the following command to run the `advection_miniapp.jl` script with Julia: + - Use the following command to run the `run_simulation_loop.jl` script with Julia: ```sh - julia --color=yes --project=$GRIDTOOLS_JL_PATH/GridTools.jl $GRIDTOOLS_JL_PATH/GridTools.jl/src/examples/advection/advection_miniapp.jl + julia --color=yes --project=$GRIDTOOLS_JL_PATH/GridTools.jl $GRIDTOOLS_JL_PATH/GridTools.jl/src/examples/advection/run_simulation_loop.jl ``` #### Example -Here is an example of how to set the `VISUALIZATION_FLAG` in the `advection_miniapp.jl` script and run the simulation: +Here is an example of how to set the `VISUALIZATION_FLAG` in the `run_simulation_loop.jl` script and run the simulation: 1. **Setting the Visualization Flag**: - - Open the `advection_miniapp.jl` script. + - Open the `run_simulation_loop.jl` script. - Set the `VISUALIZATION_FLAG` to `true`: ```julia const VISUALIZATION_FLAG = true @@ -42,7 +42,7 @@ Here is an example of how to set the `VISUALIZATION_FLAG` in the `advection_mini - Run the script with the following command: ```sh export GRIDTOOLS_JL_PATH=... - julia --color=yes --project=. $GRIDTOOLS_JL_PATH/src/examples/advection/advection_miniapp.jl + julia --color=yes --project=. $GRIDTOOLS_JL_PATH/src/examples/advection/run_simulation_loop.jl ``` -By following these steps, you should be able to run the `advection_miniapp.jl` script and visualize the advection simulation results on your terminal. +By following these steps, you should be able to run the `run_simulation_loop.jl` script and visualize the advection simulation results on your terminal. diff --git a/advection/advection_miniapp.jl b/advection/advection_setup.jl similarity index 74% rename from advection/advection_miniapp.jl rename to advection/advection_setup.jl index dae9794..89ffd38 100644 --- a/advection/advection_miniapp.jl +++ b/advection/advection_setup.jl @@ -1,10 +1,8 @@ -# Advection Miniapp -# This script demonstrates an advection simulation using the Atlas library. +# Advection Setup +# This script demonstrates the setup of an advection simulation using the Atlas library. using Printf -using Debugger using Statistics -using Profile using GridTools using GridTools.ExampleMeshes.Unstructured using GridTools.AtlasMeshes @@ -13,10 +11,6 @@ using GridTools.AtlasMeshes include("state_container.jl") include("metric.jl") include("advection.jl") -include("visualization_utils.jl") - -const global VISUALIZATION_FLAG::Bool=false -const global VERBOSE_FLAG::Bool=true # Grid and Mesh Initialization -------------------------------------------------------------------------------- # Create a structured grid and mesh for the simulation @@ -165,57 +159,3 @@ nabla_z( out = tmp_fields["tmp_vertex_2"], offset_provider = mesh.offset_provider ) - -if VISUALIZATION_FLAG - # Precompute the mapping between the unstructured domain to the structured one for ASCII art visualization - grid_size = 50 - mapping = precompute_mapping(mesh, xlim, ylim, grid_size) -end - -# Main Simulation Loop ---------------------------------------------------------------------------------------- -for i = 1:niter - # Perform the upwind advection scheme to update the scalar field (rho) - upwind_scheme( - state.rho, - δt, - mesh.vol, - metric.gac, - state.vel[1], - state.vel[2], - state.vel[3], - mesh.pole_edge_mask, - mesh.dual_face_orientation, - mesh.dual_face_normal_weighted_x, - mesh.dual_face_normal_weighted_y, - out = state_next.rho, - offset_provider = mesh.offset_provider - ) - - # Print the current timestep - if VERBOSE_FLAG - println("Timestep $i") - end - - if VISUALIZATION_FLAG - # Print the current state as ASCII art every 5 timesteps - print_state_ascii(state, mesh, mapping, i, grid_size) - end - - # TODO: make a function out of this switch - # Swap the current and next state - temp = state - global state = state_next - global state_next = temp - - # Update the periodic boundary layers - update_periodic_layers(mesh, state.rho) -end - -if VERBOSE_FLAG - # Output the final statistics for the scalar field (rho) and velocity fields - println( - "min max sum of final rho = $(minimum(state.rho.data)) , $(maximum(state.rho.data)) , $(sum(state.rho.data))" - ) - println("Final Vel0 sum after $niter iterations: $(sum(state.vel[1].data))") - println("Final Vel1 sum after $niter iterations: $(sum(state.vel[2].data))") -end diff --git a/advection/run_simulation_loop.jl b/advection/run_simulation_loop.jl new file mode 100644 index 0000000..2c034a6 --- /dev/null +++ b/advection/run_simulation_loop.jl @@ -0,0 +1,62 @@ +# Run Advection Miniapp Simulation +# This script demonstrates an advection simulation using the Atlas library. + +include("visualization_utils.jl") +include("advection_setup.jl") + +const global VISUALIZATION_FLAG::Bool=false +const global VERBOSE_FLAG::Bool=true + +if VISUALIZATION_FLAG + # Precompute the mapping between the unstructured domain to the structured one for ASCII art visualization + grid_size = 50 + mapping = precompute_mapping(mesh, xlim, ylim, grid_size) +end + +# Main Simulation Loop ---------------------------------------------------------------------------------------- +for i = 1:niter + # Perform the upwind advection scheme to update the scalar field (rho) + upwind_scheme( + state.rho, + δt, + mesh.vol, + metric.gac, + state.vel[1], + state.vel[2], + state.vel[3], + mesh.pole_edge_mask, + mesh.dual_face_orientation, + mesh.dual_face_normal_weighted_x, + mesh.dual_face_normal_weighted_y, + out = state_next.rho, + offset_provider = mesh.offset_provider + ) + + # Print the current timestep + if VERBOSE_FLAG + println("Timestep $i") + end + + if VISUALIZATION_FLAG + # Print the current state as ASCII art every 5 timesteps + print_state_ascii(state, mesh, mapping, i, grid_size) + end + + # TODO: make a function out of this switch + # Swap the current and next state + temp = state + global state = state_next + global state_next = temp + + # Update the periodic boundary layers + update_periodic_layers(mesh, state.rho) +end + +if VERBOSE_FLAG + # Output the final statistics for the scalar field (rho) and velocity fields + println( + "min max sum of final rho = $(minimum(state.rho.data)) , $(maximum(state.rho.data)) , $(sum(state.rho.data))" + ) + println("Final Vel0 sum after $niter iterations: $(sum(state.vel[1].data))") + println("Final Vel1 sum after $niter iterations: $(sum(state.vel[2].data))") +end diff --git a/benchmark/benchmarks_advection.jl b/benchmark/benchmarks_advection.jl index 638d744..d0e5da3 100644 --- a/benchmark/benchmarks_advection.jl +++ b/benchmark/benchmarks_advection.jl @@ -2,7 +2,7 @@ using BenchmarkTools using Statistics using GridTools -include("../advection/advection_miniapp.jl") +include("../advection/advection_setup.jl") # Advection Benchmarks diff --git a/notes/Benchmarks.jl b/notes/Benchmarks.jl index 5d390ec..b271d89 100644 --- a/notes/Benchmarks.jl +++ b/notes/Benchmarks.jl @@ -59,7 +59,7 @@ using Profile # Benchmark for Julia and Python implementations of advection ############################################################################################################## -include("../advection/advection_miniapp.jl") +include("../advection/advection_setup.jl") println("Starting julia embedded benchmark") @@ -81,7 +81,7 @@ bench_julia_embedded = @benchmark upwind_scheme( println("Finished Julia embedded benchmark") -include("../advection/advection_miniapp.jl") +include("../advection/advection_setup.jl") println("Starting julia python benchmark") From 4d71e0b73103f4ad73cc8b4433ab6fddb90a0256 Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Tue, 20 Aug 2024 16:47:15 +0200 Subject: [PATCH 40/53] Small changes in benchmark documentation --- benchmark/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/README.md b/benchmark/README.md index e898b2a..35fcc3b 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -76,7 +76,7 @@ git tag -a -m "Tag message" For example, to tag the current commit with `v1.0.0`, you would run: ```bash -git tag -a v1.0.0 -m "Release version 1.0.0" +git tag -a v1.0.0 -m "Improvement using @threads instead of @simd in broadcasting" ``` To push the tag to the remote repository, use: @@ -116,8 +116,8 @@ Here’s how you can use tags instead of commit hashes: Suppose you want to tag the two commits: ```bash - git tag -a v1.0.0 8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd -m "Tagging v1.0.0" - git tag -a v1.1.0 6fb48706f988613860c6c98beef32c32e900737b -m "Tagging v1.1.0" + git tag -a v1.0.0 8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd -m "Tag message for v1.0.0" + git tag -a v1.1.0 6fb48706f988613860c6c98beef32c32e900737b -m "Tag message for v1.1.0" ``` 2. **Use Tags in `benchpkg`**: From 26fe90031d4d675c2b8e21c57da385b7a68145a6 Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Tue, 20 Aug 2024 17:46:41 +0200 Subject: [PATCH 41/53] Fix the names retrieval of the modules automatically generated by AirSpeedVelocity when running the advection benchmark --- src/GridTools.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/GridTools.jl b/src/GridTools.jl index 70873e3..8b7edb0 100644 --- a/src/GridTools.jl +++ b/src/GridTools.jl @@ -706,7 +706,7 @@ macro module_vars() name => Core.eval(Base, name) for name in [:Int64, :Int32, :Float32, :Float64] ) - all_names = names(@__MODULE__) + all_names = names(@__MODULE__, all=true) used_modules = ccall(:jl_module_usings, Any, (Any,), @__MODULE__) for m in used_modules append!(all_names, names(m)) From 7cac41c364e1f0de5d35e19bcc5260e68e591869 Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Wed, 21 Aug 2024 13:58:44 +0200 Subject: [PATCH 42/53] Ignore plot files by AirSpeedVelocity --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index ca7a8dc..dee06b9 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,5 @@ env_setup.sh # Ignore benchmark (benchpkg) results results_GridTools@* +plot_*.png +plot_*.pdf From 6cb5585827ad97ba967e7f66570f979d964e343d Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Wed, 21 Aug 2024 14:30:45 +0200 Subject: [PATCH 43/53] Add Polyester to the dependencies --- Project.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Project.toml b/Project.toml index ff7f05d..2ab63f0 100644 --- a/Project.toml +++ b/Project.toml @@ -12,6 +12,7 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" +Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" From 96f416f9547978bb8a4f642a604c0c9e42a2f81f Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Thu, 22 Aug 2024 14:40:51 +0200 Subject: [PATCH 44/53] Increase the size of the Atlas Mesh for benchmarking purposes --- advection/advection_setup.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/advection/advection_setup.jl b/advection/advection_setup.jl index 89ffd38..3153416 100644 --- a/advection/advection_setup.jl +++ b/advection/advection_setup.jl @@ -14,7 +14,7 @@ include("advection.jl") # Grid and Mesh Initialization -------------------------------------------------------------------------------- # Create a structured grid and mesh for the simulation -grid = atlas.StructuredGrid("O10") +grid = atlas.StructuredGrid("O90") mesh = AtlasMesh(grid, num_level = 30) # Simulation Parameters --------------------------------------------------------------------------------------- From 426f9369bc58be2a79eded80cf93a205d41b581f Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Thu, 22 Aug 2024 15:07:05 +0200 Subject: [PATCH 45/53] Add script to automate the benchmark comparison between the last two commits --- benchmark/autorun_benchmarks.sh | 79 +++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100755 benchmark/autorun_benchmarks.sh diff --git a/benchmark/autorun_benchmarks.sh b/benchmark/autorun_benchmarks.sh new file mode 100755 index 0000000..73ac906 --- /dev/null +++ b/benchmark/autorun_benchmarks.sh @@ -0,0 +1,79 @@ +#!/bin/bash + +# This script automates the process of benchmarking recent changes by tagging +# the last two commits and running benchmarks using the AirspeedVelocity package. +# It supports conditional execution based on user input to include specific benchmarks +# for advection and allows dynamic configuration of execution threads. +# +# Usage: +# ./autorun_benchmarks.sh [--advection] [--threads=NUM] +# --advection: Optional. If specified, runs advection-specific benchmarks. +# --threads=NUM: Optional. Specifies the number of threads to use. Default is 8. + +# Default number of threads +threads=8 + +# Function to display usage +usage() { + echo "Usage: $0 [--advection] [--threads=NUM]" + echo " --advection: Run the advection comparison with specific benchmark script." + echo " --threads=NUM: Specify the number of threads (default is 8)." + exit 1 +} + +# Parse command-line arguments +for arg in "$@" +do + case $arg in + --advection) + advection=true + shift # Remove --advection from processing + ;; + --threads=*) + threads="${arg#*=}" + shift # Remove --threads=NUM from processing + ;; + *) + # Unknown option + usage + ;; + esac +done + +# Check if the tags already exist and delete them if they do +if git rev-parse -q --verify "refs/tags/after_debug" >/dev/null; then + git tag -d after_debug +fi + +if git rev-parse -q --verify "refs/tags/before_debug" >/dev/null; then + git tag -d before_debug +fi + +# Tag the last commit as 'after_debug' +git tag after_debug HEAD +echo "Tagged the latest commit as 'after_debug'" + +# Tag the second last commit as 'before_debug' +git tag before_debug HEAD~1 +echo -e "Tagged the previous commit as 'before_debug'\n" + +# Print the before and after tags with their messages +git tag -n | grep -E 'before_debug|after_debug' | while IFS= read -r line; do echo "$line"; done ; echo "" + +# Conditional command based on the --advection flag +if [ "$advection" == true ]; then + # Set the benchmark script for advection + benchmark_script="benchmark/benchmarks_advection.jl" + command="benchpkg --rev=before_debug,after_debug \ + -s $benchmark_script \ + --bench-on=before_debug \ + --exeflags=\"--threads=$threads\"" +else + command="benchpkg --rev=before_debug,after_debug \ + --bench-on=before_debug \ + --exeflags=\"--threads=$threads\"" +fi + +# Print and execute the command +echo "Executing command: $command" +eval $command From c8a08bb9f137a617f8735afcb03efc24e4872f1f Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Thu, 22 Aug 2024 15:35:50 +0200 Subject: [PATCH 46/53] Add utilis for benchmark/profiling in the interactive REPL --- .../utils/setup_benchmark_interactive.jl | 299 ++++++++++++++++++ 1 file changed, 299 insertions(+) create mode 100644 benchmark/utils/setup_benchmark_interactive.jl diff --git a/benchmark/utils/setup_benchmark_interactive.jl b/benchmark/utils/setup_benchmark_interactive.jl new file mode 100644 index 0000000..7e2aad1 --- /dev/null +++ b/benchmark/utils/setup_benchmark_interactive.jl @@ -0,0 +1,299 @@ +# setup_benchmark_interactive.jl + +# This script is intended for interactive usage during development and benchmarking sessions. +# It sets up a Julia environment with necessary packages and predefined functions for running various benchmarks. +# This allows developers to interactively profile and debug performance issues in real-time. +# +# Usage Example: +# Start Julia with the appropriate project settings and thread configuration: +# $ julia --project=. --threads 8 +# +# Inside the Julia REPL, load the benchmark setup: +# julia> include("setup_benchmark_interactive.jl") +# This will load all necessary modules and display the current thread usage. +# +# To run and profile a specific operation, use: +# julia> a, out = single_field_setup(STREAM_SIZE) +# julia> @profile fo_sin(a, backend="embedded", out=out) +# This will profile the `fo_sin` operation and print profiling results. + +include("../../advection/advection_setup.jl") + +using BenchmarkTools +using Statistics +using GridTools +using GridTools.ExampleMeshes.Unstructured +using GridTools.ExampleMeshes.Cartesian +using Profile +using Base.Threads + +# Data size +const global STREAM_SIZE = 10_000_000 + +# Utils ------------------------------------------------------------------------------------------------------ + +# Useful for the benchmark of the field remapping operation +function create_large_connectivity(size::Int) + edge_to_cell_table = vcat([rand(1:size, (1, 2)) for _ in 1:size]...) + cell_to_edge_table = vcat([rand(1:size, (1, 3)) for _ in 1:size]...) + + E2C = Connectivity(edge_to_cell_table, Cell, Edge, 2) + C2E = Connectivity(cell_to_edge_table, Edge, Cell, 3) + + Dict( + "E2C" => E2C, + "C2E" => C2E, + "E2CDim" => E2C # TODO: remove it + ) +end + +""" + compute_memory_bandwidth_single(results, a, out)::Float64 + +Calculates the memory bandwidth for operations that involve a single input and output field based on benchmark results. + +This function measures how efficiently data is transferred to and from memory during the execution of a benchmarked operation. + +# Arguments +- `results`: The benchmark results object containing timing and other performance data. +- `a`: The input field used in the benchmark. +- `out`: The output field produced by the benchmark. + +# Returns +- `bandwidth`: The computed memory bandwidth in gigabytes per second (GB/s), which represents the rate at which data is read from and written to the system memory during the operation. + +# Calculation Details +- `data_size`: Sum of the sizes of the input and output data in bytes. +- `time_in_seconds`: The median execution time of the benchmark, converted from nanoseconds to seconds. +- `bandwidth`: Calculated as the total data transferred divided by the time taken, expressed in GB/s. +""" +function compute_memory_bandwidth_single(results, a, out=a)::Float64 + data_size = sizeof(a.data) + sizeof(out.data) # Read from a and write to out + time_in_seconds = median(results.times) / 1e9 # Convert ns to s + bandwidth = data_size / time_in_seconds / 1e9 # GB/s + return bandwidth +end + +""" + compute_memory_bandwidth_addition(results, a, b, out) + +Function to compute the memory bandwidth for the addition benchmarks. + +# Arguments +- `results`: Benchmark results. +- `a, b`: The input arrays/fields used in the benchmark. +- `out`: The output array/field of the benchmark. + +# Returns +- The computed memory bandwidth in GB/s. +""" +function compute_memory_bandwidth_addition(results, a, b, out)::Tuple{Float64, Int64} + @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data) + data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data) # Read a and b, write to out + time_in_seconds = median(results.times) / 1e9 # Convert ns to s + bandwidth = data_size / time_in_seconds / 1e9 # GB/s + return bandwidth, data_size +end + +# Operations ------------------------------------------------------------------------------------------------- + +""" + single_field_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field} + +Setup function to create a field and a similar output field for benchmarking operations that require a single input field. + +# Arguments +- `FIELD_DATA_SIZE::Int64`: The size of the field to be generated. + +# Returns +- `a`: A randomly generated field of floats of size `FIELD_DATA_SIZE`. +- `out`: An output field similar to `a`, used for storing operation results. +""" +function single_field_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field} + a = Field(Cell, rand(Float64, FIELD_DATA_SIZE)) + out = GridTools.similar_field(a) + return a, out +end + +""" + array_broadcast_addition_setup(ARRAY_SIZE::Int64) + +Setup function for the Julia broadcast addition benchmark. + +# Arguments +- `ARRAY_SIZE::Int64`: The size of the arrays to be generated. + +# Returns +- `a, b`: Two randomly generated arrays of integers of size `ARRAY_SIZE`. +- `data_size`: The total size of the data processed. +""" +function array_broadcast_addition_setup(ARRAY_SIZE::Int64)::Tuple{Array{Float64,1}, Array{Float64,1}, Int64} + a = rand(Float64, ARRAY_SIZE) + b = rand(Float64, ARRAY_SIZE) + data_size = sizeof(a) + sizeof(b) # Total bytes processed + return a, b, data_size +end + +""" + broadcast_addition_array(a::Array{Float64}, b::Array{Float64}) + +Core operation for the Julia broadcast addition benchmark. + +# Arguments +- `a, b`: Two arrays to be added. + +# Returns +- The result of element-wise addition of `a` and `b`. +""" +function broadcast_addition_array(a::Array{Float64}, b::Array{Float64})::Array{Float64,1} + return a .+ b +end + +""" + broadcast_addition(a::Field, b::Field) + +Core operation for the broadcast addition of two Field benchmark. +Useful to asses and track possible overhead on fields. + +# Arguments +- `a, b`: Two field to be added. + +# Returns +- The result of element-wise addition of the data of the fields `a` and `b`. +""" +function broadcast_addition_fields(a::Field, b::Field)::Field + return a .+ b +end + +""" + fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64) + +Setup function for the field operator broadcast addition benchmark. + +# Arguments +- `FIELD_DATA_SIZE::Int64`: The size of the fields to be generated. + +# Returns +- `a, b`: Two randomly generated fields of floats of size `FIELD_DATA_SIZE`. +- `out`: An output field similar to `a`. +""" +function fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field, Field} + a = Field(Cell, rand(Float64, FIELD_DATA_SIZE)) + b = Field(Cell, rand(Float64, FIELD_DATA_SIZE)) + out = GridTools.similar_field(a) + return a, b, out +end + +""" + fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Core operation for the field operator broadcast addition benchmark. + +# Arguments +- `a, b`: Two fields to be added. + +# Returns +- The result of element-wise addition of `a` and `b`. +""" +@field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return a .+ b +end + +""" + sin_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Applies the sine function element-wise to the data of a field without using a field operator. + +# Arguments +- `a`: Input field containing Float64 data. + +# Returns +- A new field where each element is the sine of the corresponding element in the input field `a`. +""" +function sin_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return sin.(a) +end + +""" + cos_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Applies the cosine function element-wise to the data of a field without using a field operator. + +# Arguments +- `a`: Input field containing Float64 data. + +# Returns +- A new field where each element is the cosine of the corresponding element in the input field `a`. +""" +function cos_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return cos.(a) +end + +""" + fo_sin(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Field operator that applies the sine function element-wise to the data of a field. + +# Arguments +- `a`: Input field containing Float64 data. + +# Returns +- A new field where each element is the sine of the corresponding element in the input field `a`. +""" +@field_operator function fo_sin(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return sin.(a) +end + +""" + fo_cos(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Field operator that applies the cosine function element-wise to the data of a field. + +# Arguments +- `a`: Input field containing Float64 data. + +# Returns +- A new field where each element is the cosine of the corresponding element in the input field `a`. +""" +@field_operator function fo_cos(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return cos.(a) +end + +""" + fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} + +Field operator that performs remapping from cell-based data to edge-based data. + +This operator utilizes a connectivity table (`E2C`) to map the values from cells to edges, implying a transformation from the cell-centered field to an edge-centered field based on predefined relationships in the connectivity table. + +# Arguments +- `a`: Input field containing Float64 data structured around cells. + +# Returns +- A new field where each element represents data remapped from cells to edges, structured as specified by the edge-to-cell connectivity. +""" +@field_operator function fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} + return a(E2C[1]) +end + +""" + fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} + +Field operator that computes the sum of neighboring cell values for each edge. This function leverages the connectivity table (`E2C`), which defines the relationship between edges and cells, to sum the values of cells that are connected to each edge. + +The summation is performed across the dimension specified by `E2CDim`, ensuring that each edge aggregates values from its associated cells correctly. + +# Arguments +- `a`: Input field containing Float64 data, where each cell contains a numerical value. + +# Returns +- A new field where each edge holds the summed value of its neighboring cells, based on the edge-to-cell connectivity defined in `E2C`. +""" +@field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} + return neighbor_sum(a(E2C), axis=E2CDim) +end + +# Start ------------------------------------------------------------------------------------------------------ +println("Current number of threads: ", Threads.nthreads()) +println("The environment is ready\n") +Profile.clear() From d9642216258bcfe323afc70004bbc9c65eff393b Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Thu, 22 Aug 2024 15:36:05 +0200 Subject: [PATCH 47/53] Move autorun in the utils folder --- benchmark/{ => utils}/autorun_benchmarks.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename benchmark/{ => utils}/autorun_benchmarks.sh (100%) diff --git a/benchmark/autorun_benchmarks.sh b/benchmark/utils/autorun_benchmarks.sh similarity index 100% rename from benchmark/autorun_benchmarks.sh rename to benchmark/utils/autorun_benchmarks.sh From 182dd6d123ffb950ddb47e529b7d47309c74c1ef Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Thu, 22 Aug 2024 15:59:40 +0200 Subject: [PATCH 48/53] Update autorun script --- benchmark/utils/autorun_benchmarks.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmark/utils/autorun_benchmarks.sh b/benchmark/utils/autorun_benchmarks.sh index 73ac906..2f021a4 100755 --- a/benchmark/utils/autorun_benchmarks.sh +++ b/benchmark/utils/autorun_benchmarks.sh @@ -66,11 +66,10 @@ if [ "$advection" == true ]; then benchmark_script="benchmark/benchmarks_advection.jl" command="benchpkg --rev=before_debug,after_debug \ -s $benchmark_script \ - --bench-on=before_debug \ --exeflags=\"--threads=$threads\"" else command="benchpkg --rev=before_debug,after_debug \ - --bench-on=before_debug \ + --bench-on=after_debug \ --exeflags=\"--threads=$threads\"" fi From b1f539e1afdcb1533afe15d86d433d95b1fc53ea Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Fri, 23 Aug 2024 10:46:32 +0200 Subject: [PATCH 49/53] Fix the autorun script to use hashes instead of tags --- benchmark/utils/autorun_benchmarks.sh | 28 ++++++++++----------------- 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/benchmark/utils/autorun_benchmarks.sh b/benchmark/utils/autorun_benchmarks.sh index 2f021a4..58a0f0c 100755 --- a/benchmark/utils/autorun_benchmarks.sh +++ b/benchmark/utils/autorun_benchmarks.sh @@ -40,22 +40,13 @@ do esac done -# Check if the tags already exist and delete them if they do -if git rev-parse -q --verify "refs/tags/after_debug" >/dev/null; then - git tag -d after_debug -fi - -if git rev-parse -q --verify "refs/tags/before_debug" >/dev/null; then - git tag -d before_debug -fi - -# Tag the last commit as 'after_debug' -git tag after_debug HEAD -echo "Tagged the latest commit as 'after_debug'" +# Retrieve last two commit hashes +before_debug=$(git rev-parse HEAD~1) +after_debug=$(git rev-parse HEAD) -# Tag the second last commit as 'before_debug' -git tag before_debug HEAD~1 -echo -e "Tagged the previous commit as 'before_debug'\n" +# Tag the last two commits if they are not already tagged +git tag -f after_debug $after_debug +git tag -f before_debug $before_debug # Print the before and after tags with their messages git tag -n | grep -E 'before_debug|after_debug' | while IFS= read -r line; do echo "$line"; done ; echo "" @@ -64,12 +55,13 @@ git tag -n | grep -E 'before_debug|after_debug' | while IFS= read -r line; do ec if [ "$advection" == true ]; then # Set the benchmark script for advection benchmark_script="benchmark/benchmarks_advection.jl" - command="benchpkg --rev=before_debug,after_debug \ + command="benchpkg --rev=$before_debug,$after_debug \ -s $benchmark_script \ + --bench-on=$after_debug \ --exeflags=\"--threads=$threads\"" else - command="benchpkg --rev=before_debug,after_debug \ - --bench-on=after_debug \ + command="benchpkg --rev=$before_debug,$after_debug \ + --bench-on=$after_debug \ --exeflags=\"--threads=$threads\"" fi From 5b0f1dc6b12ae0de3457575a7f0303bbf03fb45c Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Wed, 4 Sep 2024 17:02:23 +0200 Subject: [PATCH 50/53] Add gpu backend support for basic broadcast operation --- src/GridTools.jl | 41 +++++++++++++++++++++- src/embedded/builtins.jl | 24 ++++++++++--- src/embedded/cust_broadcast.jl | 63 ++++++++++++++++++++++++++++------ src/examples/example_gpu.jl | 39 +++++++++++++++++++++ test/gpu_test.jl | 44 ++++++++++++++++++++++++ 5 files changed, 194 insertions(+), 17 deletions(-) create mode 100644 src/examples/example_gpu.jl create mode 100644 test/gpu_test.jl diff --git a/src/GridTools.jl b/src/GridTools.jl index 580be0a..8f617f9 100644 --- a/src/GridTools.jl +++ b/src/GridTools.jl @@ -9,7 +9,7 @@ using Profile using Base: @propagate_inbounds using MacroTools using OffsetArrays: IdOffsetRange -using Debugger +using CUDA import Base.Broadcast: Extruded, Style, BroadcastStyle, ArrayStyle, Broadcasted @@ -157,6 +157,30 @@ julia> field = Field(Cell, ones(5)) julia> field(E2C) julia> field(E2C[1]) ``` + +GPU arrays are supported too. + +# Examples +```julia-repl +julia> using GridTools + +julia> using CUDA: CuArray + +julia> using GridTools.ExampleMeshes.Unstructured + + # Create a CuArray of data on the GPU + +julia> gpu_data = CuArray(reshape(collect(1.0:12.0), (3, 4))); + + # Create a Field passing data in the CuArray type + +julia> gpu_field = Field((Cell,K), gpu_data); + + # Check the type + +julia> Base.typeof(gpu_field.data) +CuArray{Float64, 2, CUDA.DeviceMemory} +``` """ struct Field{ B_Dim <: Tuple{Vararg{Dimension}}, @@ -609,6 +633,20 @@ function backend_execution( end end +# It is not currently working in all edge cases +function check_gpu_data(args::Tuple)::nothing + has_CuArray::Bool = false + for (i, arg) in enumerate(args) + if arg !== nothing && typeof(arg) <: AbstractArray && typeof(arg.data) <: CuArray + has_CuArray = true + end + + if has_CuArray + throw(ArgumentError("GPU Arrays (CuArray) are not supported by the Python backend. Error found in argument #$i: $(typeof(arg.data)).")) + end + end +end + function backend_execution( backend::Val{:py}, fo::FieldOp, @@ -624,6 +662,7 @@ function backend_execution( f = py_field_operator(fo) FIELD_OPERATORS[fo.name] = f end + # check_gpu_data(args) # TODO: throw an exception in case of gpu arrays passed to the python backend p_args, p_kwargs, p_out, p_offset_provider = py_args.((args, kwargs, out, GridTools.OFFSET_PROVIDER)) if is_outermost_fo diff --git a/src/embedded/builtins.jl b/src/embedded/builtins.jl index 6ddf639..bdb512b 100644 --- a/src/embedded/builtins.jl +++ b/src/embedded/builtins.jl @@ -40,17 +40,31 @@ function min_over(field_in::Field; axis::Dimension)::Field return reduction_master(field_in, axis, minimum) end +""" + reduction_master(field_in::Field, axis::Dimension, f::Function)::Field +Performs a reduction operation (`sum`, `minimum`, `maximum`, etc.) over a specific axis dimension. +This version supports both CPU and GPU fields. +""" function reduction_master(field_in::Field, axis::Dimension, f::Function) neutral_el = get_neutral(f, eltype(field_in)) dim = get_dim_ind(field_in.dims, axis) conn = OFFSET_PROVIDER[get_dim_name(axis)] - data = dropdims( - f(ifelse.(conn.data .!= -1, field_in.data, neutral_el), dims = dim), - dims = dim - ) - return Field((field_in.dims[1:dim-1]..., field_in.dims[dim+1:end]...), data) + + if isa(field_in.data, CuArray) + # GPU version using CUDA parallelization + reduced_data = CUDA.fill(neutral_el, size(field_in.data)) + CUDA.@sync reduced_data .= f(ifelse.(conn.data .!= -1, field_in.data, neutral_el), dims = dim) + reduced_data = dropdims(reduced_data, dims = dim) + else + # CPU version + reduced_data = dropdims( + f(ifelse.(conn.data .!= -1, field_in.data, neutral_el), dims = dim), + dims = dim + ) + end + return Field((field_in.dims[1:dim-1]..., field_in.dims[dim+1:end]...), reduced_data) end get_neutral(f::typeof(sum), type::DataType) = convert(type, 0) diff --git a/src/embedded/cust_broadcast.jl b/src/embedded/cust_broadcast.jl index 0b0ad16..5f9c807 100644 --- a/src/embedded/cust_broadcast.jl +++ b/src/embedded/cust_broadcast.jl @@ -230,15 +230,42 @@ end # ----------------------------------------------------------------------------------------------------------------------------------------- +function is_gpu_compatible(bc::Broadcasted{ArrayStyle{Field}})::Bool + is_all_CuArray::Bool = false + has_CuArray::Bool = false + has_CPUArray::Bool = false + + for arg in bc.args + if typeof(arg) <: AbstractArray + # Check if the argument is a CuArray + if typeof(arg.data) <: CuArray + has_CuArray = true + is_all_CuArray = true + # Check if the argument is a CPU array + elseif typeof(arg.data) <: Vector + has_CPUArray = true + end + end + + # If both a CuArray and a CPU Array are present, raise an error + if has_CuArray && has_CPUArray + throw(ErrorException("Cannot have both CuArray and CPU arrays in the same args.")) + end + end + + return is_all_CuArray +end + # Creates uninitialized output object function Base.similar(bc::Broadcasted{ArrayStyle{Field}}, ::Type{ElType}) where {ElType} offsets = getproperty.(axes(bc), :start) .- 1 + is_cuarray::Bool = is_gpu_compatible(bc) Field( - bc.axes.dims, - similar(Array{ElType}, getproperty.(axes(bc), :stop) .- offsets), - bc.axes.broadcast_dims, - offsets - ) + bc.axes.dims, + similar(is_cuarray ? CuArray{ElType} : Array{ElType}, getproperty.(axes(bc), :stop) .- offsets), + bc.axes.broadcast_dims, + offsets + ) end # ----------------------------------------------------------------------------------------------------------------------------------------- @@ -249,17 +276,31 @@ end if axes(dest) == axes(bc) && bc.f === identity && bc.args isa Tuple{AbstractArray} # only a single input argument to broadcast! A = bc.args[1] if axes(dest) == axes(A) - return copyto!(dest, A) + if isa(A.data, CuArray) + return CUDA.copyto!(dest.data, A.data) # Use @GPUArrays copyto! + else + return copyto!(dest, A) + end end end - bc′ = Base.Broadcast.preprocess(shape(dest), bc) + if isa(dest.data, CuArray) + # Extract the function and the arguments from the broadcasted expression + f = bc.f + args = bc.args - # Performance may vary depending on whether `@inbounds` is placed outside the - # for loop or not. (cf. https://github.com/JuliaLang/julia/issues/38086) - @inbounds @simd for I in eachindex(dest) - dest[I] = bc′[I] + # Apply the function f element-wise to the arguments and store the result in dest.data + CUDA.map!(f, dest.data, map(arg -> arg.data, args)...) + else + bc′ = Base.Broadcast.preprocess(shape(dest), bc) + + # Performance may vary depending on whether `@inbounds` is placed outside the + # for loop or not. (cf. https://github.com/JuliaLang/julia/issues/38086) + @inbounds @simd for I in eachindex(dest) + dest[I] = bc′[I] + end end + return dest end diff --git a/src/examples/example_gpu.jl b/src/examples/example_gpu.jl new file mode 100644 index 0000000..8954a70 --- /dev/null +++ b/src/examples/example_gpu.jl @@ -0,0 +1,39 @@ +using GridTools +using GridTools.ExampleMeshes.Unstructured +using CUDA +using Profile +using Debugger +using BenchmarkTools + +# Cpu + +a_cpu = Field(Cell, collect(1:2e7)) +b_cpu = Field(Cell, collect(1:2e7)) + +out_cpu = similar(a_cpu) + +out_cpu = a_cpu .+ b_cpu + +# Gpu + +a_gpu = Field(Cell, CuArray(1:2e7)) +b_gpu = Field(Cell, CuArray(1:2e7)) + +out_gpu = similar_field(a_gpu) + +out_gpu .= a_gpu .+ b_gpu + +function bench_cpu!(a_cpu, b_cpu, out_cpu) + out_cpu = a_cpu .+ b_cpu +end + +function bench_gpu!(a_gpu, b_gpu, out_gpu) + # Wrapping the execution in a CUDA.@sync block will make + # the CPU block until the queued GPU tasks are done, similar to how Base.@sync waits for distributed CPU tasks + CUDA.@sync begin + out_gpu = a_gpu .+ b_gpu + end +end + +@btime bench_cpu!($a_cpu, $b_cpu, $out_cpu) +@btime bench_gpu!($a_gpu, $b_gpu, $out_gpu) \ No newline at end of file diff --git a/test/gpu_test.jl b/test/gpu_test.jl new file mode 100644 index 0000000..3f7fecb --- /dev/null +++ b/test/gpu_test.jl @@ -0,0 +1,44 @@ +using Test +using CUDA: CuArray +using GridTools +using GridTools.ExampleMeshes.Unstructured + +@testset "Testset Simple Broadcast Addition GPU" begin + a_gpu = Field(Cell, CuArray(1.0:15.0)) + b_gpu = Field(Cell, CuArray(-2.0:-1:-16.0)) + @assert size(a_gpu.data) == size(b_gpu.data) "Fields a_gpu and b_gpu do not have the same size of data." + + out_gpu = similar_field(a_gpu) + out_gpu = a_gpu .+ b_gpu + + @test all(out_gpu.data .== -1) +end + +@testset "Testset Large Broadcast Addition GPU" begin + # Initialize two large GPU fields with CuArray + a_gpu = Field(Cell, CuArray(1:2e7)) + b_gpu = Field(Cell, CuArray(1:2e7)) + @assert size(a_gpu.data) == size(b_gpu.data) "Fields a_gpu and b_gpu do not have the same size of data." + + out_gpu = similar_field(a_gpu) + out_gpu .= a_gpu .+ b_gpu + + expected_result = CuArray(2:2:2e7*2) + + @test all(out_gpu.data .== expected_result) +end + +@testset "Testset Field Operator Addition GPU" begin + a_gpu = Field(Cell, CuArray(1.0:15.0)) + b_gpu = Field(Cell, CuArray(-2.0:-1:-16.0)) + @assert size(a_gpu.data) == size(b_gpu.data) "Fields a and b do not have the same size of data." + + out_gpu = similar_field(a_gpu) + + @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return a .+ b + end + + fo_addition(a_gpu, b_gpu, backend="embedded", out=out_gpu) + @test all(out_gpu.data .== -1) +end From 5f6d164656db2330ec310e6fdb235f4d29e7c3ee Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Wed, 4 Sep 2024 17:53:51 +0200 Subject: [PATCH 51/53] Add benchmarking suite for gpu --- benchmark/benchmarks_gpu.jl | 160 ++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 benchmark/benchmarks_gpu.jl diff --git a/benchmark/benchmarks_gpu.jl b/benchmark/benchmarks_gpu.jl new file mode 100644 index 0000000..edfc438 --- /dev/null +++ b/benchmark/benchmarks_gpu.jl @@ -0,0 +1,160 @@ +using BenchmarkTools +using CUDA +using GridTools +using GridTools.ExampleMeshes.Unstructured + +# Data size +const global STREAM_SIZE = 10_000_000 + +""" + compute_memory_bandwidth_addition(results, a, b, out)::Tuple{Float64, Int64} + +Function to compute the memory bandwidth for the addition benchmarks. + +# Arguments +- `results`: The benchmark results containing timing information (`times`). +- `a, b`: The input fields or arrays used in the benchmark. +- `out`: The output field or array used in the benchmark. + +# Returns +- A tuple `(bandwidth, data_size)` where: + - `bandwidth`: The memory bandwidth in gigabytes per second (GB/s). + - `data_size`: The total size of the data processed in bytes. +""" +function compute_memory_bandwidth_addition(results, a, b, out)::Tuple{Float64, Int64} + # Ensure the sizes of the data fields are consistent + @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data) + + # Calculate the total size of data read and written in bytes + # Read from `a` and `b`, and write to `out` + data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data) + + # Compute the median execution time from benchmark results in seconds (convert from nanoseconds) + time_in_seconds = median(results.times) / 1e9 + + # Calculate memory bandwidth in GB/s + bandwidth = data_size / time_in_seconds / 1e9 + + return bandwidth, data_size +end + +# GPU Setup Functions ----------------------------------------------------------------------------------------- + +""" + gpu_broadcast_addition_setup(ARRAY_SIZE::Int64) + +Setup function for the GPU broadcast addition benchmark using CuArray. + +# Arguments +- `ARRAY_SIZE::Int64`: The size of the GPU arrays to be generated. + +# Returns +- `a, b`: Two CuArray GPU arrays of size `ARRAY_SIZE`. +- `data_size`: The total size of the data processed. +""" +function gpu_broadcast_addition_setup(ARRAY_SIZE::Int64)::Tuple{CuArray{Float64,1}, CuArray{Float64,1}, Int64} + a_gpu = CuArray(rand(Float64, ARRAY_SIZE)) + b_gpu = CuArray(rand(Float64, ARRAY_SIZE)) + data_size = sizeof(a_gpu) + sizeof(b_gpu) # Total bytes processed + return a_gpu, b_gpu, data_size +end + +""" + gpu_fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64) + +Setup function for the GPU field broadcast addition benchmark using CuArray. + +# Arguments +- `FIELD_DATA_SIZE::Int64`: The size of the fields to be generated. + +# Returns +- `a, b`: Two randomly generated fields of CuArray floats of size `FIELD_DATA_SIZE`. +- `out`: An output field similar to `a`, used for storing operation results. +""" +function gpu_fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field, Field} + a_gpu = Field(Cell, CuArray(rand(Float64, FIELD_DATA_SIZE))) + b_gpu = Field(Cell, CuArray(rand(Float64, FIELD_DATA_SIZE))) + out_gpu = GridTools.similar_field(a_gpu) + return a_gpu, b_gpu, out_gpu +end + +# CuArray only +function gpu_broadcast_addition_array(a::CuArray{Float64}, b::CuArray{Float64})::CuArray{Float64} + return a .+ b +end + +# Fields and broadcasting +function gpu_broadcast_addition_fields(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return a .+ b +end + +function arr_add_wrapper(a, b) + CUDA.@sync begin + return gpu_broadcast_addition_array(a,b) + end +end + +function field_add_wrapper(a, b) + CUDA.@sync begin + return gpu_broadcast_addition_fields(a,b) + end +end + +@field_operator function gpu_fo_addition_with_wrapper(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + CUDA.@sync begin + return a .+ b + end +end + +# Benchmarks ------------------------------------------------------------------------------------------------- + +# Create the GPU benchmark SUITE +SUITE_GPU = BenchmarkGroup() + +# Define the GPU addition benchmarks +SUITE_GPU["gpu_addition"] = BenchmarkGroup() + +# GPU broadcast addition benchmark +a_gpu, b_gpu, data_size_gpu = gpu_broadcast_addition_setup(STREAM_SIZE) +SUITE_GPU["gpu_addition"]["gpu_array_broadcast_addition"] = @benchmarkable $arr_add_wrapper($a_gpu, $b_gpu) + +# GPU Field broadcast addition benchmark # TODO(lorenzovarese): fix the CUDA.@sync, results are unrealistic +a_gpu, b_gpu, out_gpu = gpu_fields_broadcast_addition_setup(STREAM_SIZE) +SUITE_GPU["gpu_addition"]["gpu_fields_broadcast_addition"] = @benchmarkable $field_add_wrapper($a_gpu, $b_gpu) + +# GPU Field Operator broadcast addition benchmark # TODO(lorenzovarese): fix the CUDA.@sync, results are unrealistic +a_gpu, b_gpu, out_gpu = gpu_fields_broadcast_addition_setup(STREAM_SIZE) +SUITE_GPU["gpu_addition"]["gpu_field_op_broadcast_addition"] = @benchmarkable $gpu_fo_addition($a_gpu, $b_gpu, backend="embedded", out=$out_gpu) + +# Running the GPU benchmark SUITE +println("Running the GPU benchmark SUITE...") +gpu_results = run(SUITE_GPU) + +# Process and print the GPU results +gpu_array_results = gpu_results["gpu_addition"]["gpu_array_broadcast_addition"] +gpu_fields_results = gpu_results["gpu_addition"]["gpu_fields_broadcast_addition"] +gpu_fo_results = gpu_results["gpu_addition"]["gpu_field_op_broadcast_addition"] + +# Compute memory bandwidth for GPU benchmarks +gpu_array_bandwidth, data_size_arr_gpu = compute_memory_bandwidth_addition(gpu_array_results, a_gpu, b_gpu, a_gpu) +gpu_fields_bandwidth, data_size_fields_gpu = compute_memory_bandwidth_addition(gpu_fields_results, a_gpu, b_gpu, a_gpu) +gpu_fo_bandwidth, data_size_fo_gpu = compute_memory_bandwidth_addition(gpu_fo_results, a_gpu, b_gpu, out_gpu) + +# Function to convert nanoseconds to milliseconds for clearer output +ns_to_ms(time_ns) = time_ns / 1e6 + +# Output results for GPU benchmarks +println("GPU Array broadcast addition:") +println("\tData size: $data_size_arr_gpu") +println("\tBandwidth: $gpu_array_bandwidth GB/s") +println("\tTime taken: $(ns_to_ms(median(gpu_array_results.times))) ms\n") + +println("GPU Fields data broadcast addition:") +println("\tData size: $data_size_fields_gpu") +println("\tBandwidth: $gpu_fields_bandwidth GB/s") +println("\tTime taken: $(ns_to_ms(median(gpu_fields_results.times))) ms\n") + +println("GPU Field Operator broadcast addition:") +println("\tData size: $data_size_fo_gpu") +println("\tBandwidth: $gpu_fo_bandwidth GB/s") +println("\tTime taken: $(ns_to_ms(median(gpu_fo_results.times))) ms\n") From 47dbe38b3e129a4bb79efaecdd5845a1721abd87 Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Thu, 5 Sep 2024 15:58:18 +0200 Subject: [PATCH 52/53] Fix gpu benchmark for memory bandwidth computation --- benchmark/benchmarks_gpu.jl | 135 +++++++++++++++--------------------- 1 file changed, 56 insertions(+), 79 deletions(-) diff --git a/benchmark/benchmarks_gpu.jl b/benchmark/benchmarks_gpu.jl index edfc438..2c9d658 100644 --- a/benchmark/benchmarks_gpu.jl +++ b/benchmark/benchmarks_gpu.jl @@ -4,33 +4,25 @@ using GridTools using GridTools.ExampleMeshes.Unstructured # Data size -const global STREAM_SIZE = 10_000_000 +const STREAM_SIZE::Int64 = 10_000_000 """ - compute_memory_bandwidth_addition(results, a, b, out)::Tuple{Float64, Int64} + compute_memory_bandwidth_addition(time_in_seconds, a, b, out)::Tuple{Float64, Int64} Function to compute the memory bandwidth for the addition benchmarks. # Arguments -- `results`: The benchmark results containing timing information (`times`). -- `a, b`: The input fields or arrays used in the benchmark. -- `out`: The output field or array used in the benchmark. +- `time_in_seconds`: The execution time in seconds. +- `STREAM_SIZE`: the size used for the arrays # Returns - A tuple `(bandwidth, data_size)` where: - `bandwidth`: The memory bandwidth in gigabytes per second (GB/s). - `data_size`: The total size of the data processed in bytes. """ -function compute_memory_bandwidth_addition(results, a, b, out)::Tuple{Float64, Int64} - # Ensure the sizes of the data fields are consistent - @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data) - +function compute_memory_bandwidth_addition(time_in_seconds::Float64, STREAM_SIZE::Int64, data_type::Type)::Tuple{Float64, Int64} # Calculate the total size of data read and written in bytes - # Read from `a` and `b`, and write to `out` - data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data) - - # Compute the median execution time from benchmark results in seconds (convert from nanoseconds) - time_in_seconds = median(results.times) / 1e9 + data_size = 3 * STREAM_SIZE * sizeof(data_type) # (a + b + out), each Float64 is 8 bytes # Calculate memory bandwidth in GB/s bandwidth = data_size / time_in_seconds / 1e9 @@ -49,14 +41,14 @@ Setup function for the GPU broadcast addition benchmark using CuArray. - `ARRAY_SIZE::Int64`: The size of the GPU arrays to be generated. # Returns -- `a, b`: Two CuArray GPU arrays of size `ARRAY_SIZE`. -- `data_size`: The total size of the data processed. +- `a_gpu`, `b_gpu`, `out_gpu`: Three CuArray GPU arrays of size `ARRAY_SIZE`. """ -function gpu_broadcast_addition_setup(ARRAY_SIZE::Int64)::Tuple{CuArray{Float64,1}, CuArray{Float64,1}, Int64} - a_gpu = CuArray(rand(Float64, ARRAY_SIZE)) - b_gpu = CuArray(rand(Float64, ARRAY_SIZE)) - data_size = sizeof(a_gpu) + sizeof(b_gpu) # Total bytes processed - return a_gpu, b_gpu, data_size +function gpu_broadcast_addition_setup(ARRAY_SIZE::Int64)::Tuple{CuArray{Float64,1}, CuArray{Float64,1}, CuArray{Float64,1}} + randcuarr = () -> CuArray(rand(Float64, ARRAY_SIZE)) + a_gpu = randcuarr() + b_gpu = randcuarr() + out_gpu = randcuarr() + return a_gpu, b_gpu, out_gpu end """ @@ -72,89 +64,74 @@ Setup function for the GPU field broadcast addition benchmark using CuArray. - `out`: An output field similar to `a`, used for storing operation results. """ function gpu_fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field, Field} - a_gpu = Field(Cell, CuArray(rand(Float64, FIELD_DATA_SIZE))) - b_gpu = Field(Cell, CuArray(rand(Float64, FIELD_DATA_SIZE))) - out_gpu = GridTools.similar_field(a_gpu) + randfieldcuarr = () -> Field(Cell, CuArray(rand(Float64, FIELD_DATA_SIZE))) + a_gpu = randfieldcuarr() + b_gpu = randfieldcuarr() + out_gpu = randfieldcuarr() return a_gpu, b_gpu, out_gpu end # CuArray only -function gpu_broadcast_addition_array(a::CuArray{Float64}, b::CuArray{Float64})::CuArray{Float64} - return a .+ b -end - -# Fields and broadcasting -function gpu_broadcast_addition_fields(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} - return a .+ b -end - -function arr_add_wrapper(a, b) +function arr_add_wrapper!(out::CuArray{Float64,1}, a::CuArray{Float64,1}, b::CuArray{Float64,1}) CUDA.@sync begin - return gpu_broadcast_addition_array(a,b) + out = a .+ b end end -function field_add_wrapper(a, b) +# Fields only +function field_add_wrapper!(out::Field{Tuple{Cell_},Float64}, a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64}) CUDA.@sync begin - return gpu_broadcast_addition_fields(a,b) + out = a .+ b end end -@field_operator function gpu_fo_addition_with_wrapper(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} +# Field operator +@field_operator function gpu_fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return a .+ b +end + +function gpu_fo_addition_wrapper!(out::Field{Tuple{Cell_},Float64}, a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64}) CUDA.@sync begin - return a .+ b + gpu_fo_addition(a, b, backend="embedded", out=out) end end -# Benchmarks ------------------------------------------------------------------------------------------------- +# Benchmarks with @belapsed -# Create the GPU benchmark SUITE -SUITE_GPU = BenchmarkGroup() +# CuArray ----------------------------------------------------------------------------------------------------------- +a_gpu, b_gpu, out_gpu = gpu_broadcast_addition_setup(STREAM_SIZE) -# Define the GPU addition benchmarks -SUITE_GPU["gpu_addition"] = BenchmarkGroup() +println("Benchmarking GPU array broadcast addition:") +gpu_array_time = @belapsed arr_add_wrapper!($out_gpu, $a_gpu, $b_gpu) -# GPU broadcast addition benchmark -a_gpu, b_gpu, data_size_gpu = gpu_broadcast_addition_setup(STREAM_SIZE) -SUITE_GPU["gpu_addition"]["gpu_array_broadcast_addition"] = @benchmarkable $arr_add_wrapper($a_gpu, $b_gpu) - -# GPU Field broadcast addition benchmark # TODO(lorenzovarese): fix the CUDA.@sync, results are unrealistic -a_gpu, b_gpu, out_gpu = gpu_fields_broadcast_addition_setup(STREAM_SIZE) -SUITE_GPU["gpu_addition"]["gpu_fields_broadcast_addition"] = @benchmarkable $field_add_wrapper($a_gpu, $b_gpu) +# Compute memory bandwidth for GPU array benchmark +gpu_array_bandwidth, data_size_arr_gpu = compute_memory_bandwidth_addition(gpu_array_time, STREAM_SIZE, eltype(a_gpu)) +println("GPU Array broadcast addition:") +println("\tData size: $data_size_arr_gpu") +println("\tTime: $gpu_array_time") +println("\tBandwidth: $gpu_array_bandwidth GB/s\n") -# GPU Field Operator broadcast addition benchmark # TODO(lorenzovarese): fix the CUDA.@sync, results are unrealistic +# Fields ------------------------------------------------------------------------------------------------------------- a_gpu, b_gpu, out_gpu = gpu_fields_broadcast_addition_setup(STREAM_SIZE) -SUITE_GPU["gpu_addition"]["gpu_field_op_broadcast_addition"] = @benchmarkable $gpu_fo_addition($a_gpu, $b_gpu, backend="embedded", out=$out_gpu) - -# Running the GPU benchmark SUITE -println("Running the GPU benchmark SUITE...") -gpu_results = run(SUITE_GPU) - -# Process and print the GPU results -gpu_array_results = gpu_results["gpu_addition"]["gpu_array_broadcast_addition"] -gpu_fields_results = gpu_results["gpu_addition"]["gpu_fields_broadcast_addition"] -gpu_fo_results = gpu_results["gpu_addition"]["gpu_field_op_broadcast_addition"] -# Compute memory bandwidth for GPU benchmarks -gpu_array_bandwidth, data_size_arr_gpu = compute_memory_bandwidth_addition(gpu_array_results, a_gpu, b_gpu, a_gpu) -gpu_fields_bandwidth, data_size_fields_gpu = compute_memory_bandwidth_addition(gpu_fields_results, a_gpu, b_gpu, a_gpu) -gpu_fo_bandwidth, data_size_fo_gpu = compute_memory_bandwidth_addition(gpu_fo_results, a_gpu, b_gpu, out_gpu) +println("Benchmarking GPU fields broadcast addition:") +gpu_fields_time = @belapsed field_add_wrapper!($out_gpu, $a_gpu, $b_gpu) -# Function to convert nanoseconds to milliseconds for clearer output -ns_to_ms(time_ns) = time_ns / 1e6 +# Compute memory bandwidth for GPU fields benchmark +gpu_fields_bandwidth, data_size_fields_gpu = compute_memory_bandwidth_addition(gpu_fields_time, STREAM_SIZE, eltype(a_gpu.data)) +println("GPU Fields broadcast addition:") +println("\tData size: $data_size_fields_gpu") +println("\tTime: $gpu_fields_time") +println("\tBandwidth: $gpu_fields_bandwidth GB/s\n") -# Output results for GPU benchmarks -println("GPU Array broadcast addition:") -println("\tData size: $data_size_arr_gpu") -println("\tBandwidth: $gpu_array_bandwidth GB/s") -println("\tTime taken: $(ns_to_ms(median(gpu_array_results.times))) ms\n") +# Field operator ------------------------------------------------------------------------------------------------------- +a_gpu, b_gpu, out_gpu = gpu_fields_broadcast_addition_setup(STREAM_SIZE) -println("GPU Fields data broadcast addition:") -println("\tData size: $data_size_fields_gpu") -println("\tBandwidth: $gpu_fields_bandwidth GB/s") -println("\tTime taken: $(ns_to_ms(median(gpu_fields_results.times))) ms\n") +println("Benchmarking GPU field operator broadcast addition:") +gpu_fo_time = @belapsed field_add_wrapper!($out_gpu, $a_gpu, $b_gpu) +# Compute memory bandwidth for GPU field operator benchmark +gpu_fo_bandwidth, data_size_fo_gpu = compute_memory_bandwidth_addition(gpu_fo_time, STREAM_SIZE, eltype(a_gpu.data)) println("GPU Field Operator broadcast addition:") println("\tData size: $data_size_fo_gpu") -println("\tBandwidth: $gpu_fo_bandwidth GB/s") -println("\tTime taken: $(ns_to_ms(median(gpu_fo_results.times))) ms\n") +println("\tBandwidth: $gpu_fo_bandwidth GB/s\n") From b1fe0b21bc3c158749c7bb8c36f631c3b3a94a31 Mon Sep 17 00:00:00 2001 From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com> Date: Thu, 5 Sep 2024 16:05:22 +0200 Subject: [PATCH 53/53] Improve the printing of the gpu benchmark results --- benchmark/benchmarks_gpu.jl | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/benchmark/benchmarks_gpu.jl b/benchmark/benchmarks_gpu.jl index 2c9d658..fa6e507 100644 --- a/benchmark/benchmarks_gpu.jl +++ b/benchmark/benchmarks_gpu.jl @@ -30,6 +30,11 @@ function compute_memory_bandwidth_addition(time_in_seconds::Float64, STREAM_SIZE return bandwidth, data_size end +# Util for pretty print the results +function format_number_with_dots(n::Int) + return reverse(join(Iterators.partition(reverse(string(n)), 3), ".")) +end + # GPU Setup Functions ----------------------------------------------------------------------------------------- """ @@ -107,8 +112,8 @@ gpu_array_time = @belapsed arr_add_wrapper!($out_gpu, $a_gpu, $b_gpu) # Compute memory bandwidth for GPU array benchmark gpu_array_bandwidth, data_size_arr_gpu = compute_memory_bandwidth_addition(gpu_array_time, STREAM_SIZE, eltype(a_gpu)) println("GPU Array broadcast addition:") -println("\tData size: $data_size_arr_gpu") -println("\tTime: $gpu_array_time") +println("\tData size: $(format_number_with_dots(data_size_arr_gpu)) bytes") +println("\tTime: $gpu_array_time s") println("\tBandwidth: $gpu_array_bandwidth GB/s\n") # Fields ------------------------------------------------------------------------------------------------------------- @@ -120,8 +125,8 @@ gpu_fields_time = @belapsed field_add_wrapper!($out_gpu, $a_gpu, $b_gpu) # Compute memory bandwidth for GPU fields benchmark gpu_fields_bandwidth, data_size_fields_gpu = compute_memory_bandwidth_addition(gpu_fields_time, STREAM_SIZE, eltype(a_gpu.data)) println("GPU Fields broadcast addition:") -println("\tData size: $data_size_fields_gpu") -println("\tTime: $gpu_fields_time") +println("\tData size: $(format_number_with_dots(data_size_fields_gpu)) bytes") +println("\tTime: $gpu_fields_time s") println("\tBandwidth: $gpu_fields_bandwidth GB/s\n") # Field operator ------------------------------------------------------------------------------------------------------- @@ -133,5 +138,6 @@ gpu_fo_time = @belapsed field_add_wrapper!($out_gpu, $a_gpu, $b_gpu) # Compute memory bandwidth for GPU field operator benchmark gpu_fo_bandwidth, data_size_fo_gpu = compute_memory_bandwidth_addition(gpu_fo_time, STREAM_SIZE, eltype(a_gpu.data)) println("GPU Field Operator broadcast addition:") -println("\tData size: $data_size_fo_gpu") +println("\tData size: $(format_number_with_dots(data_size_fo_gpu)) bytes") +println("\tTime: $gpu_fo_time s") println("\tBandwidth: $gpu_fo_bandwidth GB/s\n")