From 5c46d041011304160c6211c77a61ab5111c96a87 Mon Sep 17 00:00:00 2001
From: LorenzoVarese <lorenzovarese@users.noreply.github.com>
Date: Thu, 18 Jul 2024 16:05:22 +0200
Subject: [PATCH 01/53] Fix mesh definitions in benchmarks

---
 benchmark/benchmarks.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 8255aca..119506b 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -4,6 +4,8 @@ push!(LOAD_PATH, path_to_package)
 using BenchmarkTools
 using GridTools
 
+include("../test/mesh_definitions.jl")
+
 SUITE = BenchmarkGroup()
 
 SUITE["arith_broadcast"] = BenchmarkGroup()

From 1eea839026adea94dae4f590052dd8ea7bf8a3a8 Mon Sep 17 00:00:00 2001
From: LorenzoVarese <lorenzovarese@users.noreply.github.com>
Date: Thu, 18 Jul 2024 16:05:46 +0200
Subject: [PATCH 02/53] Attempt to fix the benchmark PR

---
 .github/workflows/benchmark_pr.yml | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark_pr.yml b/.github/workflows/benchmark_pr.yml
index 71b10e2..ed7f950 100644
--- a/.github/workflows/benchmark_pr.yml
+++ b/.github/workflows/benchmark_pr.yml
@@ -18,6 +18,14 @@ jobs:
               with:
                 version: "1.8"
             - uses: julia-actions/cache@v1
+            - name: Set up Python environment
+              run: |
+                sudo apt-get update
+                sudo apt-get install python3-pip
+                python3 -m pip install gt4py.next
+            - name: Configure PyCall
+              run: |
+                julia -e 'using Pkg; ENV["PYTHON"]="/usr/bin/python3"; Pkg.build("PyCall");'
             - name: Extract Package Name from Project.toml
               id: extract-package-name
               run: |
@@ -27,7 +35,6 @@ jobs:
               env:
                 JULIA_NUM_THREADS: 2
               run: |
-                # Lightweight build step, as sometimes the runner runs out of memory:
                 julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.add(;url="https://github.com/MilesCranmer/AirspeedVelocity.jl.git")'
                 julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.build("AirspeedVelocity")'
             - name: Add ~/.julia/bin to PATH

From 02ee8b71922628bbbe6cdcbc628f60ca497520cb Mon Sep 17 00:00:00 2001
From: LorenzoVarese <lorenzovarese@users.noreply.github.com>
Date: Thu, 18 Jul 2024 16:11:56 +0200
Subject: [PATCH 03/53] Fix pip install gt4py

---
 .github/workflows/benchmark_pr.yml | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmark_pr.yml b/.github/workflows/benchmark_pr.yml
index ed7f950..46e5b72 100644
--- a/.github/workflows/benchmark_pr.yml
+++ b/.github/workflows/benchmark_pr.yml
@@ -21,11 +21,18 @@ jobs:
             - name: Set up Python environment
               run: |
                 sudo apt-get update
-                sudo apt-get install python3-pip
-                python3 -m pip install gt4py.next
+                sudo apt-get install python3-pip python3-venv
+                python3 -m venv ~/gt4py-venv
+                source ~/gt4py-venv/bin/activate
+                python3 -m pip install --upgrade pip
+            - name: Install GT4Py from specific branch
+              run: |
+                git clone --branch fix_python_interp_path_in_cmake https://github.com/tehrengruber/gt4py.git ~/gt4py
+                pip install -r ~/gt4py/requirements-dev.txt
+                pip install -e ~/gt4py
             - name: Configure PyCall
               run: |
-                julia -e 'using Pkg; ENV["PYTHON"]="/usr/bin/python3"; Pkg.build("PyCall");'
+                julia -e 'using Pkg; ENV["PYTHON"]="~/gt4py-venv/bin/python3"; Pkg.build("PyCall");'
             - name: Extract Package Name from Project.toml
               id: extract-package-name
               run: |
@@ -82,4 +89,4 @@ jobs:
                 # comment-id: ${{ steps.fcbenchmark.outputs.comment-id }}
                 issue-number: ${{ github.event.pull_request.number }}
                 body-path: body.md
-                edit-mode: replace
\ No newline at end of file
+                edit-mode: replace

From 2ae9b772758b290186401327cc5befe7fc75983b Mon Sep 17 00:00:00 2001
From: LorenzoVarese <lorenzovarese@users.noreply.github.com>
Date: Thu, 18 Jul 2024 16:16:28 +0200
Subject: [PATCH 04/53] Fix PyCall installation in the benchmark_pr.yml

---
 .github/workflows/benchmark_pr.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmark_pr.yml b/.github/workflows/benchmark_pr.yml
index 46e5b72..2c1911a 100644
--- a/.github/workflows/benchmark_pr.yml
+++ b/.github/workflows/benchmark_pr.yml
@@ -30,9 +30,9 @@ jobs:
                 git clone --branch fix_python_interp_path_in_cmake https://github.com/tehrengruber/gt4py.git ~/gt4py
                 pip install -r ~/gt4py/requirements-dev.txt
                 pip install -e ~/gt4py
-            - name: Configure PyCall
+            - name: Install and Configure PyCall
               run: |
-                julia -e 'using Pkg; ENV["PYTHON"]="~/gt4py-venv/bin/python3"; Pkg.build("PyCall");'
+                julia --project -e 'using Pkg; Pkg.add("PyCall"); ENV["PYTHON"]="~/gt4py-venv/bin/python3"; Pkg.build("PyCall");'
             - name: Extract Package Name from Project.toml
               id: extract-package-name
               run: |

From 86863261de448f3db32cadc55e0466690dbabc9e Mon Sep 17 00:00:00 2001
From: LorenzoVarese <lorenzovarese@users.noreply.github.com>
Date: Thu, 18 Jul 2024 16:29:28 +0200
Subject: [PATCH 05/53] Fix the PyCall invoke

---
 .github/workflows/benchmark_pr.yml | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmark_pr.yml b/.github/workflows/benchmark_pr.yml
index 2c1911a..5dc9f78 100644
--- a/.github/workflows/benchmark_pr.yml
+++ b/.github/workflows/benchmark_pr.yml
@@ -28,11 +28,17 @@ jobs:
             - name: Install GT4Py from specific branch
               run: |
                 git clone --branch fix_python_interp_path_in_cmake https://github.com/tehrengruber/gt4py.git ~/gt4py
-                pip install -r ~/gt4py/requirements-dev.txt
-                pip install -e ~/gt4py
+                cd ~/gt4py
+                pip install -r requirements-dev.txt
+                pip install -e .
             - name: Install and Configure PyCall
               run: |
-                julia --project -e 'using Pkg; Pkg.add("PyCall"); ENV["PYTHON"]="~/gt4py-venv/bin/python3"; Pkg.build("PyCall");'
+                source ~/gt4py-venv/bin/activate
+                julia --project -e 'using Pkg; Pkg.add("PyCall"); ENV["PYTHON"]="python"; Pkg.build("PyCall");'
+            - name: Check PyCall Configuration
+              run: |
+                source ~/gt4py-venv/bin/activate
+                julia -e 'using PyCall; @show PyCall.python'
             - name: Extract Package Name from Project.toml
               id: extract-package-name
               run: |
@@ -42,19 +48,21 @@ jobs:
               env:
                 JULIA_NUM_THREADS: 2
               run: |
-                julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.add(;url="https://github.com/MilesCranmer/AirspeedVelocity.jl.git")'
+                julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.add(url="https://github.com/MilesCranmer/AirspeedVelocity.jl.git")'
                 julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.build("AirspeedVelocity")'
             - name: Add ~/.julia/bin to PATH
               run: |
                 echo "$HOME/.julia/bin" >> $GITHUB_PATH
             - name: Run benchmarks
               run: |
+                source ~/gt4py-venv/bin/activate
                 echo $PATH
                 ls -l ~/.julia/bin
                 mkdir results
                 benchpkg ${{ steps.extract-package-name.outputs.package_name }} --rev="${{github.event.repository.default_branch}},${{github.event.pull_request.head.sha}}" --url=${{ github.event.repository.clone_url }} --bench-on="${{github.event.repository.default_branch}}" --output-dir=results/ --tune
             - name: Create plots from benchmarks
               run: |
+                source ~/gt4py-venv/bin/activate
                 mkdir -p plots
                 benchpkgplot ${{ steps.extract-package-name.outputs.package_name }} --rev="${{github.event.repository.default_branch}},${{github.event.pull_request.head.sha}}" --npart=10 --format=png --input-dir=results/ --output-dir=plots/
             - name: Upload plot as artifact
@@ -64,6 +72,7 @@ jobs:
                 path: plots
             - name: Create markdown table from benchmarks
               run: |
+                source ~/gt4py-venv/bin/activate
                 benchpkgtable ${{ steps.extract-package-name.outputs.package_name }} --rev="${{github.event.repository.default_branch}},${{github.event.pull_request.head.sha}}" --input-dir=results/ --ratio > table.md
                 echo '### Benchmark Results' > body.md
                 echo '' >> body.md

From e978b6ce0beb9075fa1d9ba7dd90bc7d8701f349 Mon Sep 17 00:00:00 2001
From: LorenzoVarese <lorenzovarese@users.noreply.github.com>
Date: Thu, 18 Jul 2024 16:42:58 +0200
Subject: [PATCH 06/53] Add reference to julia env in benchmark_pr config

---
 .github/workflows/benchmark_pr.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmark_pr.yml b/.github/workflows/benchmark_pr.yml
index 5dc9f78..064fb50 100644
--- a/.github/workflows/benchmark_pr.yml
+++ b/.github/workflows/benchmark_pr.yml
@@ -34,11 +34,11 @@ jobs:
             - name: Install and Configure PyCall
               run: |
                 source ~/gt4py-venv/bin/activate
-                julia --project -e 'using Pkg; Pkg.add("PyCall"); ENV["PYTHON"]="python"; Pkg.build("PyCall");'
+                julia --project=. -e 'using Pkg; Pkg.add("PyCall"); ENV["PYTHON"]="python"; Pkg.build("PyCall");'
             - name: Check PyCall Configuration
               run: |
                 source ~/gt4py-venv/bin/activate
-                julia -e 'using PyCall; @show PyCall.python'
+                julia --project=. -e 'using PyCall; @show PyCall.python'
             - name: Extract Package Name from Project.toml
               id: extract-package-name
               run: |
@@ -48,8 +48,7 @@ jobs:
               env:
                 JULIA_NUM_THREADS: 2
               run: |
-                julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.add(url="https://github.com/MilesCranmer/AirspeedVelocity.jl.git")'
-                julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.build("AirspeedVelocity")'
+                julia --project=. -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; using Pkg; Pkg.add(url="https://github.com/MilesCranmer/AirspeedVelocity.jl.git"); Pkg.build("AirspeedVelocity")'
             - name: Add ~/.julia/bin to PATH
               run: |
                 echo "$HOME/.julia/bin" >> $GITHUB_PATH

From ea7d32d9888b623d64abd49937bddb364e658619 Mon Sep 17 00:00:00 2001
From: LorenzoVarese <lorenzovarese@users.noreply.github.com>
Date: Thu, 18 Jul 2024 16:51:02 +0200
Subject: [PATCH 07/53] Add cache for python, and fix the pycall (again :/)

---
 .github/workflows/benchmark_pr.yml | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark_pr.yml b/.github/workflows/benchmark_pr.yml
index 064fb50..b0a3d51 100644
--- a/.github/workflows/benchmark_pr.yml
+++ b/.github/workflows/benchmark_pr.yml
@@ -17,7 +17,20 @@ jobs:
             - uses: julia-actions/setup-julia@v1
               with:
                 version: "1.8"
-            - uses: julia-actions/cache@v1
+            - uses: actions/cache@v2
+              name: Cache Julia packages
+              with:
+                path: ~/.julia
+                key: ${{ runner.os }}-julia-${{ hashFiles('**/Project.toml', '**/Manifest.toml') }}
+                restore-keys: |
+                  ${{ runner.os }}-julia-
+            - uses: actions/cache@v2
+              name: Cache Python packages
+              with:
+                path: ~/gt4py-venv
+                key: ${{ runner.os }}-python-${{ hashFiles('**/requirements-dev.txt') }}
+                restore-keys: |
+                  ${{ runner.os }}-python-
             - name: Set up Python environment
               run: |
                 sudo apt-get update

From 3d12f85374acae4fc033ccdb232a823aadfee67f Mon Sep 17 00:00:00 2001
From: LorenzoVarese <lorenzovarese@users.noreply.github.com>
Date: Fri, 19 Jul 2024 15:26:37 +0200
Subject: [PATCH 08/53] Activate the env in the benchmark CI

---
 .github/workflows/benchmark_pr.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/benchmark_pr.yml b/.github/workflows/benchmark_pr.yml
index b0a3d51..a8bee98 100644
--- a/.github/workflows/benchmark_pr.yml
+++ b/.github/workflows/benchmark_pr.yml
@@ -40,6 +40,7 @@ jobs:
                 python3 -m pip install --upgrade pip
             - name: Install GT4Py from specific branch
               run: |
+                source ~/gt4py-venv/bin/activate
                 git clone --branch fix_python_interp_path_in_cmake https://github.com/tehrengruber/gt4py.git ~/gt4py
                 cd ~/gt4py
                 pip install -r requirements-dev.txt

From 2ae25f26317187135c2bda01d1534bf028f9ffef Mon Sep 17 00:00:00 2001
From: LorenzoVarese <lorenzovarese@users.noreply.github.com>
Date: Fri, 19 Jul 2024 17:12:34 +0200
Subject: [PATCH 09/53] Include the Cell and K definitions in the benchmark

---
 benchmark/benchmarks.jl | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 119506b..ab0de20 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -4,7 +4,11 @@ push!(LOAD_PATH, path_to_package)
 using BenchmarkTools
 using GridTools
 
-include("../test/mesh_definitions.jl")
+# Mesh definitions -------------------------------------------------------------------------------------------
+const global Cell_ = Dimension{:Cell_, HORIZONTAL}
+const global K_ = Dimension{:K_, HORIZONTAL}
+const global Cell = Cell_()
+const global K = K_()
 
 SUITE = BenchmarkGroup()
 

From 5412e44644315be8671c8016c7534bf158d97c23 Mon Sep 17 00:00:00 2001
From: LorenzoVarese <lorenzovarese@users.noreply.github.com>
Date: Mon, 22 Jul 2024 11:41:57 +0200
Subject: [PATCH 10/53] Add readme to run benchmark example

---
 .gitignore          |  5 +++-
 benchmark/README.md | 56 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 1 deletion(-)
 create mode 100644 benchmark/README.md

diff --git a/.gitignore b/.gitignore
index 0e02d77..3d9056e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,5 +29,8 @@ env_setup.sh
 .python-version
 
 # Misc
-.DS_Store
+**/.DS_Store
 .vscode
+
+# Ignore benchmark (benchpkg) results
+results_GridTools@*
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 0000000..9ae7e7a
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,56 @@
+# Benchmark Guide 🧭📈
+
+## Installation
+
+To install the benchmark CLI, execute the following command:
+
+```bash
+julia -e 'using Pkg; Pkg.add("AirspeedVelocity"); Pkg.build("AirspeedVelocity")'
+```
+
+This installation will create three executables in the `~/.julia/bin` folder: `benchpkg`, `benchpkgplot`, and `benchpkgtable`. It is necessary to add them to your `$PATH` to use them from any terminal session.
+
+### Add to PATH Temporarily
+
+To temporarily add the path to your session:
+
+```bash
+export PATH="$PATH:~/.julia/bin"
+```
+
+### Add to PATH Permanently
+
+To permanently add the executables to your path, append the following line to your `.zshrc` or `.bashrc` file:
+
+```bash
+echo 'export PATH="$PATH:~/.julia/bin"' >> ~/.zshrc  # For zsh users
+echo 'export PATH="$PATH:~/.julia/bin"' >> ~/.bashrc  # For bash users
+```
+
+## Running Benchmarks
+
+To run benchmarks, simply execute the following command in the shell:
+
+```bash
+benchpkg
+```
+
+and it will:
+
+1. Figure out the package name (from Project.toml)
+2. Figure out the default branch name to compare the dirty state of your repo against
+3. Evaluate all the benchmarks in benchmarks/benchmark.jl (BenchmarkTools.jl format – i.e., const SUITE = BenchmarkGroup())
+4. Print the result in a nicely formatted markdown table
+
+You can use the `--filter` option to quickly check if the load time has worsened compared to the master branch:
+
+```bash
+benchpkg --filter=time_to_load
+```
+
+The `benchpkg` was updated in June 2024 to automate the benchmark without specifying the parameters. 
+To specify additional condition in `benchpkg` and to work with `benchpkgplot` consult the help command (`--h`).
+
+## Creating New Benchmarks
+
+TODO: Instructions for adding new benchmarks to the suite.

From 978e6b59e3d341e90f8d737623cb72658ce5937e Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Wed, 24 Jul 2024 13:50:01 +0200
Subject: [PATCH 11/53] Add commented benchmark for field operations

---
 benchmark/benchmarks.jl | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index ab0de20..9daa4b6 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -19,4 +19,20 @@ af = Field((Cell, K), rand(1000, 1000)); bf = Field((Cell, K), rand(1000, 1000))
 SUITE["arith_broadcast"]["arrays"] = @benchmarkable a .+ b .- c
 SUITE["arith_broadcast"]["fields"] = @benchmarkable af .+ bf .- cf
 
+# Benchmark for field operator addition
+
+# function benchmark_fo_addition()
+#     a = Field(Cell, collect(1.0:15.0))
+#     b = Field(Cell, collect(-1.0:-1:-15.0))
+#     out = Field(Cell, zeros(Float64, 15))
+
+#     @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+#         return a .+ b
+#     end
+
+#     @benchmarkable fo_addition(a, b, backend="embedded", out=out)
+# end
+
+# SUITE["field_operator"]["addition"] = benchmark_fo_addition()
+
 run(SUITE, verbose = true, seconds = 1)

From 8046538e8b4fcd28aabfe7a7d980ffa56ddd018d Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Fri, 26 Jul 2024 14:43:21 +0200
Subject: [PATCH 12/53] fix benchmarks

---
 benchmark/benchmarks.jl | 95 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 80 insertions(+), 15 deletions(-)

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 9daa4b6..f4248d7 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -9,30 +9,95 @@ const global Cell_ = Dimension{:Cell_, HORIZONTAL}
 const global K_ = Dimension{:K_, HORIZONTAL}
 const global Cell = Cell_()
 const global K = K_()
+const global Edge_ = Dimension{:Edge_, HORIZONTAL}
+const global Edge = Edge_()
+const global E2CDim_ = Dimension{:E2CDim_, LOCAL}
+const global E2CDim = E2CDim_()
+
+
+function setup_simple_connectivity()::Dict{String,Connectivity}
+    edge_to_cell_table = [
+        [1 -1];
+        [3 -1];
+        [3 -1];
+        [4 -1];
+        [5 -1];
+        [6 -1];
+        [1 6];
+        [1 2];
+        [2 3];
+        [2 4];
+        [4 5];
+        [5 6]
+    ]
+
+    cell_to_edge_table = [
+        [1 7 8];
+        [8 9 10];
+        [2 3 9];
+        [4 10 11];
+        [5 11 12];
+        [6 7 12]
+    ]
+
+    E2C_offset_provider = Connectivity(edge_to_cell_table, Cell, Edge, 2)
+    C2E_offset_provider = Connectivity(cell_to_edge_table, Edge, Cell, 3)
+
+    offset_provider = Dict{String,Connectivity}(
+        "E2C" => E2C_offset_provider,
+        "C2E" => C2E_offset_provider,
+        "E2CDim" => E2C_offset_provider # TODO(lorenzovarese): this is required for the embedded backend (note: python already uses E2C)
+    )
+
+    return offset_provider
+end
 
 SUITE = BenchmarkGroup()
 
-SUITE["arith_broadcast"] = BenchmarkGroup()
+# Legacy Suite with first tests
+# SUITE["arith_broadcast"] = BenchmarkGroup()
+
+# a = rand(1000, 1000); b = rand(1000,1000); c = rand(1000,1000)
+# af = Field((Cell, K), rand(1000, 1000)); bf = Field((Cell, K), rand(1000, 1000)); cf = Field((Cell, K), rand(1000, 1000))
+# SUITE["arith_broadcast"]["arrays"] = @benchmarkable a .+ b .- c
+# SUITE["arith_broadcast"]["fields"] = @benchmarkable af .+ bf .- cf
 
-a = rand(1000, 1000); b = rand(1000,1000); c = rand(1000,1000)
-af = Field((Cell, K), rand(1000, 1000)); bf = Field((Cell, K), rand(1000, 1000)); cf = Field((Cell, K), rand(1000, 1000))
-SUITE["arith_broadcast"]["arrays"] = @benchmarkable a .+ b .- c
-SUITE["arith_broadcast"]["fields"] = @benchmarkable af .+ bf .- cf
+SUITE["field_operator"] = BenchmarkGroup()
 
 # Benchmark for field operator addition
+function benchmark_fo_addition()
+    a = Field(Cell, collect(1.0:15.0))
+    b = Field(Cell, collect(-1.0:-1:-15.0))
+    out = Field(Cell, zeros(Float64, 15))
+
+    @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+        return a .+ b
+    end
+
+    @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out) #setup=(
+        #  a = Field(Cell, collect(1.0:15.0)); 
+        # b = Field(Cell, collect(-1.0:-1:-15.0)); 
+        # out_field = Field(Cell, zeros(Float64, 15)); 
+        # @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} return a .+ b end;
+        # )
+end
+
+SUITE["field_operator"]["addition"] = benchmark_fo_addition()
 
-# function benchmark_fo_addition()
-#     a = Field(Cell, collect(1.0:15.0))
-#     b = Field(Cell, collect(-1.0:-1:-15.0))
-#     out = Field(Cell, zeros(Float64, 15))
+# Benchmark for neighbor sum
+function benchmark_fo_neighbor_sum()
+    offset_provider = setup_simple_connectivity();
+    a = Field(Cell, collect(5.0:17.0) * 3);
+    E2C = FieldOffset("E2C", source=Cell, target=(Edge, E2CDim))
+    out_field = Field(Edge, zeros(Float64, 12))
 
-#     @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
-#         return a .+ b
-#     end
+    @field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+        return neighbor_sum(a(E2C), axis=E2CDim)
+    end
 
-#     @benchmarkable fo_addition(a, b, backend="embedded", out=out)
-# end
+    @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, out=$out_field) 
+end
 
-# SUITE["field_operator"]["addition"] = benchmark_fo_addition()
+SUITE["field_operator"]["neighbor_sum"] = benchmark_fo_neighbor_sum()
 
 run(SUITE, verbose = true, seconds = 1)

From 8fea0cae311f1f20de9443f6b9c31ea283c59bfd Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Wed, 31 Jul 2024 18:14:17 +0200
Subject: [PATCH 13/53] Add benchmark comparison between Julia's broadcast
 addition and the field operator one

---
 benchmark/benchsuite_fo.jl | 124 +++++++++++++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100644 benchmark/benchsuite_fo.jl

diff --git a/benchmark/benchsuite_fo.jl b/benchmark/benchsuite_fo.jl
new file mode 100644
index 0000000..a208bc1
--- /dev/null
+++ b/benchmark/benchsuite_fo.jl
@@ -0,0 +1,124 @@
+using BenchmarkTools
+using Statistics
+using GridTools
+
+# Data size
+const global STREAM_SIZE = 10000000 # 10 million
+
+# Mesh definitions
+const global Cell_ = Dimension{:Cell_, HORIZONTAL}
+const global Cell = Cell_()
+
+"""
+    julia_broadcast_addition_setup(ARRAY_SIZE::Int64)
+
+Setup function for the Julia broadcast addition benchmark.
+
+# Arguments
+- `ARRAY_SIZE::Int64`: The size of the arrays to be generated.
+
+# Returns
+- `a, b`: Two randomly generated arrays of integers of size `ARRAY_SIZE`.
+- `data_size`: The total size of the data processed.
+"""
+function julia_broadcast_addition_setup(ARRAY_SIZE::Int64)
+    a = rand(Int, ARRAY_SIZE)
+    b = rand(Int, ARRAY_SIZE)
+    data_size = sizeof(a) + sizeof(b)  # Total bytes processed
+    return a, b, data_size
+end
+
+"""
+    julia_broadcast_addition_operation(a, b)
+
+Core operation for the Julia broadcast addition benchmark.
+
+# Arguments
+- `a, b`: Two arrays to be added.
+
+# Returns
+- The result of element-wise addition of `a` and `b`.
+"""
+function julia_broadcast_addition_operation(a, b)
+    return a .+ b
+end
+
+"""
+    fo_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)
+
+Setup function for the field operator broadcast addition benchmark.
+
+# Arguments
+- `FIELD_DATA_SIZE::Int64`: The size of the fields to be generated.
+
+# Returns
+- `a, b`: Two randomly generated fields of floats of size `FIELD_DATA_SIZE`.
+- `out`: An output field similar to `a`.
+"""
+function fo_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)
+    a = Field(Cell, rand(Float64, FIELD_DATA_SIZE))
+    b = Field(Cell, rand(Float64, FIELD_DATA_SIZE))
+    out = GridTools.similar_field(a)
+    return a, b, out
+end
+
+"""
+    fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Core operation for the field operator broadcast addition benchmark.
+
+# Arguments
+- `a, b`: Two fields to be added.
+
+# Returns
+- The result of element-wise addition of `a` and `b`.
+"""
+@field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return a .+ b
+end
+
+"""
+    compute_memory_bandwidth_addition(results, a, b, out)
+
+Function to compute the memory bandwidth for the addition benchmarks.
+
+# Arguments
+- `results`: Benchmark results.
+- `a, b`: The input arrays/fields used in the benchmark.
+- `out`: The output array/field of the benchmark.
+
+# Returns
+- The computed memory bandwidth in GB/s.
+"""
+function compute_memory_bandwidth_addition(results, a, b, out)
+    @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data)
+    data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data)  # Read a and b, write to out
+    time_in_seconds = median(results.times) / 1e9  # Convert ns to s
+    bandwidth = data_size / time_in_seconds / 1e9  # GB/s
+    return bandwidth
+end
+
+# Create the benchmark suite
+suite = BenchmarkGroup()
+
+# Julia broadcast addition benchmark
+a, b, data_size = julia_broadcast_addition_setup(STREAM_SIZE)
+suite["julia_broadcast_addition"] = @benchmarkable $julia_broadcast_addition_operation($a, $b)
+
+# FO broadcast addition benchmark
+a, b, out = fo_broadcast_addition_setup(STREAM_SIZE)
+suite["fo_broadcast_addition"] = @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out)
+
+# Run the benchmark suite
+results = run(suite)
+
+# Process the results
+julia_results = results["julia_broadcast_addition"]
+fo_results = results["fo_broadcast_addition"]
+
+# Process and print the results
+julia_bandwidth = compute_memory_bandwidth_addition(julia_results, a, b, a) # TODO: improve out
+fo_bandwidth = compute_memory_bandwidth_addition(fo_results, a, b, out)
+
+println("Julia broadcast addition bandwidth: $julia_bandwidth GB/s")
+println("FO broadcast addition bandwidth: $fo_bandwidth GB/s")

From 8d0296bab23cb9976e711c527a2bd1c10aa4823f Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Fri, 2 Aug 2024 14:20:17 +0200
Subject: [PATCH 14/53] Update the Benchmark suite. Provide comparison between
 broadcast on array, on fields data and with field operator.

---
 benchmark/benchmarks.jl     | 211 ++++++++++++++++++++++--------------
 benchmark/benchmarks_old.jl | 103 ++++++++++++++++++
 benchmark/benchsuite_fo.jl  | 124 ---------------------
 3 files changed, 232 insertions(+), 206 deletions(-)
 create mode 100644 benchmark/benchmarks_old.jl
 delete mode 100644 benchmark/benchsuite_fo.jl

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index f4248d7..2fcae28 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -1,103 +1,150 @@
-using Pkg
-path_to_package = joinpath(@__DIR__, "..")  # Assuming the benchmarks.jl file is in the "benchmark" directory
-push!(LOAD_PATH, path_to_package)
 using BenchmarkTools
+using Statistics
 using GridTools
 
-# Mesh definitions -------------------------------------------------------------------------------------------
+# Data size
+const global STREAM_SIZE = 10000000 # 10 million
+
+# Mesh definitions
 const global Cell_ = Dimension{:Cell_, HORIZONTAL}
-const global K_ = Dimension{:K_, HORIZONTAL}
 const global Cell = Cell_()
-const global K = K_()
-const global Edge_ = Dimension{:Edge_, HORIZONTAL}
-const global Edge = Edge_()
-const global E2CDim_ = Dimension{:E2CDim_, LOCAL}
-const global E2CDim = E2CDim_()
-
-
-function setup_simple_connectivity()::Dict{String,Connectivity}
-    edge_to_cell_table = [
-        [1 -1];
-        [3 -1];
-        [3 -1];
-        [4 -1];
-        [5 -1];
-        [6 -1];
-        [1 6];
-        [1 2];
-        [2 3];
-        [2 4];
-        [4 5];
-        [5 6]
-    ]
-
-    cell_to_edge_table = [
-        [1 7 8];
-        [8 9 10];
-        [2 3 9];
-        [4 10 11];
-        [5 11 12];
-        [6 7 12]
-    ]
-
-    E2C_offset_provider = Connectivity(edge_to_cell_table, Cell, Edge, 2)
-    C2E_offset_provider = Connectivity(cell_to_edge_table, Edge, Cell, 3)
-
-    offset_provider = Dict{String,Connectivity}(
-        "E2C" => E2C_offset_provider,
-        "C2E" => C2E_offset_provider,
-        "E2CDim" => E2C_offset_provider # TODO(lorenzovarese): this is required for the embedded backend (note: python already uses E2C)
-    )
-
-    return offset_provider
+
+"""
+    julia_broadcast_addition_setup(ARRAY_SIZE::Int64)
+
+Setup function for the Julia broadcast addition benchmark.
+
+# Arguments
+- `ARRAY_SIZE::Int64`: The size of the arrays to be generated.
+
+# Returns
+- `a, b`: Two randomly generated arrays of integers of size `ARRAY_SIZE`.
+- `data_size`: The total size of the data processed.
+"""
+function julia_broadcast_addition_setup(ARRAY_SIZE::Int64)
+    a = rand(Float64, ARRAY_SIZE)
+    b = rand(Float64, ARRAY_SIZE)
+    data_size = sizeof(a) + sizeof(b)  # Total bytes processed
+    return a, b, data_size
 end
 
-SUITE = BenchmarkGroup()
+"""
+    broadcast_addition_array(a::Array{Float64}, b::Array{Float64})
 
-# Legacy Suite with first tests
-# SUITE["arith_broadcast"] = BenchmarkGroup()
+Core operation for the Julia broadcast addition benchmark.
 
-# a = rand(1000, 1000); b = rand(1000,1000); c = rand(1000,1000)
-# af = Field((Cell, K), rand(1000, 1000)); bf = Field((Cell, K), rand(1000, 1000)); cf = Field((Cell, K), rand(1000, 1000))
-# SUITE["arith_broadcast"]["arrays"] = @benchmarkable a .+ b .- c
-# SUITE["arith_broadcast"]["fields"] = @benchmarkable af .+ bf .- cf
+# Arguments
+- `a, b`: Two arrays to be added.
 
-SUITE["field_operator"] = BenchmarkGroup()
+# Returns
+- The result of element-wise addition of `a` and `b`.
+"""
+function broadcast_addition_array(a::Array{Float64}, b::Array{Float64})
+    return a .+ b
+end
 
-# Benchmark for field operator addition
-function benchmark_fo_addition()
-    a = Field(Cell, collect(1.0:15.0))
-    b = Field(Cell, collect(-1.0:-1:-15.0))
-    out = Field(Cell, zeros(Float64, 15))
+"""
+    broadcast_addition(a::Field, b::Field)
 
-    @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
-        return a .+ b
-    end
+Core operation for the broadcast addition of two Field benchmark.
+Useful to asses and track possible overhead on fields.
 
-    @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out) #setup=(
-        #  a = Field(Cell, collect(1.0:15.0)); 
-        # b = Field(Cell, collect(-1.0:-1:-15.0)); 
-        # out_field = Field(Cell, zeros(Float64, 15)); 
-        # @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} return a .+ b end;
-        # )
+# Arguments
+- `a, b`: Two field to be added.
+
+# Returns
+- The result of element-wise addition of the data of the fields `a` and `b`.
+"""
+function broadcast_addition_fields(a::Field, b::Field)
+    return a .+ b
 end
 
-SUITE["field_operator"]["addition"] = benchmark_fo_addition()
+"""
+    fo_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)
 
-# Benchmark for neighbor sum
-function benchmark_fo_neighbor_sum()
-    offset_provider = setup_simple_connectivity();
-    a = Field(Cell, collect(5.0:17.0) * 3);
-    E2C = FieldOffset("E2C", source=Cell, target=(Edge, E2CDim))
-    out_field = Field(Edge, zeros(Float64, 12))
+Setup function for the field operator broadcast addition benchmark.
 
-    @field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
-        return neighbor_sum(a(E2C), axis=E2CDim)
-    end
+# Arguments
+- `FIELD_DATA_SIZE::Int64`: The size of the fields to be generated.
 
-    @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, out=$out_field) 
+# Returns
+- `a, b`: Two randomly generated fields of floats of size `FIELD_DATA_SIZE`.
+- `out`: An output field similar to `a`.
+"""
+function fo_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)
+    a = Field(Cell, rand(Float64, FIELD_DATA_SIZE))
+    b = Field(Cell, rand(Float64, FIELD_DATA_SIZE))
+    out = GridTools.similar_field(a)
+    return a, b, out
 end
 
-SUITE["field_operator"]["neighbor_sum"] = benchmark_fo_neighbor_sum()
+"""
+    fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Core operation for the field operator broadcast addition benchmark.
+
+# Arguments
+- `a, b`: Two fields to be added.
+
+# Returns
+- The result of element-wise addition of `a` and `b`.
+"""
+@field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return a .+ b
+end
+
+"""
+    compute_memory_bandwidth_addition(results, a, b, out)
+
+Function to compute the memory bandwidth for the addition benchmarks.
+
+# Arguments
+- `results`: Benchmark results.
+- `a, b`: The input arrays/fields used in the benchmark.
+- `out`: The output array/field of the benchmark.
+
+# Returns
+- The computed memory bandwidth in GB/s.
+"""
+function compute_memory_bandwidth_addition(results, a, b, out)
+    @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data)
+    data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data)  # Read a and b, write to out
+    time_in_seconds = median(results.times) / 1e9  # Convert ns to s
+    bandwidth = data_size / time_in_seconds / 1e9  # GB/s
+    return bandwidth
+end
+
+# Create the benchmark suite
+suite = BenchmarkGroup()
+
+# Define the main groups
+suite["addition"] = BenchmarkGroup()
+
+# Julia broadcast addition benchmark
+a, b, data_size = julia_broadcast_addition_setup(STREAM_SIZE)
+suite["addition"]["array_broadcast_addition"] = @benchmarkable $broadcast_addition_array($a, $b)
+
+# Field broadcast addition benchmark
+a, b, out = fo_broadcast_addition_setup(STREAM_SIZE)
+suite["addition"]["fields_broadcast_addition"] = @benchmarkable $broadcast_addition_fields($a, $b)
+
+# Field Operator broadcast addition benchmark
+a, b, out = fo_broadcast_addition_setup(STREAM_SIZE)
+suite["addition"]["field_op_broadcast_addition"] = @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out)
+
+# Run the benchmark suite
+results = run(suite)
+
+# Process the results
+array_results = results["addition"]["array_broadcast_addition"]
+fields_results = results["addition"]["fields_broadcast_addition"]
+fo_results = results["addition"]["field_op_broadcast_addition"]
+
+# Process and print the results
+array_bandwidth = compute_memory_bandwidth_addition(array_results, a, b, a) # Out is a temporary array with size a
+fields_bandwidth = compute_memory_bandwidth_addition(fields_results, a, b, a) # Out is a temporary array with size a
+fo_bandwidth = compute_memory_bandwidth_addition(fo_results, a, b, out)
 
-run(SUITE, verbose = true, seconds = 1)
+println("Array broadcast addition bandwidth: $array_bandwidth GB/s")
+println("Fields data broadcast addition bandwidth: $fields_bandwidth GB/s")
+println("Field Operator broadcast addition bandwidth: $fo_bandwidth GB/s")
diff --git a/benchmark/benchmarks_old.jl b/benchmark/benchmarks_old.jl
new file mode 100644
index 0000000..0bb429f
--- /dev/null
+++ b/benchmark/benchmarks_old.jl
@@ -0,0 +1,103 @@
+using Pkg
+path_to_package = joinpath(@__DIR__, "..")  # Assuming the benchmarks.jl file is in the "benchmark" directory
+push!(LOAD_PATH, path_to_package)
+using BenchmarkTools
+using GridTools
+
+# Mesh definitions -------------------------------------------------------------------------------------------
+# const global Cell_ = Dimension{:Cell_, HORIZONTAL}
+# const global K_ = Dimension{:K_, HORIZONTAL}
+# const global Cell = Cell_()
+# const global K = K_()
+# const global Edge_ = Dimension{:Edge_, HORIZONTAL}
+# const global Edge = Edge_()
+# const global E2CDim_ = Dimension{:E2CDim_, LOCAL}
+# const global E2CDim = E2CDim_()
+
+
+# function setup_simple_connectivity()::Dict{String,Connectivity}
+#     edge_to_cell_table = [
+#         [1 -1];
+#         [3 -1];
+#         [3 -1];
+#         [4 -1];
+#         [5 -1];
+#         [6 -1];
+#         [1 6];
+#         [1 2];
+#         [2 3];
+#         [2 4];
+#         [4 5];
+#         [5 6]
+#     ]
+
+#     cell_to_edge_table = [
+#         [1 7 8];
+#         [8 9 10];
+#         [2 3 9];
+#         [4 10 11];
+#         [5 11 12];
+#         [6 7 12]
+#     ]
+
+#     E2C_offset_provider = Connectivity(edge_to_cell_table, Cell, Edge, 2)
+#     C2E_offset_provider = Connectivity(cell_to_edge_table, Edge, Cell, 3)
+
+#     offset_provider = Dict{String,Connectivity}(
+#         "E2C" => E2C_offset_provider,
+#         "C2E" => C2E_offset_provider,
+#         "E2CDim" => E2C_offset_provider # TODO(lorenzovarese): this is required for the embedded backend (note: python already uses E2C)
+#     )
+
+#     return offset_provider
+# end
+
+SUITE = BenchmarkGroup()
+
+# Legacy Suite with first tests
+SUITE["arith_broadcast"] = BenchmarkGroup()
+
+a = rand(1000, 1000); b = rand(1000,1000); c = rand(1000,1000)
+af = Field((Cell, K), rand(1000, 1000)); bf = Field((Cell, K), rand(1000, 1000)); cf = Field((Cell, K), rand(1000, 1000))
+SUITE["arith_broadcast"]["arrays"] = @benchmarkable a .+ b .- c
+SUITE["arith_broadcast"]["fields"] = @benchmarkable af .+ bf .- cf
+
+# SUITE["field_operator"] = BenchmarkGroup()
+
+# # Benchmark for field operator addition
+# function benchmark_fo_addition()
+#     a = Field(Cell, collect(1.0:15.0))
+#     b = Field(Cell, collect(-1.0:-1:-15.0))
+#     out = Field(Cell, zeros(Float64, 15))
+
+#     @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+#         return a .+ b
+#     end
+
+#     @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out) #setup=(
+#         #  a = Field(Cell, collect(1.0:15.0)); 
+#         # b = Field(Cell, collect(-1.0:-1:-15.0)); 
+#         # out_field = Field(Cell, zeros(Float64, 15)); 
+#         # @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} return a .+ b end;
+#         # )
+# end
+
+# SUITE["field_operator"]["addition"] = benchmark_fo_addition()
+
+# # Benchmark for neighbor sum
+# function benchmark_fo_neighbor_sum()
+#     offset_provider = setup_simple_connectivity();
+#     a = Field(Cell, collect(5.0:17.0) * 3);
+#     E2C = FieldOffset("E2C", source=Cell, target=(Edge, E2CDim))
+#     out_field = Field(Edge, zeros(Float64, 12))
+
+#     @field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+#         return neighbor_sum(a(E2C), axis=E2CDim)
+#     end
+
+#     @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, out=$out_field) 
+# end
+
+# SUITE["field_operator"]["neighbor_sum"] = benchmark_fo_neighbor_sum()
+
+run(SUITE, verbose = true, seconds = 1)
diff --git a/benchmark/benchsuite_fo.jl b/benchmark/benchsuite_fo.jl
deleted file mode 100644
index a208bc1..0000000
--- a/benchmark/benchsuite_fo.jl
+++ /dev/null
@@ -1,124 +0,0 @@
-using BenchmarkTools
-using Statistics
-using GridTools
-
-# Data size
-const global STREAM_SIZE = 10000000 # 10 million
-
-# Mesh definitions
-const global Cell_ = Dimension{:Cell_, HORIZONTAL}
-const global Cell = Cell_()
-
-"""
-    julia_broadcast_addition_setup(ARRAY_SIZE::Int64)
-
-Setup function for the Julia broadcast addition benchmark.
-
-# Arguments
-- `ARRAY_SIZE::Int64`: The size of the arrays to be generated.
-
-# Returns
-- `a, b`: Two randomly generated arrays of integers of size `ARRAY_SIZE`.
-- `data_size`: The total size of the data processed.
-"""
-function julia_broadcast_addition_setup(ARRAY_SIZE::Int64)
-    a = rand(Int, ARRAY_SIZE)
-    b = rand(Int, ARRAY_SIZE)
-    data_size = sizeof(a) + sizeof(b)  # Total bytes processed
-    return a, b, data_size
-end
-
-"""
-    julia_broadcast_addition_operation(a, b)
-
-Core operation for the Julia broadcast addition benchmark.
-
-# Arguments
-- `a, b`: Two arrays to be added.
-
-# Returns
-- The result of element-wise addition of `a` and `b`.
-"""
-function julia_broadcast_addition_operation(a, b)
-    return a .+ b
-end
-
-"""
-    fo_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)
-
-Setup function for the field operator broadcast addition benchmark.
-
-# Arguments
-- `FIELD_DATA_SIZE::Int64`: The size of the fields to be generated.
-
-# Returns
-- `a, b`: Two randomly generated fields of floats of size `FIELD_DATA_SIZE`.
-- `out`: An output field similar to `a`.
-"""
-function fo_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)
-    a = Field(Cell, rand(Float64, FIELD_DATA_SIZE))
-    b = Field(Cell, rand(Float64, FIELD_DATA_SIZE))
-    out = GridTools.similar_field(a)
-    return a, b, out
-end
-
-"""
-    fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
-
-Core operation for the field operator broadcast addition benchmark.
-
-# Arguments
-- `a, b`: Two fields to be added.
-
-# Returns
-- The result of element-wise addition of `a` and `b`.
-"""
-@field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
-    return a .+ b
-end
-
-"""
-    compute_memory_bandwidth_addition(results, a, b, out)
-
-Function to compute the memory bandwidth for the addition benchmarks.
-
-# Arguments
-- `results`: Benchmark results.
-- `a, b`: The input arrays/fields used in the benchmark.
-- `out`: The output array/field of the benchmark.
-
-# Returns
-- The computed memory bandwidth in GB/s.
-"""
-function compute_memory_bandwidth_addition(results, a, b, out)
-    @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data)
-    data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data)  # Read a and b, write to out
-    time_in_seconds = median(results.times) / 1e9  # Convert ns to s
-    bandwidth = data_size / time_in_seconds / 1e9  # GB/s
-    return bandwidth
-end
-
-# Create the benchmark suite
-suite = BenchmarkGroup()
-
-# Julia broadcast addition benchmark
-a, b, data_size = julia_broadcast_addition_setup(STREAM_SIZE)
-suite["julia_broadcast_addition"] = @benchmarkable $julia_broadcast_addition_operation($a, $b)
-
-# FO broadcast addition benchmark
-a, b, out = fo_broadcast_addition_setup(STREAM_SIZE)
-suite["fo_broadcast_addition"] = @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out)
-
-# Run the benchmark suite
-results = run(suite)
-
-# Process the results
-julia_results = results["julia_broadcast_addition"]
-fo_results = results["fo_broadcast_addition"]
-
-# Process and print the results
-julia_bandwidth = compute_memory_bandwidth_addition(julia_results, a, b, a) # TODO: improve out
-fo_bandwidth = compute_memory_bandwidth_addition(fo_results, a, b, out)
-
-println("Julia broadcast addition bandwidth: $julia_bandwidth GB/s")
-println("FO broadcast addition bandwidth: $fo_bandwidth GB/s")

From b9d4e2e52d0721e2114e6f3b3149b4a65b4620ea Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Fri, 2 Aug 2024 14:36:11 +0200
Subject: [PATCH 15/53] Improve naming and type checking

---
 benchmark/benchmarks.jl | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 2fcae28..f381dc3 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -10,7 +10,7 @@ const global Cell_ = Dimension{:Cell_, HORIZONTAL}
 const global Cell = Cell_()
 
 """
-    julia_broadcast_addition_setup(ARRAY_SIZE::Int64)
+    array_broadcast_addition_setup(ARRAY_SIZE::Int64)
 
 Setup function for the Julia broadcast addition benchmark.
 
@@ -21,7 +21,7 @@ Setup function for the Julia broadcast addition benchmark.
 - `a, b`: Two randomly generated arrays of integers of size `ARRAY_SIZE`.
 - `data_size`: The total size of the data processed.
 """
-function julia_broadcast_addition_setup(ARRAY_SIZE::Int64)
+function array_broadcast_addition_setup(ARRAY_SIZE::Int64)::Tuple{Array{Float64,1}, Array{Float64,1}, Int64}
     a = rand(Float64, ARRAY_SIZE)
     b = rand(Float64, ARRAY_SIZE)
     data_size = sizeof(a) + sizeof(b)  # Total bytes processed
@@ -39,7 +39,7 @@ Core operation for the Julia broadcast addition benchmark.
 # Returns
 - The result of element-wise addition of `a` and `b`.
 """
-function broadcast_addition_array(a::Array{Float64}, b::Array{Float64})
+function broadcast_addition_array(a::Array{Float64}, b::Array{Float64})::Array{Float64,1}
     return a .+ b
 end
 
@@ -55,12 +55,12 @@ Useful to asses and track possible overhead on fields.
 # Returns
 - The result of element-wise addition of the data of the fields `a` and `b`.
 """
-function broadcast_addition_fields(a::Field, b::Field)
+function broadcast_addition_fields(a::Field, b::Field)::Field
     return a .+ b
 end
 
 """
-    fo_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)
+    fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)
 
 Setup function for the field operator broadcast addition benchmark.
 
@@ -71,7 +71,7 @@ Setup function for the field operator broadcast addition benchmark.
 - `a, b`: Two randomly generated fields of floats of size `FIELD_DATA_SIZE`.
 - `out`: An output field similar to `a`.
 """
-function fo_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)
+function fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field, Field}
     a = Field(Cell, rand(Float64, FIELD_DATA_SIZE))
     b = Field(Cell, rand(Float64, FIELD_DATA_SIZE))
     out = GridTools.similar_field(a)
@@ -106,7 +106,7 @@ Function to compute the memory bandwidth for the addition benchmarks.
 # Returns
 - The computed memory bandwidth in GB/s.
 """
-function compute_memory_bandwidth_addition(results, a, b, out)
+function compute_memory_bandwidth_addition(results, a, b, out)::Float64
     @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data)
     data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data)  # Read a and b, write to out
     time_in_seconds = median(results.times) / 1e9  # Convert ns to s
@@ -121,15 +121,15 @@ suite = BenchmarkGroup()
 suite["addition"] = BenchmarkGroup()
 
 # Julia broadcast addition benchmark
-a, b, data_size = julia_broadcast_addition_setup(STREAM_SIZE)
+a, b, data_size = array_broadcast_addition_setup(STREAM_SIZE)
 suite["addition"]["array_broadcast_addition"] = @benchmarkable $broadcast_addition_array($a, $b)
 
 # Field broadcast addition benchmark
-a, b, out = fo_broadcast_addition_setup(STREAM_SIZE)
+a, b, out = fields_broadcast_addition_setup(STREAM_SIZE)
 suite["addition"]["fields_broadcast_addition"] = @benchmarkable $broadcast_addition_fields($a, $b)
 
 # Field Operator broadcast addition benchmark
-a, b, out = fo_broadcast_addition_setup(STREAM_SIZE)
+a, b, out = fields_broadcast_addition_setup(STREAM_SIZE)
 suite["addition"]["field_op_broadcast_addition"] = @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out)
 
 # Run the benchmark suite
@@ -145,6 +145,6 @@ array_bandwidth = compute_memory_bandwidth_addition(array_results, a, b, a) # Ou
 fields_bandwidth = compute_memory_bandwidth_addition(fields_results, a, b, a) # Out is a temporary array with size a
 fo_bandwidth = compute_memory_bandwidth_addition(fo_results, a, b, out)
 
-println("Array broadcast addition bandwidth: $array_bandwidth GB/s")
-println("Fields data broadcast addition bandwidth: $fields_bandwidth GB/s")
-println("Field Operator broadcast addition bandwidth: $fo_bandwidth GB/s")
+println("Array broadcast addition bandwidth:\t\t$array_bandwidth GB/s")
+println("Fields data broadcast addition bandwidth:\t$fields_bandwidth GB/s")
+println("Field Operator broadcast addition bandwidth:\t$fo_bandwidth GB/s")

From 2ab1421efe0c404c1f49f5dab3b4952bffd51bca Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Tue, 13 Aug 2024 11:14:07 +0200
Subject: [PATCH 16/53] Add draft of neighbour_sum benchmark

---
 benchmark/benchmarks_neighbour_sum.jl | 47 +++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 benchmark/benchmarks_neighbour_sum.jl

diff --git a/benchmark/benchmarks_neighbour_sum.jl b/benchmark/benchmarks_neighbour_sum.jl
new file mode 100644
index 0000000..3016d43
--- /dev/null
+++ b/benchmark/benchmarks_neighbour_sum.jl
@@ -0,0 +1,47 @@
+
+using BenchmarkTools
+using Statistics
+using GridTools  
+
+const N = 1_000_000
+const DIM_SIZE = sqrt(N) |> floor |> Int
+
+include("../test/mesh_definitions.jl")
+
+function create_large_connectivity(size::Int)
+    edge_to_cell_table = hcat([rand(1:size, 2) for _ in 1:size]...)
+    cell_to_edge_table = hcat([rand(1:size, 3) for _ in 1:size]...)
+
+    E2C = Connectivity(edge_to_cell_table, Cell, Edge, 2)
+    C2E = Connectivity(cell_to_edge_table, Edge, Cell, 3)
+
+    Dict(
+        "E2C" => E2C,
+        "C2E" => C2E,
+        "E2CDim" => E2C  # Using the same for simplicity # TODO: to be removed
+    )
+end
+
+offset_provider = create_large_connectivity(DIM_SIZE)
+
+a = Field(Cell, collect(1.0:N))
+out_field = GridTools.similar_field(a)
+
+@field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+    return neighbor_sum(a(E2C), axis=E2CDim)
+end
+
+# Benchmark the field operation
+fo_benchmark = @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, backend="embedded", out=$out_field)
+
+# Run the benchmark
+results = run(fo_benchmark)
+
+# Memory bandwidth calculation
+time_in_seconds = median(results.times) / 1e9  # convert ns to s
+data_size = sizeof(a.data) + sizeof(out_field.data)  # total bytes read and written
+bandwidth = data_size / time_in_seconds / 1e9  # GB/s
+
+# Output results
+println("Time taken: ", median(results.times) / 1e6, " ms")
+println("Memory bandwidth: ", bandwidth, " GB/s")

From 3062bdf7a23db8543c57aa1fc360f79862dec010 Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Tue, 13 Aug 2024 15:24:47 +0200
Subject: [PATCH 17/53] Add the benchmarks for sine and cosine field operators

---
 benchmark/benchmarks.jl | 141 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 138 insertions(+), 3 deletions(-)

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index f381dc3..65799c6 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -3,12 +3,30 @@ using Statistics
 using GridTools
 
 # Data size
-const global STREAM_SIZE = 10000000 # 10 million
+const global STREAM_SIZE = 10_000_000
 
 # Mesh definitions
 const global Cell_ = Dimension{:Cell_, HORIZONTAL}
 const global Cell = Cell_()
 
+"""
+    single_field_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field}
+
+Setup function to create a field and a similar output field for benchmarking operations that require a single input field.
+
+# Arguments
+- `FIELD_DATA_SIZE::Int64`: The size of the field to be generated.
+
+# Returns
+- `a`: A randomly generated field of floats of size `FIELD_DATA_SIZE`.
+- `out`: An output field similar to `a`, used for storing operation results.
+"""
+function single_field_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field}
+    a = Field(Cell, rand(Float64, FIELD_DATA_SIZE))
+    out = GridTools.similar_field(a)
+    return a, out
+end
+
 """
     array_broadcast_addition_setup(ARRAY_SIZE::Int64)
 
@@ -93,6 +111,93 @@ Core operation for the field operator broadcast addition benchmark.
     return a .+ b
 end
 
+"""
+    sin_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Applies the sine function element-wise to the data of a field without using a field operator.
+
+# Arguments
+- `a`: Input field containing Float64 data.
+
+# Returns
+- A new field where each element is the sine of the corresponding element in the input field `a`.
+"""
+function sin_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return sin.(a)
+end
+
+"""
+    cos_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Applies the cosine function element-wise to the data of a field without using a field operator.
+
+# Arguments
+- `a`: Input field containing Float64 data.
+
+# Returns
+- A new field where each element is the cosine of the corresponding element in the input field `a`.
+"""
+function cos_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return cos.(a)
+end
+
+"""
+    fo_sin(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Field operator that applies the sine function element-wise to the data of a field.
+
+# Arguments
+- `a`: Input field containing Float64 data.
+
+# Returns
+- A new field where each element is the sine of the corresponding element in the input field `a`.
+"""
+@field_operator function fo_sin(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return sin.(a)
+end
+
+"""
+    fo_cos(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Field operator that applies the cosine function element-wise to the data of a field.
+
+# Arguments
+- `a`: Input field containing Float64 data.
+
+# Returns
+- A new field where each element is the cosine of the corresponding element in the input field `a`.
+"""
+@field_operator function fo_cos(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return cos.(a)
+end
+
+"""
+    compute_memory_bandwidth_single(results, a, out)::Float64
+
+Calculates the memory bandwidth for operations that involve a single input and output field based on benchmark results.
+
+This function measures how efficiently data is transferred to and from memory during the execution of a benchmarked operation.
+
+# Arguments
+- `results`: The benchmark results object containing timing and other performance data.
+- `a`: The input field used in the benchmark.
+- `out`: The output field produced by the benchmark.
+
+# Returns
+- `bandwidth`: The computed memory bandwidth in gigabytes per second (GB/s), which represents the rate at which data is read from and written to the system memory during the operation.
+
+# Calculation Details
+- `data_size`: Sum of the sizes of the input and output data in bytes.
+- `time_in_seconds`: The median execution time of the benchmark, converted from nanoseconds to seconds.
+- `bandwidth`: Calculated as the total data transferred divided by the time taken, expressed in GB/s.
+"""
+function compute_memory_bandwidth_single(results, a, out=a)::Float64
+    data_size = sizeof(a.data) + sizeof(out.data)  # Read from a and write to out
+    time_in_seconds = median(results.times) / 1e9  # Convert ns to s
+    bandwidth = data_size / time_in_seconds / 1e9  # GB/s
+    return bandwidth
+end
+
 """
     compute_memory_bandwidth_addition(results, a, b, out)
 
@@ -132,6 +237,22 @@ suite["addition"]["fields_broadcast_addition"] = @benchmarkable $broadcast_addit
 a, b, out = fields_broadcast_addition_setup(STREAM_SIZE)
 suite["addition"]["field_op_broadcast_addition"] = @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out)
 
+# Sine without field operator benchmark
+a, out = single_field_setup(STREAM_SIZE)
+suite["trigonometry"]["sin"] = @benchmarkable $sin_without_fo($a)
+
+# Field operator sine benchmark
+a, out = single_field_setup(STREAM_SIZE)
+suite["trigonometry"]["field_op_sin"] = @benchmarkable $fo_sin($a, backend="embedded", out=$out)
+
+# Cosine without field operator benchmark
+a, out = single_field_setup(STREAM_SIZE)
+suite["trigonometry"]["cos"] = @benchmarkable $cos_without_fo($a)
+
+# Field operator cosine benchmark
+a, out = single_field_setup(STREAM_SIZE)
+suite["trigonometry"]["field_op_cos"] = @benchmarkable $fo_cos($a, backend="embedded", out=$out)
+
 # Run the benchmark suite
 results = run(suite)
 
@@ -139,12 +260,26 @@ results = run(suite)
 array_results = results["addition"]["array_broadcast_addition"]
 fields_results = results["addition"]["fields_broadcast_addition"]
 fo_results = results["addition"]["field_op_broadcast_addition"]
+sin_results = results["trigonometry"]["sin"]
+fo_sin_results = results["trigonometry"]["field_op_sin"]
+cos_results = results["trigonometry"]["cos"]
+fo_cos_results = results["trigonometry"]["field_op_cos"]
 
 # Process and print the results
-array_bandwidth = compute_memory_bandwidth_addition(array_results, a, b, a) # Out is a temporary array with size a
-fields_bandwidth = compute_memory_bandwidth_addition(fields_results, a, b, a) # Out is a temporary array with size a
+array_bandwidth = compute_memory_bandwidth_addition(array_results, a, b, a) # Out is a temporary array with size equal to the size of a
+fields_bandwidth = compute_memory_bandwidth_addition(fields_results, a, b, a)
 fo_bandwidth = compute_memory_bandwidth_addition(fo_results, a, b, out)
 
+sin_bandwidth = compute_memory_bandwidth_single(sin_results, a)
+fo_sin_bandwidth = compute_memory_bandwidth_single(fo_sin_results, a)
+cos_bandwidth = compute_memory_bandwidth_single(cos_results, a)
+fo_cos_bandwidth = compute_memory_bandwidth_single(fo_cos_results, a)
+
+# Print the results
 println("Array broadcast addition bandwidth:\t\t$array_bandwidth GB/s")
 println("Fields data broadcast addition bandwidth:\t$fields_bandwidth GB/s")
 println("Field Operator broadcast addition bandwidth:\t$fo_bandwidth GB/s")
+println("Sine operation bandwidth (no field operator):\t$sin_bandwidth GB/s")
+println("Field Operator sine bandwidth:\t$fo_sin_bandwidth GB/s")
+println("Cosine operation bandwidth (no field operator):\t$cos_bandwidth GB/s")
+println("Field Operator cosine bandwidth:\t$fo_cos_bandwidth GB/s")

From 323c269839cec847734a44c1f4190605193c32ea Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Wed, 14 Aug 2024 15:04:15 +0200
Subject: [PATCH 18/53] Add benchmarks for remapping

---
 benchmark/benchmarks_remapping.jl | 71 +++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 benchmark/benchmarks_remapping.jl

diff --git a/benchmark/benchmarks_remapping.jl b/benchmark/benchmarks_remapping.jl
new file mode 100644
index 0000000..66470a6
--- /dev/null
+++ b/benchmark/benchmarks_remapping.jl
@@ -0,0 +1,71 @@
+using BenchmarkTools
+using Statistics
+using GridTools  
+
+const N = 10_000_000 |> floor |> Int # Adjust as needed (10 millions is the SLURM test size)
+
+include("../test/mesh_definitions.jl")  # Ensure all necessary mesh and dimension definitions are loaded
+
+# Unstructured Mesh ------------------------------------------------------------------------------------------
+
+function create_large_connectivity(size::Int)
+    edge_to_cell_table = hcat([rand(1:size, 2) for _ in 1:size]...)
+    cell_to_edge_table = hcat([rand(1:size, 3) for _ in 1:size]...)
+
+    E2C = Connectivity(edge_to_cell_table, Cell, Edge, 2)
+    C2E = Connectivity(cell_to_edge_table, Edge, Cell, 3)
+
+    Dict(
+        "E2C" => E2C,
+        "C2E" => C2E,
+        "E2CDim" => E2C  # TODO: remove it
+    )
+end
+
+offset_provider = create_large_connectivity(N)
+
+a = Field(Cell, collect(1.0:N))
+out_field = GridTools.similar_field(a)
+
+@field_operator function fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+    return a(E2C[1])
+end
+
+# Benchmark the field remapping operation
+remapping_benchmark = @benchmarkable $fo_remapping($a, offset_provider=$offset_provider, backend="embedded", out=$out_field)
+
+# Run the benchmark
+results = run(remapping_benchmark)
+
+# Memory bandwidth calculation
+unstr_time_in_seconds = median(results.times) / 1e9  # convert ns to s
+unstr_data_size = sizeof(a.data) + sizeof(out_field.data)  # total bytes read and written
+unstr_bandwidth = unstr_data_size / unstr_time_in_seconds / 1e9  # GB/s
+
+# Output results
+println("Time taken: ", median(results.times) / 1e6, " ms")
+println("Memory bandwidth for Unstructured Mesh Remapping: ", unstr_bandwidth, " GB/s")
+
+# Cartesian Mesh ---------------------------------------------------------------------------------------------
+
+# Cartesian Offset Field Operator
+@field_operator function fo_cartesian_offset(inp::Field{Tuple{K_},Float64})::Field{Tuple{K_},Float64}
+    return inp(Koff[1])
+end
+
+# Create and benchmark the Cartesian offset operation
+a = Field(K, collect(1.0:N))
+out_field = Field(K, zeros(Float64, N-1))
+cartesian_offset_provider = Dict("Koff" => K)
+
+cartesian_benchmark = @benchmarkable $fo_cartesian_offset($a, backend="embedded", out=$out_field, offset_provider=$cartesian_offset_provider)
+cartesian_results = run(cartesian_benchmark)
+
+# Memory bandwidth calculation
+cartesian_time_in_seconds = median(cartesian_results.times) / 1e9  # convert ns to s
+cartesian_data_size = sizeof(a.data) + sizeof(out_field.data)  # total bytes read and written
+cartesian_bandwidth = cartesian_data_size / cartesian_time_in_seconds / 1e9  # GB/s
+
+# Output results
+println("Time taken for Cartesian Mesh Offset: ", median(cartesian_results.times) / 1e6, " ms")
+println("Memory bandwidth for Cartesian Mesh Offset: ", cartesian_bandwidth, " GB/s")

From 9c138c2216c481c3e73d8a631fac559d4cb1d82e Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Wed, 14 Aug 2024 15:30:13 +0200
Subject: [PATCH 19/53] Add draft mpdata

---
 benchmark/benchmark_mpdata.jl | 94 +++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 benchmark/benchmark_mpdata.jl

diff --git a/benchmark/benchmark_mpdata.jl b/benchmark/benchmark_mpdata.jl
new file mode 100644
index 0000000..34a1e49
--- /dev/null
+++ b/benchmark/benchmark_mpdata.jl
@@ -0,0 +1,94 @@
+# benchmark_mpdata.jl - Benchmarking for atlas advection code
+
+using BenchmarkTools
+using GridTools  # Assuming all necessary functionality like Field, Dimension are defined here
+using Statistics
+using Printf
+
+Cell_ = Dimension{:Cell_, HORIZONTAL}
+Edge_ = Dimension{:Edge_, HORIZONTAL}
+Vertex_ = Dimension{:Vertex_, HORIZONTAL}
+K_ = Dimension{:K_, VERTICAL}
+V2VDim_ = Dimension{:V2V_, LOCAL}
+V2EDim_ = Dimension{:V2E_, LOCAL}
+E2VDim_ = Dimension{:E2V_, LOCAL}
+Cell = Cell_()
+K = K_()
+Edge = Edge_()
+Vertex = Vertex_()
+V2VDim = V2VDim_()
+V2EDim = V2EDim_()
+E2VDim = E2VDim_()
+
+V2V = FieldOffset("V2V", source = Vertex, target = (Vertex, V2VDim))
+E2V = FieldOffset("E2V", source = Vertex, target = (Edge, E2VDim))
+V2E = FieldOffset("V2E", source = Edge, target = (Vertex, V2EDim))
+Koff = FieldOffset("Koff", source = K, target = K)
+
+include("../src/atlas/atlas_mesh.jl")
+include("../src/atlas/state_container.jl")
+include("../src/atlas/metric.jl")
+include("../src/atlas/advection.jl")
+
+# Function to set up and run the benchmark
+function benchmark_mpdata()
+    # Set up the environment or load data
+    grid = atlas.StructuredGrid("O50")
+    mesh = AtlasMesh(grid, num_level = 30)
+
+    # Define dimensions based on the mesh properties
+    vertex_dim = getproperty(mesh, DIMENSION_TO_SIZE_ATTR[Vertex])
+    k_dim = getproperty(mesh, DIMENSION_TO_SIZE_ATTR[K])
+    edge_dim = getproperty(mesh, DIMENSION_TO_SIZE_ATTR[Edge])
+
+    # Set parameters
+    δt = 1800.0  # time step in s
+    eps = 1.0e-8
+    niter = 50  # Adjust based on how long you want the benchmark to run
+
+    # Initialize fields and metrics
+    state = sc_from_mesh(mesh)
+    state_next = sc_from_mesh(mesh)
+    tmp_fields = Dict{String, Field}()
+    for i = 1:6
+        tmp_fields[@sprintf("tmp_vertex_%d", i)] = Field((Vertex, K), zeros(vertex_dim, k_dim))
+    end
+    for j = 1:3
+        tmp_fields[@sprintf("tmp_edge_%d", j)] = Field((Edge, K), zeros(edge_dim, k_dim))
+    end
+
+    # Benchmark the mpdata_program
+    println("Starting the benchmark for mpdata_program...")
+    bench_result = @benchmark begin
+        mpdata_program(
+            state.rho,
+            δt,
+            eps,
+            mesh.vol,
+            metric.gac,
+            state.vel[1],
+            state.vel[2],
+            state.vel[3],
+            mesh.pole_edge_mask,
+            mesh.dual_face_orientation,
+            mesh.dual_face_normal_weighted_x,
+            mesh.dual_face_normal_weighted_y,
+            tmp_fields["tmp_vertex_1"],
+            tmp_fields["tmp_vertex_2"],
+            tmp_fields["tmp_vertex_3"],
+            tmp_fields["tmp_vertex_4"],
+            tmp_fields["tmp_vertex_5"],
+            tmp_fields["tmp_vertex_6"],
+            tmp_fields["tmp_edge_1"],
+            tmp_fields["tmp_edge_2"],
+            tmp_fields["tmp_edge_3"]
+        )
+    end
+
+    # Output benchmark results
+    println("Benchmark completed.")
+    display(bench_result)
+end
+
+# Run the benchmark function
+benchmark_mpdata()

From 700a545f2bc481bc5d14cab254fec35c25e0a8a5 Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Fri, 16 Aug 2024 09:47:35 +0200
Subject: [PATCH 20/53] Clear benchmarks.jl and add remapping

---
 benchmark/benchmarks.jl | 169 ++++++++++++++++++++++++++++------------
 1 file changed, 121 insertions(+), 48 deletions(-)

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 65799c6..18b7743 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -1,13 +1,78 @@
 using BenchmarkTools
 using Statistics
 using GridTools
+using GridTools.ExampleMeshes.Unstructured
+using GridTools.ExampleMeshes.Cartesian
 
 # Data size
 const global STREAM_SIZE = 10_000_000
 
-# Mesh definitions
-const global Cell_ = Dimension{:Cell_, HORIZONTAL}
-const global Cell = Cell_()
+# Utils ------------------------------------------------------------------------------------------------------
+
+# Useful for the benchmark of the field remapping operation
+function create_large_connectivity(size::Int)
+    edge_to_cell_table = hcat([rand(1:size, 2) for _ in 1:size]...)
+    cell_to_edge_table = hcat([rand(1:size, 3) for _ in 1:size]...)
+
+    E2C = Connectivity(edge_to_cell_table, Cell, Edge, 2)
+    C2E = Connectivity(cell_to_edge_table, Edge, Cell, 3)
+
+    Dict(
+        "E2C" => E2C,
+        "C2E" => C2E,
+        "E2CDim" => E2C  # TODO: remove it
+    )
+end
+
+"""
+    compute_memory_bandwidth_single(results, a, out)::Float64
+
+Calculates the memory bandwidth for operations that involve a single input and output field based on benchmark results.
+
+This function measures how efficiently data is transferred to and from memory during the execution of a benchmarked operation.
+
+# Arguments
+- `results`: The benchmark results object containing timing and other performance data.
+- `a`: The input field used in the benchmark.
+- `out`: The output field produced by the benchmark.
+
+# Returns
+- `bandwidth`: The computed memory bandwidth in gigabytes per second (GB/s), which represents the rate at which data is read from and written to the system memory during the operation.
+
+# Calculation Details
+- `data_size`: Sum of the sizes of the input and output data in bytes.
+- `time_in_seconds`: The median execution time of the benchmark, converted from nanoseconds to seconds.
+- `bandwidth`: Calculated as the total data transferred divided by the time taken, expressed in GB/s.
+"""
+function compute_memory_bandwidth_single(results, a, out=a)::Float64
+    data_size = sizeof(a.data) + sizeof(out.data)  # Read from a and write to out
+    time_in_seconds = median(results.times) / 1e9  # Convert ns to s
+    bandwidth = data_size / time_in_seconds / 1e9  # GB/s
+    return bandwidth
+end
+
+"""
+    compute_memory_bandwidth_addition(results, a, b, out)
+
+Function to compute the memory bandwidth for the addition benchmarks.
+
+# Arguments
+- `results`: Benchmark results.
+- `a, b`: The input arrays/fields used in the benchmark.
+- `out`: The output array/field of the benchmark.
+
+# Returns
+- The computed memory bandwidth in GB/s.
+"""
+function compute_memory_bandwidth_addition(results, a, b, out)::Float64
+    @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data)
+    data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data)  # Read a and b, write to out
+    time_in_seconds = median(results.times) / 1e9  # Convert ns to s
+    bandwidth = data_size / time_in_seconds / 1e9  # GB/s
+    return bandwidth
+end
+
+# Operations -------------------------------------------------------------------------------------------------
 
 """
     single_field_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field}
@@ -172,52 +237,23 @@ Field operator that applies the cosine function element-wise to the data of a fi
 end
 
 """
-    compute_memory_bandwidth_single(results, a, out)::Float64
+    fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
 
-Calculates the memory bandwidth for operations that involve a single input and output field based on benchmark results.
+Field operator that performs remapping from cell-based data to edge-based data.
 
-This function measures how efficiently data is transferred to and from memory during the execution of a benchmarked operation.
+This operator utilizes a connectivity table (`E2C`) to map the values from cells to edges, implying a transformation from the cell-centered field to an edge-centered field based on predefined relationships in the connectivity table.
 
 # Arguments
-- `results`: The benchmark results object containing timing and other performance data.
-- `a`: The input field used in the benchmark.
-- `out`: The output field produced by the benchmark.
+- `a`: Input field containing Float64 data structured around cells.
 
 # Returns
-- `bandwidth`: The computed memory bandwidth in gigabytes per second (GB/s), which represents the rate at which data is read from and written to the system memory during the operation.
-
-# Calculation Details
-- `data_size`: Sum of the sizes of the input and output data in bytes.
-- `time_in_seconds`: The median execution time of the benchmark, converted from nanoseconds to seconds.
-- `bandwidth`: Calculated as the total data transferred divided by the time taken, expressed in GB/s.
+- A new field where each element represents data remapped from cells to edges, structured as specified by the edge-to-cell connectivity.
 """
-function compute_memory_bandwidth_single(results, a, out=a)::Float64
-    data_size = sizeof(a.data) + sizeof(out.data)  # Read from a and write to out
-    time_in_seconds = median(results.times) / 1e9  # Convert ns to s
-    bandwidth = data_size / time_in_seconds / 1e9  # GB/s
-    return bandwidth
+@field_operator function fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+    return a(E2C[1])
 end
 
-"""
-    compute_memory_bandwidth_addition(results, a, b, out)
-
-Function to compute the memory bandwidth for the addition benchmarks.
-
-# Arguments
-- `results`: Benchmark results.
-- `a, b`: The input arrays/fields used in the benchmark.
-- `out`: The output array/field of the benchmark.
-
-# Returns
-- The computed memory bandwidth in GB/s.
-"""
-function compute_memory_bandwidth_addition(results, a, b, out)::Float64
-    @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data)
-    data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data)  # Read a and b, write to out
-    time_in_seconds = median(results.times) / 1e9  # Convert ns to s
-    bandwidth = data_size / time_in_seconds / 1e9  # GB/s
-    return bandwidth
-end
+# Benchmark --------------------------------------------------------------------------------------------------
 
 # Create the benchmark suite
 suite = BenchmarkGroup()
@@ -253,6 +289,12 @@ suite["trigonometry"]["cos"] = @benchmarkable $cos_without_fo($a)
 a, out = single_field_setup(STREAM_SIZE)
 suite["trigonometry"]["field_op_cos"] = @benchmarkable $fo_cos($a, backend="embedded", out=$out)
 
+# Benchmark the field remapping operation
+offset_provider = create_large_connectivity(STREAM_SIZE)
+a, out = single_field_setup(STREAM_SIZE)
+suite["remapping"]["field_operator"] = 
+    @benchmarkable $fo_remapping($a, offset_provider=$offset_provider, backend="embedded", out=$out)
+
 # Run the benchmark suite
 results = run(suite)
 
@@ -264,6 +306,7 @@ sin_results = results["trigonometry"]["sin"]
 fo_sin_results = results["trigonometry"]["field_op_sin"]
 cos_results = results["trigonometry"]["cos"]
 fo_cos_results = results["trigonometry"]["field_op_cos"]
+remapping_results = results["remapping"]["field_operator"]
 
 # Process and print the results
 array_bandwidth = compute_memory_bandwidth_addition(array_results, a, b, a) # Out is a temporary array with size equal to the size of a
@@ -275,11 +318,41 @@ fo_sin_bandwidth = compute_memory_bandwidth_single(fo_sin_results, a)
 cos_bandwidth = compute_memory_bandwidth_single(cos_results, a)
 fo_cos_bandwidth = compute_memory_bandwidth_single(fo_cos_results, a)
 
-# Print the results
-println("Array broadcast addition bandwidth:\t\t$array_bandwidth GB/s")
-println("Fields data broadcast addition bandwidth:\t$fields_bandwidth GB/s")
-println("Field Operator broadcast addition bandwidth:\t$fo_bandwidth GB/s")
-println("Sine operation bandwidth (no field operator):\t$sin_bandwidth GB/s")
-println("Field Operator sine bandwidth:\t$fo_sin_bandwidth GB/s")
-println("Cosine operation bandwidth (no field operator):\t$cos_bandwidth GB/s")
-println("Field Operator cosine bandwidth:\t$fo_cos_bandwidth GB/s")
+remapping_bandwidth = compute_memory_bandwidth_single(remapping_results, a)
+
+# Function to convert nanoseconds to milliseconds for clearer output
+ns_to_ms(time_ns) = time_ns / 1e6
+
+# Process and print the results along with the time taken for each
+println("Array broadcast addition:")
+println("\tBandwidth: $array_bandwidth GB/s")
+println("\tTime taken: $(ns_to_ms(median(array_results.times))) ms\n")
+
+println("Fields data broadcast addition:")
+println("\tBandwidth: $fields_bandwidth GB/s")
+println("\tTime taken: $(ns_to_ms(median(fields_results.times))) ms\n")
+
+println("Field Operator broadcast addition:")
+println("\tBandwidth: $fo_bandwidth GB/s")
+println("\tTime taken: $(ns_to_ms(median(fo_results.times))) ms\n")
+
+println("Sine operation (no field operator):")
+println("\tBandwidth: $sin_bandwidth GB/s")
+println("\tTime taken: $(ns_to_ms(median(sin_results.times))) ms\n")
+
+println("Field Operator sine operation:")
+println("\tBandwidth: $fo_sin_bandwidth GB/s")
+println("\tTime taken: $(ns_to_ms(median(fo_sin_results.times))) ms\n")
+
+println("Cosine operation (no field operator):")
+println("\tBandwidth: $cos_bandwidth GB/s")
+println("\tTime taken: $(ns_to_ms(median(cos_results.times))) ms\n")
+
+println("Field Operator cosine operation:")
+println("\tBandwidth: $fo_cos_bandwidth GB/s")
+println("\tTime taken: $(ns_to_ms(median(fo_cos_results.times))) ms\n")
+
+println("Field Operator Remapping:")
+println("\tBandwidth: $remapping_bandwidth GB/s")
+println("\tTime taken: $(ns_to_ms(median(remapping_results.times))) ms\n")
+

From 35902807cb788be4de1e691ce7a38dbc0ac83a8a Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Fri, 16 Aug 2024 10:00:30 +0200
Subject: [PATCH 21/53] Add neighbor sum benchmark to the suite

---
 benchmark/benchmarks.jl | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 18b7743..f5ff83e 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -253,6 +253,23 @@ This operator utilizes a connectivity table (`E2C`) to map the values from cells
     return a(E2C[1])
 end
 
+"""
+    fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+
+Field operator that computes the sum of neighboring cell values for each edge. This function leverages the connectivity table (`E2C`), which defines the relationship between edges and cells, to sum the values of cells that are connected to each edge.
+
+The summation is performed across the dimension specified by `E2CDim`, ensuring that each edge aggregates values from its associated cells correctly.
+
+# Arguments
+- `a`: Input field containing Float64 data, where each cell contains a numerical value.
+
+# Returns
+- A new field where each edge holds the summed value of its neighboring cells, based on the edge-to-cell connectivity defined in `E2C`.
+"""
+@field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+    return neighbor_sum(a(E2C), axis=E2CDim)
+end
+
 # Benchmark --------------------------------------------------------------------------------------------------
 
 # Create the benchmark suite
@@ -295,6 +312,12 @@ a, out = single_field_setup(STREAM_SIZE)
 suite["remapping"]["field_operator"] = 
     @benchmarkable $fo_remapping($a, offset_provider=$offset_provider, backend="embedded", out=$out)
 
+# Benchmark the field neighbor sum operation
+offset_provider = create_large_connectivity(STREAM_SIZE)
+a, out = single_field_setup(STREAM_SIZE)
+suite["neighbor_sum"]["field_operator"] = 
+    @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, backend="embedded", out=$out)
+
 # Run the benchmark suite
 results = run(suite)
 
@@ -307,6 +330,7 @@ fo_sin_results = results["trigonometry"]["field_op_sin"]
 cos_results = results["trigonometry"]["cos"]
 fo_cos_results = results["trigonometry"]["field_op_cos"]
 remapping_results = results["remapping"]["field_operator"]
+neighbor_sum_results = results["neighbor_sum"]["field_operator"]
 
 # Process and print the results
 array_bandwidth = compute_memory_bandwidth_addition(array_results, a, b, a) # Out is a temporary array with size equal to the size of a
@@ -318,8 +342,6 @@ fo_sin_bandwidth = compute_memory_bandwidth_single(fo_sin_results, a)
 cos_bandwidth = compute_memory_bandwidth_single(cos_results, a)
 fo_cos_bandwidth = compute_memory_bandwidth_single(fo_cos_results, a)
 
-remapping_bandwidth = compute_memory_bandwidth_single(remapping_results, a)
-
 # Function to convert nanoseconds to milliseconds for clearer output
 ns_to_ms(time_ns) = time_ns / 1e6
 
@@ -353,6 +375,7 @@ println("\tBandwidth: $fo_cos_bandwidth GB/s")
 println("\tTime taken: $(ns_to_ms(median(fo_cos_results.times))) ms\n")
 
 println("Field Operator Remapping:")
-println("\tBandwidth: $remapping_bandwidth GB/s")
 println("\tTime taken: $(ns_to_ms(median(remapping_results.times))) ms\n")
 
+println("Field Operator Neighbor Sum:")
+println("\tTime taken: $(ns_to_ms(median(neighbor_sum_results.times))) ms\n")

From 0be7ec1218719fe814dbe1d9290d8db46237fb2a Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Fri, 16 Aug 2024 10:00:53 +0200
Subject: [PATCH 22/53] Fix dependencies in benchmarks

---
 benchmark/benchmarks_neighbour_sum.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmark/benchmarks_neighbour_sum.jl b/benchmark/benchmarks_neighbour_sum.jl
index 3016d43..a374f40 100644
--- a/benchmark/benchmarks_neighbour_sum.jl
+++ b/benchmark/benchmarks_neighbour_sum.jl
@@ -2,12 +2,11 @@
 using BenchmarkTools
 using Statistics
 using GridTools  
+using GridTools.ExampleMeshes.Unstructured
 
 const N = 1_000_000
 const DIM_SIZE = sqrt(N) |> floor |> Int
 
-include("../test/mesh_definitions.jl")
-
 function create_large_connectivity(size::Int)
     edge_to_cell_table = hcat([rand(1:size, 2) for _ in 1:size]...)
     cell_to_edge_table = hcat([rand(1:size, 3) for _ in 1:size]...)

From 62276b057b0f0ff38897b88c3825a5e226bb5f29 Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Fri, 16 Aug 2024 10:20:13 +0200
Subject: [PATCH 23/53] Add verbose flag to avoid printing

---
 advection/advection_miniapp.jl | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/advection/advection_miniapp.jl b/advection/advection_miniapp.jl
index b4230a9..9baf6a5 100644
--- a/advection/advection_miniapp.jl
+++ b/advection/advection_miniapp.jl
@@ -8,6 +8,7 @@ using Profile
 using GridTools
 
 const global VISUALIZATION_FLAG::Bool=false
+const global VERBOSE_FLAG::Bool=true
 
 # Mesh Definitions --------------------------------------------------------------------------------------------
 # Define dimensions for the mesh
@@ -215,7 +216,9 @@ for i = 1:niter
     )
 
     # Print the current timestep
-    println("Timestep $i")
+    if VERBOSE_FLAG
+        println("Timestep $i")
+    end
 
     if VISUALIZATION_FLAG
         # Print the current state as ASCII art every 5 timesteps
@@ -232,9 +235,11 @@ for i = 1:niter
     update_periodic_layers(mesh, state.rho)
 end
 
-# Output the final statistics for the scalar field (rho) and velocity fields
-println(
-    "min max sum of final rho = $(minimum(state.rho.data)) , $(maximum(state.rho.data)) , $(sum(state.rho.data))"
-)
-println("Final Vel0 sum after $niter iterations: $(sum(state.vel[1].data))")
-println("Final Vel1 sum after $niter iterations: $(sum(state.vel[2].data))")
+if VERBOSE_FLAG
+    # Output the final statistics for the scalar field (rho) and velocity fields
+    println(
+        "min max sum of final rho = $(minimum(state.rho.data)) , $(maximum(state.rho.data)) , $(sum(state.rho.data))"
+    )
+    println("Final Vel0 sum after $niter iterations: $(sum(state.vel[1].data))")
+    println("Final Vel1 sum after $niter iterations: $(sum(state.vel[2].data))")
+end

From 1bbb1e64d8d88aa374d91dc1418c35bc9a8449fb Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Fri, 16 Aug 2024 11:56:53 +0200
Subject: [PATCH 24/53] Quick fix to the unary/binary operation support

---
 advection/advection.jl     | 12 ++----------
 src/gt2py/jast_to_foast.jl |  1 +
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/advection/advection.jl b/advection/advection.jl
index 19f7741..35cbece 100644
--- a/advection/advection.jl
+++ b/advection/advection.jl
@@ -149,7 +149,8 @@ end
 )::Field{Tuple{Vertex_, K_}, Float64}
     zrhin =
         (1.0 ./ vol) .* neighbor_sum(
-            -min.(0.0, flux(V2E)) .* max.(0.0, dual_face_orientation) -
+            # TODO: fix the 0-min workaround due to the binary/unary operation issue
+            (broadcast(0., (Vertex, V2EDim, K)) .- min.(0.0, flux(V2E))) .* max.(0.0, dual_face_orientation) -
             max.(0.0, flux(V2E)) .* min.(0.0, dual_face_orientation),
             axis = V2EDim,
         )
@@ -227,15 +228,6 @@ end
     dual_face_orientation::Field{Tuple{Vertex_, V2EDim_}, Float64},
     dual_face_normal_weighted_x::Field{Tuple{Edge_}, Float64},
     dual_face_normal_weighted_y::Field{Tuple{Edge_}, Float64},
-    tmp_vertex_1::Field{Tuple{Vertex_, K_}, Float64},
-    tmp_vertex_2::Field{Tuple{Vertex_, K_}, Float64},
-    tmp_vertex_3::Field{Tuple{Vertex_, K_}, Float64},
-    tmp_vertex_4::Field{Tuple{Vertex_, K_}, Float64},
-    tmp_vertex_5::Field{Tuple{Vertex_, K_}, Float64},
-    tmp_vertex_6::Field{Tuple{Vertex_, K_}, Float64},
-    tmp_edge_1::Field{Tuple{Edge_, K_}, Float64},
-    tmp_edge_2::Field{Tuple{Edge_, K_}, Float64},
-    tmp_edge_3::Field{Tuple{Edge_, K_}, Float64},
 )
 
     tmp_edge_1 = advector_normal(
diff --git a/src/gt2py/jast_to_foast.jl b/src/gt2py/jast_to_foast.jl
index c843059..f0663c7 100644
--- a/src/gt2py/jast_to_foast.jl
+++ b/src/gt2py/jast_to_foast.jl
@@ -266,6 +266,7 @@ end
 
 function visit_(sym::Val{:call}, args::Array, outer_loc)
     if args[1] in bin_op
+        # TODO: check the case where a unary expression, that is at the same time binary operation is encountered: i.e. -x
         @assert length(args)==3 "Expected a binary operation. AST must be canonicalized using `canonicalize_arithmetic_ops` first."
         return foast.BinOp(
             op = visit(args[1]),

From 17a55f8af2b655a526ba0127142df8a6035860f2 Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Fri, 16 Aug 2024 11:57:42 +0200
Subject: [PATCH 25/53] Use the ExampleMeshes in Atlas miniapp (with workaround
 on the offset_provider)

---
 advection/advection_miniapp.jl | 28 ++--------------------------
 src/atlas/atlas_mesh.jl        |  6 +++++-
 2 files changed, 7 insertions(+), 27 deletions(-)

diff --git a/advection/advection_miniapp.jl b/advection/advection_miniapp.jl
index 9baf6a5..0073430 100644
--- a/advection/advection_miniapp.jl
+++ b/advection/advection_miniapp.jl
@@ -6,35 +6,11 @@ using Debugger
 using Statistics
 using Profile
 using GridTools
+using GridTools.ExampleMeshes.Unstructured
 
 const global VISUALIZATION_FLAG::Bool=false
 const global VERBOSE_FLAG::Bool=true
 
-# Mesh Definitions --------------------------------------------------------------------------------------------
-# Define dimensions for the mesh
-Cell_ = Dimension{:Cell_, HORIZONTAL}
-Edge_ = Dimension{:Edge_, HORIZONTAL}
-Vertex_ = Dimension{:Vertex_, HORIZONTAL}
-K_ = Dimension{:K_, VERTICAL}
-V2VDim_ = Dimension{:V2V_, LOCAL}
-V2EDim_ = Dimension{:V2E_, LOCAL}
-E2VDim_ = Dimension{:E2V_, LOCAL}
-
-# Instantiate dimension objects
-Cell = Cell_()
-K = K_()
-Edge = Edge_()
-Vertex = Vertex_()
-V2VDim = V2VDim_()
-V2EDim = V2EDim_()
-E2VDim = E2VDim_()
-
-# Define field offsets to describe the relationships between different dimensions
-V2V = FieldOffset("V2V", source = Vertex, target = (Vertex, V2VDim))
-E2V = FieldOffset("E2V", source = Vertex, target = (Edge, E2VDim))
-V2E = FieldOffset("V2E", source = Edge, target = (Vertex, V2EDim))
-Koff = FieldOffset("Koff", source = K, target = K)
-
 # Include additional necessary files for mesh, state container, metric calculations, and advection operations
 include("../src/atlas/atlas_mesh.jl")
 include("state_container.jl")
@@ -50,7 +26,7 @@ mesh = AtlasMesh(grid, num_level = 30)
 # Simulation Parameters ---------------------------------------------------------------------------------------
 δt = 1800.0  # time step in s
 niter = 50
-ε = 1.0e-8
+ϵ = 1.0e-8
 
 # Calculate metric properties from the mesh
 metric = m_from_mesh(mesh)
diff --git a/src/atlas/atlas_mesh.jl b/src/atlas/atlas_mesh.jl
index be45be3..d8f947b 100644
--- a/src/atlas/atlas_mesh.jl
+++ b/src/atlas/atlas_mesh.jl
@@ -260,7 +260,11 @@ struct AtlasMesh
             "V2V" => v2v,
             "V2E" => v2e,
             "E2V" => e2v,
-            "Koff" => K
+            "Koff" => K,
+            # TODO: cleanup
+            "V2VDim" => v2v,
+            "V2EDim" => v2e,
+            "E2VDim" => e2v,
         )
 
         remote_indices = Dict{Dimension, Array}(

From fcb44cd09e06d02b219db3dda9e480ab6cf9652f Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Fri, 16 Aug 2024 11:58:11 +0200
Subject: [PATCH 26/53] Add benchmark for mp_data

---
 benchmark/benchmarks.jl | 59 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index f5ff83e..2ebf5f0 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -270,7 +270,7 @@ The summation is performed across the dimension specified by `E2CDim`, ensuring
     return neighbor_sum(a(E2C), axis=E2CDim)
 end
 
-# Benchmark --------------------------------------------------------------------------------------------------
+# Benchmarks -------------------------------------------------------------------------------------------------
 
 # Create the benchmark suite
 suite = BenchmarkGroup()
@@ -379,3 +379,60 @@ println("\tTime taken: $(ns_to_ms(median(remapping_results.times))) ms\n")
 
 println("Field Operator Neighbor Sum:")
 println("\tTime taken: $(ns_to_ms(median(neighbor_sum_results.times))) ms\n")
+
+# Advection Benchmarks 
+
+include("../advection/advection_miniapp.jl")
+
+println("Starting julia embedded benchmark")
+
+suite["advection"]["mpdata_program_julia_embedded"] = @benchmark mpdata_program(
+        state.rho,
+        δt,
+        ϵ,
+        mesh.vol,
+        metric.gac,
+        state.vel[1],
+        state.vel[2],
+        state.vel[3],
+        mesh.pole_edge_mask,
+        mesh.dual_face_orientation,
+        mesh.dual_face_normal_weighted_x,
+        mesh.dual_face_normal_weighted_y,
+        out = state_next.rho,
+        offset_provider = mesh.offset_provider
+    )
+
+println("Finished Julia embedded benchmark")
+
+# TODO: disabled because the backend is not currently supporting it (the backend is too slow)
+# println("Starting julia python benchmark")
+
+# suite["advection"]["mpdata_program_julia_pyback"] = @benchmark mpdata_program(
+#         state.rho,
+#         δt,
+#         ϵ,
+#         mesh.vol,
+#         metric.gac,
+#         state.vel[1],
+#         state.vel[2],
+#         state.vel[3],
+#         mesh.pole_edge_mask,
+#         mesh.dual_face_orientation,
+#         mesh.dual_face_normal_weighted_x,
+#         mesh.dual_face_normal_weighted_y,
+#         out = state_next.rho,
+#         offset_provider = mesh.offset_provider,
+#         backend = "py"
+#     )
+
+# println("Finished Julia python backend benchmark")
+
+mpdata_emb_results = results["advection"]["mpdata_program_julia_embedded"]
+# mpdata_pyback_results = results["advection"]["mpdata_program_julia_pyback"]
+
+println("mpdata_program julia embedded version:")
+println("\tTime taken: $(ns_to_ms(median(mpdata_emb_results.times))) ms\n")
+
+# println("mpdata_program julia with python backend:")
+# println("\tTime taken: $(ns_to_ms(median(mpdata_pyback_results.times))) ms\n")

From 22d1257051617c2376a95652b7a1d78f9e42341b Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Mon, 19 Aug 2024 14:07:36 +0200
Subject: [PATCH 27/53] Fix slicing operation in the advection_miniapp

---
 advection/advection.jl         |  5 ++--
 advection/advection_miniapp.jl |  4 +--
 benchmark/benchmarks.jl        | 54 ++++++++++++++++++++--------------
 src/GridTools.jl               |  5 ++--
 src/embedded/cust_broadcast.jl |  2 +-
 test/gt2py_fo_exec.jl          | 19 ++++++++++++
 6 files changed, 59 insertions(+), 30 deletions(-)

diff --git a/advection/advection.jl b/advection/advection.jl
index 35cbece..159a2ef 100644
--- a/advection/advection.jl
+++ b/advection/advection.jl
@@ -6,11 +6,10 @@
     level_indices::Field{Tuple{K_}, Int64},
     num_level::Int64
 )::Field{Tuple{Vertex_, K_}, Float64}
-
     return where(
-        level_indices .== num_level - 1,
+        level_indices .== 0,
         lower,
-        where(slice(level_indices .== 0, 1:29), upper, interior)
+        where(slice(level_indices .== 29, 2:30), upper, interior)
     )
 end
 
diff --git a/advection/advection_miniapp.jl b/advection/advection_miniapp.jl
index 0073430..cdc72e0 100644
--- a/advection/advection_miniapp.jl
+++ b/advection/advection_miniapp.jl
@@ -9,7 +9,7 @@ using GridTools
 using GridTools.ExampleMeshes.Unstructured
 
 const global VISUALIZATION_FLAG::Bool=false
-const global VERBOSE_FLAG::Bool=true
+const global VERBOSE_FLAG::Bool=false
 
 # Include additional necessary files for mesh, state container, metric calculations, and advection operations
 include("../src/atlas/atlas_mesh.jl")
@@ -20,7 +20,7 @@ include("visualization_utils.jl")
 
 # Grid and Mesh Initialization --------------------------------------------------------------------------------
 # Create a structured grid and mesh for the simulation
-grid = atlas.StructuredGrid("O50")
+grid = atlas.StructuredGrid("O10")
 mesh = AtlasMesh(grid, num_level = 30)
 
 # Simulation Parameters ---------------------------------------------------------------------------------------
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 2ebf5f0..5e61886 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -5,14 +5,14 @@ using GridTools.ExampleMeshes.Unstructured
 using GridTools.ExampleMeshes.Cartesian
 
 # Data size
-const global STREAM_SIZE = 10_000_000
+const global STREAM_SIZE = 100
 
 # Utils ------------------------------------------------------------------------------------------------------
 
 # Useful for the benchmark of the field remapping operation
 function create_large_connectivity(size::Int)
-    edge_to_cell_table = hcat([rand(1:size, 2) for _ in 1:size]...)
-    cell_to_edge_table = hcat([rand(1:size, 3) for _ in 1:size]...)
+    edge_to_cell_table = vcat([rand(1:size, (1, 2)) for _ in 1:size]...)
+    cell_to_edge_table = vcat([rand(1:size, (1, 3)) for _ in 1:size]...)
 
     E2C = Connectivity(edge_to_cell_table, Cell, Edge, 2)
     C2E = Connectivity(cell_to_edge_table, Edge, Cell, 3)
@@ -64,12 +64,12 @@ Function to compute the memory bandwidth for the addition benchmarks.
 # Returns
 - The computed memory bandwidth in GB/s.
 """
-function compute_memory_bandwidth_addition(results, a, b, out)::Float64
+function compute_memory_bandwidth_addition(results, a, b, out)::Tuple{Float64, Int64}
     @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data)
     data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data)  # Read a and b, write to out
     time_in_seconds = median(results.times) / 1e9  # Convert ns to s
     bandwidth = data_size / time_in_seconds / 1e9  # GB/s
-    return bandwidth
+    return bandwidth, data_size
 end
 
 # Operations -------------------------------------------------------------------------------------------------
@@ -280,11 +280,11 @@ suite["addition"] = BenchmarkGroup()
 
 # Julia broadcast addition benchmark
 a, b, data_size = array_broadcast_addition_setup(STREAM_SIZE)
-suite["addition"]["array_broadcast_addition"] = @benchmarkable $broadcast_addition_array($a, $b)
+suite["addition"]["array_broadcast_addition"] = @benchmarkable broadcast_addition_array(a, b) setup=((a, b, data_size) = $array_broadcast_addition_setup($STREAM_SIZE); ) #a=$a; b=$b)
 
 # Field broadcast addition benchmark
 a, b, out = fields_broadcast_addition_setup(STREAM_SIZE)
-suite["addition"]["fields_broadcast_addition"] = @benchmarkable $broadcast_addition_fields($a, $b)
+suite["addition"]["fields_broadcast_addition"] = @benchmarkable broadcast_addition_fields($a, $b)
 
 # Field Operator broadcast addition benchmark
 a, b, out = fields_broadcast_addition_setup(STREAM_SIZE)
@@ -292,7 +292,7 @@ suite["addition"]["field_op_broadcast_addition"] = @benchmarkable $fo_addition($
 
 # Sine without field operator benchmark
 a, out = single_field_setup(STREAM_SIZE)
-suite["trigonometry"]["sin"] = @benchmarkable $sin_without_fo($a)
+suite["trigonometry"]["sin"] = @benchmarkable sin_without_fo($a)
 
 # Field operator sine benchmark
 a, out = single_field_setup(STREAM_SIZE)
@@ -300,7 +300,7 @@ suite["trigonometry"]["field_op_sin"] = @benchmarkable $fo_sin($a, backend="embe
 
 # Cosine without field operator benchmark
 a, out = single_field_setup(STREAM_SIZE)
-suite["trigonometry"]["cos"] = @benchmarkable $cos_without_fo($a)
+suite["trigonometry"]["cos"] = @benchmarkable cos_without_fo($a)
 
 # Field operator cosine benchmark
 a, out = single_field_setup(STREAM_SIZE)
@@ -319,6 +319,7 @@ suite["neighbor_sum"]["field_operator"] =
     @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, backend="embedded", out=$out)
 
 # Run the benchmark suite
+println("Running the benchmark suite...")
 results = run(suite)
 
 # Process the results
@@ -332,10 +333,10 @@ fo_cos_results = results["trigonometry"]["field_op_cos"]
 remapping_results = results["remapping"]["field_operator"]
 neighbor_sum_results = results["neighbor_sum"]["field_operator"]
 
-# Process and print the results
-array_bandwidth = compute_memory_bandwidth_addition(array_results, a, b, a) # Out is a temporary array with size equal to the size of a
-fields_bandwidth = compute_memory_bandwidth_addition(fields_results, a, b, a)
-fo_bandwidth = compute_memory_bandwidth_addition(fo_results, a, b, out)
+# Compute memory bandwidth
+array_bandwidth, data_size_arr = compute_memory_bandwidth_addition(array_results, a, b, a) # Out is a temporary array with size equal to the size of a
+fields_bandwidth, data_size_fields = compute_memory_bandwidth_addition(fields_results, a, b, a)
+fo_bandwidth, data_size_fo = compute_memory_bandwidth_addition(fo_results, a, b, out)
 
 sin_bandwidth = compute_memory_bandwidth_single(sin_results, a)
 fo_sin_bandwidth = compute_memory_bandwidth_single(fo_sin_results, a)
@@ -347,14 +348,17 @@ ns_to_ms(time_ns) = time_ns / 1e6
 
 # Process and print the results along with the time taken for each
 println("Array broadcast addition:")
+println("\tData size: $data_size_arr")
 println("\tBandwidth: $array_bandwidth GB/s")
 println("\tTime taken: $(ns_to_ms(median(array_results.times))) ms\n")
 
 println("Fields data broadcast addition:")
+println("\tData size: $data_size_fields")
 println("\tBandwidth: $fields_bandwidth GB/s")
 println("\tTime taken: $(ns_to_ms(median(fields_results.times))) ms\n")
 
 println("Field Operator broadcast addition:")
+println("\tData size: $data_size_fo")
 println("\tBandwidth: $fo_bandwidth GB/s")
 println("\tTime taken: $(ns_to_ms(median(fo_results.times))) ms\n")
 
@@ -384,9 +388,11 @@ println("\tTime taken: $(ns_to_ms(median(neighbor_sum_results.times))) ms\n")
 
 include("../advection/advection_miniapp.jl")
 
-println("Starting julia embedded benchmark")
+avection_suite = BenchmarkGroup()
+
+println("Starting Advection Benchmark (julia embedded)")
 
-suite["advection"]["mpdata_program_julia_embedded"] = @benchmark mpdata_program(
+avection_suite["advection"]["mpdata_program_julia_embedded"] = @benchmark $mpdata_program(
         state.rho,
         δt,
         ϵ,
@@ -403,12 +409,12 @@ suite["advection"]["mpdata_program_julia_embedded"] = @benchmark mpdata_program(
         offset_provider = mesh.offset_provider
     )
 
-println("Finished Julia embedded benchmark")
+println("Finished Advection Benchmark (julia embedded)")
 
 # TODO: disabled because the backend is not currently supporting it (the backend is too slow)
-# println("Starting julia python benchmark")
+# println("Starting Advection Benchmark (julia-python)")
 
-# suite["advection"]["mpdata_program_julia_pyback"] = @benchmark mpdata_program(
+# advection_suite["advection"]["mpdata_program_julia_pyback"] = @benchmark mpdata_program(
 #         state.rho,
 #         δt,
 #         ϵ,
@@ -426,13 +432,17 @@ println("Finished Julia embedded benchmark")
 #         backend = "py"
 #     )
 
-# println("Finished Julia python backend benchmark")
+# println("Finished Advection Benchmark (julia-python)")
+
+# Run the benchmark suite
+println("Running the advection suite...")
+# advection_results = run(avection_suite)
 
-mpdata_emb_results = results["advection"]["mpdata_program_julia_embedded"]
+# mpdata_emb_results = advection_results["advection"]["mpdata_program_julia_embedded"]
 # mpdata_pyback_results = results["advection"]["mpdata_program_julia_pyback"]
 
-println("mpdata_program julia embedded version:")
-println("\tTime taken: $(ns_to_ms(median(mpdata_emb_results.times))) ms\n")
+# println("mpdata_program julia embedded version:")
+# println("\tTime taken: $(ns_to_ms(median(mpdata_emb_results.times))) ms\n")
 
 # println("mpdata_program julia with python backend:")
 # println("\tTime taken: $(ns_to_ms(median(mpdata_pyback_results.times))) ms\n")
diff --git a/src/GridTools.jl b/src/GridTools.jl
index 580be0a..083cbd7 100644
--- a/src/GridTools.jl
+++ b/src/GridTools.jl
@@ -475,6 +475,7 @@ Base.convert(t::Type{T}, F::Field) where {T <: Number} =
     inds::Vararg{Int, N}
 ) where {BD, T, N}
     new_inds = inds .- F.origin
+    # @assert Tuple(1 for i in 1:length(new_inds)) <= new_inds <= size(F.data) "Error: $new_inds, $(size(F.data)), $(F.origin)"
     return F.data[new_inds...]
 end
 @propagate_inbounds function Base.setindex!(
@@ -488,8 +489,9 @@ end
 Base.showarg(io::IO, @nospecialize(F::Field), toplevel) =
     print(io, eltype(F), " Field with dimensions ", get_dim_name.(F.broadcast_dims))
 function slice(F::Field, inds...)::Field
+    @assert all(typeof(x) <: UnitRange{Int64} for x in inds) # TODO: understand why the line below is filtering the UnitRange only
     dim_ind = findall(x -> typeof(x) <: UnitRange{Int64}, inds)
-    return Field(F.dims[dim_ind], view(F.data, inds...), F.broadcast_dims)
+    return Field(F.dims[dim_ind], view(F.data, inds...), F.broadcast_dims, origin=Dict(d=>ind[1]-1 for (d,ind) in zip(F.dims, inds)))
 end
 
 # Connectivity struct ------------------------------------------------------------
@@ -561,7 +563,6 @@ function (fo::FieldOp)(
     out = nothing,
     kwargs...
 )
-
     is_outermost_fo = isnothing(OFFSET_PROVIDER)
     if is_outermost_fo
         @assert !isnothing(out) "Must provide an out field."
diff --git a/src/embedded/cust_broadcast.jl b/src/embedded/cust_broadcast.jl
index 0b0ad16..66cb372 100644
--- a/src/embedded/cust_broadcast.jl
+++ b/src/embedded/cust_broadcast.jl
@@ -66,7 +66,7 @@ function get_size_ifelse(mask::FieldShape, branch::FieldShape)
     out_size = [branch.axes...]
     ind_mask = findall(x -> x in branch.dims, mask.dims)
     ind_out = findall(x -> x in mask.dims, branch.dims)
-
+    # TODO: this is not correct if the mask has an origin
     out_size[ind_out] .= mask.axes[ind_mask]
 
     return FieldShape(branch.dims, Tuple(out_size), branch.broadcast_dims)
diff --git a/test/gt2py_fo_exec.jl b/test/gt2py_fo_exec.jl
index ec0014f..187eb23 100644
--- a/test/gt2py_fo_exec.jl
+++ b/test/gt2py_fo_exec.jl
@@ -564,6 +564,23 @@ function test_lap_lap(offset_provider::Dict{String, Dimension}, backend::String,
     # TODO: add in the future the test for the border values
 end
 
+"""
+    test_slice()
+
+This test checks the `slice` function, which should correctly extract a subset of data from a larger field and properly adjust the origin to reflect the new sliced field's starting point.
+
+# Expected Behavior
+- The sliced data should match the expected subset from the original field.
+- The origin of the sliced field should be adjusted correctly to match the new starting index of the sliced data.
+"""
+function test_slice()
+    a::Field = Field((IDim,), [1; 2; 3; 4; 5])
+    sliced_a = slice(a, 2:4)
+    @test sliced_a.data == [2; 3; 4]
+    @test sliced_a.origin == (2-1,)
+    @test sliced_a.dims == (IDim,)
+end
+
 # Test Executions --------------------------------------------------------------------------------------------
 
 function test_gt4py_fo_exec()
@@ -638,6 +655,8 @@ function test_gt4py_fo_exec()
     
     # testwrapper(setup_cartesian_offset_provider, test_lap_lap, "embedded", simple_cartesian_field)
     testwrapper(setup_cartesian_offset_provider, test_lap_lap, "py", simple_cartesian_field)
+
+    testwrapper(nothing, test_slice)
 end
 
 @testset "Testset GT2Py fo exec" test_gt4py_fo_exec()

From 30d92c1f1621817f45a6e1feea3c2b0676d7c95f Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Mon, 19 Aug 2024 14:11:32 +0200
Subject: [PATCH 28/53] Fix benchmarking size

---
 benchmark/benchmarks.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 5e61886..faa4468 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -5,7 +5,7 @@ using GridTools.ExampleMeshes.Unstructured
 using GridTools.ExampleMeshes.Cartesian
 
 # Data size
-const global STREAM_SIZE = 100
+const global STREAM_SIZE = 10_000_000
 
 # Utils ------------------------------------------------------------------------------------------------------
 

From 9928e143f20a15c0be8f26db49838fd9c1e1766e Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Mon, 19 Aug 2024 14:22:03 +0200
Subject: [PATCH 29/53] Remove deprecated benchmark files

---
 benchmark/benchmarks_neighbour_sum.jl | 46 -----------------
 benchmark/benchmarks_remapping.jl     | 71 ---------------------------
 2 files changed, 117 deletions(-)
 delete mode 100644 benchmark/benchmarks_neighbour_sum.jl
 delete mode 100644 benchmark/benchmarks_remapping.jl

diff --git a/benchmark/benchmarks_neighbour_sum.jl b/benchmark/benchmarks_neighbour_sum.jl
deleted file mode 100644
index a374f40..0000000
--- a/benchmark/benchmarks_neighbour_sum.jl
+++ /dev/null
@@ -1,46 +0,0 @@
-
-using BenchmarkTools
-using Statistics
-using GridTools  
-using GridTools.ExampleMeshes.Unstructured
-
-const N = 1_000_000
-const DIM_SIZE = sqrt(N) |> floor |> Int
-
-function create_large_connectivity(size::Int)
-    edge_to_cell_table = hcat([rand(1:size, 2) for _ in 1:size]...)
-    cell_to_edge_table = hcat([rand(1:size, 3) for _ in 1:size]...)
-
-    E2C = Connectivity(edge_to_cell_table, Cell, Edge, 2)
-    C2E = Connectivity(cell_to_edge_table, Edge, Cell, 3)
-
-    Dict(
-        "E2C" => E2C,
-        "C2E" => C2E,
-        "E2CDim" => E2C  # Using the same for simplicity # TODO: to be removed
-    )
-end
-
-offset_provider = create_large_connectivity(DIM_SIZE)
-
-a = Field(Cell, collect(1.0:N))
-out_field = GridTools.similar_field(a)
-
-@field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
-    return neighbor_sum(a(E2C), axis=E2CDim)
-end
-
-# Benchmark the field operation
-fo_benchmark = @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, backend="embedded", out=$out_field)
-
-# Run the benchmark
-results = run(fo_benchmark)
-
-# Memory bandwidth calculation
-time_in_seconds = median(results.times) / 1e9  # convert ns to s
-data_size = sizeof(a.data) + sizeof(out_field.data)  # total bytes read and written
-bandwidth = data_size / time_in_seconds / 1e9  # GB/s
-
-# Output results
-println("Time taken: ", median(results.times) / 1e6, " ms")
-println("Memory bandwidth: ", bandwidth, " GB/s")
diff --git a/benchmark/benchmarks_remapping.jl b/benchmark/benchmarks_remapping.jl
deleted file mode 100644
index 66470a6..0000000
--- a/benchmark/benchmarks_remapping.jl
+++ /dev/null
@@ -1,71 +0,0 @@
-using BenchmarkTools
-using Statistics
-using GridTools  
-
-const N = 10_000_000 |> floor |> Int # Adjust as needed (10 millions is the SLURM test size)
-
-include("../test/mesh_definitions.jl")  # Ensure all necessary mesh and dimension definitions are loaded
-
-# Unstructured Mesh ------------------------------------------------------------------------------------------
-
-function create_large_connectivity(size::Int)
-    edge_to_cell_table = hcat([rand(1:size, 2) for _ in 1:size]...)
-    cell_to_edge_table = hcat([rand(1:size, 3) for _ in 1:size]...)
-
-    E2C = Connectivity(edge_to_cell_table, Cell, Edge, 2)
-    C2E = Connectivity(cell_to_edge_table, Edge, Cell, 3)
-
-    Dict(
-        "E2C" => E2C,
-        "C2E" => C2E,
-        "E2CDim" => E2C  # TODO: remove it
-    )
-end
-
-offset_provider = create_large_connectivity(N)
-
-a = Field(Cell, collect(1.0:N))
-out_field = GridTools.similar_field(a)
-
-@field_operator function fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
-    return a(E2C[1])
-end
-
-# Benchmark the field remapping operation
-remapping_benchmark = @benchmarkable $fo_remapping($a, offset_provider=$offset_provider, backend="embedded", out=$out_field)
-
-# Run the benchmark
-results = run(remapping_benchmark)
-
-# Memory bandwidth calculation
-unstr_time_in_seconds = median(results.times) / 1e9  # convert ns to s
-unstr_data_size = sizeof(a.data) + sizeof(out_field.data)  # total bytes read and written
-unstr_bandwidth = unstr_data_size / unstr_time_in_seconds / 1e9  # GB/s
-
-# Output results
-println("Time taken: ", median(results.times) / 1e6, " ms")
-println("Memory bandwidth for Unstructured Mesh Remapping: ", unstr_bandwidth, " GB/s")
-
-# Cartesian Mesh ---------------------------------------------------------------------------------------------
-
-# Cartesian Offset Field Operator
-@field_operator function fo_cartesian_offset(inp::Field{Tuple{K_},Float64})::Field{Tuple{K_},Float64}
-    return inp(Koff[1])
-end
-
-# Create and benchmark the Cartesian offset operation
-a = Field(K, collect(1.0:N))
-out_field = Field(K, zeros(Float64, N-1))
-cartesian_offset_provider = Dict("Koff" => K)
-
-cartesian_benchmark = @benchmarkable $fo_cartesian_offset($a, backend="embedded", out=$out_field, offset_provider=$cartesian_offset_provider)
-cartesian_results = run(cartesian_benchmark)
-
-# Memory bandwidth calculation
-cartesian_time_in_seconds = median(cartesian_results.times) / 1e9  # convert ns to s
-cartesian_data_size = sizeof(a.data) + sizeof(out_field.data)  # total bytes read and written
-cartesian_bandwidth = cartesian_data_size / cartesian_time_in_seconds / 1e9  # GB/s
-
-# Output results
-println("Time taken for Cartesian Mesh Offset: ", median(cartesian_results.times) / 1e6, " ms")
-println("Memory bandwidth for Cartesian Mesh Offset: ", cartesian_bandwidth, " GB/s")

From 177f8babbcae606614d96ff11fecb2af685a41d4 Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Mon, 19 Aug 2024 14:50:26 +0200
Subject: [PATCH 30/53] Fix advection benchmarks and place them in a separate
 script

---
 benchmark/benchmark_advection.jl | 104 +++++++++++++++++++++++++++++++
 benchmark/benchmark_mpdata.jl    |  94 ----------------------------
 benchmark/benchmarks.jl          |  63 -------------------
 3 files changed, 104 insertions(+), 157 deletions(-)
 create mode 100644 benchmark/benchmark_advection.jl
 delete mode 100644 benchmark/benchmark_mpdata.jl

diff --git a/benchmark/benchmark_advection.jl b/benchmark/benchmark_advection.jl
new file mode 100644
index 0000000..8432aea
--- /dev/null
+++ b/benchmark/benchmark_advection.jl
@@ -0,0 +1,104 @@
+using BenchmarkTools
+using Statistics
+using GridTools
+using GridTools.ExampleMeshes.Unstructured
+using GridTools.ExampleMeshes.Cartesian
+
+include("../advection/advection_miniapp.jl")
+
+# Advection Benchmarks 
+
+advection_suite = BenchmarkGroup()
+advection_suite["advection"]["upwind_julia_embedded"] = @benchmarkable upwind_scheme(
+        state.rho,
+        δt,
+        mesh.vol,
+        metric.gac,
+        state.vel[1],
+        state.vel[2],
+        state.vel[3],
+        mesh.pole_edge_mask,
+        mesh.dual_face_orientation,
+        mesh.dual_face_normal_weighted_x,
+        mesh.dual_face_normal_weighted_y,
+        out = state_next.rho,
+        offset_provider = mesh.offset_provider
+        # embedded backend
+    )
+
+# advection_suite["advection"]["upwind_python_backend"] = @benchmarkable upwind_scheme(
+#         state.rho,
+#         δt,
+#         mesh.vol,
+#         metric.gac,
+#         state.vel[1],
+#         state.vel[2],
+#         state.vel[3],
+#         mesh.pole_edge_mask,
+#         mesh.dual_face_orientation,
+#         mesh.dual_face_normal_weighted_x,
+#         mesh.dual_face_normal_weighted_y,
+#         out = state_next.rho,
+#         offset_provider = mesh.offset_provider,
+#         backend = "py"
+#     )
+
+advection_suite["advection"]["mpdata_program_julia_embedded"] = @benchmarkable mpdata_program(
+        state.rho,
+        δt,
+        ϵ,
+        mesh.vol,
+        metric.gac,
+        state.vel[1],
+        state.vel[2],
+        state.vel[3],
+        mesh.pole_edge_mask,
+        mesh.dual_face_orientation,
+        mesh.dual_face_normal_weighted_x,
+        mesh.dual_face_normal_weighted_y,
+        out = state_next.rho,
+        offset_provider = mesh.offset_provider
+    )
+
+# TODO: disabled because the backend is not currently supporting it (the backend is too slow)
+# advection_suite["advection"]["mpdata_program_python_backend"] = @benchmarkable mpdata_program(
+#         state.rho,
+#         δt,
+#         ϵ,
+#         mesh.vol,
+#         metric.gac,
+#         state.vel[1],
+#         state.vel[2],
+#         state.vel[3],
+#         mesh.pole_edge_mask,
+#         mesh.dual_face_orientation,
+#         mesh.dual_face_normal_weighted_x,
+#         mesh.dual_face_normal_weighted_y,
+#         out = state_next.rho,
+#         offset_provider = mesh.offset_provider,
+#         backend = "py"
+#     )
+
+# Run the benchmark suite
+println("Running the advection suite...")
+advection_results = run(advection_suite)
+
+upwind_embedded_results = advection_results["advection"]["upwind_julia_embedded"]
+# upwind_python_backend_results = results["advection"]["upwind_python_backend"]
+mpdata_embedded_results = advection_results["advection"]["mpdata_program_julia_embedded"]
+# mpdata_python_backend_results = results["advection"]["mpdata_program_python_backend"]
+
+# Function to convert nanoseconds to milliseconds for clearer output
+ns_to_ms(time_ns) = time_ns / 1e6
+
+println("Upwind scheme julia (embedded):")
+println("\tTime taken: $(ns_to_ms(median(upwind_embedded_results.times))) ms\n")
+
+# println("Upwind scheme julia (python backend):")
+# println("\tTime taken: $(ns_to_ms(median(upwind_python_backend_results.times))) ms\n")
+
+println("Mpdata program julia (embedded):")
+println("\tTime taken: $(ns_to_ms(median(mpdata_embedded_results.times))) ms\n")
+
+# println("Mpdata program julia (python backend):")
+# println("\tTime taken: $(ns_to_ms(median(mpdata_python_backend_results.times))) ms\n")
diff --git a/benchmark/benchmark_mpdata.jl b/benchmark/benchmark_mpdata.jl
deleted file mode 100644
index 34a1e49..0000000
--- a/benchmark/benchmark_mpdata.jl
+++ /dev/null
@@ -1,94 +0,0 @@
-# benchmark_mpdata.jl - Benchmarking for atlas advection code
-
-using BenchmarkTools
-using GridTools  # Assuming all necessary functionality like Field, Dimension are defined here
-using Statistics
-using Printf
-
-Cell_ = Dimension{:Cell_, HORIZONTAL}
-Edge_ = Dimension{:Edge_, HORIZONTAL}
-Vertex_ = Dimension{:Vertex_, HORIZONTAL}
-K_ = Dimension{:K_, VERTICAL}
-V2VDim_ = Dimension{:V2V_, LOCAL}
-V2EDim_ = Dimension{:V2E_, LOCAL}
-E2VDim_ = Dimension{:E2V_, LOCAL}
-Cell = Cell_()
-K = K_()
-Edge = Edge_()
-Vertex = Vertex_()
-V2VDim = V2VDim_()
-V2EDim = V2EDim_()
-E2VDim = E2VDim_()
-
-V2V = FieldOffset("V2V", source = Vertex, target = (Vertex, V2VDim))
-E2V = FieldOffset("E2V", source = Vertex, target = (Edge, E2VDim))
-V2E = FieldOffset("V2E", source = Edge, target = (Vertex, V2EDim))
-Koff = FieldOffset("Koff", source = K, target = K)
-
-include("../src/atlas/atlas_mesh.jl")
-include("../src/atlas/state_container.jl")
-include("../src/atlas/metric.jl")
-include("../src/atlas/advection.jl")
-
-# Function to set up and run the benchmark
-function benchmark_mpdata()
-    # Set up the environment or load data
-    grid = atlas.StructuredGrid("O50")
-    mesh = AtlasMesh(grid, num_level = 30)
-
-    # Define dimensions based on the mesh properties
-    vertex_dim = getproperty(mesh, DIMENSION_TO_SIZE_ATTR[Vertex])
-    k_dim = getproperty(mesh, DIMENSION_TO_SIZE_ATTR[K])
-    edge_dim = getproperty(mesh, DIMENSION_TO_SIZE_ATTR[Edge])
-
-    # Set parameters
-    δt = 1800.0  # time step in s
-    eps = 1.0e-8
-    niter = 50  # Adjust based on how long you want the benchmark to run
-
-    # Initialize fields and metrics
-    state = sc_from_mesh(mesh)
-    state_next = sc_from_mesh(mesh)
-    tmp_fields = Dict{String, Field}()
-    for i = 1:6
-        tmp_fields[@sprintf("tmp_vertex_%d", i)] = Field((Vertex, K), zeros(vertex_dim, k_dim))
-    end
-    for j = 1:3
-        tmp_fields[@sprintf("tmp_edge_%d", j)] = Field((Edge, K), zeros(edge_dim, k_dim))
-    end
-
-    # Benchmark the mpdata_program
-    println("Starting the benchmark for mpdata_program...")
-    bench_result = @benchmark begin
-        mpdata_program(
-            state.rho,
-            δt,
-            eps,
-            mesh.vol,
-            metric.gac,
-            state.vel[1],
-            state.vel[2],
-            state.vel[3],
-            mesh.pole_edge_mask,
-            mesh.dual_face_orientation,
-            mesh.dual_face_normal_weighted_x,
-            mesh.dual_face_normal_weighted_y,
-            tmp_fields["tmp_vertex_1"],
-            tmp_fields["tmp_vertex_2"],
-            tmp_fields["tmp_vertex_3"],
-            tmp_fields["tmp_vertex_4"],
-            tmp_fields["tmp_vertex_5"],
-            tmp_fields["tmp_vertex_6"],
-            tmp_fields["tmp_edge_1"],
-            tmp_fields["tmp_edge_2"],
-            tmp_fields["tmp_edge_3"]
-        )
-    end
-
-    # Output benchmark results
-    println("Benchmark completed.")
-    display(bench_result)
-end
-
-# Run the benchmark function
-benchmark_mpdata()
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index faa4468..a1c8136 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -383,66 +383,3 @@ println("\tTime taken: $(ns_to_ms(median(remapping_results.times))) ms\n")
 
 println("Field Operator Neighbor Sum:")
 println("\tTime taken: $(ns_to_ms(median(neighbor_sum_results.times))) ms\n")
-
-# Advection Benchmarks 
-
-include("../advection/advection_miniapp.jl")
-
-avection_suite = BenchmarkGroup()
-
-println("Starting Advection Benchmark (julia embedded)")
-
-avection_suite["advection"]["mpdata_program_julia_embedded"] = @benchmark $mpdata_program(
-        state.rho,
-        δt,
-        ϵ,
-        mesh.vol,
-        metric.gac,
-        state.vel[1],
-        state.vel[2],
-        state.vel[3],
-        mesh.pole_edge_mask,
-        mesh.dual_face_orientation,
-        mesh.dual_face_normal_weighted_x,
-        mesh.dual_face_normal_weighted_y,
-        out = state_next.rho,
-        offset_provider = mesh.offset_provider
-    )
-
-println("Finished Advection Benchmark (julia embedded)")
-
-# TODO: disabled because the backend is not currently supporting it (the backend is too slow)
-# println("Starting Advection Benchmark (julia-python)")
-
-# advection_suite["advection"]["mpdata_program_julia_pyback"] = @benchmark mpdata_program(
-#         state.rho,
-#         δt,
-#         ϵ,
-#         mesh.vol,
-#         metric.gac,
-#         state.vel[1],
-#         state.vel[2],
-#         state.vel[3],
-#         mesh.pole_edge_mask,
-#         mesh.dual_face_orientation,
-#         mesh.dual_face_normal_weighted_x,
-#         mesh.dual_face_normal_weighted_y,
-#         out = state_next.rho,
-#         offset_provider = mesh.offset_provider,
-#         backend = "py"
-#     )
-
-# println("Finished Advection Benchmark (julia-python)")
-
-# Run the benchmark suite
-println("Running the advection suite...")
-# advection_results = run(avection_suite)
-
-# mpdata_emb_results = advection_results["advection"]["mpdata_program_julia_embedded"]
-# mpdata_pyback_results = results["advection"]["mpdata_program_julia_pyback"]
-
-# println("mpdata_program julia embedded version:")
-# println("\tTime taken: $(ns_to_ms(median(mpdata_emb_results.times))) ms\n")
-
-# println("mpdata_program julia with python backend:")
-# println("\tTime taken: $(ns_to_ms(median(mpdata_pyback_results.times))) ms\n")

From 391dc0a269a5254d7e8fc47e61665bf7b44ae082 Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Tue, 20 Aug 2024 10:53:21 +0200
Subject: [PATCH 31/53] Fix K dimension in advection meshes

---
 ...k_advection.jl => benchmarks_advection.jl} | 40 +++++++++----------
 src/ExampleMeshes.jl                          |  2 +-
 2 files changed, 21 insertions(+), 21 deletions(-)
 rename benchmark/{benchmark_advection.jl => benchmarks_advection.jl} (76%)

diff --git a/benchmark/benchmark_advection.jl b/benchmark/benchmarks_advection.jl
similarity index 76%
rename from benchmark/benchmark_advection.jl
rename to benchmark/benchmarks_advection.jl
index 8432aea..f6411b1 100644
--- a/benchmark/benchmark_advection.jl
+++ b/benchmark/benchmarks_advection.jl
@@ -26,22 +26,22 @@ advection_suite["advection"]["upwind_julia_embedded"] = @benchmarkable upwind_sc
         # embedded backend
     )
 
-# advection_suite["advection"]["upwind_python_backend"] = @benchmarkable upwind_scheme(
-#         state.rho,
-#         δt,
-#         mesh.vol,
-#         metric.gac,
-#         state.vel[1],
-#         state.vel[2],
-#         state.vel[3],
-#         mesh.pole_edge_mask,
-#         mesh.dual_face_orientation,
-#         mesh.dual_face_normal_weighted_x,
-#         mesh.dual_face_normal_weighted_y,
-#         out = state_next.rho,
-#         offset_provider = mesh.offset_provider,
-#         backend = "py"
-#     )
+advection_suite["advection"]["upwind_python_backend"] = @benchmarkable upwind_scheme(
+        state.rho,
+        δt,
+        mesh.vol,
+        metric.gac,
+        state.vel[1],
+        state.vel[2],
+        state.vel[3],
+        mesh.pole_edge_mask,
+        mesh.dual_face_orientation,
+        mesh.dual_face_normal_weighted_x,
+        mesh.dual_face_normal_weighted_y,
+        out = state_next.rho,
+        offset_provider = mesh.offset_provider,
+        backend = "py"
+    )
 
 advection_suite["advection"]["mpdata_program_julia_embedded"] = @benchmarkable mpdata_program(
         state.rho,
@@ -84,9 +84,9 @@ println("Running the advection suite...")
 advection_results = run(advection_suite)
 
 upwind_embedded_results = advection_results["advection"]["upwind_julia_embedded"]
-# upwind_python_backend_results = results["advection"]["upwind_python_backend"]
+upwind_python_backend_results = advection_results["advection"]["upwind_python_backend"]
 mpdata_embedded_results = advection_results["advection"]["mpdata_program_julia_embedded"]
-# mpdata_python_backend_results = results["advection"]["mpdata_program_python_backend"]
+# mpdata_python_backend_results = advection_results["advection"]["mpdata_program_python_backend"]
 
 # Function to convert nanoseconds to milliseconds for clearer output
 ns_to_ms(time_ns) = time_ns / 1e6
@@ -94,8 +94,8 @@ ns_to_ms(time_ns) = time_ns / 1e6
 println("Upwind scheme julia (embedded):")
 println("\tTime taken: $(ns_to_ms(median(upwind_embedded_results.times))) ms\n")
 
-# println("Upwind scheme julia (python backend):")
-# println("\tTime taken: $(ns_to_ms(median(upwind_python_backend_results.times))) ms\n")
+println("Upwind scheme julia (python backend):")
+println("\tTime taken: $(ns_to_ms(median(upwind_python_backend_results.times))) ms\n")
 
 println("Mpdata program julia (embedded):")
 println("\tTime taken: $(ns_to_ms(median(mpdata_embedded_results.times))) ms\n")
diff --git a/src/ExampleMeshes.jl b/src/ExampleMeshes.jl
index 96612cf..6d5d237 100644
--- a/src/ExampleMeshes.jl
+++ b/src/ExampleMeshes.jl
@@ -11,7 +11,7 @@ export Cell, K, Edge, Vertex, V2VDim, V2EDim, E2VDim, E2CDim, C2EDim
 export V2V, E2V, V2E, E2C, C2E, Koff
 
 const global Cell_ = Dimension{:Cell_, HORIZONTAL}
-const global K_ = Dimension{:K_, HORIZONTAL}
+const global K_ = Dimension{:K_, VERTICAL}
 const global Edge_ = Dimension{:Edge_, HORIZONTAL}
 const global Vertex_ = Dimension{:Vertex_, HORIZONTAL}
 const global V2VDim_ = Dimension{:V2VDim_, LOCAL}

From 904866621b486fb1e3c77fa25868f49e139f65f5 Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Tue, 20 Aug 2024 10:54:49 +0200
Subject: [PATCH 32/53] Add multi-threads optimization on broadcasting
 operation

---
 src/embedded/cust_broadcast.jl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/embedded/cust_broadcast.jl b/src/embedded/cust_broadcast.jl
index 66cb372..ff0aa9f 100644
--- a/src/embedded/cust_broadcast.jl
+++ b/src/embedded/cust_broadcast.jl
@@ -1,3 +1,6 @@
+
+using Base.Threads: @threads
+
 Base.BroadcastStyle(::Type{<:Field}) = Broadcast.ArrayStyle{Field}()
 
 # TODO(tehrengruber): Implement a range with an attached dimension instead of this single object
@@ -257,7 +260,7 @@ end
 
     # Performance may vary depending on whether `@inbounds` is placed outside the
     # for loop or not. (cf. https://github.com/JuliaLang/julia/issues/38086)
-    @inbounds @simd for I in eachindex(dest)
+    @inbounds @threads for I in eachindex(dest)
         dest[I] = bc′[I]
     end
     return dest

From e2ce6012037404cf02859e5985c575c71b8cf5f1 Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Tue, 20 Aug 2024 11:13:05 +0200
Subject: [PATCH 33/53] Change benchmark SUITE for compatibility with
 AirSpeedVelocity

---
 benchmark/benchmarks.jl           | 30 +++++++++++++++---------------
 benchmark/benchmarks_advection.jl | 12 ++++++------
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index a1c8136..87c404f 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -272,55 +272,55 @@ end
 
 # Benchmarks -------------------------------------------------------------------------------------------------
 
-# Create the benchmark suite
-suite = BenchmarkGroup()
+# Create the benchmark SUITE
+SUITE = BenchmarkGroup()
 
 # Define the main groups
-suite["addition"] = BenchmarkGroup()
+SUITE["addition"] = BenchmarkGroup()
 
 # Julia broadcast addition benchmark
 a, b, data_size = array_broadcast_addition_setup(STREAM_SIZE)
-suite["addition"]["array_broadcast_addition"] = @benchmarkable broadcast_addition_array(a, b) setup=((a, b, data_size) = $array_broadcast_addition_setup($STREAM_SIZE); ) #a=$a; b=$b)
+SUITE["addition"]["array_broadcast_addition"] = @benchmarkable broadcast_addition_array(a, b) setup=((a, b, data_size) = $array_broadcast_addition_setup($STREAM_SIZE); ) #a=$a; b=$b)
 
 # Field broadcast addition benchmark
 a, b, out = fields_broadcast_addition_setup(STREAM_SIZE)
-suite["addition"]["fields_broadcast_addition"] = @benchmarkable broadcast_addition_fields($a, $b)
+SUITE["addition"]["fields_broadcast_addition"] = @benchmarkable broadcast_addition_fields($a, $b)
 
 # Field Operator broadcast addition benchmark
 a, b, out = fields_broadcast_addition_setup(STREAM_SIZE)
-suite["addition"]["field_op_broadcast_addition"] = @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out)
+SUITE["addition"]["field_op_broadcast_addition"] = @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out)
 
 # Sine without field operator benchmark
 a, out = single_field_setup(STREAM_SIZE)
-suite["trigonometry"]["sin"] = @benchmarkable sin_without_fo($a)
+SUITE["trigonometry"]["sin"] = @benchmarkable sin_without_fo($a)
 
 # Field operator sine benchmark
 a, out = single_field_setup(STREAM_SIZE)
-suite["trigonometry"]["field_op_sin"] = @benchmarkable $fo_sin($a, backend="embedded", out=$out)
+SUITE["trigonometry"]["field_op_sin"] = @benchmarkable $fo_sin($a, backend="embedded", out=$out)
 
 # Cosine without field operator benchmark
 a, out = single_field_setup(STREAM_SIZE)
-suite["trigonometry"]["cos"] = @benchmarkable cos_without_fo($a)
+SUITE["trigonometry"]["cos"] = @benchmarkable cos_without_fo($a)
 
 # Field operator cosine benchmark
 a, out = single_field_setup(STREAM_SIZE)
-suite["trigonometry"]["field_op_cos"] = @benchmarkable $fo_cos($a, backend="embedded", out=$out)
+SUITE["trigonometry"]["field_op_cos"] = @benchmarkable $fo_cos($a, backend="embedded", out=$out)
 
 # Benchmark the field remapping operation
 offset_provider = create_large_connectivity(STREAM_SIZE)
 a, out = single_field_setup(STREAM_SIZE)
-suite["remapping"]["field_operator"] = 
+SUITE["remapping"]["field_operator"] = 
     @benchmarkable $fo_remapping($a, offset_provider=$offset_provider, backend="embedded", out=$out)
 
 # Benchmark the field neighbor sum operation
 offset_provider = create_large_connectivity(STREAM_SIZE)
 a, out = single_field_setup(STREAM_SIZE)
-suite["neighbor_sum"]["field_operator"] = 
+SUITE["neighbor_sum"]["field_operator"] = 
     @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, backend="embedded", out=$out)
 
-# Run the benchmark suite
-println("Running the benchmark suite...")
-results = run(suite)
+# Run the benchmark SUITE
+println("Running the benchmark SUITE...")
+results = run(SUITE)
 
 # Process the results
 array_results = results["addition"]["array_broadcast_addition"]
diff --git a/benchmark/benchmarks_advection.jl b/benchmark/benchmarks_advection.jl
index f6411b1..c40d809 100644
--- a/benchmark/benchmarks_advection.jl
+++ b/benchmark/benchmarks_advection.jl
@@ -8,8 +8,8 @@ include("../advection/advection_miniapp.jl")
 
 # Advection Benchmarks 
 
-advection_suite = BenchmarkGroup()
-advection_suite["advection"]["upwind_julia_embedded"] = @benchmarkable upwind_scheme(
+SUITE = BenchmarkGroup()
+SUITE["advection"]["upwind_julia_embedded"] = @benchmarkable upwind_scheme(
         state.rho,
         δt,
         mesh.vol,
@@ -26,7 +26,7 @@ advection_suite["advection"]["upwind_julia_embedded"] = @benchmarkable upwind_sc
         # embedded backend
     )
 
-advection_suite["advection"]["upwind_python_backend"] = @benchmarkable upwind_scheme(
+SUITE["advection"]["upwind_python_backend"] = @benchmarkable upwind_scheme(
         state.rho,
         δt,
         mesh.vol,
@@ -43,7 +43,7 @@ advection_suite["advection"]["upwind_python_backend"] = @benchmarkable upwind_sc
         backend = "py"
     )
 
-advection_suite["advection"]["mpdata_program_julia_embedded"] = @benchmarkable mpdata_program(
+SUITE["advection"]["mpdata_program_julia_embedded"] = @benchmarkable mpdata_program(
         state.rho,
         δt,
         ϵ,
@@ -61,7 +61,7 @@ advection_suite["advection"]["mpdata_program_julia_embedded"] = @benchmarkable m
     )
 
 # TODO: disabled because the backend is not currently supporting it (the backend is too slow)
-# advection_suite["advection"]["mpdata_program_python_backend"] = @benchmarkable mpdata_program(
+# SUITE["advection"]["mpdata_program_python_backend"] = @benchmarkable mpdata_program(
 #         state.rho,
 #         δt,
 #         ϵ,
@@ -81,7 +81,7 @@ advection_suite["advection"]["mpdata_program_julia_embedded"] = @benchmarkable m
 
 # Run the benchmark suite
 println("Running the advection suite...")
-advection_results = run(advection_suite)
+advection_results = run(SUITE)
 
 upwind_embedded_results = advection_results["advection"]["upwind_julia_embedded"]
 upwind_python_backend_results = advection_results["advection"]["upwind_python_backend"]

From 45cf97a27f7680032a7db4955c0724f856c9346d Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Tue, 20 Aug 2024 11:13:28 +0200
Subject: [PATCH 34/53] Add multi-threads optimization

---
 src/embedded/cust_broadcast.jl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/embedded/cust_broadcast.jl b/src/embedded/cust_broadcast.jl
index 66cb372..ff0aa9f 100644
--- a/src/embedded/cust_broadcast.jl
+++ b/src/embedded/cust_broadcast.jl
@@ -1,3 +1,6 @@
+
+using Base.Threads: @threads
+
 Base.BroadcastStyle(::Type{<:Field}) = Broadcast.ArrayStyle{Field}()
 
 # TODO(tehrengruber): Implement a range with an attached dimension instead of this single object
@@ -257,7 +260,7 @@ end
 
     # Performance may vary depending on whether `@inbounds` is placed outside the
     # for loop or not. (cf. https://github.com/JuliaLang/julia/issues/38086)
-    @inbounds @simd for I in eachindex(dest)
+    @inbounds @threads for I in eachindex(dest)
         dest[I] = bc′[I]
     end
     return dest

From 8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Tue, 20 Aug 2024 11:24:17 +0200
Subject: [PATCH 35/53] Restoring SIMD loop in broadcast

---
 src/embedded/cust_broadcast.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/embedded/cust_broadcast.jl b/src/embedded/cust_broadcast.jl
index ff0aa9f..e76c3fa 100644
--- a/src/embedded/cust_broadcast.jl
+++ b/src/embedded/cust_broadcast.jl
@@ -260,7 +260,7 @@ end
 
     # Performance may vary depending on whether `@inbounds` is placed outside the
     # for loop or not. (cf. https://github.com/JuliaLang/julia/issues/38086)
-    @inbounds @threads for I in eachindex(dest)
+    @inbounds @simd for I in eachindex(dest)
         dest[I] = bc′[I]
     end
     return dest

From be385b7b32164868b81dda520dc62b17bcc9c341 Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Tue, 20 Aug 2024 12:04:33 +0200
Subject: [PATCH 36/53] Add benchmark readme on how to run benchmarks on
 separate revisions

---
 benchmark/README.md | 110 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 108 insertions(+), 2 deletions(-)

diff --git a/benchmark/README.md b/benchmark/README.md
index 9ae7e7a..e898b2a 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -51,6 +51,112 @@ benchpkg --filter=time_to_load
 The `benchpkg` was updated in June 2024 to automate the benchmark without specifying the parameters. 
 To specify additional condition in `benchpkg` and to work with `benchpkgplot` consult the help command (`--h`).
 
-## Creating New Benchmarks
+Here’s an improved and completed version of your README section, with the necessary definitions, examples, and explanations:
 
-TODO: Instructions for adding new benchmarks to the suite.
+---
+
+## Comparing Two or More Different Revisions (States)
+
+To compare two or more different states of your codebase, you can use revisions. In this context, a **revision** refers to a specific state of the repository, which can be identified by a commit hash or a tag.
+
+### (Reminder) What is a Revision?
+
+A **revision** in Git is an identifier that refers to a specific state of the repository at a particular point in time. Revisions can be specified using:
+- **Commit Hashes**: A unique SHA-1 identifier for each commit, e.g., `8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd`.
+- **Tags**: Human-readable names assigned to specific commits, often used to mark release points (e.g., `v1.0.0`).
+
+### How to Add a Tag
+
+You can create a tag in Git by using the following command:
+
+```bash
+git tag -a <tag-name> -m "Tag message"
+```
+
+For example, to tag the current commit with `v1.0.0`, you would run:
+
+```bash
+git tag -a v1.0.0 -m "Release version 1.0.0"
+```
+
+To push the tag to the remote repository, use:
+
+```bash
+git push origin <tag-name>
+```
+
+For example:
+
+```bash
+git push origin v1.0.0
+```
+
+To see information about all tags, such as the commit they point to and the tag messages, use:
+
+```bash
+git show-ref --tags && git tag -n | while IFS= read -r line; do echo "$line"; done
+```
+
+### Example: Using Commit Hashes to Compare Revisions
+
+Here is an example of how to use commit hashes to compare different revisions:
+
+```bash
+benchpkg --rev=8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd,6fb48706f988613860c6c98beef32c32e900737b \
+    --bench-on=8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd --exeflags="--threads=8"
+```
+
+In this example, `benchpkg` compares the two specified revisions, with the first hash being the baseline for comparison.
+
+### Example: Using Tags to Compare Revisions
+
+Here’s how you can use tags instead of commit hashes:
+
+1. **Create Tags**: 
+   Suppose you want to tag the two commits:
+
+   ```bash
+   git tag -a v1.0.0 8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd -m "Tagging v1.0.0"
+   git tag -a v1.1.0 6fb48706f988613860c6c98beef32c32e900737b -m "Tagging v1.1.0"
+   ```
+
+2. **Use Tags in `benchpkg`**:
+   Once the tags are set, you can use them in the comparison:
+
+   ```bash
+   benchpkg --rev=v1.0.0,v1.1.0 --bench-on=v1.0.0 --exeflags="--threads=8"
+   ```
+
+### How to Remove a Tag
+
+If you need to remove a tag from your repository, you can do so with the following commands:
+
+1. **Delete the tag locally**:
+
+   ```bash
+   git tag -d <tag-name>
+   ```
+
+   For example:
+
+   ```bash
+   git tag -d v1.0.0
+   ```
+
+2. **Delete the tag from the remote repository**:
+
+   ```bash
+   git push origin --delete <tag-name>
+   ```
+
+   For example:
+
+   ```bash
+   git push origin --delete v1.0.0
+   ```
+
+## Developer Notes
+
+1. The `benchpkg` tool compares different revisions, allowing you to specify the commits or tags you wish to compare. It is crucial to ensure that both commits include all necessary dependencies; otherwise, the dependencies might not be resolved.
+
+2. **AirSpeedVelocity**: Note that AirSpeedVelocity requires the benchmarking suite to be named `SUITE`. Any other names will not be recognized, which could lead to errors in your benchmarking process.

From b9ebf8e32f4ba159be68d0485156ca9fc4d93187 Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Tue, 20 Aug 2024 16:21:23 +0200
Subject: [PATCH 37/53] Create an AtlasMeshes module and resolve issues with
 atlas4py import

---
 advection/advection_miniapp.jl              |  8 ++++----
 benchmark/benchmarks_advection.jl           |  2 --
 src/GridTools.jl                            |  1 +
 src/atlas/{atlas_mesh.jl => AtlasMeshes.jl} | 14 +++++++++++++-
 4 files changed, 18 insertions(+), 7 deletions(-)
 rename src/atlas/{atlas_mesh.jl => AtlasMeshes.jl} (97%)

diff --git a/advection/advection_miniapp.jl b/advection/advection_miniapp.jl
index cdc72e0..dae9794 100644
--- a/advection/advection_miniapp.jl
+++ b/advection/advection_miniapp.jl
@@ -7,17 +7,17 @@ using Statistics
 using Profile
 using GridTools
 using GridTools.ExampleMeshes.Unstructured
-
-const global VISUALIZATION_FLAG::Bool=false
-const global VERBOSE_FLAG::Bool=false
+using GridTools.AtlasMeshes
 
 # Include additional necessary files for mesh, state container, metric calculations, and advection operations
-include("../src/atlas/atlas_mesh.jl")
 include("state_container.jl")
 include("metric.jl")
 include("advection.jl")
 include("visualization_utils.jl")
 
+const global VISUALIZATION_FLAG::Bool=false
+const global VERBOSE_FLAG::Bool=true
+
 # Grid and Mesh Initialization --------------------------------------------------------------------------------
 # Create a structured grid and mesh for the simulation
 grid = atlas.StructuredGrid("O10")
diff --git a/benchmark/benchmarks_advection.jl b/benchmark/benchmarks_advection.jl
index c40d809..638d744 100644
--- a/benchmark/benchmarks_advection.jl
+++ b/benchmark/benchmarks_advection.jl
@@ -1,8 +1,6 @@
 using BenchmarkTools
 using Statistics
 using GridTools
-using GridTools.ExampleMeshes.Unstructured
-using GridTools.ExampleMeshes.Cartesian
 
 include("../advection/advection_miniapp.jl")
 
diff --git a/src/GridTools.jl b/src/GridTools.jl
index 083cbd7..70873e3 100644
--- a/src/GridTools.jl
+++ b/src/GridTools.jl
@@ -758,5 +758,6 @@ end
 generate_unique_name(name::Symbol, value::Integer = 0) = Symbol("$(name)ᐞ$(value)")
 
 include("ExampleMeshes.jl")
+include("atlas/AtlasMeshes.jl")
 
 end
diff --git a/src/atlas/atlas_mesh.jl b/src/atlas/AtlasMeshes.jl
similarity index 97%
rename from src/atlas/atlas_mesh.jl
rename to src/atlas/AtlasMeshes.jl
index d8f947b..dbce49d 100644
--- a/src/atlas/atlas_mesh.jl
+++ b/src/atlas/AtlasMeshes.jl
@@ -1,9 +1,19 @@
 # ENV["PYCALL_JL_RUNTIME_PYTHON"] = Sys.which("python3.10")
 # ENV["PYTHONBREAKPOINT"] = "pdb.set_trace"
 
+module AtlasMeshes
+
+using GridTools
+using GridTools.ExampleMeshes.Unstructured
 using PyCall
 
-atlas = pyimport("atlas4py")
+export AtlasMesh, atlas, update_periodic_layers, DIMENSION_TO_SIZE_ATTR
+
+const atlas = PyNULL()
+
+function __init__()
+    copy!(atlas, pyimport("atlas4py"))
+end
 
 const rpi = 2.0 * asin(1.0)
 const _deg2rad = 2.0 * rpi / 360.0
@@ -361,3 +371,5 @@ function update_periodic_layers(mesh::AtlasMesh, field::Field)
     )
     field[periodic_indices, :] .= field[remote_indices[periodic_indices], :]
 end
+
+end # AtlasMeshes module
\ No newline at end of file

From 085877d39e0e942ef6ff899a23c87cac896e4a5a Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Tue, 20 Aug 2024 16:22:04 +0200
Subject: [PATCH 38/53] Fix embedded test with the new K dimension definition
 in example meshes

---
 test/embedded_test.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/embedded_test.jl b/test/embedded_test.jl
index 770cfed..b61db94 100644
--- a/test/embedded_test.jl
+++ b/test/embedded_test.jl
@@ -135,8 +135,8 @@ end
 
     # Broadcast -------------------------
 
-    @test typeof(broadcast(cell_values, (Cell, K))) == Field{Tuple{Dimension{:Cell_, HORIZONTAL}, Dimension{:K_, HORIZONTAL}}, Float64, 1, Tuple{Dimension{:Cell_, HORIZONTAL}}, Vector{Float64}}
-    @test typeof(broadcast(5.0, (Cell, K))) == Field{Tuple{Dimension{:Cell_, HORIZONTAL}, Dimension{:K_, HORIZONTAL}}, Float64, 0, Tuple{}, Array{Float64, 0}}
+    @test typeof(broadcast(cell_values, (Cell, K))) == Field{Tuple{Dimension{:Cell_, HORIZONTAL}, Dimension{:K_, VERTICAL}}, Float64, 1, Tuple{Dimension{:Cell_, HORIZONTAL}}, Vector{Float64}}
+    @test typeof(broadcast(5.0, (Cell, K))) == Field{Tuple{Dimension{:Cell_, HORIZONTAL}, Dimension{:K_, VERTICAL}}, Float64, 0, Tuple{}, Array{Float64, 0}}
 
     # Where -----------------------------------------
 

From 89572e10205090bd9c1f2c17ddbb343090983199 Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Tue, 20 Aug 2024 16:38:28 +0200
Subject: [PATCH 39/53] Separate the simulation loop from the Advection Setup
 of the miniapp

---
 advection/README.md                           | 20 +++---
 ...dvection_miniapp.jl => advection_setup.jl} | 64 +------------------
 advection/run_simulation_loop.jl              | 62 ++++++++++++++++++
 benchmark/benchmarks_advection.jl             |  2 +-
 notes/Benchmarks.jl                           |  4 +-
 5 files changed, 77 insertions(+), 75 deletions(-)
 rename advection/{advection_miniapp.jl => advection_setup.jl} (74%)
 create mode 100644 advection/run_simulation_loop.jl

diff --git a/advection/README.md b/advection/README.md
index b838658..cfae700 100644
--- a/advection/README.md
+++ b/advection/README.md
@@ -1,6 +1,6 @@
-### README for Running `advection_miniapp.jl`
+### README for Running `advection_setup.jl` using `run_simulation_loop.jl`
 
-This README provides instructions on how to run the `advection_miniapp.jl` script for simulating advection using the Atlas library. The script allows for terminal visualization, which can be enabled as described below.
+This README provides instructions on how to run the `run_simulation_loop.jl` script for simulating advection using the Atlas library. The script allows for terminal visualization, which can be enabled as described below.
 
 #### Prerequisites
 
@@ -15,23 +15,23 @@ This README provides instructions on how to run the `advection_miniapp.jl` scrip
      ```
 
 2. **Enabling Visualization** (optional):
-   - The script has a `VISUALIZATION_FLAG` that can be set to enable or disable visualization on the terminal. Ensure that this flag is set to `true` in the `advection_miniapp.jl` script if you wish to enable visualization.
-   - Note: Other parameters such as the number of iterations can be changed in the `# Simulation Parameters` section of the script.
+   - The script has a `VISUALIZATION_FLAG` that can be set to enable or disable visualization on the terminal. Ensure that this flag is set to `true` in the `run_simulation_loop.jl` script if you wish to enable visualization.
+   - Note: Other parameters such as the number of iterations can be changed in the `# Simulation Parameters` section of the `advection_setup.jl` script.
 
 #### Running the Simulation
 
 1. **Running the Script**:
-   - Use the following command to run the `advection_miniapp.jl` script with Julia:
+   - Use the following command to run the `run_simulation_loop.jl` script with Julia:
      ```sh
-     julia --color=yes --project=$GRIDTOOLS_JL_PATH/GridTools.jl $GRIDTOOLS_JL_PATH/GridTools.jl/src/examples/advection/advection_miniapp.jl
+     julia --color=yes --project=$GRIDTOOLS_JL_PATH/GridTools.jl $GRIDTOOLS_JL_PATH/GridTools.jl/src/examples/advection/run_simulation_loop.jl
      ```
 
 #### Example
 
-Here is an example of how to set the `VISUALIZATION_FLAG` in the `advection_miniapp.jl` script and run the simulation:
+Here is an example of how to set the `VISUALIZATION_FLAG` in the `run_simulation_loop.jl` script and run the simulation:
 
 1. **Setting the Visualization Flag**:
-   - Open the `advection_miniapp.jl` script.
+   - Open the `run_simulation_loop.jl` script.
    - Set the `VISUALIZATION_FLAG` to `true`:
      ```julia
      const VISUALIZATION_FLAG = true
@@ -42,7 +42,7 @@ Here is an example of how to set the `VISUALIZATION_FLAG` in the `advection_mini
    - Run the script with the following command:
      ```sh
      export GRIDTOOLS_JL_PATH=...
-     julia --color=yes --project=. $GRIDTOOLS_JL_PATH/src/examples/advection/advection_miniapp.jl
+     julia --color=yes --project=. $GRIDTOOLS_JL_PATH/src/examples/advection/run_simulation_loop.jl
      ```
 
-By following these steps, you should be able to run the `advection_miniapp.jl` script and visualize the advection simulation results on your terminal.
+By following these steps, you should be able to run the `run_simulation_loop.jl` script and visualize the advection simulation results on your terminal.
diff --git a/advection/advection_miniapp.jl b/advection/advection_setup.jl
similarity index 74%
rename from advection/advection_miniapp.jl
rename to advection/advection_setup.jl
index dae9794..89ffd38 100644
--- a/advection/advection_miniapp.jl
+++ b/advection/advection_setup.jl
@@ -1,10 +1,8 @@
-# Advection Miniapp
-# This script demonstrates an advection simulation using the Atlas library.
+# Advection Setup
+# This script demonstrates the setup of an advection simulation using the Atlas library.
 
 using Printf
-using Debugger
 using Statistics
-using Profile
 using GridTools
 using GridTools.ExampleMeshes.Unstructured
 using GridTools.AtlasMeshes
@@ -13,10 +11,6 @@ using GridTools.AtlasMeshes
 include("state_container.jl")
 include("metric.jl")
 include("advection.jl")
-include("visualization_utils.jl")
-
-const global VISUALIZATION_FLAG::Bool=false
-const global VERBOSE_FLAG::Bool=true
 
 # Grid and Mesh Initialization --------------------------------------------------------------------------------
 # Create a structured grid and mesh for the simulation
@@ -165,57 +159,3 @@ nabla_z(
     out = tmp_fields["tmp_vertex_2"],
     offset_provider = mesh.offset_provider
 )
-
-if VISUALIZATION_FLAG
-    # Precompute the mapping between the unstructured domain to the structured one for ASCII art visualization
-    grid_size = 50
-    mapping = precompute_mapping(mesh, xlim, ylim, grid_size)
-end
-
-# Main Simulation Loop ----------------------------------------------------------------------------------------
-for i = 1:niter
-    # Perform the upwind advection scheme to update the scalar field (rho)
-    upwind_scheme(
-        state.rho,
-        δt,
-        mesh.vol,
-        metric.gac,
-        state.vel[1],
-        state.vel[2],
-        state.vel[3],
-        mesh.pole_edge_mask,
-        mesh.dual_face_orientation,
-        mesh.dual_face_normal_weighted_x,
-        mesh.dual_face_normal_weighted_y,
-        out = state_next.rho,
-        offset_provider = mesh.offset_provider
-    )
-
-    # Print the current timestep
-    if VERBOSE_FLAG
-        println("Timestep $i")
-    end
-
-    if VISUALIZATION_FLAG
-        # Print the current state as ASCII art every 5 timesteps
-        print_state_ascii(state, mesh, mapping, i, grid_size)
-    end
-
-    # TODO: make a function out of this switch
-    # Swap the current and next state
-    temp = state
-    global state = state_next
-    global state_next = temp
-
-    # Update the periodic boundary layers
-    update_periodic_layers(mesh, state.rho)
-end
-
-if VERBOSE_FLAG
-    # Output the final statistics for the scalar field (rho) and velocity fields
-    println(
-        "min max sum of final rho = $(minimum(state.rho.data)) , $(maximum(state.rho.data)) , $(sum(state.rho.data))"
-    )
-    println("Final Vel0 sum after $niter iterations: $(sum(state.vel[1].data))")
-    println("Final Vel1 sum after $niter iterations: $(sum(state.vel[2].data))")
-end
diff --git a/advection/run_simulation_loop.jl b/advection/run_simulation_loop.jl
new file mode 100644
index 0000000..2c034a6
--- /dev/null
+++ b/advection/run_simulation_loop.jl
@@ -0,0 +1,62 @@
+# Run Advection Miniapp Simulation
+# This script demonstrates an advection simulation using the Atlas library.
+
+include("visualization_utils.jl")
+include("advection_setup.jl")
+
+const global VISUALIZATION_FLAG::Bool=false
+const global VERBOSE_FLAG::Bool=true
+
+if VISUALIZATION_FLAG
+    # Precompute the mapping between the unstructured domain to the structured one for ASCII art visualization
+    grid_size = 50
+    mapping = precompute_mapping(mesh, xlim, ylim, grid_size)
+end
+
+# Main Simulation Loop ----------------------------------------------------------------------------------------
+for i = 1:niter
+    # Perform the upwind advection scheme to update the scalar field (rho)
+    upwind_scheme(
+        state.rho,
+        δt,
+        mesh.vol,
+        metric.gac,
+        state.vel[1],
+        state.vel[2],
+        state.vel[3],
+        mesh.pole_edge_mask,
+        mesh.dual_face_orientation,
+        mesh.dual_face_normal_weighted_x,
+        mesh.dual_face_normal_weighted_y,
+        out = state_next.rho,
+        offset_provider = mesh.offset_provider
+    )
+
+    # Print the current timestep
+    if VERBOSE_FLAG
+        println("Timestep $i")
+    end
+
+    if VISUALIZATION_FLAG
+        # Print the current state as ASCII art every 5 timesteps
+        print_state_ascii(state, mesh, mapping, i, grid_size)
+    end
+
+    # TODO: make a function out of this switch
+    # Swap the current and next state
+    temp = state
+    global state = state_next
+    global state_next = temp
+
+    # Update the periodic boundary layers
+    update_periodic_layers(mesh, state.rho)
+end
+
+if VERBOSE_FLAG
+    # Output the final statistics for the scalar field (rho) and velocity fields
+    println(
+        "min max sum of final rho = $(minimum(state.rho.data)) , $(maximum(state.rho.data)) , $(sum(state.rho.data))"
+    )
+    println("Final Vel0 sum after $niter iterations: $(sum(state.vel[1].data))")
+    println("Final Vel1 sum after $niter iterations: $(sum(state.vel[2].data))")
+end
diff --git a/benchmark/benchmarks_advection.jl b/benchmark/benchmarks_advection.jl
index 638d744..d0e5da3 100644
--- a/benchmark/benchmarks_advection.jl
+++ b/benchmark/benchmarks_advection.jl
@@ -2,7 +2,7 @@ using BenchmarkTools
 using Statistics
 using GridTools
 
-include("../advection/advection_miniapp.jl")
+include("../advection/advection_setup.jl")
 
 # Advection Benchmarks 
 
diff --git a/notes/Benchmarks.jl b/notes/Benchmarks.jl
index 5d390ec..b271d89 100644
--- a/notes/Benchmarks.jl
+++ b/notes/Benchmarks.jl
@@ -59,7 +59,7 @@ using Profile
 
 # Benchmark for Julia and Python implementations of advection ##############################################################################################################
 
-include("../advection/advection_miniapp.jl")
+include("../advection/advection_setup.jl")
 
 println("Starting julia embedded benchmark")
 
@@ -81,7 +81,7 @@ bench_julia_embedded = @benchmark upwind_scheme(
 
 println("Finished Julia embedded benchmark")
 
-include("../advection/advection_miniapp.jl")
+include("../advection/advection_setup.jl")
 
 println("Starting julia python benchmark")
 

From 4d71e0b73103f4ad73cc8b4433ab6fddb90a0256 Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Tue, 20 Aug 2024 16:47:15 +0200
Subject: [PATCH 40/53] Small changes in benchmark documentation

---
 benchmark/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmark/README.md b/benchmark/README.md
index e898b2a..35fcc3b 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -76,7 +76,7 @@ git tag -a <tag-name> -m "Tag message"
 For example, to tag the current commit with `v1.0.0`, you would run:
 
 ```bash
-git tag -a v1.0.0 -m "Release version 1.0.0"
+git tag -a v1.0.0 -m "Improvement using @threads instead of @simd in broadcasting"
 ```
 
 To push the tag to the remote repository, use:
@@ -116,8 +116,8 @@ Here’s how you can use tags instead of commit hashes:
    Suppose you want to tag the two commits:
 
    ```bash
-   git tag -a v1.0.0 8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd -m "Tagging v1.0.0"
-   git tag -a v1.1.0 6fb48706f988613860c6c98beef32c32e900737b -m "Tagging v1.1.0"
+   git tag -a v1.0.0 8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd -m "Tag message for v1.0.0"
+   git tag -a v1.1.0 6fb48706f988613860c6c98beef32c32e900737b -m "Tag message for v1.1.0"
    ```
 
 2. **Use Tags in `benchpkg`**:

From 26fe90031d4d675c2b8e21c57da385b7a68145a6 Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Tue, 20 Aug 2024 17:46:41 +0200
Subject: [PATCH 41/53] Fix the names retrieval of the modules automatically
 generated by AirSpeedVelocity when running the advection benchmark

---
 src/GridTools.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/GridTools.jl b/src/GridTools.jl
index 70873e3..8b7edb0 100644
--- a/src/GridTools.jl
+++ b/src/GridTools.jl
@@ -706,7 +706,7 @@ macro module_vars()
                 name => Core.eval(Base, name) for
                 name in [:Int64, :Int32, :Float32, :Float64]
             )
-            all_names = names(@__MODULE__)
+            all_names = names(@__MODULE__, all=true)
             used_modules = ccall(:jl_module_usings, Any, (Any,), @__MODULE__)
             for m in used_modules
                 append!(all_names, names(m))

From 7cac41c364e1f0de5d35e19bcc5260e68e591869 Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Wed, 21 Aug 2024 13:58:44 +0200
Subject: [PATCH 42/53] Ignore plot files by AirSpeedVelocity

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index ca7a8dc..dee06b9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,3 +30,5 @@ env_setup.sh
 
 # Ignore benchmark (benchpkg) results
 results_GridTools@*
+plot_*.png
+plot_*.pdf

From 6cb5585827ad97ba967e7f66570f979d964e343d Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Wed, 21 Aug 2024 14:30:45 +0200
Subject: [PATCH 43/53] Add Polyester to the dependencies

---
 Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Project.toml b/Project.toml
index ff7f05d..2ab63f0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -12,6 +12,7 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
+Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
 PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"

From 96f416f9547978bb8a4f642a604c0c9e42a2f81f Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Thu, 22 Aug 2024 14:40:51 +0200
Subject: [PATCH 44/53] Increase the size of the Atlas Mesh for benchmarking
 purposes

---
 advection/advection_setup.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/advection/advection_setup.jl b/advection/advection_setup.jl
index 89ffd38..3153416 100644
--- a/advection/advection_setup.jl
+++ b/advection/advection_setup.jl
@@ -14,7 +14,7 @@ include("advection.jl")
 
 # Grid and Mesh Initialization --------------------------------------------------------------------------------
 # Create a structured grid and mesh for the simulation
-grid = atlas.StructuredGrid("O10")
+grid = atlas.StructuredGrid("O90")
 mesh = AtlasMesh(grid, num_level = 30)
 
 # Simulation Parameters ---------------------------------------------------------------------------------------

From 426f9369bc58be2a79eded80cf93a205d41b581f Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Thu, 22 Aug 2024 15:07:05 +0200
Subject: [PATCH 45/53] Add script to automate the benchmark comparison between
 the last two commits

---
 benchmark/autorun_benchmarks.sh | 79 +++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100755 benchmark/autorun_benchmarks.sh

diff --git a/benchmark/autorun_benchmarks.sh b/benchmark/autorun_benchmarks.sh
new file mode 100755
index 0000000..73ac906
--- /dev/null
+++ b/benchmark/autorun_benchmarks.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+
+# This script automates the process of benchmarking recent changes by tagging
+# the last two commits and running benchmarks using the AirspeedVelocity package.
+# It supports conditional execution based on user input to include specific benchmarks
+# for advection and allows dynamic configuration of execution threads.
+#
+# Usage:
+#   ./autorun_benchmarks.sh [--advection] [--threads=NUM]
+#     --advection: Optional. If specified, runs advection-specific benchmarks.
+#     --threads=NUM: Optional. Specifies the number of threads to use. Default is 8.
+
+# Default number of threads
+threads=8
+
+# Function to display usage
+usage() {
+    echo "Usage: $0 [--advection] [--threads=NUM]"
+    echo "  --advection: Run the advection comparison with specific benchmark script."
+    echo "  --threads=NUM: Specify the number of threads (default is 8)."
+    exit 1
+}
+
+# Parse command-line arguments
+for arg in "$@"
+do
+    case $arg in
+        --advection)
+        advection=true
+        shift # Remove --advection from processing
+        ;;
+        --threads=*)
+        threads="${arg#*=}"
+        shift # Remove --threads=NUM from processing
+        ;;
+        *)
+        # Unknown option
+        usage
+        ;;
+    esac
+done
+
+# Check if the tags already exist and delete them if they do
+if git rev-parse -q --verify "refs/tags/after_debug" >/dev/null; then
+    git tag -d after_debug
+fi
+
+if git rev-parse -q --verify "refs/tags/before_debug" >/dev/null; then
+    git tag -d before_debug
+fi
+
+# Tag the last commit as 'after_debug'
+git tag after_debug HEAD
+echo "Tagged the latest commit as 'after_debug'"
+
+# Tag the second last commit as 'before_debug'
+git tag before_debug HEAD~1
+echo -e "Tagged the previous commit as 'before_debug'\n"
+
+# Print the before and after tags with their messages
+git tag -n | grep -E 'before_debug|after_debug' | while IFS= read -r line; do echo "$line"; done ; echo ""
+
+# Conditional command based on the --advection flag
+if [ "$advection" == true ]; then
+    # Set the benchmark script for advection
+    benchmark_script="benchmark/benchmarks_advection.jl"
+    command="benchpkg --rev=before_debug,after_debug \
+             -s $benchmark_script \
+             --bench-on=before_debug \
+             --exeflags=\"--threads=$threads\""
+else
+    command="benchpkg --rev=before_debug,after_debug \
+             --bench-on=before_debug \
+             --exeflags=\"--threads=$threads\""
+fi
+
+# Print and execute the command
+echo "Executing command: $command"
+eval $command

From c8a08bb9f137a617f8735afcb03efc24e4872f1f Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Thu, 22 Aug 2024 15:35:50 +0200
Subject: [PATCH 46/53] Add utilis for benchmark/profiling in the interactive
 REPL

---
 .../utils/setup_benchmark_interactive.jl      | 299 ++++++++++++++++++
 1 file changed, 299 insertions(+)
 create mode 100644 benchmark/utils/setup_benchmark_interactive.jl

diff --git a/benchmark/utils/setup_benchmark_interactive.jl b/benchmark/utils/setup_benchmark_interactive.jl
new file mode 100644
index 0000000..7e2aad1
--- /dev/null
+++ b/benchmark/utils/setup_benchmark_interactive.jl
@@ -0,0 +1,299 @@
+# setup_benchmark_interactive.jl
+
+# This script is intended for interactive usage during development and benchmarking sessions.
+# It sets up a Julia environment with necessary packages and predefined functions for running various benchmarks.
+# This allows developers to interactively profile and debug performance issues in real-time.
+#
+# Usage Example:
+# Start Julia with the appropriate project settings and thread configuration:
+# $ julia --project=. --threads 8
+#
+# Inside the Julia REPL, load the benchmark setup:
+# julia> include("setup_benchmark_interactive.jl")
+# This will load all necessary modules and display the current thread usage.
+#
+# To run and profile a specific operation, use:
+# julia> a, out = single_field_setup(STREAM_SIZE)
+# julia> @profile fo_sin(a, backend="embedded", out=out)
+# This will profile the `fo_sin` operation and print profiling results.
+
+include("../../advection/advection_setup.jl") 
+
+using BenchmarkTools
+using Statistics
+using GridTools
+using GridTools.ExampleMeshes.Unstructured
+using GridTools.ExampleMeshes.Cartesian
+using Profile
+using Base.Threads
+
+# Data size
+const global STREAM_SIZE = 10_000_000
+
+# Utils ------------------------------------------------------------------------------------------------------
+
+# Useful for the benchmark of the field remapping operation
+function create_large_connectivity(size::Int)
+    edge_to_cell_table = vcat([rand(1:size, (1, 2)) for _ in 1:size]...)
+    cell_to_edge_table = vcat([rand(1:size, (1, 3)) for _ in 1:size]...)
+
+    E2C = Connectivity(edge_to_cell_table, Cell, Edge, 2)
+    C2E = Connectivity(cell_to_edge_table, Edge, Cell, 3)
+
+    Dict(
+        "E2C" => E2C,
+        "C2E" => C2E,
+        "E2CDim" => E2C  # TODO: remove it
+    )
+end
+
+"""
+    compute_memory_bandwidth_single(results, a, out)::Float64
+
+Calculates the memory bandwidth for operations that involve a single input and output field based on benchmark results.
+
+This function measures how efficiently data is transferred to and from memory during the execution of a benchmarked operation.
+
+# Arguments
+- `results`: The benchmark results object containing timing and other performance data.
+- `a`: The input field used in the benchmark.
+- `out`: The output field produced by the benchmark.
+
+# Returns
+- `bandwidth`: The computed memory bandwidth in gigabytes per second (GB/s), which represents the rate at which data is read from and written to the system memory during the operation.
+
+# Calculation Details
+- `data_size`: Sum of the sizes of the input and output data in bytes.
+- `time_in_seconds`: The median execution time of the benchmark, converted from nanoseconds to seconds.
+- `bandwidth`: Calculated as the total data transferred divided by the time taken, expressed in GB/s.
+"""
+function compute_memory_bandwidth_single(results, a, out=a)::Float64
+    data_size = sizeof(a.data) + sizeof(out.data)  # Read from a and write to out
+    time_in_seconds = median(results.times) / 1e9  # Convert ns to s
+    bandwidth = data_size / time_in_seconds / 1e9  # GB/s
+    return bandwidth
+end
+
+"""
+    compute_memory_bandwidth_addition(results, a, b, out)
+
+Function to compute the memory bandwidth for the addition benchmarks.
+
+# Arguments
+- `results`: Benchmark results.
+- `a, b`: The input arrays/fields used in the benchmark.
+- `out`: The output array/field of the benchmark.
+
+# Returns
+- The computed memory bandwidth in GB/s.
+"""
+function compute_memory_bandwidth_addition(results, a, b, out)::Tuple{Float64, Int64}
+    @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data)
+    data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data)  # Read a and b, write to out
+    time_in_seconds = median(results.times) / 1e9  # Convert ns to s
+    bandwidth = data_size / time_in_seconds / 1e9  # GB/s
+    return bandwidth, data_size
+end
+
+# Operations -------------------------------------------------------------------------------------------------
+
+"""
+    single_field_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field}
+
+Setup function to create a field and a similar output field for benchmarking operations that require a single input field.
+
+# Arguments
+- `FIELD_DATA_SIZE::Int64`: The size of the field to be generated.
+
+# Returns
+- `a`: A randomly generated field of floats of size `FIELD_DATA_SIZE`.
+- `out`: An output field similar to `a`, used for storing operation results.
+"""
+function single_field_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field}
+    a = Field(Cell, rand(Float64, FIELD_DATA_SIZE))
+    out = GridTools.similar_field(a)
+    return a, out
+end
+
+"""
+    array_broadcast_addition_setup(ARRAY_SIZE::Int64)
+
+Setup function for the Julia broadcast addition benchmark.
+
+# Arguments
+- `ARRAY_SIZE::Int64`: The size of the arrays to be generated.
+
+# Returns
+- `a, b`: Two randomly generated arrays of integers of size `ARRAY_SIZE`.
+- `data_size`: The total size of the data processed.
+"""
+function array_broadcast_addition_setup(ARRAY_SIZE::Int64)::Tuple{Array{Float64,1}, Array{Float64,1}, Int64}
+    a = rand(Float64, ARRAY_SIZE)
+    b = rand(Float64, ARRAY_SIZE)
+    data_size = sizeof(a) + sizeof(b)  # Total bytes processed
+    return a, b, data_size
+end
+
+"""
+    broadcast_addition_array(a::Array{Float64}, b::Array{Float64})
+
+Core operation for the Julia broadcast addition benchmark.
+
+# Arguments
+- `a, b`: Two arrays to be added.
+
+# Returns
+- The result of element-wise addition of `a` and `b`.
+"""
+function broadcast_addition_array(a::Array{Float64}, b::Array{Float64})::Array{Float64,1}
+    return a .+ b
+end
+
+"""
+    broadcast_addition(a::Field, b::Field)
+
+Core operation for the broadcast addition of two Field benchmark.
+Useful to asses and track possible overhead on fields.
+
+# Arguments
+- `a, b`: Two field to be added.
+
+# Returns
+- The result of element-wise addition of the data of the fields `a` and `b`.
+"""
+function broadcast_addition_fields(a::Field, b::Field)::Field
+    return a .+ b
+end
+
+"""
+    fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)
+
+Setup function for the field operator broadcast addition benchmark.
+
+# Arguments
+- `FIELD_DATA_SIZE::Int64`: The size of the fields to be generated.
+
+# Returns
+- `a, b`: Two randomly generated fields of floats of size `FIELD_DATA_SIZE`.
+- `out`: An output field similar to `a`.
+"""
+function fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field, Field}
+    a = Field(Cell, rand(Float64, FIELD_DATA_SIZE))
+    b = Field(Cell, rand(Float64, FIELD_DATA_SIZE))
+    out = GridTools.similar_field(a)
+    return a, b, out
+end
+
+"""
+    fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Core operation for the field operator broadcast addition benchmark.
+
+# Arguments
+- `a, b`: Two fields to be added.
+
+# Returns
+- The result of element-wise addition of `a` and `b`.
+"""
+@field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return a .+ b
+end
+
+"""
+    sin_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Applies the sine function element-wise to the data of a field without using a field operator.
+
+# Arguments
+- `a`: Input field containing Float64 data.
+
+# Returns
+- A new field where each element is the sine of the corresponding element in the input field `a`.
+"""
+function sin_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return sin.(a)
+end
+
+"""
+    cos_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Applies the cosine function element-wise to the data of a field without using a field operator.
+
+# Arguments
+- `a`: Input field containing Float64 data.
+
+# Returns
+- A new field where each element is the cosine of the corresponding element in the input field `a`.
+"""
+function cos_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return cos.(a)
+end
+
+"""
+    fo_sin(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Field operator that applies the sine function element-wise to the data of a field.
+
+# Arguments
+- `a`: Input field containing Float64 data.
+
+# Returns
+- A new field where each element is the sine of the corresponding element in the input field `a`.
+"""
+@field_operator function fo_sin(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return sin.(a)
+end
+
+"""
+    fo_cos(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Field operator that applies the cosine function element-wise to the data of a field.
+
+# Arguments
+- `a`: Input field containing Float64 data.
+
+# Returns
+- A new field where each element is the cosine of the corresponding element in the input field `a`.
+"""
+@field_operator function fo_cos(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return cos.(a)
+end
+
+"""
+    fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+
+Field operator that performs remapping from cell-based data to edge-based data.
+
+This operator utilizes a connectivity table (`E2C`) to map the values from cells to edges, implying a transformation from the cell-centered field to an edge-centered field based on predefined relationships in the connectivity table.
+
+# Arguments
+- `a`: Input field containing Float64 data structured around cells.
+
+# Returns
+- A new field where each element represents data remapped from cells to edges, structured as specified by the edge-to-cell connectivity.
+"""
+@field_operator function fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+    return a(E2C[1])
+end
+
+"""
+    fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+
+Field operator that computes the sum of neighboring cell values for each edge. This function leverages the connectivity table (`E2C`), which defines the relationship between edges and cells, to sum the values of cells that are connected to each edge.
+
+The summation is performed across the dimension specified by `E2CDim`, ensuring that each edge aggregates values from its associated cells correctly.
+
+# Arguments
+- `a`: Input field containing Float64 data, where each cell contains a numerical value.
+
+# Returns
+- A new field where each edge holds the summed value of its neighboring cells, based on the edge-to-cell connectivity defined in `E2C`.
+"""
+@field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+    return neighbor_sum(a(E2C), axis=E2CDim)
+end
+
+# Start ------------------------------------------------------------------------------------------------------
+println("Current number of threads: ", Threads.nthreads())
+println("The environment is ready\n")
+Profile.clear()

From d9642216258bcfe323afc70004bbc9c65eff393b Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Thu, 22 Aug 2024 15:36:05 +0200
Subject: [PATCH 47/53] Move autorun in the utils folder

---
 benchmark/{ => utils}/autorun_benchmarks.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename benchmark/{ => utils}/autorun_benchmarks.sh (100%)

diff --git a/benchmark/autorun_benchmarks.sh b/benchmark/utils/autorun_benchmarks.sh
similarity index 100%
rename from benchmark/autorun_benchmarks.sh
rename to benchmark/utils/autorun_benchmarks.sh

From 182dd6d123ffb950ddb47e529b7d47309c74c1ef Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Thu, 22 Aug 2024 15:59:40 +0200
Subject: [PATCH 48/53] Update autorun script

---
 benchmark/utils/autorun_benchmarks.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmark/utils/autorun_benchmarks.sh b/benchmark/utils/autorun_benchmarks.sh
index 73ac906..2f021a4 100755
--- a/benchmark/utils/autorun_benchmarks.sh
+++ b/benchmark/utils/autorun_benchmarks.sh
@@ -66,11 +66,10 @@ if [ "$advection" == true ]; then
     benchmark_script="benchmark/benchmarks_advection.jl"
     command="benchpkg --rev=before_debug,after_debug \
              -s $benchmark_script \
-             --bench-on=before_debug \
              --exeflags=\"--threads=$threads\""
 else
     command="benchpkg --rev=before_debug,after_debug \
-             --bench-on=before_debug \
+             --bench-on=after_debug \
              --exeflags=\"--threads=$threads\""
 fi
 

From b1f539e1afdcb1533afe15d86d433d95b1fc53ea Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Fri, 23 Aug 2024 10:46:32 +0200
Subject: [PATCH 49/53] Fix the autorun script to use hashes instead of tags

---
 benchmark/utils/autorun_benchmarks.sh | 28 ++++++++++-----------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/benchmark/utils/autorun_benchmarks.sh b/benchmark/utils/autorun_benchmarks.sh
index 2f021a4..58a0f0c 100755
--- a/benchmark/utils/autorun_benchmarks.sh
+++ b/benchmark/utils/autorun_benchmarks.sh
@@ -40,22 +40,13 @@ do
     esac
 done
 
-# Check if the tags already exist and delete them if they do
-if git rev-parse -q --verify "refs/tags/after_debug" >/dev/null; then
-    git tag -d after_debug
-fi
-
-if git rev-parse -q --verify "refs/tags/before_debug" >/dev/null; then
-    git tag -d before_debug
-fi
-
-# Tag the last commit as 'after_debug'
-git tag after_debug HEAD
-echo "Tagged the latest commit as 'after_debug'"
+# Retrieve last two commit hashes
+before_debug=$(git rev-parse HEAD~1)
+after_debug=$(git rev-parse HEAD)
 
-# Tag the second last commit as 'before_debug'
-git tag before_debug HEAD~1
-echo -e "Tagged the previous commit as 'before_debug'\n"
+# Tag the last two commits if they are not already tagged
+git tag -f after_debug $after_debug
+git tag -f before_debug $before_debug
 
 # Print the before and after tags with their messages
 git tag -n | grep -E 'before_debug|after_debug' | while IFS= read -r line; do echo "$line"; done ; echo ""
@@ -64,12 +55,13 @@ git tag -n | grep -E 'before_debug|after_debug' | while IFS= read -r line; do ec
 if [ "$advection" == true ]; then
     # Set the benchmark script for advection
     benchmark_script="benchmark/benchmarks_advection.jl"
-    command="benchpkg --rev=before_debug,after_debug \
+    command="benchpkg --rev=$before_debug,$after_debug \
              -s $benchmark_script \
+             --bench-on=$after_debug \
              --exeflags=\"--threads=$threads\""
 else
-    command="benchpkg --rev=before_debug,after_debug \
-             --bench-on=after_debug \
+    command="benchpkg --rev=$before_debug,$after_debug \
+             --bench-on=$after_debug \
              --exeflags=\"--threads=$threads\""
 fi
 

From 5b0f1dc6b12ae0de3457575a7f0303bbf03fb45c Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Wed, 4 Sep 2024 17:02:23 +0200
Subject: [PATCH 50/53] Add gpu backend support for basic broadcast operation

---
 src/GridTools.jl               | 41 +++++++++++++++++++++-
 src/embedded/builtins.jl       | 24 ++++++++++---
 src/embedded/cust_broadcast.jl | 63 ++++++++++++++++++++++++++++------
 src/examples/example_gpu.jl    | 39 +++++++++++++++++++++
 test/gpu_test.jl               | 44 ++++++++++++++++++++++++
 5 files changed, 194 insertions(+), 17 deletions(-)
 create mode 100644 src/examples/example_gpu.jl
 create mode 100644 test/gpu_test.jl

diff --git a/src/GridTools.jl b/src/GridTools.jl
index 580be0a..8f617f9 100644
--- a/src/GridTools.jl
+++ b/src/GridTools.jl
@@ -9,7 +9,7 @@ using Profile
 using Base: @propagate_inbounds
 using MacroTools
 using OffsetArrays: IdOffsetRange
-using Debugger
+using CUDA
 
 import Base.Broadcast: Extruded, Style, BroadcastStyle, ArrayStyle, Broadcasted
 
@@ -157,6 +157,30 @@ julia> field = Field(Cell, ones(5))
 julia> field(E2C)
 julia> field(E2C[1])
 ```
+
+GPU arrays are supported too.
+
+# Examples
+```julia-repl
+julia> using GridTools
+
+julia> using CUDA: CuArray
+
+julia> using GridTools.ExampleMeshes.Unstructured
+
+       # Create a CuArray of data on the GPU
+
+julia> gpu_data = CuArray(reshape(collect(1.0:12.0), (3, 4)));
+
+       # Create a Field passing data in the CuArray type
+
+julia> gpu_field = Field((Cell,K), gpu_data);
+
+       # Check the type
+
+julia> Base.typeof(gpu_field.data)
+CuArray{Float64, 2, CUDA.DeviceMemory}
+```
 """
 struct Field{
     B_Dim <: Tuple{Vararg{Dimension}},
@@ -609,6 +633,20 @@ function backend_execution(
     end
 end
 
+# It is not currently working in all edge cases
+function check_gpu_data(args::Tuple)::nothing
+    has_CuArray::Bool = false
+    for (i, arg) in enumerate(args)
+        if arg !== nothing && typeof(arg) <: AbstractArray && typeof(arg.data) <: CuArray
+            has_CuArray = true
+        end
+
+        if has_CuArray
+            throw(ArgumentError("GPU Arrays (CuArray) are not supported by the Python backend. Error found in argument #$i: $(typeof(arg.data))."))
+        end
+    end
+end
+
 function backend_execution(
     backend::Val{:py},
     fo::FieldOp,
@@ -624,6 +662,7 @@ function backend_execution(
         f = py_field_operator(fo)
         FIELD_OPERATORS[fo.name] = f
     end
+    # check_gpu_data(args) # TODO: throw an exception in case of gpu arrays passed to the python backend
     p_args, p_kwargs, p_out, p_offset_provider =
         py_args.((args, kwargs, out, GridTools.OFFSET_PROVIDER))
     if is_outermost_fo
diff --git a/src/embedded/builtins.jl b/src/embedded/builtins.jl
index 6ddf639..bdb512b 100644
--- a/src/embedded/builtins.jl
+++ b/src/embedded/builtins.jl
@@ -40,17 +40,31 @@ function min_over(field_in::Field; axis::Dimension)::Field
     return reduction_master(field_in, axis, minimum)
 end
 
+"""
+    reduction_master(field_in::Field, axis::Dimension, f::Function)::Field
 
+Performs a reduction operation (`sum`, `minimum`, `maximum`, etc.) over a specific axis dimension.
+This version supports both CPU and GPU fields.
+"""
 function reduction_master(field_in::Field, axis::Dimension, f::Function)
     neutral_el = get_neutral(f, eltype(field_in))
     dim = get_dim_ind(field_in.dims, axis)
 
     conn = OFFSET_PROVIDER[get_dim_name(axis)]
-    data = dropdims(
-        f(ifelse.(conn.data .!= -1, field_in.data, neutral_el), dims = dim),
-        dims = dim
-    )
-    return Field((field_in.dims[1:dim-1]..., field_in.dims[dim+1:end]...), data)
+
+    if isa(field_in.data, CuArray)
+        # GPU version using CUDA parallelization
+        reduced_data = CUDA.fill(neutral_el, size(field_in.data))
+        CUDA.@sync reduced_data .= f(ifelse.(conn.data .!= -1, field_in.data, neutral_el), dims = dim)
+        reduced_data = dropdims(reduced_data, dims = dim)
+    else
+        # CPU version
+        reduced_data = dropdims(
+            f(ifelse.(conn.data .!= -1, field_in.data, neutral_el), dims = dim),
+            dims = dim
+        )
+    end
+    return Field((field_in.dims[1:dim-1]..., field_in.dims[dim+1:end]...), reduced_data)
 end
 
 get_neutral(f::typeof(sum), type::DataType) = convert(type, 0)
diff --git a/src/embedded/cust_broadcast.jl b/src/embedded/cust_broadcast.jl
index 0b0ad16..5f9c807 100644
--- a/src/embedded/cust_broadcast.jl
+++ b/src/embedded/cust_broadcast.jl
@@ -230,15 +230,42 @@ end
 
 # -----------------------------------------------------------------------------------------------------------------------------------------
 
+function is_gpu_compatible(bc::Broadcasted{ArrayStyle{Field}})::Bool
+    is_all_CuArray::Bool = false
+    has_CuArray::Bool = false
+    has_CPUArray::Bool = false
+
+    for arg in bc.args
+        if typeof(arg) <: AbstractArray
+            # Check if the argument is a CuArray
+            if typeof(arg.data) <: CuArray
+                has_CuArray = true
+                is_all_CuArray = true
+            # Check if the argument is a CPU array
+            elseif typeof(arg.data) <: Vector
+                has_CPUArray = true
+            end
+        end
+
+        # If both a CuArray and a CPU Array are present, raise an error
+        if has_CuArray && has_CPUArray
+            throw(ErrorException("Cannot have both CuArray and CPU arrays in the same args."))
+        end
+    end
+
+    return is_all_CuArray
+end
+
 # Creates uninitialized output object
 function Base.similar(bc::Broadcasted{ArrayStyle{Field}}, ::Type{ElType}) where {ElType}
     offsets = getproperty.(axes(bc), :start) .- 1
+    is_cuarray::Bool = is_gpu_compatible(bc)
     Field(
-        bc.axes.dims,
-        similar(Array{ElType}, getproperty.(axes(bc), :stop) .- offsets),
-        bc.axes.broadcast_dims,
-        offsets
-    )
+            bc.axes.dims,
+            similar(is_cuarray ? CuArray{ElType} : Array{ElType}, getproperty.(axes(bc), :stop) .- offsets),
+            bc.axes.broadcast_dims,
+            offsets
+        )
 end
 
 # -----------------------------------------------------------------------------------------------------------------------------------------
@@ -249,17 +276,31 @@ end
     if axes(dest) == axes(bc) && bc.f === identity && bc.args isa Tuple{AbstractArray} # only a single input argument to broadcast!
         A = bc.args[1]
         if axes(dest) == axes(A)
-            return copyto!(dest, A)
+            if isa(A.data, CuArray)
+                return CUDA.copyto!(dest.data, A.data) # Use @GPUArrays copyto!
+            else
+                return copyto!(dest, A)
+            end
         end
     end
 
-    bc′ = Base.Broadcast.preprocess(shape(dest), bc)
+    if isa(dest.data, CuArray)
+        # Extract the function and the arguments from the broadcasted expression
+        f = bc.f
+        args = bc.args
 
-    # Performance may vary depending on whether `@inbounds` is placed outside the
-    # for loop or not. (cf. https://github.com/JuliaLang/julia/issues/38086)
-    @inbounds @simd for I in eachindex(dest)
-        dest[I] = bc′[I]
+        # Apply the function f element-wise to the arguments and store the result in dest.data
+        CUDA.map!(f, dest.data, map(arg -> arg.data, args)...)
+    else
+        bc′ = Base.Broadcast.preprocess(shape(dest), bc)
+
+        # Performance may vary depending on whether `@inbounds` is placed outside the
+        # for loop or not. (cf. https://github.com/JuliaLang/julia/issues/38086)
+        @inbounds @simd for I in eachindex(dest)
+            dest[I] = bc′[I]
+        end
     end
+    
     return dest
 end
 
diff --git a/src/examples/example_gpu.jl b/src/examples/example_gpu.jl
new file mode 100644
index 0000000..8954a70
--- /dev/null
+++ b/src/examples/example_gpu.jl
@@ -0,0 +1,39 @@
+using GridTools
+using GridTools.ExampleMeshes.Unstructured
+using CUDA
+using Profile
+using Debugger
+using BenchmarkTools
+
+# Cpu
+
+a_cpu = Field(Cell, collect(1:2e7))
+b_cpu = Field(Cell, collect(1:2e7))
+
+out_cpu = similar(a_cpu)
+
+out_cpu = a_cpu .+ b_cpu
+
+# Gpu
+
+a_gpu = Field(Cell, CuArray(1:2e7))
+b_gpu = Field(Cell, CuArray(1:2e7))
+
+out_gpu = similar_field(a_gpu)
+
+out_gpu .= a_gpu .+ b_gpu
+
+function bench_cpu!(a_cpu, b_cpu, out_cpu)
+    out_cpu = a_cpu .+ b_cpu
+end
+
+function bench_gpu!(a_gpu, b_gpu, out_gpu)
+    # Wrapping the execution in a CUDA.@sync block will make 
+    # the CPU block until the queued GPU tasks are done, similar to how Base.@sync waits for distributed CPU tasks
+    CUDA.@sync begin
+        out_gpu = a_gpu .+ b_gpu
+    end
+end
+
+@btime bench_cpu!($a_cpu, $b_cpu, $out_cpu)
+@btime bench_gpu!($a_gpu, $b_gpu, $out_gpu)
\ No newline at end of file
diff --git a/test/gpu_test.jl b/test/gpu_test.jl
new file mode 100644
index 0000000..3f7fecb
--- /dev/null
+++ b/test/gpu_test.jl
@@ -0,0 +1,44 @@
+using Test
+using CUDA: CuArray
+using GridTools
+using GridTools.ExampleMeshes.Unstructured
+
+@testset "Testset Simple Broadcast Addition GPU" begin
+    a_gpu = Field(Cell, CuArray(1.0:15.0))
+    b_gpu = Field(Cell, CuArray(-2.0:-1:-16.0))
+    @assert size(a_gpu.data) == size(b_gpu.data) "Fields a_gpu and b_gpu do not have the same size of data."
+
+    out_gpu = similar_field(a_gpu)
+    out_gpu = a_gpu .+ b_gpu
+
+    @test all(out_gpu.data .== -1)    
+end
+
+@testset "Testset Large Broadcast Addition GPU" begin
+    # Initialize two large GPU fields with CuArray
+    a_gpu = Field(Cell, CuArray(1:2e7))
+    b_gpu = Field(Cell, CuArray(1:2e7))
+    @assert size(a_gpu.data) == size(b_gpu.data) "Fields a_gpu and b_gpu do not have the same size of data."
+
+    out_gpu = similar_field(a_gpu)
+    out_gpu .= a_gpu .+ b_gpu
+
+    expected_result = CuArray(2:2:2e7*2)
+    
+    @test all(out_gpu.data .== expected_result)
+end
+
+@testset "Testset Field Operator Addition GPU" begin
+    a_gpu = Field(Cell, CuArray(1.0:15.0))
+    b_gpu = Field(Cell, CuArray(-2.0:-1:-16.0))
+    @assert size(a_gpu.data) == size(b_gpu.data) "Fields a and b do not have the same size of data."
+
+    out_gpu = similar_field(a_gpu)
+
+    @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+        return a .+ b
+    end
+
+    fo_addition(a_gpu, b_gpu, backend="embedded", out=out_gpu)
+    @test all(out_gpu.data .== -1)
+end

From 5f6d164656db2330ec310e6fdb235f4d29e7c3ee Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Wed, 4 Sep 2024 17:53:51 +0200
Subject: [PATCH 51/53] Add benchmarking suite for gpu

---
 benchmark/benchmarks_gpu.jl | 160 ++++++++++++++++++++++++++++++++++++
 1 file changed, 160 insertions(+)
 create mode 100644 benchmark/benchmarks_gpu.jl

diff --git a/benchmark/benchmarks_gpu.jl b/benchmark/benchmarks_gpu.jl
new file mode 100644
index 0000000..edfc438
--- /dev/null
+++ b/benchmark/benchmarks_gpu.jl
@@ -0,0 +1,160 @@
+using BenchmarkTools
+using CUDA
+using GridTools
+using GridTools.ExampleMeshes.Unstructured
+
+# Data size
+const global STREAM_SIZE = 10_000_000
+
+"""
+    compute_memory_bandwidth_addition(results, a, b, out)::Tuple{Float64, Int64}
+
+Function to compute the memory bandwidth for the addition benchmarks.
+
+# Arguments
+- `results`: The benchmark results containing timing information (`times`).
+- `a, b`: The input fields or arrays used in the benchmark.
+- `out`: The output field or array used in the benchmark.
+
+# Returns
+- A tuple `(bandwidth, data_size)` where:
+    - `bandwidth`: The memory bandwidth in gigabytes per second (GB/s).
+    - `data_size`: The total size of the data processed in bytes.
+"""
+function compute_memory_bandwidth_addition(results, a, b, out)::Tuple{Float64, Int64}
+    # Ensure the sizes of the data fields are consistent
+    @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data)
+
+    # Calculate the total size of data read and written in bytes
+    # Read from `a` and `b`, and write to `out`
+    data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data)
+
+    # Compute the median execution time from benchmark results in seconds (convert from nanoseconds)
+    time_in_seconds = median(results.times) / 1e9
+
+    # Calculate memory bandwidth in GB/s
+    bandwidth = data_size / time_in_seconds / 1e9
+
+    return bandwidth, data_size
+end
+
+# GPU Setup Functions -----------------------------------------------------------------------------------------
+
+"""
+    gpu_broadcast_addition_setup(ARRAY_SIZE::Int64)
+
+Setup function for the GPU broadcast addition benchmark using CuArray.
+
+# Arguments
+- `ARRAY_SIZE::Int64`: The size of the GPU arrays to be generated.
+
+# Returns
+- `a, b`: Two CuArray GPU arrays of size `ARRAY_SIZE`.
+- `data_size`: The total size of the data processed.
+"""
+function gpu_broadcast_addition_setup(ARRAY_SIZE::Int64)::Tuple{CuArray{Float64,1}, CuArray{Float64,1}, Int64}
+    a_gpu = CuArray(rand(Float64, ARRAY_SIZE))
+    b_gpu = CuArray(rand(Float64, ARRAY_SIZE))
+    data_size = sizeof(a_gpu) + sizeof(b_gpu)  # Total bytes processed
+    return a_gpu, b_gpu, data_size
+end
+
+"""
+    gpu_fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)
+
+Setup function for the GPU field broadcast addition benchmark using CuArray.
+
+# Arguments
+- `FIELD_DATA_SIZE::Int64`: The size of the fields to be generated.
+
+# Returns
+- `a, b`: Two randomly generated fields of CuArray floats of size `FIELD_DATA_SIZE`.
+- `out`: An output field similar to `a`, used for storing operation results.
+"""
+function gpu_fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field, Field}
+    a_gpu = Field(Cell, CuArray(rand(Float64, FIELD_DATA_SIZE)))
+    b_gpu = Field(Cell, CuArray(rand(Float64, FIELD_DATA_SIZE)))
+    out_gpu = GridTools.similar_field(a_gpu)
+    return a_gpu, b_gpu, out_gpu
+end
+
+# CuArray only
+function gpu_broadcast_addition_array(a::CuArray{Float64}, b::CuArray{Float64})::CuArray{Float64}
+    return a .+ b
+end
+
+# Fields and broadcasting
+function gpu_broadcast_addition_fields(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return a .+ b
+end
+
+function arr_add_wrapper(a, b)
+    CUDA.@sync begin
+        return gpu_broadcast_addition_array(a,b)
+    end
+end
+
+function field_add_wrapper(a, b)
+    CUDA.@sync begin
+        return gpu_broadcast_addition_fields(a,b)
+    end
+end
+
+@field_operator function gpu_fo_addition_with_wrapper(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    CUDA.@sync begin
+        return a .+ b
+    end
+end
+
+# Benchmarks -------------------------------------------------------------------------------------------------
+
+# Create the GPU benchmark SUITE
+SUITE_GPU = BenchmarkGroup()
+
+# Define the GPU addition benchmarks
+SUITE_GPU["gpu_addition"] = BenchmarkGroup()
+
+# GPU broadcast addition benchmark
+a_gpu, b_gpu, data_size_gpu = gpu_broadcast_addition_setup(STREAM_SIZE)
+SUITE_GPU["gpu_addition"]["gpu_array_broadcast_addition"] = @benchmarkable $arr_add_wrapper($a_gpu, $b_gpu)
+
+# GPU Field broadcast addition benchmark # TODO(lorenzovarese): fix the CUDA.@sync, results are unrealistic
+a_gpu, b_gpu, out_gpu = gpu_fields_broadcast_addition_setup(STREAM_SIZE)
+SUITE_GPU["gpu_addition"]["gpu_fields_broadcast_addition"] = @benchmarkable $field_add_wrapper($a_gpu, $b_gpu)
+
+# GPU Field Operator broadcast addition benchmark # TODO(lorenzovarese): fix the CUDA.@sync, results are unrealistic
+a_gpu, b_gpu, out_gpu = gpu_fields_broadcast_addition_setup(STREAM_SIZE)
+SUITE_GPU["gpu_addition"]["gpu_field_op_broadcast_addition"] = @benchmarkable $gpu_fo_addition($a_gpu, $b_gpu, backend="embedded", out=$out_gpu)
+
+# Running the GPU benchmark SUITE
+println("Running the GPU benchmark SUITE...")
+gpu_results = run(SUITE_GPU)
+
+# Process and print the GPU results
+gpu_array_results = gpu_results["gpu_addition"]["gpu_array_broadcast_addition"]
+gpu_fields_results = gpu_results["gpu_addition"]["gpu_fields_broadcast_addition"]
+gpu_fo_results = gpu_results["gpu_addition"]["gpu_field_op_broadcast_addition"]
+
+# Compute memory bandwidth for GPU benchmarks
+gpu_array_bandwidth, data_size_arr_gpu = compute_memory_bandwidth_addition(gpu_array_results, a_gpu, b_gpu, a_gpu)
+gpu_fields_bandwidth, data_size_fields_gpu = compute_memory_bandwidth_addition(gpu_fields_results, a_gpu, b_gpu, a_gpu)
+gpu_fo_bandwidth, data_size_fo_gpu = compute_memory_bandwidth_addition(gpu_fo_results, a_gpu, b_gpu, out_gpu)
+
+# Function to convert nanoseconds to milliseconds for clearer output
+ns_to_ms(time_ns) = time_ns / 1e6
+
+# Output results for GPU benchmarks
+println("GPU Array broadcast addition:")
+println("\tData size: $data_size_arr_gpu")
+println("\tBandwidth: $gpu_array_bandwidth GB/s")
+println("\tTime taken: $(ns_to_ms(median(gpu_array_results.times))) ms\n")
+
+println("GPU Fields data broadcast addition:")
+println("\tData size: $data_size_fields_gpu")
+println("\tBandwidth: $gpu_fields_bandwidth GB/s")
+println("\tTime taken: $(ns_to_ms(median(gpu_fields_results.times))) ms\n")
+
+println("GPU Field Operator broadcast addition:")
+println("\tData size: $data_size_fo_gpu")
+println("\tBandwidth: $gpu_fo_bandwidth GB/s")
+println("\tTime taken: $(ns_to_ms(median(gpu_fo_results.times))) ms\n")

From 47dbe38b3e129a4bb79efaecdd5845a1721abd87 Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Thu, 5 Sep 2024 15:58:18 +0200
Subject: [PATCH 52/53] Fix gpu benchmark for memory bandwidth computation

---
 benchmark/benchmarks_gpu.jl | 135 +++++++++++++++---------------------
 1 file changed, 56 insertions(+), 79 deletions(-)

diff --git a/benchmark/benchmarks_gpu.jl b/benchmark/benchmarks_gpu.jl
index edfc438..2c9d658 100644
--- a/benchmark/benchmarks_gpu.jl
+++ b/benchmark/benchmarks_gpu.jl
@@ -4,33 +4,25 @@ using GridTools
 using GridTools.ExampleMeshes.Unstructured
 
 # Data size
-const global STREAM_SIZE = 10_000_000
+const STREAM_SIZE::Int64 = 10_000_000
 
 """
-    compute_memory_bandwidth_addition(results, a, b, out)::Tuple{Float64, Int64}
+    compute_memory_bandwidth_addition(time_in_seconds, a, b, out)::Tuple{Float64, Int64}
 
 Function to compute the memory bandwidth for the addition benchmarks.
 
 # Arguments
-- `results`: The benchmark results containing timing information (`times`).
-- `a, b`: The input fields or arrays used in the benchmark.
-- `out`: The output field or array used in the benchmark.
+- `time_in_seconds`: The execution time in seconds.
+- `STREAM_SIZE`: the size used for the arrays
 
 # Returns
 - A tuple `(bandwidth, data_size)` where:
     - `bandwidth`: The memory bandwidth in gigabytes per second (GB/s).
     - `data_size`: The total size of the data processed in bytes.
 """
-function compute_memory_bandwidth_addition(results, a, b, out)::Tuple{Float64, Int64}
-    # Ensure the sizes of the data fields are consistent
-    @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data)
-
+function compute_memory_bandwidth_addition(time_in_seconds::Float64, STREAM_SIZE::Int64, data_type::Type)::Tuple{Float64, Int64}
     # Calculate the total size of data read and written in bytes
-    # Read from `a` and `b`, and write to `out`
-    data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data)
-
-    # Compute the median execution time from benchmark results in seconds (convert from nanoseconds)
-    time_in_seconds = median(results.times) / 1e9
+    data_size = 3 * STREAM_SIZE * sizeof(data_type)  # (a + b + out), each Float64 is 8 bytes
 
     # Calculate memory bandwidth in GB/s
     bandwidth = data_size / time_in_seconds / 1e9
@@ -49,14 +41,14 @@ Setup function for the GPU broadcast addition benchmark using CuArray.
 - `ARRAY_SIZE::Int64`: The size of the GPU arrays to be generated.
 
 # Returns
-- `a, b`: Two CuArray GPU arrays of size `ARRAY_SIZE`.
-- `data_size`: The total size of the data processed.
+- `a_gpu`, `b_gpu`, `out_gpu`: Three CuArray GPU arrays of size `ARRAY_SIZE`.
 """
-function gpu_broadcast_addition_setup(ARRAY_SIZE::Int64)::Tuple{CuArray{Float64,1}, CuArray{Float64,1}, Int64}
-    a_gpu = CuArray(rand(Float64, ARRAY_SIZE))
-    b_gpu = CuArray(rand(Float64, ARRAY_SIZE))
-    data_size = sizeof(a_gpu) + sizeof(b_gpu)  # Total bytes processed
-    return a_gpu, b_gpu, data_size
+function gpu_broadcast_addition_setup(ARRAY_SIZE::Int64)::Tuple{CuArray{Float64,1}, CuArray{Float64,1}, CuArray{Float64,1}}
+    randcuarr = () -> CuArray(rand(Float64, ARRAY_SIZE))
+    a_gpu = randcuarr()
+    b_gpu = randcuarr()
+    out_gpu = randcuarr()
+    return a_gpu, b_gpu, out_gpu
 end
 
 """
@@ -72,89 +64,74 @@ Setup function for the GPU field broadcast addition benchmark using CuArray.
 - `out`: An output field similar to `a`, used for storing operation results.
 """
 function gpu_fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field, Field}
-    a_gpu = Field(Cell, CuArray(rand(Float64, FIELD_DATA_SIZE)))
-    b_gpu = Field(Cell, CuArray(rand(Float64, FIELD_DATA_SIZE)))
-    out_gpu = GridTools.similar_field(a_gpu)
+    randfieldcuarr = () -> Field(Cell, CuArray(rand(Float64, FIELD_DATA_SIZE)))
+    a_gpu = randfieldcuarr()
+    b_gpu = randfieldcuarr()
+    out_gpu = randfieldcuarr()
     return a_gpu, b_gpu, out_gpu
 end
 
 # CuArray only
-function gpu_broadcast_addition_array(a::CuArray{Float64}, b::CuArray{Float64})::CuArray{Float64}
-    return a .+ b
-end
-
-# Fields and broadcasting
-function gpu_broadcast_addition_fields(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
-    return a .+ b
-end
-
-function arr_add_wrapper(a, b)
+function arr_add_wrapper!(out::CuArray{Float64,1}, a::CuArray{Float64,1}, b::CuArray{Float64,1})
     CUDA.@sync begin
-        return gpu_broadcast_addition_array(a,b)
+        out = a .+ b
     end
 end
 
-function field_add_wrapper(a, b)
+# Fields only
+function field_add_wrapper!(out::Field{Tuple{Cell_},Float64}, a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})
     CUDA.@sync begin
-        return gpu_broadcast_addition_fields(a,b)
+        out = a .+ b
     end
 end
 
-@field_operator function gpu_fo_addition_with_wrapper(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+# Field operator
+@field_operator function gpu_fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return a .+ b
+end
+
+function gpu_fo_addition_wrapper!(out::Field{Tuple{Cell_},Float64}, a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})
     CUDA.@sync begin
-        return a .+ b
+        gpu_fo_addition(a, b, backend="embedded", out=out)
     end
 end
 
-# Benchmarks -------------------------------------------------------------------------------------------------
+# Benchmarks with @belapsed
 
-# Create the GPU benchmark SUITE
-SUITE_GPU = BenchmarkGroup()
+# CuArray  -----------------------------------------------------------------------------------------------------------
+a_gpu, b_gpu, out_gpu = gpu_broadcast_addition_setup(STREAM_SIZE)
 
-# Define the GPU addition benchmarks
-SUITE_GPU["gpu_addition"] = BenchmarkGroup()
+println("Benchmarking GPU array broadcast addition:")
+gpu_array_time = @belapsed arr_add_wrapper!($out_gpu, $a_gpu, $b_gpu)
 
-# GPU broadcast addition benchmark
-a_gpu, b_gpu, data_size_gpu = gpu_broadcast_addition_setup(STREAM_SIZE)
-SUITE_GPU["gpu_addition"]["gpu_array_broadcast_addition"] = @benchmarkable $arr_add_wrapper($a_gpu, $b_gpu)
-
-# GPU Field broadcast addition benchmark # TODO(lorenzovarese): fix the CUDA.@sync, results are unrealistic
-a_gpu, b_gpu, out_gpu = gpu_fields_broadcast_addition_setup(STREAM_SIZE)
-SUITE_GPU["gpu_addition"]["gpu_fields_broadcast_addition"] = @benchmarkable $field_add_wrapper($a_gpu, $b_gpu)
+# Compute memory bandwidth for GPU array benchmark
+gpu_array_bandwidth, data_size_arr_gpu = compute_memory_bandwidth_addition(gpu_array_time, STREAM_SIZE, eltype(a_gpu))
+println("GPU Array broadcast addition:")
+println("\tData size: $data_size_arr_gpu")
+println("\tTime:      $gpu_array_time")
+println("\tBandwidth: $gpu_array_bandwidth GB/s\n")
 
-# GPU Field Operator broadcast addition benchmark # TODO(lorenzovarese): fix the CUDA.@sync, results are unrealistic
+# Fields  -------------------------------------------------------------------------------------------------------------
 a_gpu, b_gpu, out_gpu = gpu_fields_broadcast_addition_setup(STREAM_SIZE)
-SUITE_GPU["gpu_addition"]["gpu_field_op_broadcast_addition"] = @benchmarkable $gpu_fo_addition($a_gpu, $b_gpu, backend="embedded", out=$out_gpu)
-
-# Running the GPU benchmark SUITE
-println("Running the GPU benchmark SUITE...")
-gpu_results = run(SUITE_GPU)
-
-# Process and print the GPU results
-gpu_array_results = gpu_results["gpu_addition"]["gpu_array_broadcast_addition"]
-gpu_fields_results = gpu_results["gpu_addition"]["gpu_fields_broadcast_addition"]
-gpu_fo_results = gpu_results["gpu_addition"]["gpu_field_op_broadcast_addition"]
 
-# Compute memory bandwidth for GPU benchmarks
-gpu_array_bandwidth, data_size_arr_gpu = compute_memory_bandwidth_addition(gpu_array_results, a_gpu, b_gpu, a_gpu)
-gpu_fields_bandwidth, data_size_fields_gpu = compute_memory_bandwidth_addition(gpu_fields_results, a_gpu, b_gpu, a_gpu)
-gpu_fo_bandwidth, data_size_fo_gpu = compute_memory_bandwidth_addition(gpu_fo_results, a_gpu, b_gpu, out_gpu)
+println("Benchmarking GPU fields broadcast addition:")
+gpu_fields_time = @belapsed field_add_wrapper!($out_gpu, $a_gpu, $b_gpu)
 
-# Function to convert nanoseconds to milliseconds for clearer output
-ns_to_ms(time_ns) = time_ns / 1e6
+# Compute memory bandwidth for GPU fields benchmark
+gpu_fields_bandwidth, data_size_fields_gpu = compute_memory_bandwidth_addition(gpu_fields_time, STREAM_SIZE, eltype(a_gpu.data))
+println("GPU Fields broadcast addition:")
+println("\tData size: $data_size_fields_gpu")
+println("\tTime:      $gpu_fields_time")
+println("\tBandwidth: $gpu_fields_bandwidth GB/s\n")
 
-# Output results for GPU benchmarks
-println("GPU Array broadcast addition:")
-println("\tData size: $data_size_arr_gpu")
-println("\tBandwidth: $gpu_array_bandwidth GB/s")
-println("\tTime taken: $(ns_to_ms(median(gpu_array_results.times))) ms\n")
+# Field operator -------------------------------------------------------------------------------------------------------
+a_gpu, b_gpu, out_gpu = gpu_fields_broadcast_addition_setup(STREAM_SIZE)
 
-println("GPU Fields data broadcast addition:")
-println("\tData size: $data_size_fields_gpu")
-println("\tBandwidth: $gpu_fields_bandwidth GB/s")
-println("\tTime taken: $(ns_to_ms(median(gpu_fields_results.times))) ms\n")
+println("Benchmarking GPU field operator broadcast addition:")
+gpu_fo_time = @belapsed field_add_wrapper!($out_gpu, $a_gpu, $b_gpu)
 
+# Compute memory bandwidth for GPU field operator benchmark
+gpu_fo_bandwidth, data_size_fo_gpu = compute_memory_bandwidth_addition(gpu_fo_time, STREAM_SIZE, eltype(a_gpu.data))
 println("GPU Field Operator broadcast addition:")
 println("\tData size: $data_size_fo_gpu")
-println("\tBandwidth: $gpu_fo_bandwidth GB/s")
-println("\tTime taken: $(ns_to_ms(median(gpu_fo_results.times))) ms\n")
+println("\tBandwidth: $gpu_fo_bandwidth GB/s\n")

From b1fe0b21bc3c158749c7bb8c36f631c3b3a94a31 Mon Sep 17 00:00:00 2001
From: Lorenzo Varese <55581163+lorenzovarese@users.noreply.github.com>
Date: Thu, 5 Sep 2024 16:05:22 +0200
Subject: [PATCH 53/53] Improve the printing of the gpu benchmark results

---
 benchmark/benchmarks_gpu.jl | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/benchmark/benchmarks_gpu.jl b/benchmark/benchmarks_gpu.jl
index 2c9d658..fa6e507 100644
--- a/benchmark/benchmarks_gpu.jl
+++ b/benchmark/benchmarks_gpu.jl
@@ -30,6 +30,11 @@ function compute_memory_bandwidth_addition(time_in_seconds::Float64, STREAM_SIZE
     return bandwidth, data_size
 end
 
+# Util for pretty print the results
+function format_number_with_dots(n::Int)
+    return reverse(join(Iterators.partition(reverse(string(n)), 3), "."))
+end
+
 # GPU Setup Functions -----------------------------------------------------------------------------------------
 
 """
@@ -107,8 +112,8 @@ gpu_array_time = @belapsed arr_add_wrapper!($out_gpu, $a_gpu, $b_gpu)
 # Compute memory bandwidth for GPU array benchmark
 gpu_array_bandwidth, data_size_arr_gpu = compute_memory_bandwidth_addition(gpu_array_time, STREAM_SIZE, eltype(a_gpu))
 println("GPU Array broadcast addition:")
-println("\tData size: $data_size_arr_gpu")
-println("\tTime:      $gpu_array_time")
+println("\tData size: $(format_number_with_dots(data_size_arr_gpu)) bytes")
+println("\tTime:      $gpu_array_time s")
 println("\tBandwidth: $gpu_array_bandwidth GB/s\n")
 
 # Fields  -------------------------------------------------------------------------------------------------------------
@@ -120,8 +125,8 @@ gpu_fields_time = @belapsed field_add_wrapper!($out_gpu, $a_gpu, $b_gpu)
 # Compute memory bandwidth for GPU fields benchmark
 gpu_fields_bandwidth, data_size_fields_gpu = compute_memory_bandwidth_addition(gpu_fields_time, STREAM_SIZE, eltype(a_gpu.data))
 println("GPU Fields broadcast addition:")
-println("\tData size: $data_size_fields_gpu")
-println("\tTime:      $gpu_fields_time")
+println("\tData size: $(format_number_with_dots(data_size_fields_gpu)) bytes")
+println("\tTime:      $gpu_fields_time s")
 println("\tBandwidth: $gpu_fields_bandwidth GB/s\n")
 
 # Field operator -------------------------------------------------------------------------------------------------------
@@ -133,5 +138,6 @@ gpu_fo_time = @belapsed field_add_wrapper!($out_gpu, $a_gpu, $b_gpu)
 # Compute memory bandwidth for GPU field operator benchmark
 gpu_fo_bandwidth, data_size_fo_gpu = compute_memory_bandwidth_addition(gpu_fo_time, STREAM_SIZE, eltype(a_gpu.data))
 println("GPU Field Operator broadcast addition:")
-println("\tData size: $data_size_fo_gpu")
+println("\tData size: $(format_number_with_dots(data_size_fo_gpu)) bytes")
+println("\tTime:      $gpu_fo_time s")
 println("\tBandwidth: $gpu_fo_bandwidth GB/s\n")