diff --git a/.github/workflows/benchmark_pr.yml b/.github/workflows/benchmark_pr.yml
index 71b10e2..a8bee98 100644
--- a/.github/workflows/benchmark_pr.yml
+++ b/.github/workflows/benchmark_pr.yml
@@ -17,7 +17,42 @@ jobs:
             - uses: julia-actions/setup-julia@v1
               with:
                 version: "1.8"
-            - uses: julia-actions/cache@v1
+            - uses: actions/cache@v2
+              name: Cache Julia packages
+              with:
+                path: ~/.julia
+                key: ${{ runner.os }}-julia-${{ hashFiles('**/Project.toml', '**/Manifest.toml') }}
+                restore-keys: |
+                  ${{ runner.os }}-julia-
+            - uses: actions/cache@v2
+              name: Cache Python packages
+              with:
+                path: ~/gt4py-venv
+                key: ${{ runner.os }}-python-${{ hashFiles('**/requirements-dev.txt') }}
+                restore-keys: |
+                  ${{ runner.os }}-python-
+            - name: Set up Python environment
+              run: |
+                sudo apt-get update
+                sudo apt-get install python3-pip python3-venv
+                python3 -m venv ~/gt4py-venv
+                source ~/gt4py-venv/bin/activate
+                python3 -m pip install --upgrade pip
+            - name: Install GT4Py from specific branch
+              run: |
+                source ~/gt4py-venv/bin/activate
+                git clone --branch fix_python_interp_path_in_cmake https://github.com/tehrengruber/gt4py.git ~/gt4py
+                cd ~/gt4py
+                pip install -r requirements-dev.txt
+                pip install -e .
+            - name: Install and Configure PyCall
+              run: |
+                source ~/gt4py-venv/bin/activate
+                julia --project=. -e 'using Pkg; Pkg.add("PyCall"); ENV["PYTHON"]="python"; Pkg.build("PyCall");'
+            - name: Check PyCall Configuration
+              run: |
+                source ~/gt4py-venv/bin/activate
+                julia --project=. -e 'using PyCall; @show PyCall.python'
             - name: Extract Package Name from Project.toml
               id: extract-package-name
               run: |
@@ -27,20 +62,20 @@ jobs:
               env:
                 JULIA_NUM_THREADS: 2
               run: |
-                # Lightweight build step, as sometimes the runner runs out of memory:
-                julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.add(;url="https://github.com/MilesCranmer/AirspeedVelocity.jl.git")'
-                julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.build("AirspeedVelocity")'
+                julia --project=. -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; using Pkg; Pkg.add(url="https://github.com/MilesCranmer/AirspeedVelocity.jl.git"); Pkg.build("AirspeedVelocity")'
             - name: Add ~/.julia/bin to PATH
               run: |
                 echo "$HOME/.julia/bin" >> $GITHUB_PATH
             - name: Run benchmarks
               run: |
+                source ~/gt4py-venv/bin/activate
                 echo $PATH
                 ls -l ~/.julia/bin
                 mkdir results
                 benchpkg ${{ steps.extract-package-name.outputs.package_name }} --rev="${{github.event.repository.default_branch}},${{github.event.pull_request.head.sha}}" --url=${{ github.event.repository.clone_url }} --bench-on="${{github.event.repository.default_branch}}" --output-dir=results/ --tune
             - name: Create plots from benchmarks
               run: |
+                source ~/gt4py-venv/bin/activate
                 mkdir -p plots
                 benchpkgplot ${{ steps.extract-package-name.outputs.package_name }} --rev="${{github.event.repository.default_branch}},${{github.event.pull_request.head.sha}}" --npart=10 --format=png --input-dir=results/ --output-dir=plots/
             - name: Upload plot as artifact
@@ -50,6 +85,7 @@ jobs:
                 path: plots
             - name: Create markdown table from benchmarks
               run: |
+                source ~/gt4py-venv/bin/activate
                 benchpkgtable ${{ steps.extract-package-name.outputs.package_name }} --rev="${{github.event.repository.default_branch}},${{github.event.pull_request.head.sha}}" --input-dir=results/ --ratio > table.md
                 echo '### Benchmark Results' > body.md
                 echo '' >> body.md
@@ -75,4 +111,4 @@ jobs:
                 # comment-id: ${{ steps.fcbenchmark.outputs.comment-id }}
                 issue-number: ${{ github.event.pull_request.number }}
                 body-path: body.md
-                edit-mode: replace
\ No newline at end of file
+                edit-mode: replace
diff --git a/.gitignore b/.gitignore
index 92d93be..dee06b9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,3 +18,17 @@ docs/build/
 .DS_Store
 
 Manifest.toml
+
+# Python Env
+.venv
+env_setup.sh
+.python-version
+
+# Misc
+**/.DS_Store
+.vscode
+
+# Ignore benchmark (benchpkg) results
+results_GridTools@*
+plot_*.png
+plot_*.pdf
diff --git a/Project.toml b/Project.toml
index ff7f05d..2ab63f0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -12,6 +12,7 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
+Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
 PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
diff --git a/advection/README.md b/advection/README.md
index b838658..cfae700 100644
--- a/advection/README.md
+++ b/advection/README.md
@@ -1,6 +1,6 @@
-### README for Running `advection_miniapp.jl`
+### README for Running `advection_setup.jl` using `run_simulation_loop.jl`
 
-This README provides instructions on how to run the `advection_miniapp.jl` script for simulating advection using the Atlas library. The script allows for terminal visualization, which can be enabled as described below.
+This README provides instructions on how to run the `run_simulation_loop.jl` script for simulating advection using the Atlas library. The script allows for terminal visualization, which can be enabled as described below.
 
 #### Prerequisites
 
@@ -15,23 +15,23 @@ This README provides instructions on how to run the `advection_miniapp.jl` scrip
      ```
 
 2. **Enabling Visualization** (optional):
-   - The script has a `VISUALIZATION_FLAG` that can be set to enable or disable visualization on the terminal. Ensure that this flag is set to `true` in the `advection_miniapp.jl` script if you wish to enable visualization.
-   - Note: Other parameters such as the number of iterations can be changed in the `# Simulation Parameters` section of the script.
+   - The script has a `VISUALIZATION_FLAG` that can be set to enable or disable visualization on the terminal. Ensure that this flag is set to `true` in the `run_simulation_loop.jl` script if you wish to enable visualization.
+   - Note: Other parameters such as the number of iterations can be changed in the `# Simulation Parameters` section of the `advection_setup.jl` script.
 
 #### Running the Simulation
 
 1. **Running the Script**:
-   - Use the following command to run the `advection_miniapp.jl` script with Julia:
+   - Use the following command to run the `run_simulation_loop.jl` script with Julia:
      ```sh
-     julia --color=yes --project=$GRIDTOOLS_JL_PATH/GridTools.jl $GRIDTOOLS_JL_PATH/GridTools.jl/src/examples/advection/advection_miniapp.jl
+     julia --color=yes --project=$GRIDTOOLS_JL_PATH/GridTools.jl $GRIDTOOLS_JL_PATH/GridTools.jl/src/examples/advection/run_simulation_loop.jl
      ```
 
 #### Example
 
-Here is an example of how to set the `VISUALIZATION_FLAG` in the `advection_miniapp.jl` script and run the simulation:
+Here is an example of how to set the `VISUALIZATION_FLAG` in the `run_simulation_loop.jl` script and run the simulation:
 
 1. **Setting the Visualization Flag**:
-   - Open the `advection_miniapp.jl` script.
+   - Open the `run_simulation_loop.jl` script.
    - Set the `VISUALIZATION_FLAG` to `true`:
      ```julia
      const VISUALIZATION_FLAG = true
@@ -42,7 +42,7 @@ Here is an example of how to set the `VISUALIZATION_FLAG` in the `advection_mini
    - Run the script with the following command:
      ```sh
      export GRIDTOOLS_JL_PATH=...
-     julia --color=yes --project=. $GRIDTOOLS_JL_PATH/src/examples/advection/advection_miniapp.jl
+     julia --color=yes --project=. $GRIDTOOLS_JL_PATH/src/examples/advection/run_simulation_loop.jl
      ```
 
-By following these steps, you should be able to run the `advection_miniapp.jl` script and visualize the advection simulation results on your terminal.
+By following these steps, you should be able to run the `run_simulation_loop.jl` script and visualize the advection simulation results on your terminal.
diff --git a/advection/advection.jl b/advection/advection.jl
index 19f7741..159a2ef 100644
--- a/advection/advection.jl
+++ b/advection/advection.jl
@@ -6,11 +6,10 @@
     level_indices::Field{Tuple{K_}, Int64},
     num_level::Int64
 )::Field{Tuple{Vertex_, K_}, Float64}
-
     return where(
-        level_indices .== num_level - 1,
+        level_indices .== 0,
         lower,
-        where(slice(level_indices .== 0, 1:29), upper, interior)
+        where(slice(level_indices .== 29, 2:30), upper, interior)
     )
 end
 
@@ -149,7 +148,8 @@ end
 )::Field{Tuple{Vertex_, K_}, Float64}
     zrhin =
         (1.0 ./ vol) .* neighbor_sum(
-            -min.(0.0, flux(V2E)) .* max.(0.0, dual_face_orientation) -
+            # TODO: fix the 0-min workaround due to the binary/unary operation issue
+            (broadcast(0., (Vertex, V2EDim, K)) .- min.(0.0, flux(V2E))) .* max.(0.0, dual_face_orientation) -
             max.(0.0, flux(V2E)) .* min.(0.0, dual_face_orientation),
             axis = V2EDim,
         )
@@ -227,15 +227,6 @@ end
     dual_face_orientation::Field{Tuple{Vertex_, V2EDim_}, Float64},
     dual_face_normal_weighted_x::Field{Tuple{Edge_}, Float64},
     dual_face_normal_weighted_y::Field{Tuple{Edge_}, Float64},
-    tmp_vertex_1::Field{Tuple{Vertex_, K_}, Float64},
-    tmp_vertex_2::Field{Tuple{Vertex_, K_}, Float64},
-    tmp_vertex_3::Field{Tuple{Vertex_, K_}, Float64},
-    tmp_vertex_4::Field{Tuple{Vertex_, K_}, Float64},
-    tmp_vertex_5::Field{Tuple{Vertex_, K_}, Float64},
-    tmp_vertex_6::Field{Tuple{Vertex_, K_}, Float64},
-    tmp_edge_1::Field{Tuple{Edge_, K_}, Float64},
-    tmp_edge_2::Field{Tuple{Edge_, K_}, Float64},
-    tmp_edge_3::Field{Tuple{Edge_, K_}, Float64},
 )
 
     tmp_edge_1 = advector_normal(
diff --git a/advection/advection_miniapp.jl b/advection/advection_setup.jl
similarity index 66%
rename from advection/advection_miniapp.jl
rename to advection/advection_setup.jl
index b4230a9..3153416 100644
--- a/advection/advection_miniapp.jl
+++ b/advection/advection_setup.jl
@@ -1,55 +1,26 @@
-# Advection Miniapp
-# This script demonstrates an advection simulation using the Atlas library.
+# Advection Setup
+# This script demonstrates the setup of an advection simulation using the Atlas library.
 
 using Printf
-using Debugger
 using Statistics
-using Profile
 using GridTools
-
-const global VISUALIZATION_FLAG::Bool=false
-
-# Mesh Definitions --------------------------------------------------------------------------------------------
-# Define dimensions for the mesh
-Cell_ = Dimension{:Cell_, HORIZONTAL}
-Edge_ = Dimension{:Edge_, HORIZONTAL}
-Vertex_ = Dimension{:Vertex_, HORIZONTAL}
-K_ = Dimension{:K_, VERTICAL}
-V2VDim_ = Dimension{:V2V_, LOCAL}
-V2EDim_ = Dimension{:V2E_, LOCAL}
-E2VDim_ = Dimension{:E2V_, LOCAL}
-
-# Instantiate dimension objects
-Cell = Cell_()
-K = K_()
-Edge = Edge_()
-Vertex = Vertex_()
-V2VDim = V2VDim_()
-V2EDim = V2EDim_()
-E2VDim = E2VDim_()
-
-# Define field offsets to describe the relationships between different dimensions
-V2V = FieldOffset("V2V", source = Vertex, target = (Vertex, V2VDim))
-E2V = FieldOffset("E2V", source = Vertex, target = (Edge, E2VDim))
-V2E = FieldOffset("V2E", source = Edge, target = (Vertex, V2EDim))
-Koff = FieldOffset("Koff", source = K, target = K)
+using GridTools.ExampleMeshes.Unstructured
+using GridTools.AtlasMeshes
 
 # Include additional necessary files for mesh, state container, metric calculations, and advection operations
-include("../src/atlas/atlas_mesh.jl")
 include("state_container.jl")
 include("metric.jl")
 include("advection.jl")
-include("visualization_utils.jl")
 
 # Grid and Mesh Initialization --------------------------------------------------------------------------------
 # Create a structured grid and mesh for the simulation
-grid = atlas.StructuredGrid("O50")
+grid = atlas.StructuredGrid("O90")
 mesh = AtlasMesh(grid, num_level = 30)
 
 # Simulation Parameters ---------------------------------------------------------------------------------------
 δt = 1800.0  # time step in s
 niter = 50
-ε = 1.0e-8
+ϵ = 1.0e-8
 
 # Calculate metric properties from the mesh
 metric = m_from_mesh(mesh)
@@ -188,53 +159,3 @@ nabla_z(
     out = tmp_fields["tmp_vertex_2"],
     offset_provider = mesh.offset_provider
 )
-
-if VISUALIZATION_FLAG
-    # Precompute the mapping between the unstructured domain to the structured one for ASCII art visualization
-    grid_size = 50
-    mapping = precompute_mapping(mesh, xlim, ylim, grid_size)
-end
-
-# Main Simulation Loop ----------------------------------------------------------------------------------------
-for i = 1:niter
-    # Perform the upwind advection scheme to update the scalar field (rho)
-    upwind_scheme(
-        state.rho,
-        δt,
-        mesh.vol,
-        metric.gac,
-        state.vel[1],
-        state.vel[2],
-        state.vel[3],
-        mesh.pole_edge_mask,
-        mesh.dual_face_orientation,
-        mesh.dual_face_normal_weighted_x,
-        mesh.dual_face_normal_weighted_y,
-        out = state_next.rho,
-        offset_provider = mesh.offset_provider
-    )
-
-    # Print the current timestep
-    println("Timestep $i")
-
-    if VISUALIZATION_FLAG
-        # Print the current state as ASCII art every 5 timesteps
-        print_state_ascii(state, mesh, mapping, i, grid_size)
-    end
-
-    # TODO: make a function out of this switch
-    # Swap the current and next state
-    temp = state
-    global state = state_next
-    global state_next = temp
-
-    # Update the periodic boundary layers
-    update_periodic_layers(mesh, state.rho)
-end
-
-# Output the final statistics for the scalar field (rho) and velocity fields
-println(
-    "min max sum of final rho = $(minimum(state.rho.data)) , $(maximum(state.rho.data)) , $(sum(state.rho.data))"
-)
-println("Final Vel0 sum after $niter iterations: $(sum(state.vel[1].data))")
-println("Final Vel1 sum after $niter iterations: $(sum(state.vel[2].data))")
diff --git a/advection/run_simulation_loop.jl b/advection/run_simulation_loop.jl
new file mode 100644
index 0000000..2c034a6
--- /dev/null
+++ b/advection/run_simulation_loop.jl
@@ -0,0 +1,62 @@
+# Run Advection Miniapp Simulation
+# This script demonstrates an advection simulation using the Atlas library.
+
+include("visualization_utils.jl")
+include("advection_setup.jl")
+
+const global VISUALIZATION_FLAG::Bool=false
+const global VERBOSE_FLAG::Bool=true
+
+if VISUALIZATION_FLAG
+    # Precompute the mapping between the unstructured domain to the structured one for ASCII art visualization
+    grid_size = 50
+    mapping = precompute_mapping(mesh, xlim, ylim, grid_size)
+end
+
+# Main Simulation Loop ----------------------------------------------------------------------------------------
+for i = 1:niter
+    # Perform the upwind advection scheme to update the scalar field (rho)
+    upwind_scheme(
+        state.rho,
+        δt,
+        mesh.vol,
+        metric.gac,
+        state.vel[1],
+        state.vel[2],
+        state.vel[3],
+        mesh.pole_edge_mask,
+        mesh.dual_face_orientation,
+        mesh.dual_face_normal_weighted_x,
+        mesh.dual_face_normal_weighted_y,
+        out = state_next.rho,
+        offset_provider = mesh.offset_provider
+    )
+
+    # Print the current timestep
+    if VERBOSE_FLAG
+        println("Timestep $i")
+    end
+
+    if VISUALIZATION_FLAG
+        # Print the current state as ASCII art every 5 timesteps
+        print_state_ascii(state, mesh, mapping, i, grid_size)
+    end
+
+    # TODO: make a function out of this switch
+    # Swap the current and next state
+    temp = state
+    global state = state_next
+    global state_next = temp
+
+    # Update the periodic boundary layers
+    update_periodic_layers(mesh, state.rho)
+end
+
+if VERBOSE_FLAG
+    # Output the final statistics for the scalar field (rho) and velocity fields
+    println(
+        "min max sum of final rho = $(minimum(state.rho.data)) , $(maximum(state.rho.data)) , $(sum(state.rho.data))"
+    )
+    println("Final Vel0 sum after $niter iterations: $(sum(state.vel[1].data))")
+    println("Final Vel1 sum after $niter iterations: $(sum(state.vel[2].data))")
+end
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 0000000..35fcc3b
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,162 @@
+# Benchmark Guide 🧭📈
+
+## Installation
+
+To install the benchmark CLI, execute the following command:
+
+```bash
+julia -e 'using Pkg; Pkg.add("AirspeedVelocity"); Pkg.build("AirspeedVelocity")'
+```
+
+This installation will create three executables in the `~/.julia/bin` folder: `benchpkg`, `benchpkgplot`, and `benchpkgtable`. It is necessary to add them to your `$PATH` to use them from any terminal session.
+
+### Add to PATH Temporarily
+
+To temporarily add the path to your session:
+
+```bash
+export PATH="$PATH:~/.julia/bin"
+```
+
+### Add to PATH Permanently
+
+To permanently add the executables to your path, append the following line to your `.zshrc` or `.bashrc` file:
+
+```bash
+echo 'export PATH="$PATH:~/.julia/bin"' >> ~/.zshrc  # For zsh users
+echo 'export PATH="$PATH:~/.julia/bin"' >> ~/.bashrc  # For bash users
+```
+
+## Running Benchmarks
+
+To run benchmarks, simply execute the following command in the shell:
+
+```bash
+benchpkg
+```
+
+and it will:
+
+1. Figure out the package name (from Project.toml)
+2. Figure out the default branch name to compare the dirty state of your repo against
+3. Evaluate all the benchmarks in benchmarks/benchmark.jl (BenchmarkTools.jl format – i.e., const SUITE = BenchmarkGroup())
+4. Print the result in a nicely formatted markdown table
+
+You can use the `--filter` option to quickly check if the load time has worsened compared to the master branch:
+
+```bash
+benchpkg --filter=time_to_load
+```
+
+The `benchpkg` was updated in June 2024 to automate the benchmark without specifying the parameters. 
+To specify additional condition in `benchpkg` and to work with `benchpkgplot` consult the help command (`--h`).
+
+Here’s an improved and completed version of your README section, with the necessary definitions, examples, and explanations:
+
+---
+
+## Comparing Two or More Different Revisions (States)
+
+To compare two or more different states of your codebase, you can use revisions. In this context, a **revision** refers to a specific state of the repository, which can be identified by a commit hash or a tag.
+
+### (Reminder) What is a Revision?
+
+A **revision** in Git is an identifier that refers to a specific state of the repository at a particular point in time. Revisions can be specified using:
+- **Commit Hashes**: A unique SHA-1 identifier for each commit, e.g., `8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd`.
+- **Tags**: Human-readable names assigned to specific commits, often used to mark release points (e.g., `v1.0.0`).
+
+### How to Add a Tag
+
+You can create a tag in Git by using the following command:
+
+```bash
+git tag -a <tag-name> -m "Tag message"
+```
+
+For example, to tag the current commit with `v1.0.0`, you would run:
+
+```bash
+git tag -a v1.0.0 -m "Improvement using @threads instead of @simd in broadcasting"
+```
+
+To push the tag to the remote repository, use:
+
+```bash
+git push origin <tag-name>
+```
+
+For example:
+
+```bash
+git push origin v1.0.0
+```
+
+To see information about all tags, such as the commit they point to and the tag messages, use:
+
+```bash
+git show-ref --tags && git tag -n | while IFS= read -r line; do echo "$line"; done
+```
+
+### Example: Using Commit Hashes to Compare Revisions
+
+Here is an example of how to use commit hashes to compare different revisions:
+
+```bash
+benchpkg --rev=8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd,6fb48706f988613860c6c98beef32c32e900737b \
+    --bench-on=8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd --exeflags="--threads=8"
+```
+
+In this example, `benchpkg` compares the two specified revisions, with the first hash being the baseline for comparison.
+
+### Example: Using Tags to Compare Revisions
+
+Here’s how you can use tags instead of commit hashes:
+
+1. **Create Tags**: 
+   Suppose you want to tag the two commits:
+
+   ```bash
+   git tag -a v1.0.0 8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd -m "Tag message for v1.0.0"
+   git tag -a v1.1.0 6fb48706f988613860c6c98beef32c32e900737b -m "Tag message for v1.1.0"
+   ```
+
+2. **Use Tags in `benchpkg`**:
+   Once the tags are set, you can use them in the comparison:
+
+   ```bash
+   benchpkg --rev=v1.0.0,v1.1.0 --bench-on=v1.0.0 --exeflags="--threads=8"
+   ```
+
+### How to Remove a Tag
+
+If you need to remove a tag from your repository, you can do so with the following commands:
+
+1. **Delete the tag locally**:
+
+   ```bash
+   git tag -d <tag-name>
+   ```
+
+   For example:
+
+   ```bash
+   git tag -d v1.0.0
+   ```
+
+2. **Delete the tag from the remote repository**:
+
+   ```bash
+   git push origin --delete <tag-name>
+   ```
+
+   For example:
+
+   ```bash
+   git push origin --delete v1.0.0
+   ```
+
+## Developer Notes
+
+1. The `benchpkg` tool compares different revisions, allowing you to specify the commits or tags you wish to compare. It is crucial to ensure that both commits include all necessary dependencies; otherwise, the dependencies might not be resolved.
+
+2. **AirSpeedVelocity**: Note that AirSpeedVelocity requires the benchmarking suite to be named `SUITE`. Any other names will not be recognized, which could lead to errors in your benchmarking process.
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 8255aca..87c404f 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -1,16 +1,385 @@
-using Pkg
-path_to_package = joinpath(@__DIR__, "..")  # Assuming the benchmarks.jl file is in the "benchmark" directory
-push!(LOAD_PATH, path_to_package)
 using BenchmarkTools
+using Statistics
 using GridTools
+using GridTools.ExampleMeshes.Unstructured
+using GridTools.ExampleMeshes.Cartesian
 
+# Data size
+const global STREAM_SIZE = 10_000_000
+
+# Utils ------------------------------------------------------------------------------------------------------
+
+# Useful for the benchmark of the field remapping operation
+function create_large_connectivity(size::Int)
+    edge_to_cell_table = vcat([rand(1:size, (1, 2)) for _ in 1:size]...)
+    cell_to_edge_table = vcat([rand(1:size, (1, 3)) for _ in 1:size]...)
+
+    E2C = Connectivity(edge_to_cell_table, Cell, Edge, 2)
+    C2E = Connectivity(cell_to_edge_table, Edge, Cell, 3)
+
+    Dict(
+        "E2C" => E2C,
+        "C2E" => C2E,
+        "E2CDim" => E2C  # TODO: remove it
+    )
+end
+
+"""
+    compute_memory_bandwidth_single(results, a, out)::Float64
+
+Calculates the memory bandwidth for operations that involve a single input and output field based on benchmark results.
+
+This function measures how efficiently data is transferred to and from memory during the execution of a benchmarked operation.
+
+# Arguments
+- `results`: The benchmark results object containing timing and other performance data.
+- `a`: The input field used in the benchmark.
+- `out`: The output field produced by the benchmark.
+
+# Returns
+- `bandwidth`: The computed memory bandwidth in gigabytes per second (GB/s), which represents the rate at which data is read from and written to the system memory during the operation.
+
+# Calculation Details
+- `data_size`: Sum of the sizes of the input and output data in bytes.
+- `time_in_seconds`: The median execution time of the benchmark, converted from nanoseconds to seconds.
+- `bandwidth`: Calculated as the total data transferred divided by the time taken, expressed in GB/s.
+"""
+function compute_memory_bandwidth_single(results, a, out=a)::Float64
+    data_size = sizeof(a.data) + sizeof(out.data)  # Read from a and write to out
+    time_in_seconds = median(results.times) / 1e9  # Convert ns to s
+    bandwidth = data_size / time_in_seconds / 1e9  # GB/s
+    return bandwidth
+end
+
+"""
+    compute_memory_bandwidth_addition(results, a, b, out)
+
+Function to compute the memory bandwidth for the addition benchmarks.
+
+# Arguments
+- `results`: Benchmark results.
+- `a, b`: The input arrays/fields used in the benchmark.
+- `out`: The output array/field of the benchmark.
+
+# Returns
+- The computed memory bandwidth in GB/s.
+"""
+function compute_memory_bandwidth_addition(results, a, b, out)::Tuple{Float64, Int64}
+    @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data)
+    data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data)  # Read a and b, write to out
+    time_in_seconds = median(results.times) / 1e9  # Convert ns to s
+    bandwidth = data_size / time_in_seconds / 1e9  # GB/s
+    return bandwidth, data_size
+end
+
+# Operations -------------------------------------------------------------------------------------------------
+
+"""
+    single_field_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field}
+
+Setup function to create a field and a similar output field for benchmarking operations that require a single input field.
+
+# Arguments
+- `FIELD_DATA_SIZE::Int64`: The size of the field to be generated.
+
+# Returns
+- `a`: A randomly generated field of floats of size `FIELD_DATA_SIZE`.
+- `out`: An output field similar to `a`, used for storing operation results.
+"""
+function single_field_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field}
+    a = Field(Cell, rand(Float64, FIELD_DATA_SIZE))
+    out = GridTools.similar_field(a)
+    return a, out
+end
+
+"""
+    array_broadcast_addition_setup(ARRAY_SIZE::Int64)
+
+Setup function for the Julia broadcast addition benchmark.
+
+# Arguments
+- `ARRAY_SIZE::Int64`: The size of the arrays to be generated.
+
+# Returns
+- `a, b`: Two randomly generated arrays of integers of size `ARRAY_SIZE`.
+- `data_size`: The total size of the data processed.
+"""
+function array_broadcast_addition_setup(ARRAY_SIZE::Int64)::Tuple{Array{Float64,1}, Array{Float64,1}, Int64}
+    a = rand(Float64, ARRAY_SIZE)
+    b = rand(Float64, ARRAY_SIZE)
+    data_size = sizeof(a) + sizeof(b)  # Total bytes processed
+    return a, b, data_size
+end
+
+"""
+    broadcast_addition_array(a::Array{Float64}, b::Array{Float64})
+
+Core operation for the Julia broadcast addition benchmark.
+
+# Arguments
+- `a, b`: Two arrays to be added.
+
+# Returns
+- The result of element-wise addition of `a` and `b`.
+"""
+function broadcast_addition_array(a::Array{Float64}, b::Array{Float64})::Array{Float64,1}
+    return a .+ b
+end
+
+"""
+    broadcast_addition(a::Field, b::Field)
+
+Core operation for the broadcast addition of two Field benchmark.
+Useful to asses and track possible overhead on fields.
+
+# Arguments
+- `a, b`: Two field to be added.
+
+# Returns
+- The result of element-wise addition of the data of the fields `a` and `b`.
+"""
+function broadcast_addition_fields(a::Field, b::Field)::Field
+    return a .+ b
+end
+
+"""
+    fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)
+
+Setup function for the field operator broadcast addition benchmark.
+
+# Arguments
+- `FIELD_DATA_SIZE::Int64`: The size of the fields to be generated.
+
+# Returns
+- `a, b`: Two randomly generated fields of floats of size `FIELD_DATA_SIZE`.
+- `out`: An output field similar to `a`.
+"""
+function fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field, Field}
+    a = Field(Cell, rand(Float64, FIELD_DATA_SIZE))
+    b = Field(Cell, rand(Float64, FIELD_DATA_SIZE))
+    out = GridTools.similar_field(a)
+    return a, b, out
+end
+
+"""
+    fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Core operation for the field operator broadcast addition benchmark.
+
+# Arguments
+- `a, b`: Two fields to be added.
+
+# Returns
+- The result of element-wise addition of `a` and `b`.
+"""
+@field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return a .+ b
+end
+
+"""
+    sin_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Applies the sine function element-wise to the data of a field without using a field operator.
+
+# Arguments
+- `a`: Input field containing Float64 data.
+
+# Returns
+- A new field where each element is the sine of the corresponding element in the input field `a`.
+"""
+function sin_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return sin.(a)
+end
+
+"""
+    cos_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Applies the cosine function element-wise to the data of a field without using a field operator.
+
+# Arguments
+- `a`: Input field containing Float64 data.
+
+# Returns
+- A new field where each element is the cosine of the corresponding element in the input field `a`.
+"""
+function cos_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return cos.(a)
+end
+
+"""
+    fo_sin(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Field operator that applies the sine function element-wise to the data of a field.
+
+# Arguments
+- `a`: Input field containing Float64 data.
+
+# Returns
+- A new field where each element is the sine of the corresponding element in the input field `a`.
+"""
+@field_operator function fo_sin(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return sin.(a)
+end
+
+"""
+    fo_cos(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Field operator that applies the cosine function element-wise to the data of a field.
+
+# Arguments
+- `a`: Input field containing Float64 data.
+
+# Returns
+- A new field where each element is the cosine of the corresponding element in the input field `a`.
+"""
+@field_operator function fo_cos(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return cos.(a)
+end
+
+"""
+    fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+
+Field operator that performs remapping from cell-based data to edge-based data.
+
+This operator utilizes a connectivity table (`E2C`) to map the values from cells to edges, implying a transformation from the cell-centered field to an edge-centered field based on predefined relationships in the connectivity table.
+
+# Arguments
+- `a`: Input field containing Float64 data structured around cells.
+
+# Returns
+- A new field where each element represents data remapped from cells to edges, structured as specified by the edge-to-cell connectivity.
+"""
+@field_operator function fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+    return a(E2C[1])
+end
+
+"""
+    fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+
+Field operator that computes the sum of neighboring cell values for each edge. This function leverages the connectivity table (`E2C`), which defines the relationship between edges and cells, to sum the values of cells that are connected to each edge.
+
+The summation is performed across the dimension specified by `E2CDim`, ensuring that each edge aggregates values from its associated cells correctly.
+
+# Arguments
+- `a`: Input field containing Float64 data, where each cell contains a numerical value.
+
+# Returns
+- A new field where each edge holds the summed value of its neighboring cells, based on the edge-to-cell connectivity defined in `E2C`.
+"""
+@field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+    return neighbor_sum(a(E2C), axis=E2CDim)
+end
+
+# Benchmarks -------------------------------------------------------------------------------------------------
+
+# Create the benchmark SUITE
 SUITE = BenchmarkGroup()
 
-SUITE["arith_broadcast"] = BenchmarkGroup()
+# Define the main groups
+SUITE["addition"] = BenchmarkGroup()
+
+# Julia broadcast addition benchmark
+a, b, data_size = array_broadcast_addition_setup(STREAM_SIZE)
+SUITE["addition"]["array_broadcast_addition"] = @benchmarkable broadcast_addition_array(a, b) setup=((a, b, data_size) = $array_broadcast_addition_setup($STREAM_SIZE); ) #a=$a; b=$b)
+
+# Field broadcast addition benchmark
+a, b, out = fields_broadcast_addition_setup(STREAM_SIZE)
+SUITE["addition"]["fields_broadcast_addition"] = @benchmarkable broadcast_addition_fields($a, $b)
+
+# Field Operator broadcast addition benchmark
+a, b, out = fields_broadcast_addition_setup(STREAM_SIZE)
+SUITE["addition"]["field_op_broadcast_addition"] = @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out)
+
+# Sine without field operator benchmark
+a, out = single_field_setup(STREAM_SIZE)
+SUITE["trigonometry"]["sin"] = @benchmarkable sin_without_fo($a)
+
+# Field operator sine benchmark
+a, out = single_field_setup(STREAM_SIZE)
+SUITE["trigonometry"]["field_op_sin"] = @benchmarkable $fo_sin($a, backend="embedded", out=$out)
+
+# Cosine without field operator benchmark
+a, out = single_field_setup(STREAM_SIZE)
+SUITE["trigonometry"]["cos"] = @benchmarkable cos_without_fo($a)
+
+# Field operator cosine benchmark
+a, out = single_field_setup(STREAM_SIZE)
+SUITE["trigonometry"]["field_op_cos"] = @benchmarkable $fo_cos($a, backend="embedded", out=$out)
+
+# Benchmark the field remapping operation
+offset_provider = create_large_connectivity(STREAM_SIZE)
+a, out = single_field_setup(STREAM_SIZE)
+SUITE["remapping"]["field_operator"] = 
+    @benchmarkable $fo_remapping($a, offset_provider=$offset_provider, backend="embedded", out=$out)
+
+# Benchmark the field neighbor sum operation
+offset_provider = create_large_connectivity(STREAM_SIZE)
+a, out = single_field_setup(STREAM_SIZE)
+SUITE["neighbor_sum"]["field_operator"] = 
+    @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, backend="embedded", out=$out)
+
+# Run the benchmark SUITE
+println("Running the benchmark SUITE...")
+results = run(SUITE)
+
+# Process the results
+array_results = results["addition"]["array_broadcast_addition"]
+fields_results = results["addition"]["fields_broadcast_addition"]
+fo_results = results["addition"]["field_op_broadcast_addition"]
+sin_results = results["trigonometry"]["sin"]
+fo_sin_results = results["trigonometry"]["field_op_sin"]
+cos_results = results["trigonometry"]["cos"]
+fo_cos_results = results["trigonometry"]["field_op_cos"]
+remapping_results = results["remapping"]["field_operator"]
+neighbor_sum_results = results["neighbor_sum"]["field_operator"]
+
+# Compute memory bandwidth
+array_bandwidth, data_size_arr = compute_memory_bandwidth_addition(array_results, a, b, a) # Out is a temporary array with size equal to the size of a
+fields_bandwidth, data_size_fields = compute_memory_bandwidth_addition(fields_results, a, b, a)
+fo_bandwidth, data_size_fo = compute_memory_bandwidth_addition(fo_results, a, b, out)
+
+sin_bandwidth = compute_memory_bandwidth_single(sin_results, a)
+fo_sin_bandwidth = compute_memory_bandwidth_single(fo_sin_results, a)
+cos_bandwidth = compute_memory_bandwidth_single(cos_results, a)
+fo_cos_bandwidth = compute_memory_bandwidth_single(fo_cos_results, a)
+
+# Function to convert nanoseconds to milliseconds for clearer output
+ns_to_ms(time_ns) = time_ns / 1e6
+
+# Process and print the results along with the time taken for each
+println("Array broadcast addition:")
+println("\tData size: $data_size_arr")
+println("\tBandwidth: $array_bandwidth GB/s")
+println("\tTime taken: $(ns_to_ms(median(array_results.times))) ms\n")
+
+println("Fields data broadcast addition:")
+println("\tData size: $data_size_fields")
+println("\tBandwidth: $fields_bandwidth GB/s")
+println("\tTime taken: $(ns_to_ms(median(fields_results.times))) ms\n")
+
+println("Field Operator broadcast addition:")
+println("\tData size: $data_size_fo")
+println("\tBandwidth: $fo_bandwidth GB/s")
+println("\tTime taken: $(ns_to_ms(median(fo_results.times))) ms\n")
+
+println("Sine operation (no field operator):")
+println("\tBandwidth: $sin_bandwidth GB/s")
+println("\tTime taken: $(ns_to_ms(median(sin_results.times))) ms\n")
+
+println("Field Operator sine operation:")
+println("\tBandwidth: $fo_sin_bandwidth GB/s")
+println("\tTime taken: $(ns_to_ms(median(fo_sin_results.times))) ms\n")
+
+println("Cosine operation (no field operator):")
+println("\tBandwidth: $cos_bandwidth GB/s")
+println("\tTime taken: $(ns_to_ms(median(cos_results.times))) ms\n")
+
+println("Field Operator cosine operation:")
+println("\tBandwidth: $fo_cos_bandwidth GB/s")
+println("\tTime taken: $(ns_to_ms(median(fo_cos_results.times))) ms\n")
 
-a = rand(1000, 1000); b = rand(1000,1000); c = rand(1000,1000)
-af = Field((Cell, K), rand(1000, 1000)); bf = Field((Cell, K), rand(1000, 1000)); cf = Field((Cell, K), rand(1000, 1000))
-SUITE["arith_broadcast"]["arrays"] = @benchmarkable a .+ b .- c
-SUITE["arith_broadcast"]["fields"] = @benchmarkable af .+ bf .- cf
+println("Field Operator Remapping:")
+println("\tTime taken: $(ns_to_ms(median(remapping_results.times))) ms\n")
 
-run(SUITE, verbose = true, seconds = 1)
+println("Field Operator Neighbor Sum:")
+println("\tTime taken: $(ns_to_ms(median(neighbor_sum_results.times))) ms\n")
diff --git a/benchmark/benchmarks_advection.jl b/benchmark/benchmarks_advection.jl
new file mode 100644
index 0000000..d0e5da3
--- /dev/null
+++ b/benchmark/benchmarks_advection.jl
@@ -0,0 +1,102 @@
+using BenchmarkTools
+using Statistics
+using GridTools
+
+include("../advection/advection_setup.jl")
+
+# Advection Benchmarks 
+
+SUITE = BenchmarkGroup()
+SUITE["advection"]["upwind_julia_embedded"] = @benchmarkable upwind_scheme(
+        state.rho,
+        δt,
+        mesh.vol,
+        metric.gac,
+        state.vel[1],
+        state.vel[2],
+        state.vel[3],
+        mesh.pole_edge_mask,
+        mesh.dual_face_orientation,
+        mesh.dual_face_normal_weighted_x,
+        mesh.dual_face_normal_weighted_y,
+        out = state_next.rho,
+        offset_provider = mesh.offset_provider
+        # embedded backend
+    )
+
+SUITE["advection"]["upwind_python_backend"] = @benchmarkable upwind_scheme(
+        state.rho,
+        δt,
+        mesh.vol,
+        metric.gac,
+        state.vel[1],
+        state.vel[2],
+        state.vel[3],
+        mesh.pole_edge_mask,
+        mesh.dual_face_orientation,
+        mesh.dual_face_normal_weighted_x,
+        mesh.dual_face_normal_weighted_y,
+        out = state_next.rho,
+        offset_provider = mesh.offset_provider,
+        backend = "py"
+    )
+
+SUITE["advection"]["mpdata_program_julia_embedded"] = @benchmarkable mpdata_program(
+        state.rho,
+        δt,
+        ϵ,
+        mesh.vol,
+        metric.gac,
+        state.vel[1],
+        state.vel[2],
+        state.vel[3],
+        mesh.pole_edge_mask,
+        mesh.dual_face_orientation,
+        mesh.dual_face_normal_weighted_x,
+        mesh.dual_face_normal_weighted_y,
+        out = state_next.rho,
+        offset_provider = mesh.offset_provider
+    )
+
+# TODO: disabled because the backend is not currently supporting it (the backend is too slow)
+# SUITE["advection"]["mpdata_program_python_backend"] = @benchmarkable mpdata_program(
+#         state.rho,
+#         δt,
+#         ϵ,
+#         mesh.vol,
+#         metric.gac,
+#         state.vel[1],
+#         state.vel[2],
+#         state.vel[3],
+#         mesh.pole_edge_mask,
+#         mesh.dual_face_orientation,
+#         mesh.dual_face_normal_weighted_x,
+#         mesh.dual_face_normal_weighted_y,
+#         out = state_next.rho,
+#         offset_provider = mesh.offset_provider,
+#         backend = "py"
+#     )
+
+# Run the benchmark suite
+println("Running the advection suite...")
+advection_results = run(SUITE)
+
+upwind_embedded_results = advection_results["advection"]["upwind_julia_embedded"]
+upwind_python_backend_results = advection_results["advection"]["upwind_python_backend"]
+mpdata_embedded_results = advection_results["advection"]["mpdata_program_julia_embedded"]
+# mpdata_python_backend_results = advection_results["advection"]["mpdata_program_python_backend"]
+
+# Function to convert nanoseconds to milliseconds for clearer output
+ns_to_ms(time_ns) = time_ns / 1e6
+
+println("Upwind scheme julia (embedded):")
+println("\tTime taken: $(ns_to_ms(median(upwind_embedded_results.times))) ms\n")
+
+println("Upwind scheme julia (python backend):")
+println("\tTime taken: $(ns_to_ms(median(upwind_python_backend_results.times))) ms\n")
+
+println("Mpdata program julia (embedded):")
+println("\tTime taken: $(ns_to_ms(median(mpdata_embedded_results.times))) ms\n")
+
+# println("Mpdata program julia (python backend):")
+# println("\tTime taken: $(ns_to_ms(median(mpdata_python_backend_results.times))) ms\n")
diff --git a/benchmark/benchmarks_gpu.jl b/benchmark/benchmarks_gpu.jl
new file mode 100644
index 0000000..fa6e507
--- /dev/null
+++ b/benchmark/benchmarks_gpu.jl
@@ -0,0 +1,143 @@
+using BenchmarkTools
+using CUDA
+using GridTools
+using GridTools.ExampleMeshes.Unstructured
+
+# Data size
+const STREAM_SIZE::Int64 = 10_000_000
+
+"""
+    compute_memory_bandwidth_addition(time_in_seconds, a, b, out)::Tuple{Float64, Int64}
+
+Function to compute the memory bandwidth for the addition benchmarks.
+
+# Arguments
+- `time_in_seconds`: The execution time in seconds.
+- `STREAM_SIZE`: the size used for the arrays
+
+# Returns
+- A tuple `(bandwidth, data_size)` where:
+    - `bandwidth`: The memory bandwidth in gigabytes per second (GB/s).
+    - `data_size`: The total size of the data processed in bytes.
+"""
+function compute_memory_bandwidth_addition(time_in_seconds::Float64, STREAM_SIZE::Int64, data_type::Type)::Tuple{Float64, Int64}
+    # Calculate the total size of data read and written in bytes
+    data_size = 3 * STREAM_SIZE * sizeof(data_type)  # (a + b + out), each Float64 is 8 bytes
+
+    # Calculate memory bandwidth in GB/s
+    bandwidth = data_size / time_in_seconds / 1e9
+
+    return bandwidth, data_size
+end
+
+# Util for pretty print the results
+function format_number_with_dots(n::Int)
+    return reverse(join(Iterators.partition(reverse(string(n)), 3), "."))
+end
+
+# GPU Setup Functions -----------------------------------------------------------------------------------------
+
+"""
+    gpu_broadcast_addition_setup(ARRAY_SIZE::Int64)
+
+Setup function for the GPU broadcast addition benchmark using CuArray.
+
+# Arguments
+- `ARRAY_SIZE::Int64`: The size of the GPU arrays to be generated.
+
+# Returns
+- `a_gpu`, `b_gpu`, `out_gpu`: Three CuArray GPU arrays of size `ARRAY_SIZE`.
+"""
+function gpu_broadcast_addition_setup(ARRAY_SIZE::Int64)::Tuple{CuArray{Float64,1}, CuArray{Float64,1}, CuArray{Float64,1}}
+    randcuarr = () -> CuArray(rand(Float64, ARRAY_SIZE))
+    a_gpu = randcuarr()
+    b_gpu = randcuarr()
+    out_gpu = randcuarr()
+    return a_gpu, b_gpu, out_gpu
+end
+
+"""
+    gpu_fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)
+
+Setup function for the GPU field broadcast addition benchmark using CuArray.
+
+# Arguments
+- `FIELD_DATA_SIZE::Int64`: The size of the fields to be generated.
+
+# Returns
+- `a, b`: Two randomly generated fields of CuArray floats of size `FIELD_DATA_SIZE`.
+- `out`: An output field similar to `a`, used for storing operation results.
+"""
+function gpu_fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field, Field}
+    randfieldcuarr = () -> Field(Cell, CuArray(rand(Float64, FIELD_DATA_SIZE)))
+    a_gpu = randfieldcuarr()
+    b_gpu = randfieldcuarr()
+    out_gpu = randfieldcuarr()
+    return a_gpu, b_gpu, out_gpu
+end
+
+# CuArray only
+function arr_add_wrapper!(out::CuArray{Float64,1}, a::CuArray{Float64,1}, b::CuArray{Float64,1})
+    CUDA.@sync begin
+        out = a .+ b
+    end
+end
+
+# Fields only
+function field_add_wrapper!(out::Field{Tuple{Cell_},Float64}, a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})
+    CUDA.@sync begin
+        out = a .+ b
+    end
+end
+
+# Field operator
+@field_operator function gpu_fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return a .+ b
+end
+
+function gpu_fo_addition_wrapper!(out::Field{Tuple{Cell_},Float64}, a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})
+    CUDA.@sync begin
+        gpu_fo_addition(a, b, backend="embedded", out=out)
+    end
+end
+
+# Benchmarks with @belapsed
+
+# CuArray  -----------------------------------------------------------------------------------------------------------
+a_gpu, b_gpu, out_gpu = gpu_broadcast_addition_setup(STREAM_SIZE)
+
+println("Benchmarking GPU array broadcast addition:")
+gpu_array_time = @belapsed arr_add_wrapper!($out_gpu, $a_gpu, $b_gpu)
+
+# Compute memory bandwidth for GPU array benchmark
+gpu_array_bandwidth, data_size_arr_gpu = compute_memory_bandwidth_addition(gpu_array_time, STREAM_SIZE, eltype(a_gpu))
+println("GPU Array broadcast addition:")
+println("\tData size: $(format_number_with_dots(data_size_arr_gpu)) bytes")
+println("\tTime:      $gpu_array_time s")
+println("\tBandwidth: $gpu_array_bandwidth GB/s\n")
+
+# Fields  -------------------------------------------------------------------------------------------------------------
+a_gpu, b_gpu, out_gpu = gpu_fields_broadcast_addition_setup(STREAM_SIZE)
+
+println("Benchmarking GPU fields broadcast addition:")
+gpu_fields_time = @belapsed field_add_wrapper!($out_gpu, $a_gpu, $b_gpu)
+
+# Compute memory bandwidth for GPU fields benchmark
+gpu_fields_bandwidth, data_size_fields_gpu = compute_memory_bandwidth_addition(gpu_fields_time, STREAM_SIZE, eltype(a_gpu.data))
+println("GPU Fields broadcast addition:")
+println("\tData size: $(format_number_with_dots(data_size_fields_gpu)) bytes")
+println("\tTime:      $gpu_fields_time s")
+println("\tBandwidth: $gpu_fields_bandwidth GB/s\n")
+
+# Field operator -------------------------------------------------------------------------------------------------------
+a_gpu, b_gpu, out_gpu = gpu_fields_broadcast_addition_setup(STREAM_SIZE)
+
+println("Benchmarking GPU field operator broadcast addition:")
+gpu_fo_time = @belapsed field_add_wrapper!($out_gpu, $a_gpu, $b_gpu)
+
+# Compute memory bandwidth for GPU field operator benchmark
+gpu_fo_bandwidth, data_size_fo_gpu = compute_memory_bandwidth_addition(gpu_fo_time, STREAM_SIZE, eltype(a_gpu.data))
+println("GPU Field Operator broadcast addition:")
+println("\tData size: $(format_number_with_dots(data_size_fo_gpu)) bytes")
+println("\tTime:      $gpu_fo_time s")
+println("\tBandwidth: $gpu_fo_bandwidth GB/s\n")
diff --git a/benchmark/benchmarks_old.jl b/benchmark/benchmarks_old.jl
new file mode 100644
index 0000000..0bb429f
--- /dev/null
+++ b/benchmark/benchmarks_old.jl
@@ -0,0 +1,103 @@
+using Pkg
+path_to_package = joinpath(@__DIR__, "..")  # Assuming the benchmarks.jl file is in the "benchmark" directory
+push!(LOAD_PATH, path_to_package)
+using BenchmarkTools
+using GridTools
+
+# Mesh definitions -------------------------------------------------------------------------------------------
+# const global Cell_ = Dimension{:Cell_, HORIZONTAL}
+# const global K_ = Dimension{:K_, HORIZONTAL}
+# const global Cell = Cell_()
+# const global K = K_()
+# const global Edge_ = Dimension{:Edge_, HORIZONTAL}
+# const global Edge = Edge_()
+# const global E2CDim_ = Dimension{:E2CDim_, LOCAL}
+# const global E2CDim = E2CDim_()
+
+
+# function setup_simple_connectivity()::Dict{String,Connectivity}
+#     edge_to_cell_table = [
+#         [1 -1];
+#         [3 -1];
+#         [3 -1];
+#         [4 -1];
+#         [5 -1];
+#         [6 -1];
+#         [1 6];
+#         [1 2];
+#         [2 3];
+#         [2 4];
+#         [4 5];
+#         [5 6]
+#     ]
+
+#     cell_to_edge_table = [
+#         [1 7 8];
+#         [8 9 10];
+#         [2 3 9];
+#         [4 10 11];
+#         [5 11 12];
+#         [6 7 12]
+#     ]
+
+#     E2C_offset_provider = Connectivity(edge_to_cell_table, Cell, Edge, 2)
+#     C2E_offset_provider = Connectivity(cell_to_edge_table, Edge, Cell, 3)
+
+#     offset_provider = Dict{String,Connectivity}(
+#         "E2C" => E2C_offset_provider,
+#         "C2E" => C2E_offset_provider,
+#         "E2CDim" => E2C_offset_provider # TODO(lorenzovarese): this is required for the embedded backend (note: python already uses E2C)
+#     )
+
+#     return offset_provider
+# end
+
+SUITE = BenchmarkGroup()
+
+# Legacy Suite with first tests
+SUITE["arith_broadcast"] = BenchmarkGroup()
+
+a = rand(1000, 1000); b = rand(1000,1000); c = rand(1000,1000)
+af = Field((Cell, K), rand(1000, 1000)); bf = Field((Cell, K), rand(1000, 1000)); cf = Field((Cell, K), rand(1000, 1000))
+SUITE["arith_broadcast"]["arrays"] = @benchmarkable a .+ b .- c
+SUITE["arith_broadcast"]["fields"] = @benchmarkable af .+ bf .- cf
+
+# SUITE["field_operator"] = BenchmarkGroup()
+
+# # Benchmark for field operator addition
+# function benchmark_fo_addition()
+#     a = Field(Cell, collect(1.0:15.0))
+#     b = Field(Cell, collect(-1.0:-1:-15.0))
+#     out = Field(Cell, zeros(Float64, 15))
+
+#     @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+#         return a .+ b
+#     end
+
+#     @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out) #setup=(
+#         #  a = Field(Cell, collect(1.0:15.0)); 
+#         # b = Field(Cell, collect(-1.0:-1:-15.0)); 
+#         # out_field = Field(Cell, zeros(Float64, 15)); 
+#         # @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} return a .+ b end;
+#         # )
+# end
+
+# SUITE["field_operator"]["addition"] = benchmark_fo_addition()
+
+# # Benchmark for neighbor sum
+# function benchmark_fo_neighbor_sum()
+#     offset_provider = setup_simple_connectivity();
+#     a = Field(Cell, collect(5.0:17.0) * 3);
+#     E2C = FieldOffset("E2C", source=Cell, target=(Edge, E2CDim))
+#     out_field = Field(Edge, zeros(Float64, 12))
+
+#     @field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+#         return neighbor_sum(a(E2C), axis=E2CDim)
+#     end
+
+#     @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, out=$out_field) 
+# end
+
+# SUITE["field_operator"]["neighbor_sum"] = benchmark_fo_neighbor_sum()
+
+run(SUITE, verbose = true, seconds = 1)
diff --git a/benchmark/utils/autorun_benchmarks.sh b/benchmark/utils/autorun_benchmarks.sh
new file mode 100755
index 0000000..58a0f0c
--- /dev/null
+++ b/benchmark/utils/autorun_benchmarks.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+# This script automates the process of benchmarking recent changes by tagging
+# the last two commits and running benchmarks using the AirspeedVelocity package.
+# It supports conditional execution based on user input to include specific benchmarks
+# for advection and allows dynamic configuration of execution threads.
+#
+# Usage:
+#   ./autorun_benchmarks.sh [--advection] [--threads=NUM]
+#     --advection: Optional. If specified, runs advection-specific benchmarks.
+#     --threads=NUM: Optional. Specifies the number of threads to use. Default is 8.
+
+# Default number of threads
+threads=8
+
+# Function to display usage
+usage() {
+    echo "Usage: $0 [--advection] [--threads=NUM]"
+    echo "  --advection: Run the advection comparison with specific benchmark script."
+    echo "  --threads=NUM: Specify the number of threads (default is 8)."
+    exit 1
+}
+
+# Parse command-line arguments
+for arg in "$@"
+do
+    case $arg in
+        --advection)
+        advection=true
+        shift # Remove --advection from processing
+        ;;
+        --threads=*)
+        threads="${arg#*=}"
+        shift # Remove --threads=NUM from processing
+        ;;
+        *)
+        # Unknown option
+        usage
+        ;;
+    esac
+done
+
+# Retrieve last two commit hashes
+before_debug=$(git rev-parse HEAD~1)
+after_debug=$(git rev-parse HEAD)
+
+# Tag the last two commits if they are not already tagged
+git tag -f after_debug $after_debug
+git tag -f before_debug $before_debug
+
+# Print the before and after tags with their messages
+git tag -n | grep -E 'before_debug|after_debug' | while IFS= read -r line; do echo "$line"; done ; echo ""
+
+# Conditional command based on the --advection flag
+if [ "$advection" == true ]; then
+    # Set the benchmark script for advection
+    benchmark_script="benchmark/benchmarks_advection.jl"
+    command="benchpkg --rev=$before_debug,$after_debug \
+             -s $benchmark_script \
+             --bench-on=$after_debug \
+             --exeflags=\"--threads=$threads\""
+else
+    command="benchpkg --rev=$before_debug,$after_debug \
+             --bench-on=$after_debug \
+             --exeflags=\"--threads=$threads\""
+fi
+
+# Print and execute the command
+echo "Executing command: $command"
+eval $command
diff --git a/benchmark/utils/setup_benchmark_interactive.jl b/benchmark/utils/setup_benchmark_interactive.jl
new file mode 100644
index 0000000..7e2aad1
--- /dev/null
+++ b/benchmark/utils/setup_benchmark_interactive.jl
@@ -0,0 +1,299 @@
+# setup_benchmark_interactive.jl
+
+# This script is intended for interactive usage during development and benchmarking sessions.
+# It sets up a Julia environment with necessary packages and predefined functions for running various benchmarks.
+# This allows developers to interactively profile and debug performance issues in real-time.
+#
+# Usage Example:
+# Start Julia with the appropriate project settings and thread configuration:
+# $ julia --project=. --threads 8
+#
+# Inside the Julia REPL, load the benchmark setup:
+# julia> include("setup_benchmark_interactive.jl")
+# This will load all necessary modules and display the current thread usage.
+#
+# To run and profile a specific operation, use:
+# julia> a, out = single_field_setup(STREAM_SIZE)
+# julia> @profile fo_sin(a, backend="embedded", out=out)
+# This will profile the `fo_sin` operation and print profiling results.
+
+include("../../advection/advection_setup.jl") 
+
+using BenchmarkTools
+using Statistics
+using GridTools
+using GridTools.ExampleMeshes.Unstructured
+using GridTools.ExampleMeshes.Cartesian
+using Profile
+using Base.Threads
+
+# Data size
+const global STREAM_SIZE = 10_000_000
+
+# Utils ------------------------------------------------------------------------------------------------------
+
+# Useful for the benchmark of the field remapping operation
+function create_large_connectivity(size::Int)
+    edge_to_cell_table = vcat([rand(1:size, (1, 2)) for _ in 1:size]...)
+    cell_to_edge_table = vcat([rand(1:size, (1, 3)) for _ in 1:size]...)
+
+    E2C = Connectivity(edge_to_cell_table, Cell, Edge, 2)
+    C2E = Connectivity(cell_to_edge_table, Edge, Cell, 3)
+
+    Dict(
+        "E2C" => E2C,
+        "C2E" => C2E,
+        "E2CDim" => E2C  # TODO: remove it
+    )
+end
+
+"""
+    compute_memory_bandwidth_single(results, a, out)::Float64
+
+Calculates the memory bandwidth for operations that involve a single input and output field based on benchmark results.
+
+This function measures how efficiently data is transferred to and from memory during the execution of a benchmarked operation.
+
+# Arguments
+- `results`: The benchmark results object containing timing and other performance data.
+- `a`: The input field used in the benchmark.
+- `out`: The output field produced by the benchmark.
+
+# Returns
+- `bandwidth`: The computed memory bandwidth in gigabytes per second (GB/s), which represents the rate at which data is read from and written to the system memory during the operation.
+
+# Calculation Details
+- `data_size`: Sum of the sizes of the input and output data in bytes.
+- `time_in_seconds`: The median execution time of the benchmark, converted from nanoseconds to seconds.
+- `bandwidth`: Calculated as the total data transferred divided by the time taken, expressed in GB/s.
+"""
+function compute_memory_bandwidth_single(results, a, out=a)::Float64
+    data_size = sizeof(a.data) + sizeof(out.data)  # Read from a and write to out
+    time_in_seconds = median(results.times) / 1e9  # Convert ns to s
+    bandwidth = data_size / time_in_seconds / 1e9  # GB/s
+    return bandwidth
+end
+
+"""
+    compute_memory_bandwidth_addition(results, a, b, out)
+
+Function to compute the memory bandwidth for the addition benchmarks.
+
+# Arguments
+- `results`: Benchmark results.
+- `a, b`: The input arrays/fields used in the benchmark.
+- `out`: The output array/field of the benchmark.
+
+# Returns
+- The computed memory bandwidth in GB/s.
+"""
+function compute_memory_bandwidth_addition(results, a, b, out)::Tuple{Float64, Int64}
+    @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data)
+    data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data)  # Read a and b, write to out
+    time_in_seconds = median(results.times) / 1e9  # Convert ns to s
+    bandwidth = data_size / time_in_seconds / 1e9  # GB/s
+    return bandwidth, data_size
+end
+
+# Operations -------------------------------------------------------------------------------------------------
+
+"""
+    single_field_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field}
+
+Setup function to create a field and a similar output field for benchmarking operations that require a single input field.
+
+# Arguments
+- `FIELD_DATA_SIZE::Int64`: The size of the field to be generated.
+
+# Returns
+- `a`: A randomly generated field of floats of size `FIELD_DATA_SIZE`.
+- `out`: An output field similar to `a`, used for storing operation results.
+"""
+function single_field_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field}
+    a = Field(Cell, rand(Float64, FIELD_DATA_SIZE))
+    out = GridTools.similar_field(a)
+    return a, out
+end
+
+"""
+    array_broadcast_addition_setup(ARRAY_SIZE::Int64)
+
+Setup function for the Julia broadcast addition benchmark.
+
+# Arguments
+- `ARRAY_SIZE::Int64`: The size of the arrays to be generated.
+
+# Returns
+- `a, b`: Two randomly generated arrays of integers of size `ARRAY_SIZE`.
+- `data_size`: The total size of the data processed.
+"""
+function array_broadcast_addition_setup(ARRAY_SIZE::Int64)::Tuple{Array{Float64,1}, Array{Float64,1}, Int64}
+    a = rand(Float64, ARRAY_SIZE)
+    b = rand(Float64, ARRAY_SIZE)
+    data_size = sizeof(a) + sizeof(b)  # Total bytes processed
+    return a, b, data_size
+end
+
+"""
+    broadcast_addition_array(a::Array{Float64}, b::Array{Float64})
+
+Core operation for the Julia broadcast addition benchmark.
+
+# Arguments
+- `a, b`: Two arrays to be added.
+
+# Returns
+- The result of element-wise addition of `a` and `b`.
+"""
+function broadcast_addition_array(a::Array{Float64}, b::Array{Float64})::Array{Float64,1}
+    return a .+ b
+end
+
+"""
+    broadcast_addition(a::Field, b::Field)
+
+Core operation for the broadcast addition of two Field benchmark.
+Useful to asses and track possible overhead on fields.
+
+# Arguments
+- `a, b`: Two field to be added.
+
+# Returns
+- The result of element-wise addition of the data of the fields `a` and `b`.
+"""
+function broadcast_addition_fields(a::Field, b::Field)::Field
+    return a .+ b
+end
+
+"""
+    fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)
+
+Setup function for the field operator broadcast addition benchmark.
+
+# Arguments
+- `FIELD_DATA_SIZE::Int64`: The size of the fields to be generated.
+
+# Returns
+- `a, b`: Two randomly generated fields of floats of size `FIELD_DATA_SIZE`.
+- `out`: An output field similar to `a`.
+"""
+function fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field, Field}
+    a = Field(Cell, rand(Float64, FIELD_DATA_SIZE))
+    b = Field(Cell, rand(Float64, FIELD_DATA_SIZE))
+    out = GridTools.similar_field(a)
+    return a, b, out
+end
+
+"""
+    fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Core operation for the field operator broadcast addition benchmark.
+
+# Arguments
+- `a, b`: Two fields to be added.
+
+# Returns
+- The result of element-wise addition of `a` and `b`.
+"""
+@field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return a .+ b
+end
+
+"""
+    sin_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Applies the sine function element-wise to the data of a field without using a field operator.
+
+# Arguments
+- `a`: Input field containing Float64 data.
+
+# Returns
+- A new field where each element is the sine of the corresponding element in the input field `a`.
+"""
+function sin_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return sin.(a)
+end
+
+"""
+    cos_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Applies the cosine function element-wise to the data of a field without using a field operator.
+
+# Arguments
+- `a`: Input field containing Float64 data.
+
+# Returns
+- A new field where each element is the cosine of the corresponding element in the input field `a`.
+"""
+function cos_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return cos.(a)
+end
+
+"""
+    fo_sin(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Field operator that applies the sine function element-wise to the data of a field.
+
+# Arguments
+- `a`: Input field containing Float64 data.
+
+# Returns
+- A new field where each element is the sine of the corresponding element in the input field `a`.
+"""
+@field_operator function fo_sin(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return sin.(a)
+end
+
+"""
+    fo_cos(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+
+Field operator that applies the cosine function element-wise to the data of a field.
+
+# Arguments
+- `a`: Input field containing Float64 data.
+
+# Returns
+- A new field where each element is the cosine of the corresponding element in the input field `a`.
+"""
+@field_operator function fo_cos(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+    return cos.(a)
+end
+
+"""
+    fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+
+Field operator that performs remapping from cell-based data to edge-based data.
+
+This operator utilizes a connectivity table (`E2C`) to map the values from cells to edges, implying a transformation from the cell-centered field to an edge-centered field based on predefined relationships in the connectivity table.
+
+# Arguments
+- `a`: Input field containing Float64 data structured around cells.
+
+# Returns
+- A new field where each element represents data remapped from cells to edges, structured as specified by the edge-to-cell connectivity.
+"""
+@field_operator function fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+    return a(E2C[1])
+end
+
+"""
+    fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+
+Field operator that computes the sum of neighboring cell values for each edge. This function leverages the connectivity table (`E2C`), which defines the relationship between edges and cells, to sum the values of cells that are connected to each edge.
+
+The summation is performed across the dimension specified by `E2CDim`, ensuring that each edge aggregates values from its associated cells correctly.
+
+# Arguments
+- `a`: Input field containing Float64 data, where each cell contains a numerical value.
+
+# Returns
+- A new field where each edge holds the summed value of its neighboring cells, based on the edge-to-cell connectivity defined in `E2C`.
+"""
+@field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64}
+    return neighbor_sum(a(E2C), axis=E2CDim)
+end
+
+# Start ------------------------------------------------------------------------------------------------------
+println("Current number of threads: ", Threads.nthreads())
+println("The environment is ready\n")
+Profile.clear()
diff --git a/notes/Benchmarks.jl b/notes/Benchmarks.jl
index 5d390ec..b271d89 100644
--- a/notes/Benchmarks.jl
+++ b/notes/Benchmarks.jl
@@ -59,7 +59,7 @@ using Profile
 
 # Benchmark for Julia and Python implementations of advection ##############################################################################################################
 
-include("../advection/advection_miniapp.jl")
+include("../advection/advection_setup.jl")
 
 println("Starting julia embedded benchmark")
 
@@ -81,7 +81,7 @@ bench_julia_embedded = @benchmark upwind_scheme(
 
 println("Finished Julia embedded benchmark")
 
-include("../advection/advection_miniapp.jl")
+include("../advection/advection_setup.jl")
 
 println("Starting julia python benchmark")
 
diff --git a/src/ExampleMeshes.jl b/src/ExampleMeshes.jl
index 96612cf..6d5d237 100644
--- a/src/ExampleMeshes.jl
+++ b/src/ExampleMeshes.jl
@@ -11,7 +11,7 @@ export Cell, K, Edge, Vertex, V2VDim, V2EDim, E2VDim, E2CDim, C2EDim
 export V2V, E2V, V2E, E2C, C2E, Koff
 
 const global Cell_ = Dimension{:Cell_, HORIZONTAL}
-const global K_ = Dimension{:K_, HORIZONTAL}
+const global K_ = Dimension{:K_, VERTICAL}
 const global Edge_ = Dimension{:Edge_, HORIZONTAL}
 const global Vertex_ = Dimension{:Vertex_, HORIZONTAL}
 const global V2VDim_ = Dimension{:V2VDim_, LOCAL}
diff --git a/src/GridTools.jl b/src/GridTools.jl
index 580be0a..865d40c 100644
--- a/src/GridTools.jl
+++ b/src/GridTools.jl
@@ -9,7 +9,7 @@ using Profile
 using Base: @propagate_inbounds
 using MacroTools
 using OffsetArrays: IdOffsetRange
-using Debugger
+using CUDA
 
 import Base.Broadcast: Extruded, Style, BroadcastStyle, ArrayStyle, Broadcasted
 
@@ -157,6 +157,30 @@ julia> field = Field(Cell, ones(5))
 julia> field(E2C)
 julia> field(E2C[1])
 ```
+
+GPU arrays are supported too.
+
+# Examples
+```julia-repl
+julia> using GridTools
+
+julia> using CUDA: CuArray
+
+julia> using GridTools.ExampleMeshes.Unstructured
+
+       # Create a CuArray of data on the GPU
+
+julia> gpu_data = CuArray(reshape(collect(1.0:12.0), (3, 4)));
+
+       # Create a Field passing data in the CuArray type
+
+julia> gpu_field = Field((Cell,K), gpu_data);
+
+       # Check the type
+
+julia> Base.typeof(gpu_field.data)
+CuArray{Float64, 2, CUDA.DeviceMemory}
+```
 """
 struct Field{
     B_Dim <: Tuple{Vararg{Dimension}},
@@ -475,6 +499,7 @@ Base.convert(t::Type{T}, F::Field) where {T <: Number} =
     inds::Vararg{Int, N}
 ) where {BD, T, N}
     new_inds = inds .- F.origin
+    # @assert Tuple(1 for i in 1:length(new_inds)) <= new_inds <= size(F.data) "Error: $new_inds, $(size(F.data)), $(F.origin)"
     return F.data[new_inds...]
 end
 @propagate_inbounds function Base.setindex!(
@@ -488,8 +513,9 @@ end
 Base.showarg(io::IO, @nospecialize(F::Field), toplevel) =
     print(io, eltype(F), " Field with dimensions ", get_dim_name.(F.broadcast_dims))
 function slice(F::Field, inds...)::Field
+    @assert all(typeof(x) <: UnitRange{Int64} for x in inds) # TODO: understand why the line below is filtering the UnitRange only
     dim_ind = findall(x -> typeof(x) <: UnitRange{Int64}, inds)
-    return Field(F.dims[dim_ind], view(F.data, inds...), F.broadcast_dims)
+    return Field(F.dims[dim_ind], view(F.data, inds...), F.broadcast_dims, origin=Dict(d=>ind[1]-1 for (d,ind) in zip(F.dims, inds)))
 end
 
 # Connectivity struct ------------------------------------------------------------
@@ -561,7 +587,6 @@ function (fo::FieldOp)(
     out = nothing,
     kwargs...
 )
-
     is_outermost_fo = isnothing(OFFSET_PROVIDER)
     if is_outermost_fo
         @assert !isnothing(out) "Must provide an out field."
@@ -609,6 +634,20 @@ function backend_execution(
     end
 end
 
+# It is not currently working in all edge cases
+function check_gpu_data(args::Tuple)::nothing
+    has_CuArray::Bool = false
+    for (i, arg) in enumerate(args)
+        if arg !== nothing && typeof(arg) <: AbstractArray && typeof(arg.data) <: CuArray
+            has_CuArray = true
+        end
+
+        if has_CuArray
+            throw(ArgumentError("GPU Arrays (CuArray) are not supported by the Python backend. Error found in argument #$i: $(typeof(arg.data))."))
+        end
+    end
+end
+
 function backend_execution(
     backend::Val{:py},
     fo::FieldOp,
@@ -624,6 +663,7 @@ function backend_execution(
         f = py_field_operator(fo)
         FIELD_OPERATORS[fo.name] = f
     end
+    # check_gpu_data(args) # TODO: throw an exception in case of gpu arrays passed to the python backend
     p_args, p_kwargs, p_out, p_offset_provider =
         py_args.((args, kwargs, out, GridTools.OFFSET_PROVIDER))
     if is_outermost_fo
@@ -705,7 +745,7 @@ macro module_vars()
                 name => Core.eval(Base, name) for
                 name in [:Int64, :Int32, :Float32, :Float64]
             )
-            all_names = names(@__MODULE__)
+            all_names = names(@__MODULE__, all=true)
             used_modules = ccall(:jl_module_usings, Any, (Any,), @__MODULE__)
             for m in used_modules
                 append!(all_names, names(m))
@@ -757,5 +797,6 @@ end
 generate_unique_name(name::Symbol, value::Integer = 0) = Symbol("$(name)ᐞ$(value)")
 
 include("ExampleMeshes.jl")
+include("atlas/AtlasMeshes.jl")
 
 end
diff --git a/src/atlas/atlas_mesh.jl b/src/atlas/AtlasMeshes.jl
similarity index 96%
rename from src/atlas/atlas_mesh.jl
rename to src/atlas/AtlasMeshes.jl
index be45be3..dbce49d 100644
--- a/src/atlas/atlas_mesh.jl
+++ b/src/atlas/AtlasMeshes.jl
@@ -1,9 +1,19 @@
 # ENV["PYCALL_JL_RUNTIME_PYTHON"] = Sys.which("python3.10")
 # ENV["PYTHONBREAKPOINT"] = "pdb.set_trace"
 
+module AtlasMeshes
+
+using GridTools
+using GridTools.ExampleMeshes.Unstructured
 using PyCall
 
-atlas = pyimport("atlas4py")
+export AtlasMesh, atlas, update_periodic_layers, DIMENSION_TO_SIZE_ATTR
+
+const atlas = PyNULL()
+
+function __init__()
+    copy!(atlas, pyimport("atlas4py"))
+end
 
 const rpi = 2.0 * asin(1.0)
 const _deg2rad = 2.0 * rpi / 360.0
@@ -260,7 +270,11 @@ struct AtlasMesh
             "V2V" => v2v,
             "V2E" => v2e,
             "E2V" => e2v,
-            "Koff" => K
+            "Koff" => K,
+            # TODO: cleanup
+            "V2VDim" => v2v,
+            "V2EDim" => v2e,
+            "E2VDim" => e2v,
         )
 
         remote_indices = Dict{Dimension, Array}(
@@ -357,3 +371,5 @@ function update_periodic_layers(mesh::AtlasMesh, field::Field)
     )
     field[periodic_indices, :] .= field[remote_indices[periodic_indices], :]
 end
+
+end # AtlasMeshes module
\ No newline at end of file
diff --git a/src/embedded/builtins.jl b/src/embedded/builtins.jl
index 6ddf639..bdb512b 100644
--- a/src/embedded/builtins.jl
+++ b/src/embedded/builtins.jl
@@ -40,17 +40,31 @@ function min_over(field_in::Field; axis::Dimension)::Field
     return reduction_master(field_in, axis, minimum)
 end
 
+"""
+    reduction_master(field_in::Field, axis::Dimension, f::Function)::Field
 
+Performs a reduction operation (`sum`, `minimum`, `maximum`, etc.) over a specific axis dimension.
+This version supports both CPU and GPU fields.
+"""
 function reduction_master(field_in::Field, axis::Dimension, f::Function)
     neutral_el = get_neutral(f, eltype(field_in))
     dim = get_dim_ind(field_in.dims, axis)
 
     conn = OFFSET_PROVIDER[get_dim_name(axis)]
-    data = dropdims(
-        f(ifelse.(conn.data .!= -1, field_in.data, neutral_el), dims = dim),
-        dims = dim
-    )
-    return Field((field_in.dims[1:dim-1]..., field_in.dims[dim+1:end]...), data)
+
+    if isa(field_in.data, CuArray)
+        # GPU version using CUDA parallelization
+        reduced_data = CUDA.fill(neutral_el, size(field_in.data))
+        CUDA.@sync reduced_data .= f(ifelse.(conn.data .!= -1, field_in.data, neutral_el), dims = dim)
+        reduced_data = dropdims(reduced_data, dims = dim)
+    else
+        # CPU version
+        reduced_data = dropdims(
+            f(ifelse.(conn.data .!= -1, field_in.data, neutral_el), dims = dim),
+            dims = dim
+        )
+    end
+    return Field((field_in.dims[1:dim-1]..., field_in.dims[dim+1:end]...), reduced_data)
 end
 
 get_neutral(f::typeof(sum), type::DataType) = convert(type, 0)
diff --git a/src/embedded/cust_broadcast.jl b/src/embedded/cust_broadcast.jl
index 0b0ad16..bf6acbb 100644
--- a/src/embedded/cust_broadcast.jl
+++ b/src/embedded/cust_broadcast.jl
@@ -1,3 +1,6 @@
+
+using Base.Threads: @threads
+
 Base.BroadcastStyle(::Type{<:Field}) = Broadcast.ArrayStyle{Field}()
 
 # TODO(tehrengruber): Implement a range with an attached dimension instead of this single object
@@ -66,7 +69,7 @@ function get_size_ifelse(mask::FieldShape, branch::FieldShape)
     out_size = [branch.axes...]
     ind_mask = findall(x -> x in branch.dims, mask.dims)
     ind_out = findall(x -> x in mask.dims, branch.dims)
-
+    # TODO: this is not correct if the mask has an origin
     out_size[ind_out] .= mask.axes[ind_mask]
 
     return FieldShape(branch.dims, Tuple(out_size), branch.broadcast_dims)
@@ -230,15 +233,42 @@ end
 
 # -----------------------------------------------------------------------------------------------------------------------------------------
 
+function is_gpu_compatible(bc::Broadcasted{ArrayStyle{Field}})::Bool
+    is_all_CuArray::Bool = false
+    has_CuArray::Bool = false
+    has_CPUArray::Bool = false
+
+    for arg in bc.args
+        if typeof(arg) <: AbstractArray
+            # Check if the argument is a CuArray
+            if typeof(arg.data) <: CuArray
+                has_CuArray = true
+                is_all_CuArray = true
+            # Check if the argument is a CPU array
+            elseif typeof(arg.data) <: Vector
+                has_CPUArray = true
+            end
+        end
+
+        # If both a CuArray and a CPU Array are present, raise an error
+        if has_CuArray && has_CPUArray
+            throw(ErrorException("Cannot have both CuArray and CPU arrays in the same args."))
+        end
+    end
+
+    return is_all_CuArray
+end
+
 # Creates uninitialized output object
 function Base.similar(bc::Broadcasted{ArrayStyle{Field}}, ::Type{ElType}) where {ElType}
     offsets = getproperty.(axes(bc), :start) .- 1
+    is_cuarray::Bool = is_gpu_compatible(bc)
     Field(
-        bc.axes.dims,
-        similar(Array{ElType}, getproperty.(axes(bc), :stop) .- offsets),
-        bc.axes.broadcast_dims,
-        offsets
-    )
+            bc.axes.dims,
+            similar(is_cuarray ? CuArray{ElType} : Array{ElType}, getproperty.(axes(bc), :stop) .- offsets),
+            bc.axes.broadcast_dims,
+            offsets
+        )
 end
 
 # -----------------------------------------------------------------------------------------------------------------------------------------
@@ -249,17 +279,31 @@ end
     if axes(dest) == axes(bc) && bc.f === identity && bc.args isa Tuple{AbstractArray} # only a single input argument to broadcast!
         A = bc.args[1]
         if axes(dest) == axes(A)
-            return copyto!(dest, A)
+            if isa(A.data, CuArray)
+                return CUDA.copyto!(dest.data, A.data) # Use @GPUArrays copyto!
+            else
+                return copyto!(dest, A)
+            end
         end
     end
 
-    bc′ = Base.Broadcast.preprocess(shape(dest), bc)
+    if isa(dest.data, CuArray)
+        # Extract the function and the arguments from the broadcasted expression
+        f = bc.f
+        args = bc.args
+
+        # Apply the function f element-wise to the arguments and store the result in dest.data
+        CUDA.map!(f, dest.data, map(arg -> arg.data, args)...)
+    else
+        bc′ = Base.Broadcast.preprocess(shape(dest), bc)
 
-    # Performance may vary depending on whether `@inbounds` is placed outside the
-    # for loop or not. (cf. https://github.com/JuliaLang/julia/issues/38086)
-    @inbounds @simd for I in eachindex(dest)
-        dest[I] = bc′[I]
+        # Performance may vary depending on whether `@inbounds` is placed outside the
+        # for loop or not. (cf. https://github.com/JuliaLang/julia/issues/38086)
+        @inbounds @simd for I in eachindex(dest)
+            dest[I] = bc′[I]
+        end
     end
+    
     return dest
 end
 
diff --git a/src/examples/example_gpu.jl b/src/examples/example_gpu.jl
new file mode 100644
index 0000000..8954a70
--- /dev/null
+++ b/src/examples/example_gpu.jl
@@ -0,0 +1,39 @@
+using GridTools
+using GridTools.ExampleMeshes.Unstructured
+using CUDA
+using Profile
+using Debugger
+using BenchmarkTools
+
+# Cpu
+
+a_cpu = Field(Cell, collect(1:2e7))
+b_cpu = Field(Cell, collect(1:2e7))
+
+out_cpu = similar(a_cpu)
+
+out_cpu = a_cpu .+ b_cpu
+
+# Gpu
+
+a_gpu = Field(Cell, CuArray(1:2e7))
+b_gpu = Field(Cell, CuArray(1:2e7))
+
+out_gpu = similar_field(a_gpu)
+
+out_gpu .= a_gpu .+ b_gpu
+
+function bench_cpu!(a_cpu, b_cpu, out_cpu)
+    out_cpu = a_cpu .+ b_cpu
+end
+
+function bench_gpu!(a_gpu, b_gpu, out_gpu)
+    # Wrapping the execution in a CUDA.@sync block will make 
+    # the CPU block until the queued GPU tasks are done, similar to how Base.@sync waits for distributed CPU tasks
+    CUDA.@sync begin
+        out_gpu = a_gpu .+ b_gpu
+    end
+end
+
+@btime bench_cpu!($a_cpu, $b_cpu, $out_cpu)
+@btime bench_gpu!($a_gpu, $b_gpu, $out_gpu)
\ No newline at end of file
diff --git a/src/gt2py/jast_to_foast.jl b/src/gt2py/jast_to_foast.jl
index c843059..f0663c7 100644
--- a/src/gt2py/jast_to_foast.jl
+++ b/src/gt2py/jast_to_foast.jl
@@ -266,6 +266,7 @@ end
 
 function visit_(sym::Val{:call}, args::Array, outer_loc)
     if args[1] in bin_op
+        # TODO: check the case where a unary expression, that is at the same time binary operation is encountered: i.e. -x
         @assert length(args)==3 "Expected a binary operation. AST must be canonicalized using `canonicalize_arithmetic_ops` first."
         return foast.BinOp(
             op = visit(args[1]),
diff --git a/test/embedded_test.jl b/test/embedded_test.jl
index 770cfed..b61db94 100644
--- a/test/embedded_test.jl
+++ b/test/embedded_test.jl
@@ -135,8 +135,8 @@ end
 
     # Broadcast -------------------------
 
-    @test typeof(broadcast(cell_values, (Cell, K))) == Field{Tuple{Dimension{:Cell_, HORIZONTAL}, Dimension{:K_, HORIZONTAL}}, Float64, 1, Tuple{Dimension{:Cell_, HORIZONTAL}}, Vector{Float64}}
-    @test typeof(broadcast(5.0, (Cell, K))) == Field{Tuple{Dimension{:Cell_, HORIZONTAL}, Dimension{:K_, HORIZONTAL}}, Float64, 0, Tuple{}, Array{Float64, 0}}
+    @test typeof(broadcast(cell_values, (Cell, K))) == Field{Tuple{Dimension{:Cell_, HORIZONTAL}, Dimension{:K_, VERTICAL}}, Float64, 1, Tuple{Dimension{:Cell_, HORIZONTAL}}, Vector{Float64}}
+    @test typeof(broadcast(5.0, (Cell, K))) == Field{Tuple{Dimension{:Cell_, HORIZONTAL}, Dimension{:K_, VERTICAL}}, Float64, 0, Tuple{}, Array{Float64, 0}}
 
     # Where -----------------------------------------
 
diff --git a/test/gpu_test.jl b/test/gpu_test.jl
new file mode 100644
index 0000000..3f7fecb
--- /dev/null
+++ b/test/gpu_test.jl
@@ -0,0 +1,44 @@
+using Test
+using CUDA: CuArray
+using GridTools
+using GridTools.ExampleMeshes.Unstructured
+
+@testset "Testset Simple Broadcast Addition GPU" begin
+    a_gpu = Field(Cell, CuArray(1.0:15.0))
+    b_gpu = Field(Cell, CuArray(-2.0:-1:-16.0))
+    @assert size(a_gpu.data) == size(b_gpu.data) "Fields a_gpu and b_gpu do not have the same size of data."
+
+    out_gpu = similar_field(a_gpu)
+    out_gpu = a_gpu .+ b_gpu
+
+    @test all(out_gpu.data .== -1)    
+end
+
+@testset "Testset Large Broadcast Addition GPU" begin
+    # Initialize two large GPU fields with CuArray
+    a_gpu = Field(Cell, CuArray(1:2e7))
+    b_gpu = Field(Cell, CuArray(1:2e7))
+    @assert size(a_gpu.data) == size(b_gpu.data) "Fields a_gpu and b_gpu do not have the same size of data."
+
+    out_gpu = similar_field(a_gpu)
+    out_gpu .= a_gpu .+ b_gpu
+
+    expected_result = CuArray(2:2:2e7*2)
+    
+    @test all(out_gpu.data .== expected_result)
+end
+
+@testset "Testset Field Operator Addition GPU" begin
+    a_gpu = Field(Cell, CuArray(1.0:15.0))
+    b_gpu = Field(Cell, CuArray(-2.0:-1:-16.0))
+    @assert size(a_gpu.data) == size(b_gpu.data) "Fields a and b do not have the same size of data."
+
+    out_gpu = similar_field(a_gpu)
+
+    @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64}
+        return a .+ b
+    end
+
+    fo_addition(a_gpu, b_gpu, backend="embedded", out=out_gpu)
+    @test all(out_gpu.data .== -1)
+end
diff --git a/test/gt2py_fo_exec.jl b/test/gt2py_fo_exec.jl
index ec0014f..187eb23 100644
--- a/test/gt2py_fo_exec.jl
+++ b/test/gt2py_fo_exec.jl
@@ -564,6 +564,23 @@ function test_lap_lap(offset_provider::Dict{String, Dimension}, backend::String,
     # TODO: add in the future the test for the border values
 end
 
+"""
+    test_slice()
+
+This test checks the `slice` function, which should correctly extract a subset of data from a larger field and properly adjust the origin to reflect the new sliced field's starting point.
+
+# Expected Behavior
+- The sliced data should match the expected subset from the original field.
+- The origin of the sliced field should be adjusted correctly to match the new starting index of the sliced data.
+"""
+function test_slice()
+    a::Field = Field((IDim,), [1; 2; 3; 4; 5])
+    sliced_a = slice(a, 2:4)
+    @test sliced_a.data == [2; 3; 4]
+    @test sliced_a.origin == (2-1,)
+    @test sliced_a.dims == (IDim,)
+end
+
 # Test Executions --------------------------------------------------------------------------------------------
 
 function test_gt4py_fo_exec()
@@ -638,6 +655,8 @@ function test_gt4py_fo_exec()
     
     # testwrapper(setup_cartesian_offset_provider, test_lap_lap, "embedded", simple_cartesian_field)
     testwrapper(setup_cartesian_offset_provider, test_lap_lap, "py", simple_cartesian_field)
+
+    testwrapper(nothing, test_slice)
 end
 
 @testset "Testset GT2Py fo exec" test_gt4py_fo_exec()