diff --git a/.github/workflows/benchmark_pr.yml b/.github/workflows/benchmark_pr.yml index 71b10e2..a8bee98 100644 --- a/.github/workflows/benchmark_pr.yml +++ b/.github/workflows/benchmark_pr.yml @@ -17,7 +17,42 @@ jobs: - uses: julia-actions/setup-julia@v1 with: version: "1.8" - - uses: julia-actions/cache@v1 + - uses: actions/cache@v2 + name: Cache Julia packages + with: + path: ~/.julia + key: ${{ runner.os }}-julia-${{ hashFiles('**/Project.toml', '**/Manifest.toml') }} + restore-keys: | + ${{ runner.os }}-julia- + - uses: actions/cache@v2 + name: Cache Python packages + with: + path: ~/gt4py-venv + key: ${{ runner.os }}-python-${{ hashFiles('**/requirements-dev.txt') }} + restore-keys: | + ${{ runner.os }}-python- + - name: Set up Python environment + run: | + sudo apt-get update + sudo apt-get install python3-pip python3-venv + python3 -m venv ~/gt4py-venv + source ~/gt4py-venv/bin/activate + python3 -m pip install --upgrade pip + - name: Install GT4Py from specific branch + run: | + source ~/gt4py-venv/bin/activate + git clone --branch fix_python_interp_path_in_cmake https://github.com/tehrengruber/gt4py.git ~/gt4py + cd ~/gt4py + pip install -r requirements-dev.txt + pip install -e . + - name: Install and Configure PyCall + run: | + source ~/gt4py-venv/bin/activate + julia --project=. -e 'using Pkg; Pkg.add("PyCall"); ENV["PYTHON"]="python"; Pkg.build("PyCall");' + - name: Check PyCall Configuration + run: | + source ~/gt4py-venv/bin/activate + julia --project=. -e 'using PyCall; @show PyCall.python' - name: Extract Package Name from Project.toml id: extract-package-name run: | @@ -27,20 +62,20 @@ jobs: env: JULIA_NUM_THREADS: 2 run: | - # Lightweight build step, as sometimes the runner runs out of memory: - julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.add(;url="https://github.com/MilesCranmer/AirspeedVelocity.jl.git")' - julia -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; import Pkg; Pkg.build("AirspeedVelocity")' + julia --project=. -e 'ENV["JULIA_PKG_PRECOMPILE_AUTO"]=0; using Pkg; Pkg.add(url="https://github.com/MilesCranmer/AirspeedVelocity.jl.git"); Pkg.build("AirspeedVelocity")' - name: Add ~/.julia/bin to PATH run: | echo "$HOME/.julia/bin" >> $GITHUB_PATH - name: Run benchmarks run: | + source ~/gt4py-venv/bin/activate echo $PATH ls -l ~/.julia/bin mkdir results benchpkg ${{ steps.extract-package-name.outputs.package_name }} --rev="${{github.event.repository.default_branch}},${{github.event.pull_request.head.sha}}" --url=${{ github.event.repository.clone_url }} --bench-on="${{github.event.repository.default_branch}}" --output-dir=results/ --tune - name: Create plots from benchmarks run: | + source ~/gt4py-venv/bin/activate mkdir -p plots benchpkgplot ${{ steps.extract-package-name.outputs.package_name }} --rev="${{github.event.repository.default_branch}},${{github.event.pull_request.head.sha}}" --npart=10 --format=png --input-dir=results/ --output-dir=plots/ - name: Upload plot as artifact @@ -50,6 +85,7 @@ jobs: path: plots - name: Create markdown table from benchmarks run: | + source ~/gt4py-venv/bin/activate benchpkgtable ${{ steps.extract-package-name.outputs.package_name }} --rev="${{github.event.repository.default_branch}},${{github.event.pull_request.head.sha}}" --input-dir=results/ --ratio > table.md echo '### Benchmark Results' > body.md echo '' >> body.md @@ -75,4 +111,4 @@ jobs: # comment-id: ${{ steps.fcbenchmark.outputs.comment-id }} issue-number: ${{ github.event.pull_request.number }} body-path: body.md - edit-mode: replace \ No newline at end of file + edit-mode: replace diff --git a/.gitignore b/.gitignore index 92d93be..dee06b9 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,17 @@ docs/build/ .DS_Store Manifest.toml + +# Python Env +.venv +env_setup.sh +.python-version + +# Misc +**/.DS_Store +.vscode + +# Ignore benchmark (benchpkg) results +results_GridTools@* +plot_*.png +plot_*.pdf diff --git a/Project.toml b/Project.toml index ff7f05d..2ab63f0 100644 --- a/Project.toml +++ b/Project.toml @@ -12,6 +12,7 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" +Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" diff --git a/advection/README.md b/advection/README.md index b838658..cfae700 100644 --- a/advection/README.md +++ b/advection/README.md @@ -1,6 +1,6 @@ -### README for Running `advection_miniapp.jl` +### README for Running `advection_setup.jl` using `run_simulation_loop.jl` -This README provides instructions on how to run the `advection_miniapp.jl` script for simulating advection using the Atlas library. The script allows for terminal visualization, which can be enabled as described below. +This README provides instructions on how to run the `run_simulation_loop.jl` script for simulating advection using the Atlas library. The script allows for terminal visualization, which can be enabled as described below. #### Prerequisites @@ -15,23 +15,23 @@ This README provides instructions on how to run the `advection_miniapp.jl` scrip ``` 2. **Enabling Visualization** (optional): - - The script has a `VISUALIZATION_FLAG` that can be set to enable or disable visualization on the terminal. Ensure that this flag is set to `true` in the `advection_miniapp.jl` script if you wish to enable visualization. - - Note: Other parameters such as the number of iterations can be changed in the `# Simulation Parameters` section of the script. + - The script has a `VISUALIZATION_FLAG` that can be set to enable or disable visualization on the terminal. Ensure that this flag is set to `true` in the `run_simulation_loop.jl` script if you wish to enable visualization. + - Note: Other parameters such as the number of iterations can be changed in the `# Simulation Parameters` section of the `advection_setup.jl` script. #### Running the Simulation 1. **Running the Script**: - - Use the following command to run the `advection_miniapp.jl` script with Julia: + - Use the following command to run the `run_simulation_loop.jl` script with Julia: ```sh - julia --color=yes --project=$GRIDTOOLS_JL_PATH/GridTools.jl $GRIDTOOLS_JL_PATH/GridTools.jl/src/examples/advection/advection_miniapp.jl + julia --color=yes --project=$GRIDTOOLS_JL_PATH/GridTools.jl $GRIDTOOLS_JL_PATH/GridTools.jl/src/examples/advection/run_simulation_loop.jl ``` #### Example -Here is an example of how to set the `VISUALIZATION_FLAG` in the `advection_miniapp.jl` script and run the simulation: +Here is an example of how to set the `VISUALIZATION_FLAG` in the `run_simulation_loop.jl` script and run the simulation: 1. **Setting the Visualization Flag**: - - Open the `advection_miniapp.jl` script. + - Open the `run_simulation_loop.jl` script. - Set the `VISUALIZATION_FLAG` to `true`: ```julia const VISUALIZATION_FLAG = true @@ -42,7 +42,7 @@ Here is an example of how to set the `VISUALIZATION_FLAG` in the `advection_mini - Run the script with the following command: ```sh export GRIDTOOLS_JL_PATH=... - julia --color=yes --project=. $GRIDTOOLS_JL_PATH/src/examples/advection/advection_miniapp.jl + julia --color=yes --project=. $GRIDTOOLS_JL_PATH/src/examples/advection/run_simulation_loop.jl ``` -By following these steps, you should be able to run the `advection_miniapp.jl` script and visualize the advection simulation results on your terminal. +By following these steps, you should be able to run the `run_simulation_loop.jl` script and visualize the advection simulation results on your terminal. diff --git a/advection/advection.jl b/advection/advection.jl index 19f7741..159a2ef 100644 --- a/advection/advection.jl +++ b/advection/advection.jl @@ -6,11 +6,10 @@ level_indices::Field{Tuple{K_}, Int64}, num_level::Int64 )::Field{Tuple{Vertex_, K_}, Float64} - return where( - level_indices .== num_level - 1, + level_indices .== 0, lower, - where(slice(level_indices .== 0, 1:29), upper, interior) + where(slice(level_indices .== 29, 2:30), upper, interior) ) end @@ -149,7 +148,8 @@ end )::Field{Tuple{Vertex_, K_}, Float64} zrhin = (1.0 ./ vol) .* neighbor_sum( - -min.(0.0, flux(V2E)) .* max.(0.0, dual_face_orientation) - + # TODO: fix the 0-min workaround due to the binary/unary operation issue + (broadcast(0., (Vertex, V2EDim, K)) .- min.(0.0, flux(V2E))) .* max.(0.0, dual_face_orientation) - max.(0.0, flux(V2E)) .* min.(0.0, dual_face_orientation), axis = V2EDim, ) @@ -227,15 +227,6 @@ end dual_face_orientation::Field{Tuple{Vertex_, V2EDim_}, Float64}, dual_face_normal_weighted_x::Field{Tuple{Edge_}, Float64}, dual_face_normal_weighted_y::Field{Tuple{Edge_}, Float64}, - tmp_vertex_1::Field{Tuple{Vertex_, K_}, Float64}, - tmp_vertex_2::Field{Tuple{Vertex_, K_}, Float64}, - tmp_vertex_3::Field{Tuple{Vertex_, K_}, Float64}, - tmp_vertex_4::Field{Tuple{Vertex_, K_}, Float64}, - tmp_vertex_5::Field{Tuple{Vertex_, K_}, Float64}, - tmp_vertex_6::Field{Tuple{Vertex_, K_}, Float64}, - tmp_edge_1::Field{Tuple{Edge_, K_}, Float64}, - tmp_edge_2::Field{Tuple{Edge_, K_}, Float64}, - tmp_edge_3::Field{Tuple{Edge_, K_}, Float64}, ) tmp_edge_1 = advector_normal( diff --git a/advection/advection_miniapp.jl b/advection/advection_setup.jl similarity index 66% rename from advection/advection_miniapp.jl rename to advection/advection_setup.jl index b4230a9..3153416 100644 --- a/advection/advection_miniapp.jl +++ b/advection/advection_setup.jl @@ -1,55 +1,26 @@ -# Advection Miniapp -# This script demonstrates an advection simulation using the Atlas library. +# Advection Setup +# This script demonstrates the setup of an advection simulation using the Atlas library. using Printf -using Debugger using Statistics -using Profile using GridTools - -const global VISUALIZATION_FLAG::Bool=false - -# Mesh Definitions -------------------------------------------------------------------------------------------- -# Define dimensions for the mesh -Cell_ = Dimension{:Cell_, HORIZONTAL} -Edge_ = Dimension{:Edge_, HORIZONTAL} -Vertex_ = Dimension{:Vertex_, HORIZONTAL} -K_ = Dimension{:K_, VERTICAL} -V2VDim_ = Dimension{:V2V_, LOCAL} -V2EDim_ = Dimension{:V2E_, LOCAL} -E2VDim_ = Dimension{:E2V_, LOCAL} - -# Instantiate dimension objects -Cell = Cell_() -K = K_() -Edge = Edge_() -Vertex = Vertex_() -V2VDim = V2VDim_() -V2EDim = V2EDim_() -E2VDim = E2VDim_() - -# Define field offsets to describe the relationships between different dimensions -V2V = FieldOffset("V2V", source = Vertex, target = (Vertex, V2VDim)) -E2V = FieldOffset("E2V", source = Vertex, target = (Edge, E2VDim)) -V2E = FieldOffset("V2E", source = Edge, target = (Vertex, V2EDim)) -Koff = FieldOffset("Koff", source = K, target = K) +using GridTools.ExampleMeshes.Unstructured +using GridTools.AtlasMeshes # Include additional necessary files for mesh, state container, metric calculations, and advection operations -include("../src/atlas/atlas_mesh.jl") include("state_container.jl") include("metric.jl") include("advection.jl") -include("visualization_utils.jl") # Grid and Mesh Initialization -------------------------------------------------------------------------------- # Create a structured grid and mesh for the simulation -grid = atlas.StructuredGrid("O50") +grid = atlas.StructuredGrid("O90") mesh = AtlasMesh(grid, num_level = 30) # Simulation Parameters --------------------------------------------------------------------------------------- δt = 1800.0 # time step in s niter = 50 -ε = 1.0e-8 +ϵ = 1.0e-8 # Calculate metric properties from the mesh metric = m_from_mesh(mesh) @@ -188,53 +159,3 @@ nabla_z( out = tmp_fields["tmp_vertex_2"], offset_provider = mesh.offset_provider ) - -if VISUALIZATION_FLAG - # Precompute the mapping between the unstructured domain to the structured one for ASCII art visualization - grid_size = 50 - mapping = precompute_mapping(mesh, xlim, ylim, grid_size) -end - -# Main Simulation Loop ---------------------------------------------------------------------------------------- -for i = 1:niter - # Perform the upwind advection scheme to update the scalar field (rho) - upwind_scheme( - state.rho, - δt, - mesh.vol, - metric.gac, - state.vel[1], - state.vel[2], - state.vel[3], - mesh.pole_edge_mask, - mesh.dual_face_orientation, - mesh.dual_face_normal_weighted_x, - mesh.dual_face_normal_weighted_y, - out = state_next.rho, - offset_provider = mesh.offset_provider - ) - - # Print the current timestep - println("Timestep $i") - - if VISUALIZATION_FLAG - # Print the current state as ASCII art every 5 timesteps - print_state_ascii(state, mesh, mapping, i, grid_size) - end - - # TODO: make a function out of this switch - # Swap the current and next state - temp = state - global state = state_next - global state_next = temp - - # Update the periodic boundary layers - update_periodic_layers(mesh, state.rho) -end - -# Output the final statistics for the scalar field (rho) and velocity fields -println( - "min max sum of final rho = $(minimum(state.rho.data)) , $(maximum(state.rho.data)) , $(sum(state.rho.data))" -) -println("Final Vel0 sum after $niter iterations: $(sum(state.vel[1].data))") -println("Final Vel1 sum after $niter iterations: $(sum(state.vel[2].data))") diff --git a/advection/run_simulation_loop.jl b/advection/run_simulation_loop.jl new file mode 100644 index 0000000..2c034a6 --- /dev/null +++ b/advection/run_simulation_loop.jl @@ -0,0 +1,62 @@ +# Run Advection Miniapp Simulation +# This script demonstrates an advection simulation using the Atlas library. + +include("visualization_utils.jl") +include("advection_setup.jl") + +const global VISUALIZATION_FLAG::Bool=false +const global VERBOSE_FLAG::Bool=true + +if VISUALIZATION_FLAG + # Precompute the mapping between the unstructured domain to the structured one for ASCII art visualization + grid_size = 50 + mapping = precompute_mapping(mesh, xlim, ylim, grid_size) +end + +# Main Simulation Loop ---------------------------------------------------------------------------------------- +for i = 1:niter + # Perform the upwind advection scheme to update the scalar field (rho) + upwind_scheme( + state.rho, + δt, + mesh.vol, + metric.gac, + state.vel[1], + state.vel[2], + state.vel[3], + mesh.pole_edge_mask, + mesh.dual_face_orientation, + mesh.dual_face_normal_weighted_x, + mesh.dual_face_normal_weighted_y, + out = state_next.rho, + offset_provider = mesh.offset_provider + ) + + # Print the current timestep + if VERBOSE_FLAG + println("Timestep $i") + end + + if VISUALIZATION_FLAG + # Print the current state as ASCII art every 5 timesteps + print_state_ascii(state, mesh, mapping, i, grid_size) + end + + # TODO: make a function out of this switch + # Swap the current and next state + temp = state + global state = state_next + global state_next = temp + + # Update the periodic boundary layers + update_periodic_layers(mesh, state.rho) +end + +if VERBOSE_FLAG + # Output the final statistics for the scalar field (rho) and velocity fields + println( + "min max sum of final rho = $(minimum(state.rho.data)) , $(maximum(state.rho.data)) , $(sum(state.rho.data))" + ) + println("Final Vel0 sum after $niter iterations: $(sum(state.vel[1].data))") + println("Final Vel1 sum after $niter iterations: $(sum(state.vel[2].data))") +end diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 0000000..35fcc3b --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,162 @@ +# Benchmark Guide 🧭📈 + +## Installation + +To install the benchmark CLI, execute the following command: + +```bash +julia -e 'using Pkg; Pkg.add("AirspeedVelocity"); Pkg.build("AirspeedVelocity")' +``` + +This installation will create three executables in the `~/.julia/bin` folder: `benchpkg`, `benchpkgplot`, and `benchpkgtable`. It is necessary to add them to your `$PATH` to use them from any terminal session. + +### Add to PATH Temporarily + +To temporarily add the path to your session: + +```bash +export PATH="$PATH:~/.julia/bin" +``` + +### Add to PATH Permanently + +To permanently add the executables to your path, append the following line to your `.zshrc` or `.bashrc` file: + +```bash +echo 'export PATH="$PATH:~/.julia/bin"' >> ~/.zshrc # For zsh users +echo 'export PATH="$PATH:~/.julia/bin"' >> ~/.bashrc # For bash users +``` + +## Running Benchmarks + +To run benchmarks, simply execute the following command in the shell: + +```bash +benchpkg +``` + +and it will: + +1. Figure out the package name (from Project.toml) +2. Figure out the default branch name to compare the dirty state of your repo against +3. Evaluate all the benchmarks in benchmarks/benchmark.jl (BenchmarkTools.jl format – i.e., const SUITE = BenchmarkGroup()) +4. Print the result in a nicely formatted markdown table + +You can use the `--filter` option to quickly check if the load time has worsened compared to the master branch: + +```bash +benchpkg --filter=time_to_load +``` + +The `benchpkg` was updated in June 2024 to automate the benchmark without specifying the parameters. +To specify additional condition in `benchpkg` and to work with `benchpkgplot` consult the help command (`--h`). + +Here’s an improved and completed version of your README section, with the necessary definitions, examples, and explanations: + +--- + +## Comparing Two or More Different Revisions (States) + +To compare two or more different states of your codebase, you can use revisions. In this context, a **revision** refers to a specific state of the repository, which can be identified by a commit hash or a tag. + +### (Reminder) What is a Revision? + +A **revision** in Git is an identifier that refers to a specific state of the repository at a particular point in time. Revisions can be specified using: +- **Commit Hashes**: A unique SHA-1 identifier for each commit, e.g., `8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd`. +- **Tags**: Human-readable names assigned to specific commits, often used to mark release points (e.g., `v1.0.0`). + +### How to Add a Tag + +You can create a tag in Git by using the following command: + +```bash +git tag -a -m "Tag message" +``` + +For example, to tag the current commit with `v1.0.0`, you would run: + +```bash +git tag -a v1.0.0 -m "Improvement using @threads instead of @simd in broadcasting" +``` + +To push the tag to the remote repository, use: + +```bash +git push origin +``` + +For example: + +```bash +git push origin v1.0.0 +``` + +To see information about all tags, such as the commit they point to and the tag messages, use: + +```bash +git show-ref --tags && git tag -n | while IFS= read -r line; do echo "$line"; done +``` + +### Example: Using Commit Hashes to Compare Revisions + +Here is an example of how to use commit hashes to compare different revisions: + +```bash +benchpkg --rev=8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd,6fb48706f988613860c6c98beef32c32e900737b \ + --bench-on=8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd --exeflags="--threads=8" +``` + +In this example, `benchpkg` compares the two specified revisions, with the first hash being the baseline for comparison. + +### Example: Using Tags to Compare Revisions + +Here’s how you can use tags instead of commit hashes: + +1. **Create Tags**: + Suppose you want to tag the two commits: + + ```bash + git tag -a v1.0.0 8b8a68f5b54f8fbb863f73c08f5c7fd0d3812ccd -m "Tag message for v1.0.0" + git tag -a v1.1.0 6fb48706f988613860c6c98beef32c32e900737b -m "Tag message for v1.1.0" + ``` + +2. **Use Tags in `benchpkg`**: + Once the tags are set, you can use them in the comparison: + + ```bash + benchpkg --rev=v1.0.0,v1.1.0 --bench-on=v1.0.0 --exeflags="--threads=8" + ``` + +### How to Remove a Tag + +If you need to remove a tag from your repository, you can do so with the following commands: + +1. **Delete the tag locally**: + + ```bash + git tag -d + ``` + + For example: + + ```bash + git tag -d v1.0.0 + ``` + +2. **Delete the tag from the remote repository**: + + ```bash + git push origin --delete + ``` + + For example: + + ```bash + git push origin --delete v1.0.0 + ``` + +## Developer Notes + +1. The `benchpkg` tool compares different revisions, allowing you to specify the commits or tags you wish to compare. It is crucial to ensure that both commits include all necessary dependencies; otherwise, the dependencies might not be resolved. + +2. **AirSpeedVelocity**: Note that AirSpeedVelocity requires the benchmarking suite to be named `SUITE`. Any other names will not be recognized, which could lead to errors in your benchmarking process. diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 8255aca..87c404f 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -1,16 +1,385 @@ -using Pkg -path_to_package = joinpath(@__DIR__, "..") # Assuming the benchmarks.jl file is in the "benchmark" directory -push!(LOAD_PATH, path_to_package) using BenchmarkTools +using Statistics using GridTools +using GridTools.ExampleMeshes.Unstructured +using GridTools.ExampleMeshes.Cartesian +# Data size +const global STREAM_SIZE = 10_000_000 + +# Utils ------------------------------------------------------------------------------------------------------ + +# Useful for the benchmark of the field remapping operation +function create_large_connectivity(size::Int) + edge_to_cell_table = vcat([rand(1:size, (1, 2)) for _ in 1:size]...) + cell_to_edge_table = vcat([rand(1:size, (1, 3)) for _ in 1:size]...) + + E2C = Connectivity(edge_to_cell_table, Cell, Edge, 2) + C2E = Connectivity(cell_to_edge_table, Edge, Cell, 3) + + Dict( + "E2C" => E2C, + "C2E" => C2E, + "E2CDim" => E2C # TODO: remove it + ) +end + +""" + compute_memory_bandwidth_single(results, a, out)::Float64 + +Calculates the memory bandwidth for operations that involve a single input and output field based on benchmark results. + +This function measures how efficiently data is transferred to and from memory during the execution of a benchmarked operation. + +# Arguments +- `results`: The benchmark results object containing timing and other performance data. +- `a`: The input field used in the benchmark. +- `out`: The output field produced by the benchmark. + +# Returns +- `bandwidth`: The computed memory bandwidth in gigabytes per second (GB/s), which represents the rate at which data is read from and written to the system memory during the operation. + +# Calculation Details +- `data_size`: Sum of the sizes of the input and output data in bytes. +- `time_in_seconds`: The median execution time of the benchmark, converted from nanoseconds to seconds. +- `bandwidth`: Calculated as the total data transferred divided by the time taken, expressed in GB/s. +""" +function compute_memory_bandwidth_single(results, a, out=a)::Float64 + data_size = sizeof(a.data) + sizeof(out.data) # Read from a and write to out + time_in_seconds = median(results.times) / 1e9 # Convert ns to s + bandwidth = data_size / time_in_seconds / 1e9 # GB/s + return bandwidth +end + +""" + compute_memory_bandwidth_addition(results, a, b, out) + +Function to compute the memory bandwidth for the addition benchmarks. + +# Arguments +- `results`: Benchmark results. +- `a, b`: The input arrays/fields used in the benchmark. +- `out`: The output array/field of the benchmark. + +# Returns +- The computed memory bandwidth in GB/s. +""" +function compute_memory_bandwidth_addition(results, a, b, out)::Tuple{Float64, Int64} + @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data) + data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data) # Read a and b, write to out + time_in_seconds = median(results.times) / 1e9 # Convert ns to s + bandwidth = data_size / time_in_seconds / 1e9 # GB/s + return bandwidth, data_size +end + +# Operations ------------------------------------------------------------------------------------------------- + +""" + single_field_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field} + +Setup function to create a field and a similar output field for benchmarking operations that require a single input field. + +# Arguments +- `FIELD_DATA_SIZE::Int64`: The size of the field to be generated. + +# Returns +- `a`: A randomly generated field of floats of size `FIELD_DATA_SIZE`. +- `out`: An output field similar to `a`, used for storing operation results. +""" +function single_field_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field} + a = Field(Cell, rand(Float64, FIELD_DATA_SIZE)) + out = GridTools.similar_field(a) + return a, out +end + +""" + array_broadcast_addition_setup(ARRAY_SIZE::Int64) + +Setup function for the Julia broadcast addition benchmark. + +# Arguments +- `ARRAY_SIZE::Int64`: The size of the arrays to be generated. + +# Returns +- `a, b`: Two randomly generated arrays of integers of size `ARRAY_SIZE`. +- `data_size`: The total size of the data processed. +""" +function array_broadcast_addition_setup(ARRAY_SIZE::Int64)::Tuple{Array{Float64,1}, Array{Float64,1}, Int64} + a = rand(Float64, ARRAY_SIZE) + b = rand(Float64, ARRAY_SIZE) + data_size = sizeof(a) + sizeof(b) # Total bytes processed + return a, b, data_size +end + +""" + broadcast_addition_array(a::Array{Float64}, b::Array{Float64}) + +Core operation for the Julia broadcast addition benchmark. + +# Arguments +- `a, b`: Two arrays to be added. + +# Returns +- The result of element-wise addition of `a` and `b`. +""" +function broadcast_addition_array(a::Array{Float64}, b::Array{Float64})::Array{Float64,1} + return a .+ b +end + +""" + broadcast_addition(a::Field, b::Field) + +Core operation for the broadcast addition of two Field benchmark. +Useful to asses and track possible overhead on fields. + +# Arguments +- `a, b`: Two field to be added. + +# Returns +- The result of element-wise addition of the data of the fields `a` and `b`. +""" +function broadcast_addition_fields(a::Field, b::Field)::Field + return a .+ b +end + +""" + fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64) + +Setup function for the field operator broadcast addition benchmark. + +# Arguments +- `FIELD_DATA_SIZE::Int64`: The size of the fields to be generated. + +# Returns +- `a, b`: Two randomly generated fields of floats of size `FIELD_DATA_SIZE`. +- `out`: An output field similar to `a`. +""" +function fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field, Field} + a = Field(Cell, rand(Float64, FIELD_DATA_SIZE)) + b = Field(Cell, rand(Float64, FIELD_DATA_SIZE)) + out = GridTools.similar_field(a) + return a, b, out +end + +""" + fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Core operation for the field operator broadcast addition benchmark. + +# Arguments +- `a, b`: Two fields to be added. + +# Returns +- The result of element-wise addition of `a` and `b`. +""" +@field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return a .+ b +end + +""" + sin_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Applies the sine function element-wise to the data of a field without using a field operator. + +# Arguments +- `a`: Input field containing Float64 data. + +# Returns +- A new field where each element is the sine of the corresponding element in the input field `a`. +""" +function sin_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return sin.(a) +end + +""" + cos_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Applies the cosine function element-wise to the data of a field without using a field operator. + +# Arguments +- `a`: Input field containing Float64 data. + +# Returns +- A new field where each element is the cosine of the corresponding element in the input field `a`. +""" +function cos_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return cos.(a) +end + +""" + fo_sin(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Field operator that applies the sine function element-wise to the data of a field. + +# Arguments +- `a`: Input field containing Float64 data. + +# Returns +- A new field where each element is the sine of the corresponding element in the input field `a`. +""" +@field_operator function fo_sin(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return sin.(a) +end + +""" + fo_cos(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Field operator that applies the cosine function element-wise to the data of a field. + +# Arguments +- `a`: Input field containing Float64 data. + +# Returns +- A new field where each element is the cosine of the corresponding element in the input field `a`. +""" +@field_operator function fo_cos(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return cos.(a) +end + +""" + fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} + +Field operator that performs remapping from cell-based data to edge-based data. + +This operator utilizes a connectivity table (`E2C`) to map the values from cells to edges, implying a transformation from the cell-centered field to an edge-centered field based on predefined relationships in the connectivity table. + +# Arguments +- `a`: Input field containing Float64 data structured around cells. + +# Returns +- A new field where each element represents data remapped from cells to edges, structured as specified by the edge-to-cell connectivity. +""" +@field_operator function fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} + return a(E2C[1]) +end + +""" + fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} + +Field operator that computes the sum of neighboring cell values for each edge. This function leverages the connectivity table (`E2C`), which defines the relationship between edges and cells, to sum the values of cells that are connected to each edge. + +The summation is performed across the dimension specified by `E2CDim`, ensuring that each edge aggregates values from its associated cells correctly. + +# Arguments +- `a`: Input field containing Float64 data, where each cell contains a numerical value. + +# Returns +- A new field where each edge holds the summed value of its neighboring cells, based on the edge-to-cell connectivity defined in `E2C`. +""" +@field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} + return neighbor_sum(a(E2C), axis=E2CDim) +end + +# Benchmarks ------------------------------------------------------------------------------------------------- + +# Create the benchmark SUITE SUITE = BenchmarkGroup() -SUITE["arith_broadcast"] = BenchmarkGroup() +# Define the main groups +SUITE["addition"] = BenchmarkGroup() + +# Julia broadcast addition benchmark +a, b, data_size = array_broadcast_addition_setup(STREAM_SIZE) +SUITE["addition"]["array_broadcast_addition"] = @benchmarkable broadcast_addition_array(a, b) setup=((a, b, data_size) = $array_broadcast_addition_setup($STREAM_SIZE); ) #a=$a; b=$b) + +# Field broadcast addition benchmark +a, b, out = fields_broadcast_addition_setup(STREAM_SIZE) +SUITE["addition"]["fields_broadcast_addition"] = @benchmarkable broadcast_addition_fields($a, $b) + +# Field Operator broadcast addition benchmark +a, b, out = fields_broadcast_addition_setup(STREAM_SIZE) +SUITE["addition"]["field_op_broadcast_addition"] = @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out) + +# Sine without field operator benchmark +a, out = single_field_setup(STREAM_SIZE) +SUITE["trigonometry"]["sin"] = @benchmarkable sin_without_fo($a) + +# Field operator sine benchmark +a, out = single_field_setup(STREAM_SIZE) +SUITE["trigonometry"]["field_op_sin"] = @benchmarkable $fo_sin($a, backend="embedded", out=$out) + +# Cosine without field operator benchmark +a, out = single_field_setup(STREAM_SIZE) +SUITE["trigonometry"]["cos"] = @benchmarkable cos_without_fo($a) + +# Field operator cosine benchmark +a, out = single_field_setup(STREAM_SIZE) +SUITE["trigonometry"]["field_op_cos"] = @benchmarkable $fo_cos($a, backend="embedded", out=$out) + +# Benchmark the field remapping operation +offset_provider = create_large_connectivity(STREAM_SIZE) +a, out = single_field_setup(STREAM_SIZE) +SUITE["remapping"]["field_operator"] = + @benchmarkable $fo_remapping($a, offset_provider=$offset_provider, backend="embedded", out=$out) + +# Benchmark the field neighbor sum operation +offset_provider = create_large_connectivity(STREAM_SIZE) +a, out = single_field_setup(STREAM_SIZE) +SUITE["neighbor_sum"]["field_operator"] = + @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, backend="embedded", out=$out) + +# Run the benchmark SUITE +println("Running the benchmark SUITE...") +results = run(SUITE) + +# Process the results +array_results = results["addition"]["array_broadcast_addition"] +fields_results = results["addition"]["fields_broadcast_addition"] +fo_results = results["addition"]["field_op_broadcast_addition"] +sin_results = results["trigonometry"]["sin"] +fo_sin_results = results["trigonometry"]["field_op_sin"] +cos_results = results["trigonometry"]["cos"] +fo_cos_results = results["trigonometry"]["field_op_cos"] +remapping_results = results["remapping"]["field_operator"] +neighbor_sum_results = results["neighbor_sum"]["field_operator"] + +# Compute memory bandwidth +array_bandwidth, data_size_arr = compute_memory_bandwidth_addition(array_results, a, b, a) # Out is a temporary array with size equal to the size of a +fields_bandwidth, data_size_fields = compute_memory_bandwidth_addition(fields_results, a, b, a) +fo_bandwidth, data_size_fo = compute_memory_bandwidth_addition(fo_results, a, b, out) + +sin_bandwidth = compute_memory_bandwidth_single(sin_results, a) +fo_sin_bandwidth = compute_memory_bandwidth_single(fo_sin_results, a) +cos_bandwidth = compute_memory_bandwidth_single(cos_results, a) +fo_cos_bandwidth = compute_memory_bandwidth_single(fo_cos_results, a) + +# Function to convert nanoseconds to milliseconds for clearer output +ns_to_ms(time_ns) = time_ns / 1e6 + +# Process and print the results along with the time taken for each +println("Array broadcast addition:") +println("\tData size: $data_size_arr") +println("\tBandwidth: $array_bandwidth GB/s") +println("\tTime taken: $(ns_to_ms(median(array_results.times))) ms\n") + +println("Fields data broadcast addition:") +println("\tData size: $data_size_fields") +println("\tBandwidth: $fields_bandwidth GB/s") +println("\tTime taken: $(ns_to_ms(median(fields_results.times))) ms\n") + +println("Field Operator broadcast addition:") +println("\tData size: $data_size_fo") +println("\tBandwidth: $fo_bandwidth GB/s") +println("\tTime taken: $(ns_to_ms(median(fo_results.times))) ms\n") + +println("Sine operation (no field operator):") +println("\tBandwidth: $sin_bandwidth GB/s") +println("\tTime taken: $(ns_to_ms(median(sin_results.times))) ms\n") + +println("Field Operator sine operation:") +println("\tBandwidth: $fo_sin_bandwidth GB/s") +println("\tTime taken: $(ns_to_ms(median(fo_sin_results.times))) ms\n") + +println("Cosine operation (no field operator):") +println("\tBandwidth: $cos_bandwidth GB/s") +println("\tTime taken: $(ns_to_ms(median(cos_results.times))) ms\n") + +println("Field Operator cosine operation:") +println("\tBandwidth: $fo_cos_bandwidth GB/s") +println("\tTime taken: $(ns_to_ms(median(fo_cos_results.times))) ms\n") -a = rand(1000, 1000); b = rand(1000,1000); c = rand(1000,1000) -af = Field((Cell, K), rand(1000, 1000)); bf = Field((Cell, K), rand(1000, 1000)); cf = Field((Cell, K), rand(1000, 1000)) -SUITE["arith_broadcast"]["arrays"] = @benchmarkable a .+ b .- c -SUITE["arith_broadcast"]["fields"] = @benchmarkable af .+ bf .- cf +println("Field Operator Remapping:") +println("\tTime taken: $(ns_to_ms(median(remapping_results.times))) ms\n") -run(SUITE, verbose = true, seconds = 1) +println("Field Operator Neighbor Sum:") +println("\tTime taken: $(ns_to_ms(median(neighbor_sum_results.times))) ms\n") diff --git a/benchmark/benchmarks_advection.jl b/benchmark/benchmarks_advection.jl new file mode 100644 index 0000000..d0e5da3 --- /dev/null +++ b/benchmark/benchmarks_advection.jl @@ -0,0 +1,102 @@ +using BenchmarkTools +using Statistics +using GridTools + +include("../advection/advection_setup.jl") + +# Advection Benchmarks + +SUITE = BenchmarkGroup() +SUITE["advection"]["upwind_julia_embedded"] = @benchmarkable upwind_scheme( + state.rho, + δt, + mesh.vol, + metric.gac, + state.vel[1], + state.vel[2], + state.vel[3], + mesh.pole_edge_mask, + mesh.dual_face_orientation, + mesh.dual_face_normal_weighted_x, + mesh.dual_face_normal_weighted_y, + out = state_next.rho, + offset_provider = mesh.offset_provider + # embedded backend + ) + +SUITE["advection"]["upwind_python_backend"] = @benchmarkable upwind_scheme( + state.rho, + δt, + mesh.vol, + metric.gac, + state.vel[1], + state.vel[2], + state.vel[3], + mesh.pole_edge_mask, + mesh.dual_face_orientation, + mesh.dual_face_normal_weighted_x, + mesh.dual_face_normal_weighted_y, + out = state_next.rho, + offset_provider = mesh.offset_provider, + backend = "py" + ) + +SUITE["advection"]["mpdata_program_julia_embedded"] = @benchmarkable mpdata_program( + state.rho, + δt, + ϵ, + mesh.vol, + metric.gac, + state.vel[1], + state.vel[2], + state.vel[3], + mesh.pole_edge_mask, + mesh.dual_face_orientation, + mesh.dual_face_normal_weighted_x, + mesh.dual_face_normal_weighted_y, + out = state_next.rho, + offset_provider = mesh.offset_provider + ) + +# TODO: disabled because the backend is not currently supporting it (the backend is too slow) +# SUITE["advection"]["mpdata_program_python_backend"] = @benchmarkable mpdata_program( +# state.rho, +# δt, +# ϵ, +# mesh.vol, +# metric.gac, +# state.vel[1], +# state.vel[2], +# state.vel[3], +# mesh.pole_edge_mask, +# mesh.dual_face_orientation, +# mesh.dual_face_normal_weighted_x, +# mesh.dual_face_normal_weighted_y, +# out = state_next.rho, +# offset_provider = mesh.offset_provider, +# backend = "py" +# ) + +# Run the benchmark suite +println("Running the advection suite...") +advection_results = run(SUITE) + +upwind_embedded_results = advection_results["advection"]["upwind_julia_embedded"] +upwind_python_backend_results = advection_results["advection"]["upwind_python_backend"] +mpdata_embedded_results = advection_results["advection"]["mpdata_program_julia_embedded"] +# mpdata_python_backend_results = advection_results["advection"]["mpdata_program_python_backend"] + +# Function to convert nanoseconds to milliseconds for clearer output +ns_to_ms(time_ns) = time_ns / 1e6 + +println("Upwind scheme julia (embedded):") +println("\tTime taken: $(ns_to_ms(median(upwind_embedded_results.times))) ms\n") + +println("Upwind scheme julia (python backend):") +println("\tTime taken: $(ns_to_ms(median(upwind_python_backend_results.times))) ms\n") + +println("Mpdata program julia (embedded):") +println("\tTime taken: $(ns_to_ms(median(mpdata_embedded_results.times))) ms\n") + +# println("Mpdata program julia (python backend):") +# println("\tTime taken: $(ns_to_ms(median(mpdata_python_backend_results.times))) ms\n") diff --git a/benchmark/benchmarks_gpu.jl b/benchmark/benchmarks_gpu.jl new file mode 100644 index 0000000..fa6e507 --- /dev/null +++ b/benchmark/benchmarks_gpu.jl @@ -0,0 +1,143 @@ +using BenchmarkTools +using CUDA +using GridTools +using GridTools.ExampleMeshes.Unstructured + +# Data size +const STREAM_SIZE::Int64 = 10_000_000 + +""" + compute_memory_bandwidth_addition(time_in_seconds, a, b, out)::Tuple{Float64, Int64} + +Function to compute the memory bandwidth for the addition benchmarks. + +# Arguments +- `time_in_seconds`: The execution time in seconds. +- `STREAM_SIZE`: the size used for the arrays + +# Returns +- A tuple `(bandwidth, data_size)` where: + - `bandwidth`: The memory bandwidth in gigabytes per second (GB/s). + - `data_size`: The total size of the data processed in bytes. +""" +function compute_memory_bandwidth_addition(time_in_seconds::Float64, STREAM_SIZE::Int64, data_type::Type)::Tuple{Float64, Int64} + # Calculate the total size of data read and written in bytes + data_size = 3 * STREAM_SIZE * sizeof(data_type) # (a + b + out), each Float64 is 8 bytes + + # Calculate memory bandwidth in GB/s + bandwidth = data_size / time_in_seconds / 1e9 + + return bandwidth, data_size +end + +# Util for pretty print the results +function format_number_with_dots(n::Int) + return reverse(join(Iterators.partition(reverse(string(n)), 3), ".")) +end + +# GPU Setup Functions ----------------------------------------------------------------------------------------- + +""" + gpu_broadcast_addition_setup(ARRAY_SIZE::Int64) + +Setup function for the GPU broadcast addition benchmark using CuArray. + +# Arguments +- `ARRAY_SIZE::Int64`: The size of the GPU arrays to be generated. + +# Returns +- `a_gpu`, `b_gpu`, `out_gpu`: Three CuArray GPU arrays of size `ARRAY_SIZE`. +""" +function gpu_broadcast_addition_setup(ARRAY_SIZE::Int64)::Tuple{CuArray{Float64,1}, CuArray{Float64,1}, CuArray{Float64,1}} + randcuarr = () -> CuArray(rand(Float64, ARRAY_SIZE)) + a_gpu = randcuarr() + b_gpu = randcuarr() + out_gpu = randcuarr() + return a_gpu, b_gpu, out_gpu +end + +""" + gpu_fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64) + +Setup function for the GPU field broadcast addition benchmark using CuArray. + +# Arguments +- `FIELD_DATA_SIZE::Int64`: The size of the fields to be generated. + +# Returns +- `a, b`: Two randomly generated fields of CuArray floats of size `FIELD_DATA_SIZE`. +- `out`: An output field similar to `a`, used for storing operation results. +""" +function gpu_fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field, Field} + randfieldcuarr = () -> Field(Cell, CuArray(rand(Float64, FIELD_DATA_SIZE))) + a_gpu = randfieldcuarr() + b_gpu = randfieldcuarr() + out_gpu = randfieldcuarr() + return a_gpu, b_gpu, out_gpu +end + +# CuArray only +function arr_add_wrapper!(out::CuArray{Float64,1}, a::CuArray{Float64,1}, b::CuArray{Float64,1}) + CUDA.@sync begin + out = a .+ b + end +end + +# Fields only +function field_add_wrapper!(out::Field{Tuple{Cell_},Float64}, a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64}) + CUDA.@sync begin + out = a .+ b + end +end + +# Field operator +@field_operator function gpu_fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return a .+ b +end + +function gpu_fo_addition_wrapper!(out::Field{Tuple{Cell_},Float64}, a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64}) + CUDA.@sync begin + gpu_fo_addition(a, b, backend="embedded", out=out) + end +end + +# Benchmarks with @belapsed + +# CuArray ----------------------------------------------------------------------------------------------------------- +a_gpu, b_gpu, out_gpu = gpu_broadcast_addition_setup(STREAM_SIZE) + +println("Benchmarking GPU array broadcast addition:") +gpu_array_time = @belapsed arr_add_wrapper!($out_gpu, $a_gpu, $b_gpu) + +# Compute memory bandwidth for GPU array benchmark +gpu_array_bandwidth, data_size_arr_gpu = compute_memory_bandwidth_addition(gpu_array_time, STREAM_SIZE, eltype(a_gpu)) +println("GPU Array broadcast addition:") +println("\tData size: $(format_number_with_dots(data_size_arr_gpu)) bytes") +println("\tTime: $gpu_array_time s") +println("\tBandwidth: $gpu_array_bandwidth GB/s\n") + +# Fields ------------------------------------------------------------------------------------------------------------- +a_gpu, b_gpu, out_gpu = gpu_fields_broadcast_addition_setup(STREAM_SIZE) + +println("Benchmarking GPU fields broadcast addition:") +gpu_fields_time = @belapsed field_add_wrapper!($out_gpu, $a_gpu, $b_gpu) + +# Compute memory bandwidth for GPU fields benchmark +gpu_fields_bandwidth, data_size_fields_gpu = compute_memory_bandwidth_addition(gpu_fields_time, STREAM_SIZE, eltype(a_gpu.data)) +println("GPU Fields broadcast addition:") +println("\tData size: $(format_number_with_dots(data_size_fields_gpu)) bytes") +println("\tTime: $gpu_fields_time s") +println("\tBandwidth: $gpu_fields_bandwidth GB/s\n") + +# Field operator ------------------------------------------------------------------------------------------------------- +a_gpu, b_gpu, out_gpu = gpu_fields_broadcast_addition_setup(STREAM_SIZE) + +println("Benchmarking GPU field operator broadcast addition:") +gpu_fo_time = @belapsed field_add_wrapper!($out_gpu, $a_gpu, $b_gpu) + +# Compute memory bandwidth for GPU field operator benchmark +gpu_fo_bandwidth, data_size_fo_gpu = compute_memory_bandwidth_addition(gpu_fo_time, STREAM_SIZE, eltype(a_gpu.data)) +println("GPU Field Operator broadcast addition:") +println("\tData size: $(format_number_with_dots(data_size_fo_gpu)) bytes") +println("\tTime: $gpu_fo_time s") +println("\tBandwidth: $gpu_fo_bandwidth GB/s\n") diff --git a/benchmark/benchmarks_old.jl b/benchmark/benchmarks_old.jl new file mode 100644 index 0000000..0bb429f --- /dev/null +++ b/benchmark/benchmarks_old.jl @@ -0,0 +1,103 @@ +using Pkg +path_to_package = joinpath(@__DIR__, "..") # Assuming the benchmarks.jl file is in the "benchmark" directory +push!(LOAD_PATH, path_to_package) +using BenchmarkTools +using GridTools + +# Mesh definitions ------------------------------------------------------------------------------------------- +# const global Cell_ = Dimension{:Cell_, HORIZONTAL} +# const global K_ = Dimension{:K_, HORIZONTAL} +# const global Cell = Cell_() +# const global K = K_() +# const global Edge_ = Dimension{:Edge_, HORIZONTAL} +# const global Edge = Edge_() +# const global E2CDim_ = Dimension{:E2CDim_, LOCAL} +# const global E2CDim = E2CDim_() + + +# function setup_simple_connectivity()::Dict{String,Connectivity} +# edge_to_cell_table = [ +# [1 -1]; +# [3 -1]; +# [3 -1]; +# [4 -1]; +# [5 -1]; +# [6 -1]; +# [1 6]; +# [1 2]; +# [2 3]; +# [2 4]; +# [4 5]; +# [5 6] +# ] + +# cell_to_edge_table = [ +# [1 7 8]; +# [8 9 10]; +# [2 3 9]; +# [4 10 11]; +# [5 11 12]; +# [6 7 12] +# ] + +# E2C_offset_provider = Connectivity(edge_to_cell_table, Cell, Edge, 2) +# C2E_offset_provider = Connectivity(cell_to_edge_table, Edge, Cell, 3) + +# offset_provider = Dict{String,Connectivity}( +# "E2C" => E2C_offset_provider, +# "C2E" => C2E_offset_provider, +# "E2CDim" => E2C_offset_provider # TODO(lorenzovarese): this is required for the embedded backend (note: python already uses E2C) +# ) + +# return offset_provider +# end + +SUITE = BenchmarkGroup() + +# Legacy Suite with first tests +SUITE["arith_broadcast"] = BenchmarkGroup() + +a = rand(1000, 1000); b = rand(1000,1000); c = rand(1000,1000) +af = Field((Cell, K), rand(1000, 1000)); bf = Field((Cell, K), rand(1000, 1000)); cf = Field((Cell, K), rand(1000, 1000)) +SUITE["arith_broadcast"]["arrays"] = @benchmarkable a .+ b .- c +SUITE["arith_broadcast"]["fields"] = @benchmarkable af .+ bf .- cf + +# SUITE["field_operator"] = BenchmarkGroup() + +# # Benchmark for field operator addition +# function benchmark_fo_addition() +# a = Field(Cell, collect(1.0:15.0)) +# b = Field(Cell, collect(-1.0:-1:-15.0)) +# out = Field(Cell, zeros(Float64, 15)) + +# @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} +# return a .+ b +# end + +# @benchmarkable $fo_addition($a, $b, backend="embedded", out=$out) #setup=( +# # a = Field(Cell, collect(1.0:15.0)); +# # b = Field(Cell, collect(-1.0:-1:-15.0)); +# # out_field = Field(Cell, zeros(Float64, 15)); +# # @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} return a .+ b end; +# # ) +# end + +# SUITE["field_operator"]["addition"] = benchmark_fo_addition() + +# # Benchmark for neighbor sum +# function benchmark_fo_neighbor_sum() +# offset_provider = setup_simple_connectivity(); +# a = Field(Cell, collect(5.0:17.0) * 3); +# E2C = FieldOffset("E2C", source=Cell, target=(Edge, E2CDim)) +# out_field = Field(Edge, zeros(Float64, 12)) + +# @field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} +# return neighbor_sum(a(E2C), axis=E2CDim) +# end + +# @benchmarkable $fo_neighbor_sum($a, offset_provider=$offset_provider, out=$out_field) +# end + +# SUITE["field_operator"]["neighbor_sum"] = benchmark_fo_neighbor_sum() + +run(SUITE, verbose = true, seconds = 1) diff --git a/benchmark/utils/autorun_benchmarks.sh b/benchmark/utils/autorun_benchmarks.sh new file mode 100755 index 0000000..58a0f0c --- /dev/null +++ b/benchmark/utils/autorun_benchmarks.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +# This script automates the process of benchmarking recent changes by tagging +# the last two commits and running benchmarks using the AirspeedVelocity package. +# It supports conditional execution based on user input to include specific benchmarks +# for advection and allows dynamic configuration of execution threads. +# +# Usage: +# ./autorun_benchmarks.sh [--advection] [--threads=NUM] +# --advection: Optional. If specified, runs advection-specific benchmarks. +# --threads=NUM: Optional. Specifies the number of threads to use. Default is 8. + +# Default number of threads +threads=8 + +# Function to display usage +usage() { + echo "Usage: $0 [--advection] [--threads=NUM]" + echo " --advection: Run the advection comparison with specific benchmark script." + echo " --threads=NUM: Specify the number of threads (default is 8)." + exit 1 +} + +# Parse command-line arguments +for arg in "$@" +do + case $arg in + --advection) + advection=true + shift # Remove --advection from processing + ;; + --threads=*) + threads="${arg#*=}" + shift # Remove --threads=NUM from processing + ;; + *) + # Unknown option + usage + ;; + esac +done + +# Retrieve last two commit hashes +before_debug=$(git rev-parse HEAD~1) +after_debug=$(git rev-parse HEAD) + +# Tag the last two commits if they are not already tagged +git tag -f after_debug $after_debug +git tag -f before_debug $before_debug + +# Print the before and after tags with their messages +git tag -n | grep -E 'before_debug|after_debug' | while IFS= read -r line; do echo "$line"; done ; echo "" + +# Conditional command based on the --advection flag +if [ "$advection" == true ]; then + # Set the benchmark script for advection + benchmark_script="benchmark/benchmarks_advection.jl" + command="benchpkg --rev=$before_debug,$after_debug \ + -s $benchmark_script \ + --bench-on=$after_debug \ + --exeflags=\"--threads=$threads\"" +else + command="benchpkg --rev=$before_debug,$after_debug \ + --bench-on=$after_debug \ + --exeflags=\"--threads=$threads\"" +fi + +# Print and execute the command +echo "Executing command: $command" +eval $command diff --git a/benchmark/utils/setup_benchmark_interactive.jl b/benchmark/utils/setup_benchmark_interactive.jl new file mode 100644 index 0000000..7e2aad1 --- /dev/null +++ b/benchmark/utils/setup_benchmark_interactive.jl @@ -0,0 +1,299 @@ +# setup_benchmark_interactive.jl + +# This script is intended for interactive usage during development and benchmarking sessions. +# It sets up a Julia environment with necessary packages and predefined functions for running various benchmarks. +# This allows developers to interactively profile and debug performance issues in real-time. +# +# Usage Example: +# Start Julia with the appropriate project settings and thread configuration: +# $ julia --project=. --threads 8 +# +# Inside the Julia REPL, load the benchmark setup: +# julia> include("setup_benchmark_interactive.jl") +# This will load all necessary modules and display the current thread usage. +# +# To run and profile a specific operation, use: +# julia> a, out = single_field_setup(STREAM_SIZE) +# julia> @profile fo_sin(a, backend="embedded", out=out) +# This will profile the `fo_sin` operation and print profiling results. + +include("../../advection/advection_setup.jl") + +using BenchmarkTools +using Statistics +using GridTools +using GridTools.ExampleMeshes.Unstructured +using GridTools.ExampleMeshes.Cartesian +using Profile +using Base.Threads + +# Data size +const global STREAM_SIZE = 10_000_000 + +# Utils ------------------------------------------------------------------------------------------------------ + +# Useful for the benchmark of the field remapping operation +function create_large_connectivity(size::Int) + edge_to_cell_table = vcat([rand(1:size, (1, 2)) for _ in 1:size]...) + cell_to_edge_table = vcat([rand(1:size, (1, 3)) for _ in 1:size]...) + + E2C = Connectivity(edge_to_cell_table, Cell, Edge, 2) + C2E = Connectivity(cell_to_edge_table, Edge, Cell, 3) + + Dict( + "E2C" => E2C, + "C2E" => C2E, + "E2CDim" => E2C # TODO: remove it + ) +end + +""" + compute_memory_bandwidth_single(results, a, out)::Float64 + +Calculates the memory bandwidth for operations that involve a single input and output field based on benchmark results. + +This function measures how efficiently data is transferred to and from memory during the execution of a benchmarked operation. + +# Arguments +- `results`: The benchmark results object containing timing and other performance data. +- `a`: The input field used in the benchmark. +- `out`: The output field produced by the benchmark. + +# Returns +- `bandwidth`: The computed memory bandwidth in gigabytes per second (GB/s), which represents the rate at which data is read from and written to the system memory during the operation. + +# Calculation Details +- `data_size`: Sum of the sizes of the input and output data in bytes. +- `time_in_seconds`: The median execution time of the benchmark, converted from nanoseconds to seconds. +- `bandwidth`: Calculated as the total data transferred divided by the time taken, expressed in GB/s. +""" +function compute_memory_bandwidth_single(results, a, out=a)::Float64 + data_size = sizeof(a.data) + sizeof(out.data) # Read from a and write to out + time_in_seconds = median(results.times) / 1e9 # Convert ns to s + bandwidth = data_size / time_in_seconds / 1e9 # GB/s + return bandwidth +end + +""" + compute_memory_bandwidth_addition(results, a, b, out) + +Function to compute the memory bandwidth for the addition benchmarks. + +# Arguments +- `results`: Benchmark results. +- `a, b`: The input arrays/fields used in the benchmark. +- `out`: The output array/field of the benchmark. + +# Returns +- The computed memory bandwidth in GB/s. +""" +function compute_memory_bandwidth_addition(results, a, b, out)::Tuple{Float64, Int64} + @assert sizeof(a.data) == sizeof(b.data) == sizeof(out.data) + data_size = sizeof(a.data) + sizeof(b.data) + sizeof(out.data) # Read a and b, write to out + time_in_seconds = median(results.times) / 1e9 # Convert ns to s + bandwidth = data_size / time_in_seconds / 1e9 # GB/s + return bandwidth, data_size +end + +# Operations ------------------------------------------------------------------------------------------------- + +""" + single_field_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field} + +Setup function to create a field and a similar output field for benchmarking operations that require a single input field. + +# Arguments +- `FIELD_DATA_SIZE::Int64`: The size of the field to be generated. + +# Returns +- `a`: A randomly generated field of floats of size `FIELD_DATA_SIZE`. +- `out`: An output field similar to `a`, used for storing operation results. +""" +function single_field_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field} + a = Field(Cell, rand(Float64, FIELD_DATA_SIZE)) + out = GridTools.similar_field(a) + return a, out +end + +""" + array_broadcast_addition_setup(ARRAY_SIZE::Int64) + +Setup function for the Julia broadcast addition benchmark. + +# Arguments +- `ARRAY_SIZE::Int64`: The size of the arrays to be generated. + +# Returns +- `a, b`: Two randomly generated arrays of integers of size `ARRAY_SIZE`. +- `data_size`: The total size of the data processed. +""" +function array_broadcast_addition_setup(ARRAY_SIZE::Int64)::Tuple{Array{Float64,1}, Array{Float64,1}, Int64} + a = rand(Float64, ARRAY_SIZE) + b = rand(Float64, ARRAY_SIZE) + data_size = sizeof(a) + sizeof(b) # Total bytes processed + return a, b, data_size +end + +""" + broadcast_addition_array(a::Array{Float64}, b::Array{Float64}) + +Core operation for the Julia broadcast addition benchmark. + +# Arguments +- `a, b`: Two arrays to be added. + +# Returns +- The result of element-wise addition of `a` and `b`. +""" +function broadcast_addition_array(a::Array{Float64}, b::Array{Float64})::Array{Float64,1} + return a .+ b +end + +""" + broadcast_addition(a::Field, b::Field) + +Core operation for the broadcast addition of two Field benchmark. +Useful to asses and track possible overhead on fields. + +# Arguments +- `a, b`: Two field to be added. + +# Returns +- The result of element-wise addition of the data of the fields `a` and `b`. +""" +function broadcast_addition_fields(a::Field, b::Field)::Field + return a .+ b +end + +""" + fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64) + +Setup function for the field operator broadcast addition benchmark. + +# Arguments +- `FIELD_DATA_SIZE::Int64`: The size of the fields to be generated. + +# Returns +- `a, b`: Two randomly generated fields of floats of size `FIELD_DATA_SIZE`. +- `out`: An output field similar to `a`. +""" +function fields_broadcast_addition_setup(FIELD_DATA_SIZE::Int64)::Tuple{Field, Field, Field} + a = Field(Cell, rand(Float64, FIELD_DATA_SIZE)) + b = Field(Cell, rand(Float64, FIELD_DATA_SIZE)) + out = GridTools.similar_field(a) + return a, b, out +end + +""" + fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Core operation for the field operator broadcast addition benchmark. + +# Arguments +- `a, b`: Two fields to be added. + +# Returns +- The result of element-wise addition of `a` and `b`. +""" +@field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return a .+ b +end + +""" + sin_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Applies the sine function element-wise to the data of a field without using a field operator. + +# Arguments +- `a`: Input field containing Float64 data. + +# Returns +- A new field where each element is the sine of the corresponding element in the input field `a`. +""" +function sin_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return sin.(a) +end + +""" + cos_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Applies the cosine function element-wise to the data of a field without using a field operator. + +# Arguments +- `a`: Input field containing Float64 data. + +# Returns +- A new field where each element is the cosine of the corresponding element in the input field `a`. +""" +function cos_without_fo(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return cos.(a) +end + +""" + fo_sin(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Field operator that applies the sine function element-wise to the data of a field. + +# Arguments +- `a`: Input field containing Float64 data. + +# Returns +- A new field where each element is the sine of the corresponding element in the input field `a`. +""" +@field_operator function fo_sin(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return sin.(a) +end + +""" + fo_cos(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + +Field operator that applies the cosine function element-wise to the data of a field. + +# Arguments +- `a`: Input field containing Float64 data. + +# Returns +- A new field where each element is the cosine of the corresponding element in the input field `a`. +""" +@field_operator function fo_cos(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return cos.(a) +end + +""" + fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} + +Field operator that performs remapping from cell-based data to edge-based data. + +This operator utilizes a connectivity table (`E2C`) to map the values from cells to edges, implying a transformation from the cell-centered field to an edge-centered field based on predefined relationships in the connectivity table. + +# Arguments +- `a`: Input field containing Float64 data structured around cells. + +# Returns +- A new field where each element represents data remapped from cells to edges, structured as specified by the edge-to-cell connectivity. +""" +@field_operator function fo_remapping(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} + return a(E2C[1]) +end + +""" + fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} + +Field operator that computes the sum of neighboring cell values for each edge. This function leverages the connectivity table (`E2C`), which defines the relationship between edges and cells, to sum the values of cells that are connected to each edge. + +The summation is performed across the dimension specified by `E2CDim`, ensuring that each edge aggregates values from its associated cells correctly. + +# Arguments +- `a`: Input field containing Float64 data, where each cell contains a numerical value. + +# Returns +- A new field where each edge holds the summed value of its neighboring cells, based on the edge-to-cell connectivity defined in `E2C`. +""" +@field_operator function fo_neighbor_sum(a::Field{Tuple{Cell_},Float64})::Field{Tuple{Edge_},Float64} + return neighbor_sum(a(E2C), axis=E2CDim) +end + +# Start ------------------------------------------------------------------------------------------------------ +println("Current number of threads: ", Threads.nthreads()) +println("The environment is ready\n") +Profile.clear() diff --git a/notes/Benchmarks.jl b/notes/Benchmarks.jl index 5d390ec..b271d89 100644 --- a/notes/Benchmarks.jl +++ b/notes/Benchmarks.jl @@ -59,7 +59,7 @@ using Profile # Benchmark for Julia and Python implementations of advection ############################################################################################################## -include("../advection/advection_miniapp.jl") +include("../advection/advection_setup.jl") println("Starting julia embedded benchmark") @@ -81,7 +81,7 @@ bench_julia_embedded = @benchmark upwind_scheme( println("Finished Julia embedded benchmark") -include("../advection/advection_miniapp.jl") +include("../advection/advection_setup.jl") println("Starting julia python benchmark") diff --git a/src/ExampleMeshes.jl b/src/ExampleMeshes.jl index 96612cf..6d5d237 100644 --- a/src/ExampleMeshes.jl +++ b/src/ExampleMeshes.jl @@ -11,7 +11,7 @@ export Cell, K, Edge, Vertex, V2VDim, V2EDim, E2VDim, E2CDim, C2EDim export V2V, E2V, V2E, E2C, C2E, Koff const global Cell_ = Dimension{:Cell_, HORIZONTAL} -const global K_ = Dimension{:K_, HORIZONTAL} +const global K_ = Dimension{:K_, VERTICAL} const global Edge_ = Dimension{:Edge_, HORIZONTAL} const global Vertex_ = Dimension{:Vertex_, HORIZONTAL} const global V2VDim_ = Dimension{:V2VDim_, LOCAL} diff --git a/src/GridTools.jl b/src/GridTools.jl index 580be0a..865d40c 100644 --- a/src/GridTools.jl +++ b/src/GridTools.jl @@ -9,7 +9,7 @@ using Profile using Base: @propagate_inbounds using MacroTools using OffsetArrays: IdOffsetRange -using Debugger +using CUDA import Base.Broadcast: Extruded, Style, BroadcastStyle, ArrayStyle, Broadcasted @@ -157,6 +157,30 @@ julia> field = Field(Cell, ones(5)) julia> field(E2C) julia> field(E2C[1]) ``` + +GPU arrays are supported too. + +# Examples +```julia-repl +julia> using GridTools + +julia> using CUDA: CuArray + +julia> using GridTools.ExampleMeshes.Unstructured + + # Create a CuArray of data on the GPU + +julia> gpu_data = CuArray(reshape(collect(1.0:12.0), (3, 4))); + + # Create a Field passing data in the CuArray type + +julia> gpu_field = Field((Cell,K), gpu_data); + + # Check the type + +julia> Base.typeof(gpu_field.data) +CuArray{Float64, 2, CUDA.DeviceMemory} +``` """ struct Field{ B_Dim <: Tuple{Vararg{Dimension}}, @@ -475,6 +499,7 @@ Base.convert(t::Type{T}, F::Field) where {T <: Number} = inds::Vararg{Int, N} ) where {BD, T, N} new_inds = inds .- F.origin + # @assert Tuple(1 for i in 1:length(new_inds)) <= new_inds <= size(F.data) "Error: $new_inds, $(size(F.data)), $(F.origin)" return F.data[new_inds...] end @propagate_inbounds function Base.setindex!( @@ -488,8 +513,9 @@ end Base.showarg(io::IO, @nospecialize(F::Field), toplevel) = print(io, eltype(F), " Field with dimensions ", get_dim_name.(F.broadcast_dims)) function slice(F::Field, inds...)::Field + @assert all(typeof(x) <: UnitRange{Int64} for x in inds) # TODO: understand why the line below is filtering the UnitRange only dim_ind = findall(x -> typeof(x) <: UnitRange{Int64}, inds) - return Field(F.dims[dim_ind], view(F.data, inds...), F.broadcast_dims) + return Field(F.dims[dim_ind], view(F.data, inds...), F.broadcast_dims, origin=Dict(d=>ind[1]-1 for (d,ind) in zip(F.dims, inds))) end # Connectivity struct ------------------------------------------------------------ @@ -561,7 +587,6 @@ function (fo::FieldOp)( out = nothing, kwargs... ) - is_outermost_fo = isnothing(OFFSET_PROVIDER) if is_outermost_fo @assert !isnothing(out) "Must provide an out field." @@ -609,6 +634,20 @@ function backend_execution( end end +# It is not currently working in all edge cases +function check_gpu_data(args::Tuple)::nothing + has_CuArray::Bool = false + for (i, arg) in enumerate(args) + if arg !== nothing && typeof(arg) <: AbstractArray && typeof(arg.data) <: CuArray + has_CuArray = true + end + + if has_CuArray + throw(ArgumentError("GPU Arrays (CuArray) are not supported by the Python backend. Error found in argument #$i: $(typeof(arg.data)).")) + end + end +end + function backend_execution( backend::Val{:py}, fo::FieldOp, @@ -624,6 +663,7 @@ function backend_execution( f = py_field_operator(fo) FIELD_OPERATORS[fo.name] = f end + # check_gpu_data(args) # TODO: throw an exception in case of gpu arrays passed to the python backend p_args, p_kwargs, p_out, p_offset_provider = py_args.((args, kwargs, out, GridTools.OFFSET_PROVIDER)) if is_outermost_fo @@ -705,7 +745,7 @@ macro module_vars() name => Core.eval(Base, name) for name in [:Int64, :Int32, :Float32, :Float64] ) - all_names = names(@__MODULE__) + all_names = names(@__MODULE__, all=true) used_modules = ccall(:jl_module_usings, Any, (Any,), @__MODULE__) for m in used_modules append!(all_names, names(m)) @@ -757,5 +797,6 @@ end generate_unique_name(name::Symbol, value::Integer = 0) = Symbol("$(name)ᐞ$(value)") include("ExampleMeshes.jl") +include("atlas/AtlasMeshes.jl") end diff --git a/src/atlas/atlas_mesh.jl b/src/atlas/AtlasMeshes.jl similarity index 96% rename from src/atlas/atlas_mesh.jl rename to src/atlas/AtlasMeshes.jl index be45be3..dbce49d 100644 --- a/src/atlas/atlas_mesh.jl +++ b/src/atlas/AtlasMeshes.jl @@ -1,9 +1,19 @@ # ENV["PYCALL_JL_RUNTIME_PYTHON"] = Sys.which("python3.10") # ENV["PYTHONBREAKPOINT"] = "pdb.set_trace" +module AtlasMeshes + +using GridTools +using GridTools.ExampleMeshes.Unstructured using PyCall -atlas = pyimport("atlas4py") +export AtlasMesh, atlas, update_periodic_layers, DIMENSION_TO_SIZE_ATTR + +const atlas = PyNULL() + +function __init__() + copy!(atlas, pyimport("atlas4py")) +end const rpi = 2.0 * asin(1.0) const _deg2rad = 2.0 * rpi / 360.0 @@ -260,7 +270,11 @@ struct AtlasMesh "V2V" => v2v, "V2E" => v2e, "E2V" => e2v, - "Koff" => K + "Koff" => K, + # TODO: cleanup + "V2VDim" => v2v, + "V2EDim" => v2e, + "E2VDim" => e2v, ) remote_indices = Dict{Dimension, Array}( @@ -357,3 +371,5 @@ function update_periodic_layers(mesh::AtlasMesh, field::Field) ) field[periodic_indices, :] .= field[remote_indices[periodic_indices], :] end + +end # AtlasMeshes module \ No newline at end of file diff --git a/src/embedded/builtins.jl b/src/embedded/builtins.jl index 6ddf639..bdb512b 100644 --- a/src/embedded/builtins.jl +++ b/src/embedded/builtins.jl @@ -40,17 +40,31 @@ function min_over(field_in::Field; axis::Dimension)::Field return reduction_master(field_in, axis, minimum) end +""" + reduction_master(field_in::Field, axis::Dimension, f::Function)::Field +Performs a reduction operation (`sum`, `minimum`, `maximum`, etc.) over a specific axis dimension. +This version supports both CPU and GPU fields. +""" function reduction_master(field_in::Field, axis::Dimension, f::Function) neutral_el = get_neutral(f, eltype(field_in)) dim = get_dim_ind(field_in.dims, axis) conn = OFFSET_PROVIDER[get_dim_name(axis)] - data = dropdims( - f(ifelse.(conn.data .!= -1, field_in.data, neutral_el), dims = dim), - dims = dim - ) - return Field((field_in.dims[1:dim-1]..., field_in.dims[dim+1:end]...), data) + + if isa(field_in.data, CuArray) + # GPU version using CUDA parallelization + reduced_data = CUDA.fill(neutral_el, size(field_in.data)) + CUDA.@sync reduced_data .= f(ifelse.(conn.data .!= -1, field_in.data, neutral_el), dims = dim) + reduced_data = dropdims(reduced_data, dims = dim) + else + # CPU version + reduced_data = dropdims( + f(ifelse.(conn.data .!= -1, field_in.data, neutral_el), dims = dim), + dims = dim + ) + end + return Field((field_in.dims[1:dim-1]..., field_in.dims[dim+1:end]...), reduced_data) end get_neutral(f::typeof(sum), type::DataType) = convert(type, 0) diff --git a/src/embedded/cust_broadcast.jl b/src/embedded/cust_broadcast.jl index 0b0ad16..bf6acbb 100644 --- a/src/embedded/cust_broadcast.jl +++ b/src/embedded/cust_broadcast.jl @@ -1,3 +1,6 @@ + +using Base.Threads: @threads + Base.BroadcastStyle(::Type{<:Field}) = Broadcast.ArrayStyle{Field}() # TODO(tehrengruber): Implement a range with an attached dimension instead of this single object @@ -66,7 +69,7 @@ function get_size_ifelse(mask::FieldShape, branch::FieldShape) out_size = [branch.axes...] ind_mask = findall(x -> x in branch.dims, mask.dims) ind_out = findall(x -> x in mask.dims, branch.dims) - + # TODO: this is not correct if the mask has an origin out_size[ind_out] .= mask.axes[ind_mask] return FieldShape(branch.dims, Tuple(out_size), branch.broadcast_dims) @@ -230,15 +233,42 @@ end # ----------------------------------------------------------------------------------------------------------------------------------------- +function is_gpu_compatible(bc::Broadcasted{ArrayStyle{Field}})::Bool + is_all_CuArray::Bool = false + has_CuArray::Bool = false + has_CPUArray::Bool = false + + for arg in bc.args + if typeof(arg) <: AbstractArray + # Check if the argument is a CuArray + if typeof(arg.data) <: CuArray + has_CuArray = true + is_all_CuArray = true + # Check if the argument is a CPU array + elseif typeof(arg.data) <: Vector + has_CPUArray = true + end + end + + # If both a CuArray and a CPU Array are present, raise an error + if has_CuArray && has_CPUArray + throw(ErrorException("Cannot have both CuArray and CPU arrays in the same args.")) + end + end + + return is_all_CuArray +end + # Creates uninitialized output object function Base.similar(bc::Broadcasted{ArrayStyle{Field}}, ::Type{ElType}) where {ElType} offsets = getproperty.(axes(bc), :start) .- 1 + is_cuarray::Bool = is_gpu_compatible(bc) Field( - bc.axes.dims, - similar(Array{ElType}, getproperty.(axes(bc), :stop) .- offsets), - bc.axes.broadcast_dims, - offsets - ) + bc.axes.dims, + similar(is_cuarray ? CuArray{ElType} : Array{ElType}, getproperty.(axes(bc), :stop) .- offsets), + bc.axes.broadcast_dims, + offsets + ) end # ----------------------------------------------------------------------------------------------------------------------------------------- @@ -249,17 +279,31 @@ end if axes(dest) == axes(bc) && bc.f === identity && bc.args isa Tuple{AbstractArray} # only a single input argument to broadcast! A = bc.args[1] if axes(dest) == axes(A) - return copyto!(dest, A) + if isa(A.data, CuArray) + return CUDA.copyto!(dest.data, A.data) # Use @GPUArrays copyto! + else + return copyto!(dest, A) + end end end - bc′ = Base.Broadcast.preprocess(shape(dest), bc) + if isa(dest.data, CuArray) + # Extract the function and the arguments from the broadcasted expression + f = bc.f + args = bc.args + + # Apply the function f element-wise to the arguments and store the result in dest.data + CUDA.map!(f, dest.data, map(arg -> arg.data, args)...) + else + bc′ = Base.Broadcast.preprocess(shape(dest), bc) - # Performance may vary depending on whether `@inbounds` is placed outside the - # for loop or not. (cf. https://github.com/JuliaLang/julia/issues/38086) - @inbounds @simd for I in eachindex(dest) - dest[I] = bc′[I] + # Performance may vary depending on whether `@inbounds` is placed outside the + # for loop or not. (cf. https://github.com/JuliaLang/julia/issues/38086) + @inbounds @simd for I in eachindex(dest) + dest[I] = bc′[I] + end end + return dest end diff --git a/src/examples/example_gpu.jl b/src/examples/example_gpu.jl new file mode 100644 index 0000000..8954a70 --- /dev/null +++ b/src/examples/example_gpu.jl @@ -0,0 +1,39 @@ +using GridTools +using GridTools.ExampleMeshes.Unstructured +using CUDA +using Profile +using Debugger +using BenchmarkTools + +# Cpu + +a_cpu = Field(Cell, collect(1:2e7)) +b_cpu = Field(Cell, collect(1:2e7)) + +out_cpu = similar(a_cpu) + +out_cpu = a_cpu .+ b_cpu + +# Gpu + +a_gpu = Field(Cell, CuArray(1:2e7)) +b_gpu = Field(Cell, CuArray(1:2e7)) + +out_gpu = similar_field(a_gpu) + +out_gpu .= a_gpu .+ b_gpu + +function bench_cpu!(a_cpu, b_cpu, out_cpu) + out_cpu = a_cpu .+ b_cpu +end + +function bench_gpu!(a_gpu, b_gpu, out_gpu) + # Wrapping the execution in a CUDA.@sync block will make + # the CPU block until the queued GPU tasks are done, similar to how Base.@sync waits for distributed CPU tasks + CUDA.@sync begin + out_gpu = a_gpu .+ b_gpu + end +end + +@btime bench_cpu!($a_cpu, $b_cpu, $out_cpu) +@btime bench_gpu!($a_gpu, $b_gpu, $out_gpu) \ No newline at end of file diff --git a/src/gt2py/jast_to_foast.jl b/src/gt2py/jast_to_foast.jl index c843059..f0663c7 100644 --- a/src/gt2py/jast_to_foast.jl +++ b/src/gt2py/jast_to_foast.jl @@ -266,6 +266,7 @@ end function visit_(sym::Val{:call}, args::Array, outer_loc) if args[1] in bin_op + # TODO: check the case where a unary expression, that is at the same time binary operation is encountered: i.e. -x @assert length(args)==3 "Expected a binary operation. AST must be canonicalized using `canonicalize_arithmetic_ops` first." return foast.BinOp( op = visit(args[1]), diff --git a/test/embedded_test.jl b/test/embedded_test.jl index 770cfed..b61db94 100644 --- a/test/embedded_test.jl +++ b/test/embedded_test.jl @@ -135,8 +135,8 @@ end # Broadcast ------------------------- - @test typeof(broadcast(cell_values, (Cell, K))) == Field{Tuple{Dimension{:Cell_, HORIZONTAL}, Dimension{:K_, HORIZONTAL}}, Float64, 1, Tuple{Dimension{:Cell_, HORIZONTAL}}, Vector{Float64}} - @test typeof(broadcast(5.0, (Cell, K))) == Field{Tuple{Dimension{:Cell_, HORIZONTAL}, Dimension{:K_, HORIZONTAL}}, Float64, 0, Tuple{}, Array{Float64, 0}} + @test typeof(broadcast(cell_values, (Cell, K))) == Field{Tuple{Dimension{:Cell_, HORIZONTAL}, Dimension{:K_, VERTICAL}}, Float64, 1, Tuple{Dimension{:Cell_, HORIZONTAL}}, Vector{Float64}} + @test typeof(broadcast(5.0, (Cell, K))) == Field{Tuple{Dimension{:Cell_, HORIZONTAL}, Dimension{:K_, VERTICAL}}, Float64, 0, Tuple{}, Array{Float64, 0}} # Where ----------------------------------------- diff --git a/test/gpu_test.jl b/test/gpu_test.jl new file mode 100644 index 0000000..3f7fecb --- /dev/null +++ b/test/gpu_test.jl @@ -0,0 +1,44 @@ +using Test +using CUDA: CuArray +using GridTools +using GridTools.ExampleMeshes.Unstructured + +@testset "Testset Simple Broadcast Addition GPU" begin + a_gpu = Field(Cell, CuArray(1.0:15.0)) + b_gpu = Field(Cell, CuArray(-2.0:-1:-16.0)) + @assert size(a_gpu.data) == size(b_gpu.data) "Fields a_gpu and b_gpu do not have the same size of data." + + out_gpu = similar_field(a_gpu) + out_gpu = a_gpu .+ b_gpu + + @test all(out_gpu.data .== -1) +end + +@testset "Testset Large Broadcast Addition GPU" begin + # Initialize two large GPU fields with CuArray + a_gpu = Field(Cell, CuArray(1:2e7)) + b_gpu = Field(Cell, CuArray(1:2e7)) + @assert size(a_gpu.data) == size(b_gpu.data) "Fields a_gpu and b_gpu do not have the same size of data." + + out_gpu = similar_field(a_gpu) + out_gpu .= a_gpu .+ b_gpu + + expected_result = CuArray(2:2:2e7*2) + + @test all(out_gpu.data .== expected_result) +end + +@testset "Testset Field Operator Addition GPU" begin + a_gpu = Field(Cell, CuArray(1.0:15.0)) + b_gpu = Field(Cell, CuArray(-2.0:-1:-16.0)) + @assert size(a_gpu.data) == size(b_gpu.data) "Fields a and b do not have the same size of data." + + out_gpu = similar_field(a_gpu) + + @field_operator function fo_addition(a::Field{Tuple{Cell_},Float64}, b::Field{Tuple{Cell_},Float64})::Field{Tuple{Cell_},Float64} + return a .+ b + end + + fo_addition(a_gpu, b_gpu, backend="embedded", out=out_gpu) + @test all(out_gpu.data .== -1) +end diff --git a/test/gt2py_fo_exec.jl b/test/gt2py_fo_exec.jl index ec0014f..187eb23 100644 --- a/test/gt2py_fo_exec.jl +++ b/test/gt2py_fo_exec.jl @@ -564,6 +564,23 @@ function test_lap_lap(offset_provider::Dict{String, Dimension}, backend::String, # TODO: add in the future the test for the border values end +""" + test_slice() + +This test checks the `slice` function, which should correctly extract a subset of data from a larger field and properly adjust the origin to reflect the new sliced field's starting point. + +# Expected Behavior +- The sliced data should match the expected subset from the original field. +- The origin of the sliced field should be adjusted correctly to match the new starting index of the sliced data. +""" +function test_slice() + a::Field = Field((IDim,), [1; 2; 3; 4; 5]) + sliced_a = slice(a, 2:4) + @test sliced_a.data == [2; 3; 4] + @test sliced_a.origin == (2-1,) + @test sliced_a.dims == (IDim,) +end + # Test Executions -------------------------------------------------------------------------------------------- function test_gt4py_fo_exec() @@ -638,6 +655,8 @@ function test_gt4py_fo_exec() # testwrapper(setup_cartesian_offset_provider, test_lap_lap, "embedded", simple_cartesian_field) testwrapper(setup_cartesian_offset_provider, test_lap_lap, "py", simple_cartesian_field) + + testwrapper(nothing, test_slice) end @testset "Testset GT2Py fo exec" test_gt4py_fo_exec()