Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MPI and GPU tests on buildkite #28

Merged
merged 1 commit into from
May 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .buildkite/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[deps]
Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
ClimaComms = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d"
ClimaCore = "d414da3d-4745-48bb-8d80-42e94e092884"
ClimaDiagnostics = "1ecacbb8-0713-4841-9a07-eb5aa8a2d53f"
ClimaTimeSteppers = "595c0a79-7f3d-439a-bc5a-b232dc3bde79"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
ProfileCanvas = "efd6af41-a80b-495e-886c-e51b0c7d77a3"
SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
49 changes: 49 additions & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
agents:
queue: new-central
slurm_mem: 8G
modules: climacommon/2024_04_30

env:
JULIA_LOAD_PATH: "${JULIA_LOAD_PATH}:${BUILDKITE_BUILD_CHECKOUT_PATH}/.buildkite"
JULIA_DEPOT_PATH: "${BUILDKITE_BUILD_PATH}/${BUILDKITE_PIPELINE_SLUG}/depot/default"
SLURM_KILL_BAD_EXIT: 1

steps:
- label: "init :computer:"
key: "init_cpu_env"
command:
- "echo $$JULIA_DEPOT_PATH"

- echo "--- Instantiate project"
- "julia --project=.buildkite -e 'using Pkg; Pkg.develop(; path = \".\")'"
- "julia --project=.buildkite -e 'using Pkg; Pkg.instantiate(;verbose=true)'"
- "julia --project=.buildkite -e 'using Pkg; Pkg.precompile()'"
- "julia --project=.buildkite -e 'using Pkg; Pkg.status()'"
agents:
slurm_cpus_per_task: 8
slurm_gpus: 1

- wait

- label: "Run tests on CPU"
key: "cpu_tests"
command:
- "julia --color=yes --project=.buildkite test/runtests.jl"

- label: "Run tests on GPU"
key: "gpu_tests"
command:
- "julia --color=yes --project=.buildkite test/runtests.jl"
env:
CLIMACOMMS_DEVICE: "CUDA"
agents:
slurm_gpus: 1

- label: "Run tests with MPI"
key: "mpi_tests"
command:
- "srun julia --color=yes --project=.buildkite test/integration_test.jl"
env:
CLIMACOMMS_CONTEXT: "MPI"
agents:
slurm_ntasks: 2
2 changes: 1 addition & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
version:
# - '1.9'
- '1.10'
- '~1.11.0-0'
# - '~1.11.0-0'
timeout-minutes: 30
steps:
- name: Checkout
Expand Down
2 changes: 1 addition & 1 deletion test/TestTools.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ function SphericalShellSpace(;
nelements = 10,
zelem = 10,
npolynomial = 4,
context = ClimaComms.SingletonCommsContext(),
context = ClimaComms.context(),
FT = Float64,
)
vertdomain = ClimaCore.Domains.IntervalDomain(
Expand Down
73 changes: 46 additions & 27 deletions test/integration_test.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ import NCDatasets

import ClimaDiagnostics

import ClimaComms
@static if pkgversion(ClimaComms) >= v"0.6"
ClimaComms.@import_required_backends
end

const context = ClimaComms.context()
ClimaComms.init(context)

include("TestTools.jl")

"""
Expand All @@ -15,11 +23,11 @@ Set up a full test problem
Increasing `more_compute_diagnostics` adds more copies of a compute diagnostic with no output.
Useful to stress allocations.
"""
function setup_integrator(output_dir; more_compute_diagnostics = 0)
function setup_integrator(output_dir; context, more_compute_diagnostics = 0)
t0 = 0.0
tf = 10.0
dt = 1.0
space = SphericalShellSpace()
space = SphericalShellSpace(; context)
args, kwargs = create_problem(space; t0, tf, dt)

@info "Writing output to $output_dir"
Expand Down Expand Up @@ -95,44 +103,55 @@ end

@testset "A full problem" begin
mktempdir() do output_dir
integrator = setup_integrator(output_dir)
output_dir = ClimaComms.bcast(context, output_dir)

SciMLBase.solve!(integrator)
integrator = setup_integrator(output_dir; context)

NCDatasets.NCDataset(joinpath(output_dir, "YO_1it_inst.nc")) do nc
@test nc["YO"].attrib["short_name"] == "YO"
@test nc["YO"].attrib["long_name"] == "YO YO, Instantaneous"
@test size(nc["YO"]) == (11, 10, 5, 3)
end

NCDatasets.NCDataset(joinpath(output_dir, "YO_2it_average.nc")) do nc
@test nc["YO"].attrib["short_name"] == "YO"
@test nc["YO"].attrib["long_name"] ==
"YO YO, average within every 2 iterations"
@test size(nc["YO"]) == (5, 10, 5, 3)
end
SciMLBase.solve!(integrator)

NCDatasets.NCDataset(joinpath(output_dir, "YO_3s_inst.nc")) do nc
@test nc["YO"].attrib["short_name"] == "YO"
@test nc["YO"].attrib["long_name"] == "YO YO, Instantaneous"
@test size(nc["YO"]) == (4, 10, 5, 3)
if ClimaComms.iamroot(context)
NCDatasets.NCDataset(joinpath(output_dir, "YO_1it_inst.nc")) do nc
@test nc["YO"].attrib["short_name"] == "YO"
@test nc["YO"].attrib["long_name"] == "YO YO, Instantaneous"
@test size(nc["YO"]) == (11, 10, 5, 3)
end

NCDatasets.NCDataset(
joinpath(output_dir, "YO_2it_average.nc"),
) do nc
@test nc["YO"].attrib["short_name"] == "YO"
@test nc["YO"].attrib["long_name"] ==
"YO YO, average within every 2 iterations"
@test size(nc["YO"]) == (5, 10, 5, 3)
end

NCDatasets.NCDataset(joinpath(output_dir, "YO_3s_inst.nc")) do nc
@test nc["YO"].attrib["short_name"] == "YO"
@test nc["YO"].attrib["long_name"] == "YO YO, Instantaneous"
@test size(nc["YO"]) == (4, 10, 5, 3)
end
end
end
end

@testset "Performance" begin
mktempdir() do output_dir
output_dir = ClimaComms.bcast(context, output_dir)

# Flame
integrator = setup_integrator(output_dir)
integrator = setup_integrator(output_dir; context)
prof = Profile.@profile SciMLBase.solve!(integrator)
results = Profile.fetch()
ProfileCanvas.html_file("flame.html", results)
ClimaComms.iamroot(context) && (results = Profile.fetch())
ClimaComms.iamroot(context) &&
ProfileCanvas.html_file("flame.html", results)

# Allocations
integrator = setup_integrator(output_dir)
integrator = setup_integrator(output_dir; context)
prof = Profile.Allocs.@profile SciMLBase.solve!(integrator)
results = Profile.Allocs.fetch()
allocs = ProfileCanvas.view_allocs(results)
ProfileCanvas.html_file("allocs.html", allocs)
ClimaComms.iamroot(context) && (results = Profile.Allocs.fetch())
ClimaComms.iamroot(context) &&
(allocs = ProfileCanvas.view_allocs(results))
ClimaComms.iamroot(context) &&
ProfileCanvas.html_file("allocs.html", allocs)
end
end
5 changes: 1 addition & 4 deletions test/writers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ include("TestTools.jl")

# The temporary directory where we write the file cannot be in /tmp, it has
# to be on disk
output_dir = "netcdf_writer_performance_test"
Base.mkpath(output_dir)
output_dir = mktempdir(".")

@testset "DictWriter" begin
writer = Writers.DictWriter()
Expand Down Expand Up @@ -191,5 +190,3 @@ end
show(stdout, MIME"text/plain"(), timing_ncdataset)
println()
end

Base.rm(output_dir, force = true, recursive = true)
Loading