Skip to content

Commit

Permalink
Add CI driver, slim down examples driver
Browse files Browse the repository at this point in the history
  • Loading branch information
nefrathenrici committed Feb 1, 2025
1 parent b6f9bae commit b487e54
Show file tree
Hide file tree
Showing 9 changed files with 315 additions and 331 deletions.
201 changes: 201 additions & 0 deletions .buildkite/ci_driver.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
# When Julia 1.10+ is used interactively, stacktraces contain reduced type information to make them shorter.
# On the other hand, the full type information is printed when julia is not run interactively.
# Given that ClimaCore objects are heavily parametrized, non-abbreviated stacktraces are hard to read,
# so we force abbreviated stacktraces even in non-interactive runs.
# (See also Base.type_limited_string_from_context())
redirect_stderr(IOContext(stderr, :stacktrace_types_limited => Ref(false)))
import PrecompileCI
import ClimaComms
ClimaComms.@import_required_backends
import ClimaAtmos as CA
import Random
Random.seed!(1234)

if !(@isdefined config)
(; config_file, job_id) = CA.commandline_kwargs()
config = CA.AtmosConfig(config_file; job_id)
end
simulation = CA.get_simulation(config)
(; integrator) = simulation
sol_res = CA.solve_atmos!(simulation)

(; atmos, params) = integrator.p
(; p) = integrator

import ClimaCore
import ClimaCore: Topologies, Quadratures, Spaces
import ClimaComms
using SciMLBase
using PrettyTables
using JLD2
using NCDatasets
using ClimaTimeSteppers
using Test
import Tar
import Base.Filesystem: rm
include(joinpath(pkgdir(CA), "post_processing", "ci_plots.jl"))

ref_job_id = config.parsed_args["reference_job_id"]
reference_job_id = isnothing(ref_job_id) ? simulation.job_id : ref_job_id

if sol_res.ret_code == :simulation_crashed
error(
"The ClimaAtmos simulation has crashed. See the stack trace for details.",
)
end
# Simulation did not crash
(; sol, walltime) = sol_res

# we gracefully exited, so we won't have reached t_end
if !isempty(integrator.tstops)
@assert last(sol.t) == simulation.t_end
end
CA.verify_callbacks(sol.t)

# Scaling check
if CA.is_distributed(config.comms_ctx)
nprocs = ClimaComms.nprocs(config.comms_ctx)
comms_ctx = config.comms_ctx
output_dir = simulation.output_dir
# replace sol.u on the root processor with the global sol.u
if ClimaComms.iamroot(comms_ctx)
Y = sol.u[1]
center_space = axes(Y.c)
horz_space = Spaces.horizontal_space(center_space)
horz_topology = Spaces.topology(horz_space)
quadrature_style = Spaces.quadrature_style(horz_space)
Nq = Quadratures.degrees_of_freedom(quadrature_style)
nlocalelems = Topologies.nlocalelems(horz_topology)
ncols_per_process = nlocalelems * Nq * Nq
scaling_file =
joinpath(output_dir, "scaling_data_$(nprocs)_processes.jld2")
@info(
"Writing scaling data",
"walltime (seconds)" = walltime,
scaling_file
)
JLD2.jldsave(scaling_file; nprocs, ncols_per_process, walltime)
end
end

# Check if selected output has changed from the previous recorded output (bit-wise comparison)
include(
joinpath(
@__DIR__,
"..",
"..",
"reproducibility_tests",
"reproducibility_test_job_ids.jl",
),
)
if config.parsed_args["reproducibility_test"]
# Test results against main branch
include(
joinpath(
@__DIR__,
"..",
"..",
"reproducibility_tests",
"reproducibility_tools.jl",
),
)
export_reproducibility_results(
sol.u[end],
config.comms_ctx;
job_id = simulation.job_id,
computed_dir = simulation.output_dir,
)
end

@info "Callback verification, n_expected_calls: $(CA.n_expected_calls(integrator))"
@info "Callback verification, n_measured_calls: $(CA.n_measured_calls(integrator))"

# Write diagnostics that are in DictWriter to text files
CA.write_diagnostics_as_txt(simulation)

# Conservation checks
if config.parsed_args["check_conservation"]
FT = Spaces.undertype(axes(sol.u[end].c.ρ))
@info "Checking conservation"
(; energy_conservation, mass_conservation, water_conservation) =
CA.check_conservation(sol)

@info " Net energy change / total energy: $energy_conservation"
@info " Net mass change / total mass: $mass_conservation"
@info " Net water change / total water: $water_conservation"

@test energy_conservation 0 atol = 100 * eps(FT)
@test mass_conservation 0 atol = 100 * eps(FT)
@test water_conservation 0 atol = 100 * eps(FT)
end

# Visualize the solution
if ClimaComms.iamroot(config.comms_ctx)
include(
joinpath(
pkgdir(CA),
"reproducibility_tests",
"reproducibility_utils.jl",
),
)
@info "Plotting"
paths = latest_comparable_dirs() # __build__ path (not job path)
if isempty(paths)
make_plots(Val(Symbol(reference_job_id)), simulation.output_dir)
else
main_job_path = joinpath(first(paths), reference_job_id)
nc_dir = joinpath(main_job_path, "nc_files")
if ispath(nc_dir)
@info "nc_dir exists"
else
mkpath(nc_dir)
# Try to extract nc files from tarball:
@info "Comparing against $(readdir(nc_dir))"
end
if isempty(readdir(nc_dir))
if isfile(joinpath(main_job_path, "nc_files.tar"))
Tar.extract(joinpath(main_job_path, "nc_files.tar"), nc_dir)
else
@warn "No nc_files found"
end
else
@info "Files already extracted"
end

paths = if isempty(readdir(nc_dir))
simulation.output_dir
else
[simulation.output_dir, nc_dir]
end
make_plots(Val(Symbol(reference_job_id)), paths)
end
@info "Plotting done"

if islink(simulation.output_dir)
symlink_to_fullpath(path) = joinpath(dirname(path), readlink(path))
else
symlink_to_fullpath(path) = path
end

@info "Creating tarballs"
# These NC files are used by our reproducibility tests,
# and need to be found later when comparing against the
# main branch. If "nc_files.tar" is renamed, then please
# search for "nc_files.tar" globally and rename it in the
# reproducibility test folder.
Tar.create(
f -> endswith(f, ".nc"),
symlink_to_fullpath(simulation.output_dir),
joinpath(simulation.output_dir, "nc_files.tar"),
)
Tar.create(
f -> endswith(f, r"hdf5|h5"),
symlink_to_fullpath(simulation.output_dir),
joinpath(simulation.output_dir, "hdf5_files.tar"),
)

foreach(readdir(simulation.output_dir)) do f
endswith(f, r"nc|hdf5|h5") && rm(joinpath(simulation.output_dir, f))
end
@info "Tarballs created"
end
4 changes: 2 additions & 2 deletions .buildkite/comparison/pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ for nprocs in ${process_counts[@]}; do
job_id="comparison_sphere_held_suarez_${res}_res_rhoe_$nprocs"

if [[ "$res" == "low" ]]; then
command="julia --color=yes --project=examples examples/hybrid/driver.jl --job_id $job_id --forcing held_suarez --FLOAT_TYPE $FT --tracer_upwinding none --t_end 10days --dt 400secs --z_elem 10 --h_elem 4"
command="julia --color=yes --project=examples .buildkite/ci_driver.jl --job_id $job_id --forcing held_suarez --FLOAT_TYPE $FT --tracer_upwinding none --t_end 10days --dt 400secs --z_elem 10 --h_elem 4"
else
command="julia --color=yes --project=examples examples/hybrid/driver.jl --job_id $job_id --forcing held_suarez --FLOAT_TYPE $FT --tracer_upwinding none --t_end 1days --dt 50secs --z_elem 45 --h_elem 24"
command="julia --color=yes --project=examples .buildkite/ci_driver.jl --job_id $job_id --forcing held_suarez --FLOAT_TYPE $FT --tracer_upwinding none --t_end 1days --dt 50secs --z_elem 45 --h_elem 24"
fi

if [[ "$profiling" == "enable" ]]; then
Expand Down
26 changes: 13 additions & 13 deletions .buildkite/gpu_pipeline/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ steps:
- mkdir -p target_gpu_implicit_baroclinic_wave
- >
nsys profile --delay 100 --trace=nvtx,mpi,cuda,osrt --output=target_gpu_implicit_baroclinic_wave/output_active/report
julia --threads=3 --color=yes --project=.buildkite examples/hybrid/driver.jl
julia --threads=3 --color=yes --project=.buildkite .buildkite/ci_driver.jl
--config_file ${GPU_CONFIG_PATH}target_gpu_implicit_baroclinic_wave.yml
--job_id target_gpu_implicit_baroclinic_wave
artifact_paths: "target_gpu_implicit_baroclinic_wave/output_active/*"
Expand All @@ -63,7 +63,7 @@ steps:
- mkdir -p gpu_hs_rhoe_equil_55km_nz63_0M
- >
nsys profile --delay 100 --trace=nvtx,mpi,cuda,osrt --output=gpu_hs_rhoe_equil_55km_nz63_0M/output_active/report
julia --threads=3 --color=yes --project=.buildkite examples/hybrid/driver.jl
julia --threads=3 --color=yes --project=.buildkite .buildkite/ci_driver.jl
--config_file ${GPU_CONFIG_PATH}gpu_hs_rhoe_equil_0M.yml
--job_id gpu_hs_rhoe_equil_55km_nz63_0M
artifact_paths: "gpu_hs_rhoe_equil_55km_nz63_0M/output_active/*"
Expand All @@ -81,7 +81,7 @@ steps:
- >
srun --cpu-bind=threads --cpus-per-task=4
nsys profile --delay 100 --trace=nvtx,mpi,cuda,osrt --output=gpu_hs_rhoe_equil_55km_nz63_0M_4process/output_active/report-%q{PMI_RANK}
julia --threads=3 --color=yes --project=.buildkite examples/hybrid/driver.jl
julia --threads=3 --color=yes --project=.buildkite .buildkite/ci_driver.jl
--config_file ${GPU_CONFIG_PATH}gpu_hs_rhoe_equil_0M.yml
--job_id gpu_hs_rhoe_equil_55km_nz63_0M_4process
artifact_paths: "gpu_hs_rhoe_equil_55km_nz63_0M_4process/output_active/*"
Expand All @@ -101,7 +101,7 @@ steps:
- >
srun --cpu-bind=threads --cpus-per-task=4
nsys profile --delay 100 --trace=osrt,nvtx,cuda,mpi,ucx --output=target_gpu_implicit_baroclinic_wave_4process/output_active/report-%q{PMI_RANK}
julia --threads=3 --color=yes --project=.buildkite examples/hybrid/driver.jl
julia --threads=3 --color=yes --project=.buildkite .buildkite/ci_driver.jl
--config_file ${GPU_CONFIG_PATH}target_gpu_implicit_baroclinic_wave.yml
--job_id target_gpu_implicit_baroclinic_wave_4process
artifact_paths: "target_gpu_implicit_baroclinic_wave_4process/output_active/*"
Expand All @@ -122,7 +122,7 @@ steps:
- mkdir -p gpu_aquaplanet_dyamond_diag_1process
- >
srun --cpu-bind=threads --cpus-per-task=4
nsys profile --delay 100 --trace=nvtx,mpi,cuda,osrt --output=gpu_aquaplanet_dyamond_diag_1process/output_active/report julia --threads=3 --color=yes --project=.buildkite examples/hybrid/driver.jl
nsys profile --delay 100 --trace=nvtx,mpi,cuda,osrt --output=gpu_aquaplanet_dyamond_diag_1process/output_active/report julia --threads=3 --color=yes --project=.buildkite .buildkite/ci_driver.jl
--config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_dyamond_diag_1process.yml
--job_id gpu_aquaplanet_dyamond_diag_1process
artifact_paths: "gpu_aquaplanet_dyamond_diag_1process/output_active/*"
Expand All @@ -143,7 +143,7 @@ steps:
- >
srun --cpu-bind=threads --cpus-per-task=4
nsys profile --delay 100 --trace=nvtx,mpi,cuda,osrt --output=gpu_aquaplanet_dyamond_ss_1process/output_active/report
julia --threads=3 --color=yes --project=.buildkite examples/hybrid/driver.jl
julia --threads=3 --color=yes --project=.buildkite .buildkite/ci_driver.jl
--config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_dyamond_ss.yml
--job_id gpu_aquaplanet_dyamond_ss_1process
artifact_paths: "gpu_aquaplanet_dyamond_ss_1process/output_active/*"
Expand All @@ -163,7 +163,7 @@ steps:
- mkdir -p gpu_aquaplanet_dyamond_ss_2process
- >
srun --cpu-bind=threads --cpus-per-task=4
julia --threads=3 --color=yes --project=.buildkite examples/hybrid/driver.jl
julia --threads=3 --color=yes --project=.buildkite .buildkite/ci_driver.jl
--config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_dyamond_ss.yml
--job_id gpu_aquaplanet_dyamond_ss_2process
artifact_paths: "gpu_aquaplanet_dyamond_ss_2process/output_active/*"
Expand All @@ -183,7 +183,7 @@ steps:
- mkdir -p gpu_aquaplanet_dyamond_ss_4process
- >
srun --cpu-bind=threads --cpus-per-task=4
julia --threads=3 --color=yes --project=.buildkite examples/hybrid/driver.jl
julia --threads=3 --color=yes --project=.buildkite .buildkite/ci_driver.jl
--config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_dyamond_ss.yml
--job_id gpu_aquaplanet_dyamond_ss_4process
artifact_paths: "gpu_aquaplanet_dyamond_ss_4process/output_active/*"
Expand Down Expand Up @@ -221,7 +221,7 @@ steps:
- mkdir -p gpu_aquaplanet_dyamond_ws_1process
- >
srun --cpu-bind=threads --cpus-per-task=4
julia --threads=3 --color=yes --project=.buildkite examples/hybrid/driver.jl
julia --threads=3 --color=yes --project=.buildkite .buildkite/ci_driver.jl
--config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_dyamond_ws_1process.yml
--job_id gpu_aquaplanet_dyamond_ws_1process
artifact_paths: "gpu_aquaplanet_dyamond_ws_1process/output_active/*"
Expand All @@ -241,7 +241,7 @@ steps:
- mkdir -p gpu_aquaplanet_dyamond_ws_2process
- >
srun --cpu-bind=threads --cpus-per-task=4
julia --threads=3 --color=yes --project=.buildkite examples/hybrid/driver.jl
julia --threads=3 --color=yes --project=.buildkite .buildkite/ci_driver.jl
--config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_dyamond_ws_2process.yml
--job_id gpu_aquaplanet_dyamond_ws_2process
artifact_paths: "gpu_aquaplanet_dyamond_ws_2process/output_active/*"
Expand All @@ -261,7 +261,7 @@ steps:
- mkdir -p gpu_aquaplanet_dyamond_ws_4process
- >
srun --cpu-bind=threads --cpus-per-task=4
julia --threads=3 --color=yes --project=.buildkite examples/hybrid/driver.jl
julia --threads=3 --color=yes --project=.buildkite .buildkite/ci_driver.jl
--config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_dyamond_ws_4process.yml
--job_id gpu_aquaplanet_dyamond_ws_4process
artifact_paths: "gpu_aquaplanet_dyamond_ws_4process/output_active/*"
Expand Down Expand Up @@ -302,7 +302,7 @@ steps:
- mkdir -p gpu_aquaplanet_diagedmf
- >
nsys profile --delay 200 --trace=nvtx,mpi,cuda,osrt --output=gpu_aquaplanet_diagedmf/output_active/report
julia --threads=3 --color=yes --project=.buildkite examples/hybrid/driver.jl
julia --threads=3 --color=yes --project=.buildkite .buildkite/ci_driver.jl
--config_file ${MODEL_CONFIG_PATH}aquaplanet_diagedmf.yml
--job_id gpu_aquaplanet_diagedmf
artifact_paths: "gpu_aquaplanet_diagedmf/output_active/*"
Expand Down Expand Up @@ -336,7 +336,7 @@ steps:
- mkdir -p gpu_aquaplanet_progedmf
- >
nsys profile --delay 100 --trace=nvtx,mpi,cuda,osrt --output=gpu_aquaplanet_progedmf/output_active/report
julia --threads=3 --color=yes --project=.buildkite examples/hybrid/driver.jl
julia --threads=3 --color=yes --project=.buildkite .buildkite/ci_driver.jl
--config_file ${MODEL_CONFIG_PATH}aquaplanet_progedmf.yml
--job_id gpu_aquaplanet_progedmf
artifact_paths: "gpu_aquaplanet_progedmf/output_active/*"
Expand Down
Loading

0 comments on commit b487e54

Please sign in to comment.