This repository has been archived by the owner on Mar 1, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 78
Assemble checkpoints
kpamnany edited this page Sep 2, 2020
·
1 revision
# assemble_checkpoints.jl
#
# Script to assemble ClimateMachine checkpoint files from multiple ranks
# into a single checkpoint file that can be restarted from by one rank.
#
# This is a temporary hack for until ClimateMachine unifies checkpoint
# files across multiple ranks itself.
using JLD2
using Printf
const out_dir = "assembled"
function get_checkpoint_filename(checkpoint_dir, exp_name, rank, num)
cname = @sprintf(
"%s_checkpoint_mpirank%04d_num%04d.jld2",
exp_name,
rank,
num,
)
return joinpath(checkpoint_dir, cname)
end
function assemble(checkpoint_dir, exp_name, nranks, num)
file = get_checkpoint_filename(checkpoint_dir, exp_name, 0, num)
@load file h_Q h_aux t
println("Extracted checkpoint data for rank 0")
fullQ = h_Q
fullaux = h_aux
fullt = t
for r in 1:(nranks - 1)
file = get_checkpoint_filename(checkpoint_dir, exp_name, r, num)
@load file h_Q h_aux t
println("Extracted checkpoint data for rank $r from $file")
fullQ = cat(fullQ, h_Q; dims = 3)
fullaux = cat(fullaux, h_aux; dims = 3)
end
h_Q = fullQ
h_aux = fullaux
t = fullt
mkpath(out_dir)
file = get_checkpoint_filename(out_dir, exp_name, 0, num)
print("Writing to $file... ")
@save file h_Q h_aux t
println("done.")
end
if length(ARGS) != 4
println(
"""
Usage:
assemble_checkpoints.jl <checkpoint_dir> <exp_name> <num_ranks> <checkpoint_number>""",
)
exit()
end
assemble(ARGS[1], ARGS[2], parse(Int, ARGS[3]), parse(Int, ARGS[4]))