Skip to content
This repository has been archived by the owner on Mar 1, 2023. It is now read-only.

Assemble checkpoints

kpamnany edited this page Sep 2, 2020 · 1 revision
# assemble_checkpoints.jl
#
# Script to assemble ClimateMachine checkpoint files from multiple ranks
# into a single checkpoint file that can be restarted from by one rank.
#
# This is a temporary hack for until ClimateMachine unifies checkpoint
# files across multiple ranks itself.

using JLD2
using Printf

const out_dir = "assembled"

function get_checkpoint_filename(checkpoint_dir, exp_name, rank, num)
    cname = @sprintf(
        "%s_checkpoint_mpirank%04d_num%04d.jld2",
        exp_name,
        rank,
        num,
    )
    return joinpath(checkpoint_dir, cname)
end

function assemble(checkpoint_dir, exp_name, nranks, num)
    file = get_checkpoint_filename(checkpoint_dir, exp_name, 0, num)
    @load file h_Q h_aux t
    println("Extracted checkpoint data for rank 0")
    fullQ = h_Q
    fullaux = h_aux
    fullt = t
    for r in 1:(nranks - 1)
        file = get_checkpoint_filename(checkpoint_dir, exp_name, r, num)
        @load file h_Q h_aux t
        println("Extracted checkpoint data for rank $r from $file")
        fullQ = cat(fullQ, h_Q; dims = 3)
        fullaux = cat(fullaux, h_aux; dims = 3)
    end
    h_Q = fullQ
    h_aux = fullaux
    t = fullt
    mkpath(out_dir)
    file = get_checkpoint_filename(out_dir, exp_name, 0, num)
    print("Writing to $file... ")
    @save file h_Q h_aux t
    println("done.")
end

if length(ARGS) != 4
    println(
        """
        Usage:
            assemble_checkpoints.jl <checkpoint_dir> <exp_name> <num_ranks> <checkpoint_number>""",
    )
    exit()
end

assemble(ARGS[1], ARGS[2], parse(Int, ARGS[3]), parse(Int, ARGS[4]))