Skip to content

Commit

Permalink
Merge branch '2-resilience-api-update' into 'master'
Browse files Browse the repository at this point in the history
Resolve "resilience api update"

Closes Mantevo#2

See merge request kokkos-resilience/minimd!2
  • Loading branch information
nmm0 committed Aug 6, 2019
2 parents 05185b0 + 108a340 commit e9a3c40
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 41 deletions.
12 changes: 2 additions & 10 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ target_compile_definitions(minimd PRIVATE PREC_TIMER)

# Dependencies
if (MINIMD_USE_MPISTUB)
find_library( mpi_stub_ NAMES libmpi_stubs.a libmpi_stubs mpi_stubs HINTS ${PROJECT_SOURCE_DIR}/kokkos/MPI-Stubs )
find_library( mpi_stub_ NAMES libmpi_stubs.a libmpi_stubs mpi_stubs HINTS ${PROJECT_SOURCE_DIR}/kokkos/MPI-Stubs )
#add_library(mpi_stubs)
#target_link_libraries(minimd PRIVATE mpi_stubs)
target_link_libraries(minimd PRIVATE ${mpi_stub_})
Expand Down Expand Up @@ -54,9 +54,7 @@ if(MINIMD_AUTOMATIC_CHECKPOINT OR MINIMD_MANUAL_CHECKPOINT OR MINIMD_RESILIENT_E
endif()
endif()

TARGET_LINK_KOKKOS(minimd PRIVATE)

target_link_libraries(minimd PRIVATE "-L${resilience_LINK_DIRECTORIES}")
target_link_kokkos(minimd PRIVATE)

# VeloC config
add_custom_command(TARGET minimd PRE_BUILD
Expand All @@ -73,9 +71,3 @@ add_custom_command(TARGET minimd PRE_BUILD
add_subdirectory(kokkos)


##get_cmake_property(_variableNames VARIABLES)
##list (SORT _variableNames)
##foreach (_variableName ${_variableNames})
## message(STATUS "${_variableName}=${${_variableName}}")
##endforeach()

24 changes: 12 additions & 12 deletions kokkos/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
target_sources(minimd PRIVATE
${PROJECT_SOURCE_DIR}/kokkos/atom.cpp
${PROJECT_SOURCE_DIR}/kokkos/comm.cpp
${PROJECT_SOURCE_DIR}/kokkos/force_eam.cpp
${PROJECT_SOURCE_DIR}/kokkos/force_lj.cpp
${PROJECT_SOURCE_DIR}/kokkos/input.cpp
${PROJECT_SOURCE_DIR}/kokkos/integrate.cpp
${PROJECT_SOURCE_DIR}/kokkos/ljs.cpp
${PROJECT_SOURCE_DIR}/kokkos/neighbor.cpp
${PROJECT_SOURCE_DIR}/kokkos/output.cpp
${PROJECT_SOURCE_DIR}/kokkos/setup.cpp
${PROJECT_SOURCE_DIR}/kokkos/thermo.cpp
${PROJECT_SOURCE_DIR}/kokkos/timer.cpp
${CMAKE_CURRENT_LIST_DIR}/atom.cpp
${CMAKE_CURRENT_LIST_DIR}/comm.cpp
${CMAKE_CURRENT_LIST_DIR}/force_eam.cpp
${CMAKE_CURRENT_LIST_DIR}/force_lj.cpp
${CMAKE_CURRENT_LIST_DIR}/input.cpp
${CMAKE_CURRENT_LIST_DIR}/integrate.cpp
${CMAKE_CURRENT_LIST_DIR}/ljs.cpp
${CMAKE_CURRENT_LIST_DIR}/neighbor.cpp
${CMAKE_CURRENT_LIST_DIR}/output.cpp
${CMAKE_CURRENT_LIST_DIR}/setup.cpp
${CMAKE_CURRENT_LIST_DIR}/thermo.cpp
${CMAKE_CURRENT_LIST_DIR}/timer.cpp
)
59 changes: 43 additions & 16 deletions kokkos/integrate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,27 @@
#include "math.h"
#include <cstdlib>

#ifdef KOKKOS_ENABLE_HDF5
#define CHECKPOINT_FILESPACE Kokkos::Experimental::HDF5Space
#else
#define CHECKPOINT_FILESPACE Kokkos::Experimental::StdFileSpace
#ifdef MINIMD_RESILIENCE
#include <resilience/Resilience.hpp>
#endif

#ifdef MINIMD_RESILIENCE
#include <Kokkos_Resilience.hpp>
#ifdef KOKKOS_ENABLE_MANUAL_CHECKPOINT
#include <mpi.h>
#ifdef KOKKOS_ENABLE_HDF5
#define CHECKPOINT_FILESPACE KokkosResilience::HDF5Space
#ifdef KOKKOS_ENABLE_HDF5_PARALLEL
bool serial_io = false;
#else
bool serial_io = true;
#endif
#else
bool serial_io = true;
#define CHECKPOINT_FILESPACE KokkosResilience::StdFileSpace
#endif
#endif

#ifdef KOKKOS_ENABLE_RESILIENT_EXECUTION
#define DEVICE_EXECUTION_SPACE Kokkos::ResCuda
#define DEVICE_EXECUTION_SPACE KokkosResilience::ResCuda
#else
#ifdef KOKKOS_ENABLE_CUDA
#define DEVICE_EXECUTION_SPACE Kokkos::Cuda
Expand Down Expand Up @@ -91,7 +100,8 @@ void Integrate::operator() (TagFinalIntegrate, const int& i) const {
}

void Integrate::run(Atom &atom, Force* force, Neighbor &neighbor,
Comm &comm, Thermo &thermo, Timer &timer, const int restart_)
Comm &comm, Thermo &thermo, Timer &timer,
const int restart_, std::string root_path)
{
int i, n;

Expand All @@ -107,20 +117,27 @@ void Integrate::run(Atom &atom, Force* force, Neighbor &neighbor,
int nStart = 0;

#ifdef KOKKOS_ENABLE_MANUAL_CHECKPOINT
Kokkos::Experimental::StdFileSpace sfs;
CHECKPOINT_FILESPACE sfs;
if (comm.me == 0) printf("manual checkpoint using cp mirror: %s \n", sfs.name());
auto x_cp = Kokkos::create_chkpt_mirror( sfs, atom.x );
auto v_cp = Kokkos::create_chkpt_mirror( sfs, atom.v );
auto f_cp = Kokkos::create_chkpt_mirror( sfs, atom.f );
nStart = restart_;

// Load from restart ...
if (nStart > 0) {
if (comm.nprocs > 1)
Kokkos::Experimental::DirectoryManager<CHECKPOINT_FILESPACE>::set_checkpoint_directory(comm.me == 0 ? true : false, "./data", (int)((nStart / 10) * 10));
std::string cp_path = root_path;
cp_path+=(std::string)"data";
if ( comm.nprocs > 1 &&
!std::is_same<CHECKPOINT_FILESPACE, KokkosResilience::StdFileSpace>::value &&
!serial_io )
KokkosResilience::DirectoryManager<CHECKPOINT_FILESPACE>::
set_checkpoint_directory(comm.me == 0 ? true : false, cp_path.c_str(), (int)((nStart / 10) * 10));
else
Kokkos::Experimental::DirectoryManager<CHECKPOINT_FILESPACE>::set_checkpoint_directory(comm.me == 0 ? true : false, "./data", (int)((nStart / 10) * 10), comm.me);
KokkosResilience::DirectoryManager<CHECKPOINT_FILESPACE>::
set_checkpoint_directory( true , cp_path.c_str(), (int)((nStart / 10) * 10), comm.me);
// need to resize the views to match the checkpoint files ...
Kokkos::Experimental::StdFileSpace::restore_all_views();
CHECKPOINT_FILESPACE::restore_all_views();
}
#endif

Expand Down Expand Up @@ -241,12 +258,22 @@ void Integrate::run(Atom &atom, Force* force, Neighbor &neighbor,
if(thermo.nstat) thermo.compute(n + 1, atom, neighbor, force, timer, comm);
#ifdef KOKKOS_ENABLE_MANUAL_CHECKPOINT
if ( n % 10 == 0 ) {
std::string cp_path = root_path;
cp_path+=(std::string)"data";
Kokkos::fence();
if (comm.nprocs > 1)
Kokkos::Experimental::DirectoryManager<CHECKPOINT_FILESPACE>::set_checkpoint_directory(comm.me == 0 ? true : false, "./data", n);
if ( comm.nprocs > 1 &&
!std::is_same<CHECKPOINT_FILESPACE, KokkosResilience::StdFileSpace>::value &&
!serial_io )
KokkosResilience::DirectoryManager<CHECKPOINT_FILESPACE>::
set_checkpoint_directory(comm.me == 0 ? true : false, cp_path.c_str(), n);
else
Kokkos::Experimental::DirectoryManager<CHECKPOINT_FILESPACE>::set_checkpoint_directory(comm.me == 0 ? true : false, "./data", n, comm.me);
KokkosResilience::DirectoryManager<CHECKPOINT_FILESPACE>::
set_checkpoint_directory( true , cp_path.c_str(), n, comm.me);
CHECKPOINT_FILESPACE::checkpoint_views();
MPI_Barrier( MPI_COMM_WORLD );
//if (comm.me == 0) printf("checkpoint complete: %d \n", n);
} else {
//if (comm.me == 0) printf("compute only iteration: %d \n", n);
}
#endif
}
Expand Down
2 changes: 1 addition & 1 deletion kokkos/integrate.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,5 +61,5 @@ class Integrate
void finalIntegrate();
KOKKOS_INLINE_FUNCTION
void operator() (TagFinalIntegrate, const int& i) const;
void run(Atom &, Force*, Neighbor &, Comm &, Thermo &, Timer &, const int);
void run(Atom &, Force*, Neighbor &, Comm &, Thermo &, Timer &, const int, std::string);
};
7 changes: 6 additions & 1 deletion kokkos/ljs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,17 @@ int main(int argc, char** argv)
int ntypes = 8;
int team_neigh = 0;
int restart = 0;
std::string root_path = "./";

for(int i = 0; i < argc; i++) {
if((strcmp(argv[i], "-i") == 0) || (strcmp(argv[i], "--input_file") == 0)) {
input_file = argv[++i];
continue;
}
else if((strcmp(argv[i], "-cp") == 0) || (strcmp(argv[i], "--checkpoint_path") == 0)) {
root_path = argv[++i];
continue;
}
}

MPI_Init(&argc, &argv);
Expand Down Expand Up @@ -573,7 +578,7 @@ int main(int argc, char** argv)
}

timer.barrier_start(TIME_TOTAL);
integrate.run(atom, force, neighbor, comm, thermo, timer, restart);
integrate.run(atom, force, neighbor, comm, thermo, timer, restart, root_path);
timer.barrier_stop(TIME_TOTAL);

int natoms;
Expand Down
2 changes: 1 addition & 1 deletion kokkos/ljs.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ struct In {
};

#ifdef KOKKOS_ENABLE_AUTOMATIC_CHECKPOINT
#include <Kokkos_Resilience.hpp>
#include <resilience/Resilience.hpp>
#include <memory>
#ifdef KOKKOS_ENABLE_VELOC
extern std::unique_ptr< KokkosResilience::Context< KokkosResilience::VeloCCheckpointBackend > > resilience_context;
Expand Down

0 comments on commit e9a3c40

Please sign in to comment.