Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Debug/ams def libs #60

Draft
wants to merge 23 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
847c9cc
Completely turnoff torch
koparasy Mar 8, 2024
96bf52d
Modifications on cmakes to align our configurations
koparasy Mar 8, 2024
bb2988b
Fixes to correctly find static hdf5
koparasy Mar 12, 2024
3a0fe69
Merge branch 'develop' into debug/ams_def_libs
rblake-llnl Mar 19, 2024
a3422bc
Checking in a version that links shared.
rblake-llnl Mar 20, 2024
765f6f5
Updating chunk to 256KB.
rblake-llnl Mar 20, 2024
53a3990
We need to separate public/private to link shared/static combos
rblake-llnl Mar 20, 2024
e1a2c39
Bugfix, this should have been interface.
rblake-llnl Mar 21, 2024
b71e967
Checking in CMake changes to get pytorch working.
rblake-llnl Mar 22, 2024
d783ae4
Fix bug of DeltaUQ when running on CUDA. Data are not copied correctl…
koparasy Mar 22, 2024
1f4b81a
Merge remote-tracking branch 'origin/uq/tests' into debug/ams_def_libs
rblake-llnl Mar 22, 2024
b83e364
Adding cuda device synchronizes frequently and peeking at cuda errors
koparasy Mar 14, 2024
8c51165
Fix Load Balancer to not call init on constructor and thus avoid Broa…
koparasy Mar 25, 2024
bd9ecb4
Merge remote-tracking branch 'origin/bugfix/load_balance' into debug/…
rblake-llnl Mar 25, 2024
67dbe8c
Guard destructor
koparasy Mar 25, 2024
1b2f6f2
Merge remote-tracking branch 'origin/bugfix/load_balance' into debug/…
rblake-llnl Mar 25, 2024
5580eea
Rename DBTypes to prefix with 'DB'
koparasy Mar 25, 2024
0acb0a3
Merge remote-tracking branch 'origin/features/rename_db_types' into d…
rblake-llnl Mar 25, 2024
8e8dc41
Add extensive prints of the device memory allocator
koparasy Mar 26, 2024
56ac1b2
Merge remote-tracking branch 'origin/debug/torch_memory_info' into de…
rblake-llnl Mar 26, 2024
292bc7a
Changes necessary to compile on CPU.
rblake-llnl Mar 26, 2024
4b6b362
Rest of the DDDB changes.
rblake-llnl Mar 26, 2024
2c0d1b7
Fixed CPU build fixes from torch_memory_info
rblake-llnl Mar 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 149 additions & 16 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,16 @@ option(WITH_TORCH_DEBUG "Compute RMSE of Surrogate Model and Physics Module"
option(WITH_TESTS "Compile tests" OFF)
option(WITH_REDIS "Use REDIS as a database back end" OFF)
option(WITH_HDF5 "Use HDF5 as a database back end" OFF)
option(HDF5_USE_STATIC_LIBRARIES "Use static HDF5." OFF)
set(HDF5_WITH_ZLIB "" CACHE FILETYPE "Use the following zlib for HDF5")
option(WITH_RMQ "Use RabbitMQ as a database back end (require a reachable and running RabbitMQ server service)" OFF)
option(WITH_AMS_DEBUG "Enable verbose messages" OFF)
option(WITH_PERFFLOWASPECT "Use PerfFlowAspect for Profiling" OFF)
option(WITH_WORKFLOW "Install python drivers used by the outer workflow" OFF)
option(WITH_AMS_LIB "Install C++ library to support scientific applications" ON)
option(WITH_ADIAK "Use Adiak for recording metadata" OFF)
option(BUILD_SHARED_LIBS "Build using shared libraries" ON)
option(EXCLUDE_STATIC_LIBS "Exclude static libs from the linking line" OFF)

if (WITH_MPI)
# SET(CMAKE_CXX_COMPILER "${MPI_CXX_COMPILER}" CACHE FILEPATH "CXX compiler overridden with MPI C++ wrapper")
Expand All @@ -69,6 +72,7 @@ if (WITH_CUDA)
if (BUILD_SHARED_LIBS)
set(CUDA_RUNTIME_LIBRARY "Shared")
else()
set(HDF5_USE_STATIC_LIBRARIES ON)
set(CUDA_RUNTIME_LIBRARY "Static")
endif()

Expand All @@ -91,7 +95,7 @@ if (WITH_CALIPER)
endif()

if (WITH_AMS_DEBUG)
list(APPEND AMS_APP_DEFINES "-DLIBAMS_VERBOSE")
list(APPEND AMS_APP_DEFINES "-DAMS_DEBUG")
endif()

# ------------------------------------------------------------------------------
Expand Down Expand Up @@ -132,14 +136,17 @@ endif() # WITH_REDIS

if (WITH_HDF5)
if (HDF5_USE_STATIC_LIBRARIES)
find_package(HDF5 NAMES hdf5 COMPONENTS C static NO_DEFAULT_PATH PATHS ${AMS_HDF5_DIR} ${AMS_HDF5_DIR}/share/cmake)
list(APPEND AMS_APP_LIBRARIES ${HDF5_C_STATIC_LIBRARY})
message(STATUS "HDF5 Static Library : ${HDF5_C_STATIC_LIBRARY}")
else()
find_package(HDF5 NAMES hdf5 COMPONENTS C shared NO_DEFAULT_PATH PATHS ${AMS_HDF5_DIR} ${AMS_HDF5_DIR}/share/cmake)
list(APPEND AMS_APP_LIBRARIES ${HDF5_C_SHARED_LIBRARY})
message(STATUS "HDF5 Shared Library : ${HDF5_C_SHARED_LIBRARY}")
endif()
find_package(HDF5 NAMES hdf5 COMPONENTS C static NO_DEFAULT_PATH PATHS ${AMS_HDF5_DIR} ${AMS_HDF5_DIR}/share/cmake)
list(APPEND AMS_APP_LIBRARIES ${HDF5_C_STATIC_LIBRARY})
message(STATUS "HDF5 Static Library : ${HDF5_C_STATIC_LIBRARY}")
else()
find_package(HDF5 NAMES hdf5 COMPONENTS C shared NO_DEFAULT_PATH PATHS ${AMS_HDF5_DIR} ${AMS_HDF5_DIR}/share/cmake)
list(APPEND AMS_APP_LIBRARIES ${HDF5_C_SHARED_LIBRARY})
message(STATUS "HDF5 Shared Library : ${HDF5_C_SHARED_LIBRARY}")
endif()
if (NOT HDF5_WITH_ZLIB STREQUAL "")
list(APPEND AMS_APP_LIBRARIES ${HDF5_WITH_ZLIB})
endif()
list(APPEND AMS_APP_INCLUDES ${HDF5_INCLUDE_DIR})
list(APPEND AMS_APP_DEFINES "-D__ENABLE_HDF5__")
message(STATUS "HDF5 Include directories: ${HDF5_INCLUDE_DIR}")
Expand Down Expand Up @@ -191,16 +198,50 @@ if (WITH_TORCH)
find_package(Torch REQUIRED)
# This is annoying, torch populates all my cuda flags
# and resets them
set(CMAKE_CUDA_FLAGS "")
# set(CMAKE_CUDA_FLAGS "")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't remember why I did this in the first place. But did you maybe investigate the implications of commenting this out?

set(CMAKE_CUDA_ARCHITECTURES ON)

list(APPEND AMS_APP_INCLUDES "${TORCH_INCLUDE_DIRS}")
list(APPEND AMS_APP_LIBRARIES "${TORCH_LIBRARIES}")

#get_target_property(torch_interface_system_includes
# torch INTERFACE_SYSTEM_INCLUDE_DIRECTORIES)
#if ( torch_interface_system_includes )
# list(APPEND AMS_APP_INCLUDES ${torch_interface_system_includes})
#endif()
#
#get_target_property(torch_interface_includes
# torch INTERFACE_INCLUDE_DIRECTORIES)
#if ( torch_interface_includes )
# list(APPEND AMS_APP_INCLUDES ${torch_interface_includes})
#endif()
#
#get_target_property(torch_interface_defines
# torch INTERFACE_COMPILE_DEFINITIONS)
#if ( troch_interface_defines )
# list(APPEND AMS_APP_DEFINES ${torch_interface_defines})
#endif()
#
#get_target_property(torch_interface_compile_options
# torch INTERFACE_COMPILE_OPTIONS)
#if ( torch_interface_compile_options )
# list(APPEND AMS_APP_DEFINES ${torch_interface_compile_options})
#endif()
#
#get_target_property(_interface_link_directories
# ${arg_FROM} INTERFACE_LINK_DIRECTORIES)
#if ( _interface_link_directories )
# target_link_directories( ${arg_TO} ${_scope} ${_interface_link_directories})
#endif()
#
#get_target_property(torch_interface_link_libraries
# torch INTERFACE_LINK_LIBRARIES)
#if ( torch_interface_link_libraries )
# list(APPEND AMS_APP_LIBRARIES "${torch_interface_link_libraries}")
#endif()

list(APPEND AMS_TORCH_LIBRARY torch)
list(APPEND AMS_APP_DEFINES "-D__ENABLE_TORCH__")
set(BLA_VENDER OpenBLAS)
find_package(BLAS REQUIRED)
list(APPEND AMS_APP_LIBRARIES "${BLAS_LIBRARIES}")
#set(BLA_VENDER OpenBLAS)
#find_package(BLAS REQUIRED)
#list(APPEND AMS_APP_LIBRARIES "${BLAS_LIBRARIES}")
endif()

# ------------------------------------------------------------------------------
Expand Down Expand Up @@ -253,6 +294,98 @@ if (WITH_PERFFLOWASPECT)
endif()


macro(inherit_target_nostatic)
set(options)
set(singleValueArgs TO FROM OBJECT)
set(multiValueArgs)

# Parse the arguments
cmake_parse_arguments(arg "${options}" "${singleValueArgs}"
"${multiValueArgs}" ${ARGN} )

# Check arguments
if ( NOT DEFINED arg_TO )
message( FATAL_ERROR "Must provide a TO argument to the 'blt_inherit_target' macro" )
endif()

if ( NOT DEFINED arg_FROM )
message( FATAL_ERROR "Must provide a FROM argument to the 'blt_inherit_target' macro" )
endif()

set(_scope INTERFACE)

get_target_property(_interface_system_includes
${arg_FROM} INTERFACE_SYSTEM_INCLUDE_DIRECTORIES)
if ( _interface_system_includes )
target_include_directories(${arg_TO} SYSTEM ${_scope} ${_interface_system_includes})
endif()

get_target_property(_interface_includes
${arg_FROM} INTERFACE_INCLUDE_DIRECTORIES)
if ( _interface_includes )
target_include_directories(${arg_TO} ${_scope} ${_interface_includes})
endif()

get_target_property(_interface_defines
${arg_FROM} INTERFACE_COMPILE_DEFINITIONS)
if ( _interface_defines )
target_compile_definitions( ${arg_TO} ${_scope} ${_interface_defines})
endif()

if( ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.13.0" )
get_target_property(_interface_link_options
${arg_FROM} INTERFACE_LINK_OPTIONS)
if ( _interface_link_options )
target_link_options( ${arg_TO} ${_scope} ${_interface_link_options})
endif()
endif()

get_target_property(_interface_compile_options
${arg_FROM} INTERFACE_COMPILE_OPTIONS)
if ( _interface_compile_options )
target_compile_options( ${arg_TO} ${_scope} ${_interface_compile_options})
endif()

if ( NOT arg_OBJECT )
#get_target_property(_interface_link_directories
# ${arg_FROM} INTERFACE_LINK_DIRECTORIES)
#if ( _interface_link_directories )
# target_link_directories( ${arg_TO} ${_scope} ${_interface_link_directories})
#endif()

#get_target_property(_interface_link_libraries
# ${arg_FROM} INTERFACE_LINK_LIBRARIES)
#if ( _interface_link_libraries )
# target_link_libraries( ${arg_TO} ${_scope} ${_interface_link_libraries})
#endif()
endif()

endmacro(inherit_target_nostatic)


if (EXCLUDE_STATIC_LIBS)
set(NONSTATIC_AMS_APP_INTERFACE_LIBRARIES "")
set(NONSTATIC_AMS_APP_PRIVATE_LIBRARIES "")
foreach (THIS_LIB ${AMS_APP_LIBRARIES})
list(APPEND NONSTATIC_AMS_APP_INTERFACE_LIBRARIES ${THIS_LIB})
if (TARGET ${THIS_LIB})
get_target_property(target_type ${THIS_LIB} TYPE)
if (NOT target_type STREQUAL STATIC_LIBRARY)
list(APPEND NONSTATIC_AMS_APP_PRIVATE_LIBRARIES ${THIS_LIB})
else()
add_library("${THIS_LIB}::nostatic" INTERFACE IMPORTED)
inherit_target_nostatic(TO "${THIS_LIB}::nostatic" FROM ${THIS_LIB})
list(APPEND NONSTATIC_AMS_APP_PRIVATE_LIBRARIES "${THIS_LIB}::nostatic")
endif()
else()
get_filename_component(THIS_EXT ${THIS_LIB} EXT)
if (NOT ".a" STREQUAL "${THIS_EXT}")
list(APPEND NONSTATIC_AMS_APP_PRIVATE_LIBRARIES ${THIS_LIB})
endif()
endif()
endforeach()
endif()

Comment on lines +297 to +388
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add a ams_macros.cmake file under cmake directory and have these macros? Not for now. Just asking.

add_subdirectory(src)

# ------------------------------------------------------------------------------
Expand Down
8 changes: 4 additions & 4 deletions examples/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,13 +179,13 @@ int run(const char *device_name,
CALIPER(CALI_MARK_BEGIN("Setup");)

const bool use_device = std::strcmp(device_name, "cpu") != 0;
AMSDBType dbType = AMSDBType::None;
AMSDBType dbType = AMSDBType::DDDBNone;
if (std::strcmp(db_type, "csv") == 0) {
dbType = AMSDBType::CSV;
dbType = AMSDBType::DBCSV;
} else if (std::strcmp(db_type, "hdf5") == 0) {
dbType = AMSDBType::HDF5;
dbType = AMSDBType::DBHDF5;
} else if (std::strcmp(db_type, "rmq") == 0) {
dbType = AMSDBType::RMQ;
dbType = AMSDBType::DBRMQ;
}

AMSUQPolicy uq_policy;
Expand Down
10 changes: 8 additions & 2 deletions src/AMSlib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,13 @@ target_include_directories(AMS PUBLIC
$<INSTALL_INTERFACE:include/>)
target_include_directories(AMS PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
target_link_directories(AMS PUBLIC ${AMS_APP_LIB_DIRS})
target_link_libraries(AMS PUBLIC ${AMS_APP_LIBRARIES} stdc++fs)
if (EXCLUDE_STATIC_LIBS)
target_link_libraries(AMS PRIVATE ${NONSTATIC_AMS_APP_PRIVATE_LIBRARIES} ${AMS_TORCH_LIBRARY} stdc++fs stdc++ /usr/tce/packages/intel/intel-2022.1.0/compiler/2022.1.0/linux/compiler/lib/intel64_lin/libintlc.so m)
target_link_libraries(AMS INTERFACE ${NONSTATIC_AMS_APP_INTERFACE_LIBRARIES} stdc++fs stdc++ /usr/tce/packages/intel/intel-2022.1.0/compiler/2022.1.0/linux/compiler/lib/intel64_lin/libintlc.so m)
target_link_options(AMS PRIVATE "-Wl,--unresolved-symbols=ignore-all")
else()
target_link_libraries(AMS PUBLIC ${AMS_APP_LIBRARIES} stdc++fs stdc++ /usr/tce/packages/intel/intel-2022.1.0/compiler/2022.1.0/linux/compiler/lib/intel64_lin/libintlc.so m)
endif()

#-------------------------------------------------------------------------------
# create the configuration header file with the respective information
Expand Down Expand Up @@ -75,7 +81,7 @@ install(TARGETS AMS
DESTINATION lib)

install(EXPORT AMSTargets
FILE AMS.cmake
FILE AMSConfig.cmake
DESTINATION lib/cmake/AMS)

install(FILES ${PROJECT_BINARY_DIR}/include/AMS.h DESTINATION include)
Expand Down
2 changes: 1 addition & 1 deletion src/AMSlib/include/AMS.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ typedef enum {

typedef enum { UBALANCED = 0, BALANCED } AMSExecPolicy;

typedef enum { None = 0, CSV, REDIS, HDF5, RMQ } AMSDBType;
typedef enum { DDDBNone = 0, DBCSV, DBREDIS, DBHDF5, DBRMQ } AMSDBType;

// TODO: create a cleaner interface that separates UQ type (FAISS, DeltaUQ) with policy (max, mean).
enum struct AMSUQPolicy {
Expand Down
2 changes: 2 additions & 0 deletions src/AMSlib/ml/surrogate.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,8 @@ class SurrogateModel
else
_load<TypeInValue>(new_path, "cuda");
}

AMSResourceType getModelResource() const { return model_resource; }
};

template <typename T>
Expand Down
36 changes: 31 additions & 5 deletions src/AMSlib/ml/uq.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ class UQ

if (uqPolicy == AMSUQPolicy::RandomUQ)
randomUQ = std::make_unique<RandomUQ>(resourceLocation, threshold);

DBG(UQ, "UQ Model is of type %d", uqPolicy)
}

PERFFASPECT()
Expand All @@ -73,29 +75,48 @@ class UQ
{
if ((uqPolicy == AMSUQPolicy::DeltaUQ_Mean) ||
(uqPolicy == AMSUQPolicy::DeltaUQ_Max)) {

auto &rm = ams::ResourceManager::getInstance();

CALIPER(CALI_MARK_BEGIN("DELTAUQ");)
const size_t ndims = outputs.size();
std::vector<FPTypeValue *> outputs_stdev(ndims);
// TODO: Enable device-side allocation and predicate calculation.
auto &rm = ams::ResourceManager::getInstance();
for (int dim = 0; dim < ndims; ++dim)
outputs_stdev[dim] =
rm.allocate<FPTypeValue>(totalElements, AMSResourceType::HOST);

CALIPER(CALI_MARK_BEGIN("SURROGATE");)
DBG(Workflow,
"Model exists, I am calling DeltaUQ surrogate (for all data)");
DBG(UQ,
"Model exists, I am calling DeltaUQ surrogate [%ld %ld] -> (mu:[%ld "
"%ld], std:[%ld %ld])",
totalElements,
inputs.size(),
totalElements,
outputs.size(),
totalElements,
inputs.size());
surrogate->evaluate(totalElements, inputs, outputs, outputs_stdev);
CALIPER(CALI_MARK_END("SURROGATE");)

// FIXME: We do something sub-optimal. We copy all the data from the GPU
// to the CPU and then we compute the predicate. Then we copy back the computed
// predicate to the device. We should avoid this unecessary back and forth.
bool *predicate = p_ml_acceptable;
if (surrogate->getModelResource() == AMSResourceType::DEVICE) {
predicate = rm.allocate<bool>(totalElements, AMSResourceType::HOST);
rm.copy(p_ml_acceptable, predicate);
}


if (uqPolicy == AMSUQPolicy::DeltaUQ_Mean) {
for (size_t i = 0; i < totalElements; ++i) {
// Use double for increased precision, range in the calculation
double mean = 0.0;
for (size_t dim = 0; dim < ndims; ++dim)
mean += outputs_stdev[dim][i];
mean /= ndims;
p_ml_acceptable[i] = (mean < threshold);
predicate[i] = (mean < threshold);
}
} else if (uqPolicy == AMSUQPolicy::DeltaUQ_Max) {
for (size_t i = 0; i < totalElements; ++i) {
Expand All @@ -106,12 +127,17 @@ class UQ
break;
}

p_ml_acceptable[i] = is_acceptable;
predicate[i] = is_acceptable;
}
} else {
THROW(std::runtime_error, "Invalid UQ policy");
}

if (surrogate->getModelResource() == AMSResourceType::DEVICE) {
rm.copy(predicate, p_ml_acceptable);
rm.deallocate(predicate, AMSResourceType::HOST);
}

for (int dim = 0; dim < ndims; ++dim)
rm.deallocate(outputs_stdev[dim], AMSResourceType::HOST);
CALIPER(CALI_MARK_END("DELTAUQ");)
Expand Down
Loading