diff --git a/examples/transpose/CMakeLists.txt b/examples/transpose/CMakeLists.txt index 828e0be93..5ab93b982 100644 --- a/examples/transpose/CMakeLists.txt +++ b/examples/transpose/CMakeLists.txt @@ -54,6 +54,10 @@ else() target_compile_options(transpose PRIVATE -W -Wall) endif() +if("${CMAKE_BUILD_TYPE}" MATCHES "Release") + target_compile_options(transpose PRIVATE -g1) +endif() + if(TRANSPOSE_USE_MPI) target_compile_definitions(transpose PRIVATE USE_MPI) target_link_libraries(transpose PRIVATE MPI::MPI_C) diff --git a/external/timemory b/external/timemory index 98e2306ca..a3f0a917a 160000 --- a/external/timemory +++ b/external/timemory @@ -1 +1 @@ -Subproject commit 98e2306ca9226226013335637ce6c33f72bf1e3a +Subproject commit a3f0a917abd4ff6ab2ff35c7b2ea9d02d6bebad2 diff --git a/source/bin/omnitrace/omnitrace.cpp b/source/bin/omnitrace/omnitrace.cpp index 0890fa309..c1443661e 100644 --- a/source/bin/omnitrace/omnitrace.cpp +++ b/source/bin/omnitrace/omnitrace.cpp @@ -64,6 +64,22 @@ # define OMNITRACE_USE_MPI_HEADERS 0 #endif +namespace +{ +auto +get_default_min_instructions() +{ + // default to 1024 + return tim::get_env("OMNITRACE_DEFAULT_MIN_INSTRUCTIONS", (1 << 10), false); +} +auto +get_default_min_address_range() +{ + // default to 4096 + return 4 * get_default_min_instructions(); +} +} // namespace + bool use_return_info = false; bool use_args_info = false; bool use_file_info = false; @@ -73,10 +89,10 @@ bool loop_level_instr = false; bool instr_dynamic_callsites = false; bool instr_traps = false; bool instr_loop_traps = false; -size_t min_address_range = (1 << 8); // 256 -size_t min_loop_address_range = (1 << 8); // 256 -size_t min_instructions = (1 << 6); // 64 -size_t min_loop_instructions = (1 << 6); // 64 +size_t min_address_range = get_default_min_address_range(); // 4096 +size_t min_loop_address_range = get_default_min_address_range(); // 4096 +size_t min_instructions = get_default_min_instructions(); // 1024 +size_t min_loop_instructions = get_default_min_instructions(); // 1024 bool werror = false; bool debug_print = false; bool instr_print = false; diff --git a/source/lib/common/setup.hpp b/source/lib/common/setup.hpp index f1f2848ce..241cc1f3c 100644 --- a/source/lib/common/setup.hpp +++ b/source/lib/common/setup.hpp @@ -132,6 +132,15 @@ dirname(const std::string& _fname) return _fname.substr(0, _fname.find_last_of('/')); return std::string{}; } + +inline bool +exists(const std::string& _fname) +{ + struct stat _buffer; + if(stat(_fname.c_str(), &_buffer) == 0) + return (S_ISREG(_buffer.st_mode) != 0 || S_ISLNK(_buffer.st_mode) != 0); + return false; +} } // namespace path inline void @@ -157,28 +166,55 @@ setup_environ(int _verbose, const std::string& _search_paths = {}, setenv("ROCPROFILER_LOG", "1", 0); setenv("ROCP_HSA_INTERCEPT", "1", 0); setenv("HSA_TOOLS_REPORT_LOAD_FAILURE", "1", 0); + + auto _possible_rocp_metrics = std::vector{}; + auto _possible_rocprof_libs = std::vector{}; for(const auto* itr : { "OMNITRACE_ROCM_PATH", "ROCM_PATH" }) { if(getenv(itr)) { - setenv("ROCP_METRICS", - common::join('/', getenv(itr), ROCPROFILER_METRICS_DIR, "metrics.xml") - .c_str(), - 0); - setenv("OMNITRACE_ROCPROFILER_LIBRARY", - common::join('/', getenv(itr), ROCPROFILER_METRICS_DIR, - "librocprofiler64.so") - .c_str(), - 0); - break; + _possible_rocp_metrics.emplace_back( + common::join('/', getenv(itr), "lib/rocprofiler", "metrics.xml")); + _possible_rocprof_libs.emplace_back( + common::join('/', getenv(itr), "lib/rocprofiler", "librocprofiler64.so")); + _possible_rocp_metrics.emplace_back( + common::join('/', getenv(itr), "rocprofiler/lib", "metrics.xml")); + _possible_rocprof_libs.emplace_back( + common::join('/', getenv(itr), "rocprofiler/lib", "librocprofiler64.so")); } } + // default path + _possible_rocp_metrics.emplace_back( + common::join('/', OMNITRACE_DEFAULT_ROCM_PATH, "lib/rocprofiler", "metrics.xml")); + _possible_rocp_metrics.emplace_back( + common::join('/', OMNITRACE_DEFAULT_ROCM_PATH, "rocprofiler/lib", "metrics.xml")); + + for(const auto& itr : _possible_rocprof_libs) + { + if(path::exists(itr)) + { + setenv("OMNITRACE_ROCPROFILER_LIBRARY", itr.c_str(), 0); + _possible_rocp_metrics.emplace( + _possible_rocp_metrics.begin(), + common::join('/', path::dirname(itr), + "../../lib/rocprofiler/metrics.xml")); + _possible_rocp_metrics.emplace( + _possible_rocp_metrics.begin(), + common::join('/', path::dirname(itr), "metrics.xml")); + } + } + + for(const auto& itr : _possible_rocp_metrics) + if(path::exists(itr)) setenv("ROCP_METRICS", itr.c_str(), 0); + + // default if none of above succeeded setenv("ROCP_METRICS", common::join('/', OMNITRACE_DEFAULT_ROCM_PATH, ROCPROFILER_METRICS_DIR, "metrics.xml") .c_str(), 0); + #endif #if defined(OMNITRACE_USE_OMPT) && OMNITRACE_USE_OMPT > 0 diff --git a/source/lib/omnitrace/library.cpp b/source/lib/omnitrace/library.cpp index e739b058a..28247baa0 100644 --- a/source/lib/omnitrace/library.cpp +++ b/source/lib/omnitrace/library.cpp @@ -85,6 +85,9 @@ start(); namespace { +auto _timemory_manager = tim::manager::instance(); +auto _timemory_settings = tim::settings::shared_instance(); + auto ensure_finalization(bool _static_init = false) { @@ -105,6 +108,14 @@ ensure_finalization(bool _static_init = false) (void) tim::manager::instance(); (void) tim::settings::shared_instance(); + if(!tim::get_shared_ptr_pair_callback()) + { + tim::get_shared_ptr_pair_callback() = + new tim::shared_ptr_pair_callback_t{ [](int64_t _n) { + if(_n == 0) omnitrace_finalize_hidden(); + } }; + } + if(_static_init) { OMNITRACE_BASIC_DEBUG_F("\n"); @@ -468,8 +479,6 @@ omnitrace_init_tooling_hidden() // ends the tracing session static auto _ensure_finalization = ensure_finalization(); - if(dmp::rank() == 0 && get_verbose() >= 0) fprintf(stderr, "\n"); - return true; } @@ -569,8 +578,7 @@ omnitrace_finalize_hidden(void) return; } - if(get_verbose() >= 0 || get_debug()) fprintf(stderr, "\n"); - + OMNITRACE_VERBOSE_F(0, "\n"); OMNITRACE_VERBOSE_F(0, "finalizing...\n"); sampling::block_samples(); @@ -614,8 +622,8 @@ omnitrace_finalize_hidden(void) { if(dmp::rank() == 0) { - fprintf(stderr, "\n"); - config::print_settings(); + OMNITRACE_PRINT_F("\n"); + config::print_settings(get_env("OMNITRACE_PRINT_ENV", get_debug())); } } @@ -737,7 +745,7 @@ omnitrace_finalize_hidden(void) // report the high-level metrics for the process if(get_main_bundle()) { - if(get_verbose() >= 0 || get_debug()) fprintf(stderr, "\n"); + OMNITRACE_VERBOSE_F(0, "\n"); std::string _msg = JOIN("", *get_main_bundle()); auto _pos = _msg.find(">>> "); if(_pos != std::string::npos) _msg = _msg.substr(_pos + 5); @@ -762,7 +770,7 @@ omnitrace_finalize_hidden(void) } } - if(get_verbose() >= 0 || get_debug()) fprintf(stderr, "\n"); + OMNITRACE_VERBOSE_F(0, "\n"); // ensure that all the MT instances are flushed if(get_use_sampling()) @@ -902,10 +910,9 @@ omnitrace_finalize_hidden(void) // Write the trace into a file. ofs.write(&trace_data[0], trace_data.size()); if(get_verbose() >= 0) _fom.append("%s", "Done"); // NOLINT - auto _manager = tim::manager::instance(); - if(_manager) - _manager->add_file_output("protobuf", "perfetto", - get_perfetto_output_filename()); + if(_timemory_manager) + _timemory_manager->add_file_output("protobuf", "perfetto", + get_perfetto_output_filename()); } ofs.close(); } @@ -917,28 +924,28 @@ omnitrace_finalize_hidden(void) } } - tim::manager::instance()->add_metadata([](auto& ar) { - auto _maps = tim::procfs::read_maps(process::get_id()); - auto _libs = std::set{}; - for(auto& itr : _maps) - { - auto&& _path = itr.pathname; - if(!_path.empty() && _path.at(0) != '[') _libs.emplace(_path); - } - ar(tim::cereal::make_nvp("memory_maps_files", _libs), - tim::cereal::make_nvp("memory_maps", _maps)); - }); + if(_timemory_manager && _timemory_manager != nullptr) + { + _timemory_manager->add_metadata([](auto& ar) { + auto _maps = tim::procfs::read_maps(process::get_id()); + auto _libs = std::set{}; + for(auto& itr : _maps) + { + auto&& _path = itr.pathname; + if(!_path.empty() && _path.at(0) != '[') _libs.emplace(_path); + } + ar(tim::cereal::make_nvp("memory_maps_files", _libs), + tim::cereal::make_nvp("memory_maps", _maps)); + }); - auto _manager = tim::manager::instance(); - if(_manager) _manager->set_write_metadata(-1); + _timemory_manager->set_write_metadata(-1); - OMNITRACE_VERBOSE_F(1, "Finalizing timemory...\n"); - tim::timemory_finalize(); + OMNITRACE_VERBOSE_F(1, "Finalizing timemory...\n"); + tim::timemory_finalize(_timemory_manager.get()); - if(_manager) - { - _manager->write_metadata(settings::get_global_output_prefix(), "omnitrace", - settings::default_process_suffix()); + _timemory_manager->write_metadata(settings::get_global_output_prefix(), + "omnitrace", + settings::default_process_suffix()); } if(_perfetto_output_error) diff --git a/source/lib/omnitrace/library/debug.hpp b/source/lib/omnitrace/library/debug.hpp index fa3fc4280..12894a4d2 100644 --- a/source/lib/omnitrace/library/debug.hpp +++ b/source/lib/omnitrace/library/debug.hpp @@ -531,6 +531,11 @@ get_chars(T&& _c, std::index_sequence) (::omnitrace::get_verbose_env() >= LEVEL), \ __VA_ARGS__) +#define OMNITRACE_WARNING_IF(COND, ...) OMNITRACE_CONDITIONAL_WARN((COND), __VA_ARGS__) + +#define OMNITRACE_WARNING_IF_F(COND, ...) \ + OMNITRACE_CONDITIONAL_WARN_F((COND), __VA_ARGS__) + //--------------------------------------------------------------------------------------// // // Basic print macros (basic means it will not provide PID/RANK or TID) and will not diff --git a/source/lib/omnitrace/library/roctracer.cpp b/source/lib/omnitrace/library/roctracer.cpp index 534da1fc9..7a1d74f7f 100644 --- a/source/lib/omnitrace/library/roctracer.cpp +++ b/source/lib/omnitrace/library/roctracer.cpp @@ -872,24 +872,25 @@ hip_activity_callback(const char* begin, const char* end, void*) } } + static auto _op_id_names = + std::array{ "DISPATCH", "COPY", "BARRIER" }; + { static size_t _n = 0; - OMNITRACE_CONDITIONAL_PRINT_F( - (get_debug() && get_verbose() >= 2) || _end_ns <= _beg_ns, - "%4zu :: %-20s :: %-20s :: cid=%lu, time_ns=(%12lu:%12lu) " - "delta=%li, device_id=%d, stream_id=%lu, pid=%u, tid=%lu\n", + OMNITRACE_WARNING_IF_F( + _end_ns <= _beg_ns, + "%4zu :: Discarding kernel roctracer activity record which ended before " + "it started :: %-20s :: %-20s :: cid=%lu, time_ns=(%12lu:%12lu) " + "delta=%li, device=%d, queue=%lu, pid=%u, tid=%lu, op=%s\n", _n++, op_name, _name, record->correlation_id, _beg_ns, _end_ns, (static_cast(_end_ns) - static_cast(_beg_ns)), _devid, - _queid, record->process_id, _tid); + _queid, record->process_id, _tid, _op_id_names.at(record->op)); if(_end_ns <= _beg_ns) continue; } // execute this on this thread bc of how perfetto visualization works if(get_use_perfetto()) { - static auto _op_id_names = - std::array{ "DISPATCH", "COPY", "BARRIER" }; - if(_kernel_names.find(_name) == _kernel_names.end()) _kernel_names.emplace(_name, tim::demangle(_name)); @@ -897,8 +898,8 @@ hip_activity_callback(const char* begin, const char* end, void*) tracing::push_perfetto_ts( category::device_hip{}, _kernel_names.at(_name).c_str(), _beg_ns, perfetto::Flow::ProcessScoped(_cid), "begin_ns", _beg_ns, "corr_id", - record->correlation_id, "device", _devid, "queue", _queid, "op", - _op_id_names.at(record->op)); + record->correlation_id, "device", _devid, "queue", _queid, "pid", + record->process_id, "tid", _tid, "op", _op_id_names.at(record->op)); tracing::pop_perfetto_ts(category::device_hip{}, "", _end_ns, "end_ns", _end_ns); // for some reason, this is necessary to make sure very last one ends diff --git a/source/lib/omnitrace/library/sampling.cpp b/source/lib/omnitrace/library/sampling.cpp index a5a92d509..07fbf2f6d 100644 --- a/source/lib/omnitrace/library/sampling.cpp +++ b/source/lib/omnitrace/library/sampling.cpp @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -411,13 +412,28 @@ configure(bool _setup, int64_t _tid) if(_tid == 0) { + block_samples(); + // this propagates to all threads _sampler->ignore(*_signal_types); + + // wait for the samples to finish + auto _freq = + std::max(get_sampling_cpu_freq(), get_sampling_real_freq()); + auto _period = (1.0 / _freq) * units::sec; + _period = std::max(_period, 1.0e9); // max of 1 second + std::this_thread::sleep_for( + std::chrono::nanoseconds{ static_cast(_period) }); + + for(int64_t i = 1; i < OMNITRACE_MAX_THREADS; ++i) + { + if(sampling::get_sampler(i)) sampling::get_sampler(i)->stop(); + } + for(int64_t i = 1; i < OMNITRACE_MAX_THREADS; ++i) { if(sampling::get_sampler(i)) { - sampling::get_sampler(i)->stop(); sampling::get_sampler(i)->reset(); *get_sampler_running(i) = false; } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 3a10ffdee..6b4d08b5a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -193,7 +193,9 @@ endif() # -------------------------------------------------------------------------------------- # function(OMNITRACE_WRITE_TEST_CONFIG _FILE _ENV) - set(_ENV_ONLY "OMNITRACE_(USE_MPIP|DEBUG_SETTINGS|FORCE_ROCPROFILER_INIT)=") + set(_ENV_ONLY + "OMNITRACE_(USE_MPIP|DEBUG_SETTINGS|FORCE_ROCPROFILER_INIT|DEFAULT_MIN_INSTRUCTIONS)=" + ) set(_FILE_CONTENTS) set(_ENV_CONTENTS) @@ -393,7 +395,8 @@ function(OMNITRACE_ADD_TEST) endif() set(_environ - "${TEST_ENVIRONMENT}" "OMNITRACE_OUTPUT_PATH=omnitrace-tests-output" + "OMNITRACE_DEFAULT_MIN_INSTRUCTIONS=64" "${TEST_ENVIRONMENT}" + "OMNITRACE_OUTPUT_PATH=omnitrace-tests-output" "OMNITRACE_OUTPUT_PREFIX=${_prefix}") set(_timeout ${TEST_REWRITE_TIMEOUT})