Skip to content

Commit

Permalink
Raise default min number of instructions (#173)
Browse files Browse the repository at this point in the history
- Raise min instructions default to 1024 instead of 64
- Default value of 64 has demonstrated tendency to slow down real-life
applications
- Improved the memory safety during `omnitrace_finalize()`
- new modifications guarantee that when `tim::manager::instance()` on
main thread is destroyed, omnitrace will finalize before
- Improved some warning w/ roctracer
- Improved the search for `ROCP_METRICS` and
`OMNITRACE_ROCPROFILER_LIBRARY`
- disable printing env by default
- Attempted to improve the sampling shutdown
  • Loading branch information
jrmadsen authored Oct 1, 2022
1 parent 79a8f16 commit 7d7a8f2
Show file tree
Hide file tree
Showing 9 changed files with 147 additions and 59 deletions.
4 changes: 4 additions & 0 deletions examples/transpose/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ else()
target_compile_options(transpose PRIVATE -W -Wall)
endif()

if("${CMAKE_BUILD_TYPE}" MATCHES "Release")
target_compile_options(transpose PRIVATE -g1)
endif()

if(TRANSPOSE_USE_MPI)
target_compile_definitions(transpose PRIVATE USE_MPI)
target_link_libraries(transpose PRIVATE MPI::MPI_C)
Expand Down
24 changes: 20 additions & 4 deletions source/bin/omnitrace/omnitrace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,22 @@
# define OMNITRACE_USE_MPI_HEADERS 0
#endif

namespace
{
auto
get_default_min_instructions()
{
// default to 1024
return tim::get_env<size_t>("OMNITRACE_DEFAULT_MIN_INSTRUCTIONS", (1 << 10), false);
}
auto
get_default_min_address_range()
{
// default to 4096
return 4 * get_default_min_instructions();
}
} // namespace

bool use_return_info = false;
bool use_args_info = false;
bool use_file_info = false;
Expand All @@ -73,10 +89,10 @@ bool loop_level_instr = false;
bool instr_dynamic_callsites = false;
bool instr_traps = false;
bool instr_loop_traps = false;
size_t min_address_range = (1 << 8); // 256
size_t min_loop_address_range = (1 << 8); // 256
size_t min_instructions = (1 << 6); // 64
size_t min_loop_instructions = (1 << 6); // 64
size_t min_address_range = get_default_min_address_range(); // 4096
size_t min_loop_address_range = get_default_min_address_range(); // 4096
size_t min_instructions = get_default_min_instructions(); // 1024
size_t min_loop_instructions = get_default_min_instructions(); // 1024
bool werror = false;
bool debug_print = false;
bool instr_print = false;
Expand Down
56 changes: 46 additions & 10 deletions source/lib/common/setup.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,15 @@ dirname(const std::string& _fname)
return _fname.substr(0, _fname.find_last_of('/'));
return std::string{};
}

inline bool
exists(const std::string& _fname)
{
struct stat _buffer;
if(stat(_fname.c_str(), &_buffer) == 0)
return (S_ISREG(_buffer.st_mode) != 0 || S_ISLNK(_buffer.st_mode) != 0);
return false;
}
} // namespace path

inline void
Expand All @@ -157,28 +166,55 @@ setup_environ(int _verbose, const std::string& _search_paths = {},
setenv("ROCPROFILER_LOG", "1", 0);
setenv("ROCP_HSA_INTERCEPT", "1", 0);
setenv("HSA_TOOLS_REPORT_LOAD_FAILURE", "1", 0);

auto _possible_rocp_metrics = std::vector<std::string>{};
auto _possible_rocprof_libs = std::vector<std::string>{};
for(const auto* itr : { "OMNITRACE_ROCM_PATH", "ROCM_PATH" })
{
if(getenv(itr))
{
setenv("ROCP_METRICS",
common::join('/', getenv(itr), ROCPROFILER_METRICS_DIR, "metrics.xml")
.c_str(),
0);
setenv("OMNITRACE_ROCPROFILER_LIBRARY",
common::join('/', getenv(itr), ROCPROFILER_METRICS_DIR,
"librocprofiler64.so")
.c_str(),
0);
break;
_possible_rocp_metrics.emplace_back(
common::join('/', getenv(itr), "lib/rocprofiler", "metrics.xml"));
_possible_rocprof_libs.emplace_back(
common::join('/', getenv(itr), "lib/rocprofiler", "librocprofiler64.so"));
_possible_rocp_metrics.emplace_back(
common::join('/', getenv(itr), "rocprofiler/lib", "metrics.xml"));
_possible_rocprof_libs.emplace_back(
common::join('/', getenv(itr), "rocprofiler/lib", "librocprofiler64.so"));
}
}

// default path
_possible_rocp_metrics.emplace_back(
common::join('/', OMNITRACE_DEFAULT_ROCM_PATH, "lib/rocprofiler", "metrics.xml"));
_possible_rocp_metrics.emplace_back(
common::join('/', OMNITRACE_DEFAULT_ROCM_PATH, "rocprofiler/lib", "metrics.xml"));

for(const auto& itr : _possible_rocprof_libs)
{
if(path::exists(itr))
{
setenv("OMNITRACE_ROCPROFILER_LIBRARY", itr.c_str(), 0);
_possible_rocp_metrics.emplace(
_possible_rocp_metrics.begin(),
common::join('/', path::dirname(itr),
"../../lib/rocprofiler/metrics.xml"));
_possible_rocp_metrics.emplace(
_possible_rocp_metrics.begin(),
common::join('/', path::dirname(itr), "metrics.xml"));
}
}

for(const auto& itr : _possible_rocp_metrics)
if(path::exists(itr)) setenv("ROCP_METRICS", itr.c_str(), 0);

// default if none of above succeeded
setenv("ROCP_METRICS",
common::join('/', OMNITRACE_DEFAULT_ROCM_PATH, ROCPROFILER_METRICS_DIR,
"metrics.xml")
.c_str(),
0);

#endif

#if defined(OMNITRACE_USE_OMPT) && OMNITRACE_USE_OMPT > 0
Expand Down
69 changes: 38 additions & 31 deletions source/lib/omnitrace/library.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ start();

namespace
{
auto _timemory_manager = tim::manager::instance();
auto _timemory_settings = tim::settings::shared_instance();

auto
ensure_finalization(bool _static_init = false)
{
Expand All @@ -105,6 +108,14 @@ ensure_finalization(bool _static_init = false)
(void) tim::manager::instance();
(void) tim::settings::shared_instance();

if(!tim::get_shared_ptr_pair_callback())
{
tim::get_shared_ptr_pair_callback() =
new tim::shared_ptr_pair_callback_t{ [](int64_t _n) {
if(_n == 0) omnitrace_finalize_hidden();
} };
}

if(_static_init)
{
OMNITRACE_BASIC_DEBUG_F("\n");
Expand Down Expand Up @@ -468,8 +479,6 @@ omnitrace_init_tooling_hidden()
// ends the tracing session
static auto _ensure_finalization = ensure_finalization();

if(dmp::rank() == 0 && get_verbose() >= 0) fprintf(stderr, "\n");

return true;
}

Expand Down Expand Up @@ -569,8 +578,7 @@ omnitrace_finalize_hidden(void)
return;
}

if(get_verbose() >= 0 || get_debug()) fprintf(stderr, "\n");

OMNITRACE_VERBOSE_F(0, "\n");
OMNITRACE_VERBOSE_F(0, "finalizing...\n");

sampling::block_samples();
Expand Down Expand Up @@ -614,8 +622,8 @@ omnitrace_finalize_hidden(void)
{
if(dmp::rank() == 0)
{
fprintf(stderr, "\n");
config::print_settings();
OMNITRACE_PRINT_F("\n");
config::print_settings(get_env<bool>("OMNITRACE_PRINT_ENV", get_debug()));
}
}

Expand Down Expand Up @@ -737,7 +745,7 @@ omnitrace_finalize_hidden(void)
// report the high-level metrics for the process
if(get_main_bundle())
{
if(get_verbose() >= 0 || get_debug()) fprintf(stderr, "\n");
OMNITRACE_VERBOSE_F(0, "\n");
std::string _msg = JOIN("", *get_main_bundle());
auto _pos = _msg.find(">>> ");
if(_pos != std::string::npos) _msg = _msg.substr(_pos + 5);
Expand All @@ -762,7 +770,7 @@ omnitrace_finalize_hidden(void)
}
}

if(get_verbose() >= 0 || get_debug()) fprintf(stderr, "\n");
OMNITRACE_VERBOSE_F(0, "\n");

// ensure that all the MT instances are flushed
if(get_use_sampling())
Expand Down Expand Up @@ -902,10 +910,9 @@ omnitrace_finalize_hidden(void)
// Write the trace into a file.
ofs.write(&trace_data[0], trace_data.size());
if(get_verbose() >= 0) _fom.append("%s", "Done"); // NOLINT
auto _manager = tim::manager::instance();
if(_manager)
_manager->add_file_output("protobuf", "perfetto",
get_perfetto_output_filename());
if(_timemory_manager)
_timemory_manager->add_file_output("protobuf", "perfetto",
get_perfetto_output_filename());
}
ofs.close();
}
Expand All @@ -917,28 +924,28 @@ omnitrace_finalize_hidden(void)
}
}

tim::manager::instance()->add_metadata([](auto& ar) {
auto _maps = tim::procfs::read_maps(process::get_id());
auto _libs = std::set<std::string>{};
for(auto& itr : _maps)
{
auto&& _path = itr.pathname;
if(!_path.empty() && _path.at(0) != '[') _libs.emplace(_path);
}
ar(tim::cereal::make_nvp("memory_maps_files", _libs),
tim::cereal::make_nvp("memory_maps", _maps));
});
if(_timemory_manager && _timemory_manager != nullptr)
{
_timemory_manager->add_metadata([](auto& ar) {
auto _maps = tim::procfs::read_maps(process::get_id());
auto _libs = std::set<std::string>{};
for(auto& itr : _maps)
{
auto&& _path = itr.pathname;
if(!_path.empty() && _path.at(0) != '[') _libs.emplace(_path);
}
ar(tim::cereal::make_nvp("memory_maps_files", _libs),
tim::cereal::make_nvp("memory_maps", _maps));
});

auto _manager = tim::manager::instance();
if(_manager) _manager->set_write_metadata(-1);
_timemory_manager->set_write_metadata(-1);

OMNITRACE_VERBOSE_F(1, "Finalizing timemory...\n");
tim::timemory_finalize();
OMNITRACE_VERBOSE_F(1, "Finalizing timemory...\n");
tim::timemory_finalize(_timemory_manager.get());

if(_manager)
{
_manager->write_metadata(settings::get_global_output_prefix(), "omnitrace",
settings::default_process_suffix());
_timemory_manager->write_metadata(settings::get_global_output_prefix(),
"omnitrace",
settings::default_process_suffix());
}

if(_perfetto_output_error)
Expand Down
5 changes: 5 additions & 0 deletions source/lib/omnitrace/library/debug.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,11 @@ get_chars(T&& _c, std::index_sequence<Idx...>)
(::omnitrace::get_verbose_env() >= LEVEL), \
__VA_ARGS__)

#define OMNITRACE_WARNING_IF(COND, ...) OMNITRACE_CONDITIONAL_WARN((COND), __VA_ARGS__)

#define OMNITRACE_WARNING_IF_F(COND, ...) \
OMNITRACE_CONDITIONAL_WARN_F((COND), __VA_ARGS__)

//--------------------------------------------------------------------------------------//
//
// Basic print macros (basic means it will not provide PID/RANK or TID) and will not
Expand Down
21 changes: 11 additions & 10 deletions source/lib/omnitrace/library/roctracer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -872,33 +872,34 @@ hip_activity_callback(const char* begin, const char* end, void*)
}
}

static auto _op_id_names =
std::array<const char*, 3>{ "DISPATCH", "COPY", "BARRIER" };

{
static size_t _n = 0;
OMNITRACE_CONDITIONAL_PRINT_F(
(get_debug() && get_verbose() >= 2) || _end_ns <= _beg_ns,
"%4zu :: %-20s :: %-20s :: cid=%lu, time_ns=(%12lu:%12lu) "
"delta=%li, device_id=%d, stream_id=%lu, pid=%u, tid=%lu\n",
OMNITRACE_WARNING_IF_F(
_end_ns <= _beg_ns,
"%4zu :: Discarding kernel roctracer activity record which ended before "
"it started :: %-20s :: %-20s :: cid=%lu, time_ns=(%12lu:%12lu) "
"delta=%li, device=%d, queue=%lu, pid=%u, tid=%lu, op=%s\n",
_n++, op_name, _name, record->correlation_id, _beg_ns, _end_ns,
(static_cast<int64_t>(_end_ns) - static_cast<int64_t>(_beg_ns)), _devid,
_queid, record->process_id, _tid);
_queid, record->process_id, _tid, _op_id_names.at(record->op));
if(_end_ns <= _beg_ns) continue;
}

// execute this on this thread bc of how perfetto visualization works
if(get_use_perfetto())
{
static auto _op_id_names =
std::array<const char*, 3>{ "DISPATCH", "COPY", "BARRIER" };

if(_kernel_names.find(_name) == _kernel_names.end())
_kernel_names.emplace(_name, tim::demangle(_name));

assert(_end_ns >= _beg_ns);
tracing::push_perfetto_ts(
category::device_hip{}, _kernel_names.at(_name).c_str(), _beg_ns,
perfetto::Flow::ProcessScoped(_cid), "begin_ns", _beg_ns, "corr_id",
record->correlation_id, "device", _devid, "queue", _queid, "op",
_op_id_names.at(record->op));
record->correlation_id, "device", _devid, "queue", _queid, "pid",
record->process_id, "tid", _tid, "op", _op_id_names.at(record->op));
tracing::pop_perfetto_ts(category::device_hip{}, "", _end_ns, "end_ns",
_end_ns);
// for some reason, this is necessary to make sure very last one ends
Expand Down
18 changes: 17 additions & 1 deletion source/lib/omnitrace/library/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
#include <timemory/sampling/allocator.hpp>
#include <timemory/sampling/sampler.hpp>
#include <timemory/storage.hpp>
#include <timemory/units.hpp>
#include <timemory/utility/backtrace.hpp>
#include <timemory/utility/demangle.hpp>
#include <timemory/utility/types.hpp>
Expand Down Expand Up @@ -411,13 +412,28 @@ configure(bool _setup, int64_t _tid)

if(_tid == 0)
{
block_samples();

// this propagates to all threads
_sampler->ignore(*_signal_types);

// wait for the samples to finish
auto _freq =
std::max<double>(get_sampling_cpu_freq(), get_sampling_real_freq());
auto _period = (1.0 / _freq) * units::sec;
_period = std::max<double>(_period, 1.0e9); // max of 1 second
std::this_thread::sleep_for(
std::chrono::nanoseconds{ static_cast<int64_t>(_period) });

for(int64_t i = 1; i < OMNITRACE_MAX_THREADS; ++i)
{
if(sampling::get_sampler(i)) sampling::get_sampler(i)->stop();
}

for(int64_t i = 1; i < OMNITRACE_MAX_THREADS; ++i)
{
if(sampling::get_sampler(i))
{
sampling::get_sampler(i)->stop();
sampling::get_sampler(i)->reset();
*get_sampler_running(i) = false;
}
Expand Down
7 changes: 5 additions & 2 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,9 @@ endif()
# -------------------------------------------------------------------------------------- #

function(OMNITRACE_WRITE_TEST_CONFIG _FILE _ENV)
set(_ENV_ONLY "OMNITRACE_(USE_MPIP|DEBUG_SETTINGS|FORCE_ROCPROFILER_INIT)=")
set(_ENV_ONLY
"OMNITRACE_(USE_MPIP|DEBUG_SETTINGS|FORCE_ROCPROFILER_INIT|DEFAULT_MIN_INSTRUCTIONS)="
)
set(_FILE_CONTENTS)
set(_ENV_CONTENTS)

Expand Down Expand Up @@ -393,7 +395,8 @@ function(OMNITRACE_ADD_TEST)
endif()

set(_environ
"${TEST_ENVIRONMENT}" "OMNITRACE_OUTPUT_PATH=omnitrace-tests-output"
"OMNITRACE_DEFAULT_MIN_INSTRUCTIONS=64" "${TEST_ENVIRONMENT}"
"OMNITRACE_OUTPUT_PATH=omnitrace-tests-output"
"OMNITRACE_OUTPUT_PREFIX=${_prefix}")

set(_timeout ${TEST_REWRITE_TIMEOUT})
Expand Down

0 comments on commit 7d7a8f2

Please sign in to comment.