From 15127c0d438828cda3b50cac112ad7c1a0c4a234 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Thu, 8 Feb 2024 07:06:23 -0600 Subject: [PATCH] OMNITRACE_ROCM_SMI_METRICS (#331) * OMNITRACE_ROCM_SMI_METRICS - configuration variable OMNITRACE_ROCM_SMI_METRICS for specifying which rocm-smi metrics to collect - auto-disable metric collection when rsmi_dev_X_get returns RSMI_STATUS_NOT_SUPPORTED * Bump version to 1.11.1 * Python formatting * Update python/libpyomnitrace.cpp - fix usage of substr (ignored return value) * Update python/gui/source/gui.py - Fix E721 - do not compare types, for exact checks use `is` / `is not`, for instance checks use `isinstance()` --- VERSION | 2 +- source/lib/core/config.cpp | 5 + source/lib/omnitrace/library/rocm_smi.cpp | 133 +++++++++++++++++----- source/lib/omnitrace/library/rocm_smi.hpp | 8 ++ source/python/gui/source/gui.py | 10 +- source/python/gui/source/header.py | 2 +- source/python/libpyomnitrace.cpp | 2 +- 7 files changed, 127 insertions(+), 35 deletions(-) diff --git a/VERSION b/VERSION index 1cac385c6..720c7384c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.11.0 +1.11.1 diff --git a/source/lib/core/config.cpp b/source/lib/core/config.cpp index 6ca5f8065..9615bd397 100644 --- a/source/lib/core/config.cpp +++ b/source/lib/core/config.cpp @@ -647,6 +647,11 @@ configure_settings(bool _init) "is collected on every available device", "", "rocprofiler", "rocm", "hardware_counters"); + OMNITRACE_CONFIG_SETTING(std::string, "OMNITRACE_ROCM_SMI_METRICS", + "rocm-smi metrics to collect: busy, temp, power, mem_usage", + "busy,temp,power,mem_usage", "backend", "rocm_smi", "rocm", + "process_sampling", "advanced"); + OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_CRITICAL_TRACE_DEBUG", "Enable debugging for critical trace", _omnitrace_debug, "debugging", "critical_trace", "advanced"); diff --git a/source/lib/omnitrace/library/rocm_smi.cpp b/source/lib/omnitrace/library/rocm_smi.cpp index c77454476..7170a250c 100644 --- a/source/lib/omnitrace/library/rocm_smi.cpp +++ b/source/lib/omnitrace/library/rocm_smi.cpp @@ -44,7 +44,9 @@ #include #include +#include #include +#include #include #include @@ -58,8 +60,8 @@ #include #include -#define OMNITRACE_ROCM_SMI_CALL(ERROR_CODE) \ - ::omnitrace::rocm_smi::check_error(ERROR_CODE, __FILE__, __LINE__) +#define OMNITRACE_ROCM_SMI_CALL(...) \ + ::omnitrace::rocm_smi::check_error(__FILE__, __LINE__, __VA_ARGS__) namespace omnitrace { @@ -70,6 +72,13 @@ using sampler_instances = thread_data; namespace { +auto& +get_settings(uint32_t _dev_id) +{ + static auto _v = std::unordered_map{}; + return _v[_dev_id]; +} + bool& is_initialized() { @@ -78,9 +87,16 @@ is_initialized() } void -check_error(rsmi_status_t _code, const char* _file, int _line) +check_error(const char* _file, int _line, rsmi_status_t _code, bool* _option = nullptr) { - if(_code == RSMI_STATUS_SUCCESS) return; + if(_code == RSMI_STATUS_SUCCESS) + return; + else if(_code == RSMI_STATUS_NOT_SUPPORTED && _option) + { + *_option = false; + return; + } + const char* _msg = nullptr; auto _err = rsmi_status_string(_code, &_msg); if(_err != RSMI_STATUS_SUCCESS) @@ -120,24 +136,29 @@ data::sample(uint32_t _dev_id) m_dev_id = _dev_id; m_ts = _ts; -#define OMNITRACE_RSMI_GET(FUNCTION, ...) \ - try \ +#define OMNITRACE_RSMI_GET(OPTION, FUNCTION, ...) \ + if(OPTION) \ { \ - OMNITRACE_ROCM_SMI_CALL(FUNCTION(__VA_ARGS__)); \ - } catch(std::runtime_error & _e) \ - { \ - OMNITRACE_VERBOSE_F( \ - 0, "[%s] Exception: %s. Disabling future samples from rocm-smi...\n", \ - #FUNCTION, _e.what()); \ - get_state().store(State::Disabled); \ + try \ + { \ + OMNITRACE_ROCM_SMI_CALL(FUNCTION(__VA_ARGS__), &OPTION); \ + } catch(std::runtime_error & _e) \ + { \ + OMNITRACE_VERBOSE_F( \ + 0, "[%s] Exception: %s. Disabling future samples from rocm-smi...\n", \ + #FUNCTION, _e.what()); \ + get_state().store(State::Disabled); \ + } \ } - OMNITRACE_RSMI_GET(rsmi_dev_busy_percent_get, _dev_id, &m_busy_perc); - OMNITRACE_RSMI_GET(rsmi_dev_temp_metric_get, _dev_id, RSMI_TEMP_TYPE_EDGE, - RSMI_TEMP_CURRENT, &m_temp); - OMNITRACE_RSMI_GET(rsmi_dev_power_ave_get, _dev_id, 0, &m_power); - OMNITRACE_RSMI_GET(rsmi_dev_memory_usage_get, _dev_id, RSMI_MEM_TYPE_VRAM, - &m_mem_usage); + OMNITRACE_RSMI_GET(get_settings(m_dev_id).busy, rsmi_dev_busy_percent_get, _dev_id, + &m_busy_perc); + OMNITRACE_RSMI_GET(get_settings(m_dev_id).temp, rsmi_dev_temp_metric_get, _dev_id, + RSMI_TEMP_TYPE_EDGE, RSMI_TEMP_CURRENT, &m_temp); + OMNITRACE_RSMI_GET(get_settings(m_dev_id).power, rsmi_dev_power_ave_get, _dev_id, 0, + &m_power); + OMNITRACE_RSMI_GET(get_settings(m_dev_id).mem_usage, rsmi_dev_memory_usage_get, + _dev_id, RSMI_MEM_TYPE_VRAM, &m_mem_usage); #undef OMNITRACE_RSMI_GET } @@ -249,7 +270,19 @@ data::post_process(uint32_t _dev_id) OMNITRACE_CI_THROW(!_thread_info, "Missing thread info for thread 0"); if(!_thread_info) return; + auto _settings = get_settings(_dev_id); + auto _process_perfetto = [&]() { + auto _idx = std::array{}; + { + _idx.fill(_idx.size()); + uint64_t nidx = 0; + if(_settings.busy) _idx.at(0) = nidx++; + if(_settings.temp) _idx.at(1) = nidx++; + if(_settings.power) _idx.at(2) = nidx++; + if(_settings.mem_usage) _idx.at(3) = nidx++; + } + for(auto& itr : _rocm_smi) { using counter_track = perfetto_counter_track; @@ -259,10 +292,15 @@ data::post_process(uint32_t _dev_id) auto addendum = [&](const char* _v) { return JOIN(" ", "GPU", _v, JOIN("", '[', _dev_id, ']'), "(S)"); }; - counter_track::emplace(_dev_id, addendum("Busy"), "%"); - counter_track::emplace(_dev_id, addendum("Temperature"), "deg C"); - counter_track::emplace(_dev_id, addendum("Power"), "watts"); - counter_track::emplace(_dev_id, addendum("Memory Usage"), "megabytes"); + + if(_settings.busy) counter_track::emplace(_dev_id, addendum("Busy"), "%"); + if(_settings.temp) + counter_track::emplace(_dev_id, addendum("Temperature"), "deg C"); + if(_settings.power) + counter_track::emplace(_dev_id, addendum("Power"), "watts"); + if(_settings.mem_usage) + counter_track::emplace(_dev_id, addendum("Memory Usage"), + "megabytes"); } uint64_t _ts = itr.m_ts; if(!_thread_info->is_valid_time(_ts)) continue; @@ -271,11 +309,19 @@ data::post_process(uint32_t _dev_id) double _temp = itr.m_temp / 1.0e3; double _power = itr.m_power / 1.0e6; double _usage = itr.m_mem_usage / static_cast(units::megabyte); - TRACE_COUNTER("device_busy", counter_track::at(_dev_id, 0), _ts, _busy); - TRACE_COUNTER("device_temp", counter_track::at(_dev_id, 1), _ts, _temp); - TRACE_COUNTER("device_power", counter_track::at(_dev_id, 2), _ts, _power); - TRACE_COUNTER("device_memory_usage", counter_track::at(_dev_id, 3), _ts, - _usage); + + if(_settings.busy) + TRACE_COUNTER("device_busy", counter_track::at(_dev_id, _idx.at(0)), _ts, + _busy); + if(_settings.temp) + TRACE_COUNTER("device_temp", counter_track::at(_dev_id, _idx.at(1)), _ts, + _temp); + if(_settings.power) + TRACE_COUNTER("device_power", counter_track::at(_dev_id, _idx.at(2)), _ts, + _power); + if(_settings.mem_usage) + TRACE_COUNTER("device_memory_usage", + counter_track::at(_dev_id, _idx.at(3)), _ts, _usage); } }; @@ -288,6 +334,11 @@ data::post_process(uint32_t _dev_id) using samp_bundle_t = tim::lightweight_tuple; + trait::runtime_enabled::set(_settings.busy); + trait::runtime_enabled::set(_settings.temp); + trait::runtime_enabled::set(_settings.power); + trait::runtime_enabled::set(_settings.mem_usage); + using entry_t = critical_trace::entry; auto _gpu_entries = critical_trace::get_entries( [](const entry_t& _e) { return (_e.device == critical_trace::Device::GPU); }); @@ -391,6 +442,8 @@ setup() data::device_list = _devices; + auto _metrics = get_setting_value("OMNITRACE_ROCM_SMI_METRICS"); + try { for(auto itr : _devices) @@ -398,6 +451,30 @@ setup() uint16_t dev_id = 0; OMNITRACE_ROCM_SMI_CALL(rsmi_dev_id_get(itr, &dev_id)); // dev_id holds the device ID of device i, upon a successful call + + if(_metrics && !_metrics->empty()) + { + using key_pair_t = std::pair; + const auto supported = std::unordered_map{ + key_pair_t{ "busy", get_settings(dev_id).busy }, + key_pair_t{ "temp", get_settings(dev_id).temp }, + key_pair_t{ "power", get_settings(dev_id).power }, + key_pair_t{ "mem_usage", get_settings(dev_id).mem_usage }, + }; + + get_settings(dev_id) = { false, false, false, false }; + for(const auto& metric : tim::delimit(*_metrics, ",;:\t\n ")) + { + auto iitr = supported.find(metric); + if(iitr == supported.end()) + OMNITRACE_FAIL_F("unsupported rocm-smi metric: %s\n", + metric.c_str()); + + OMNITRACE_VERBOSE_F(1, "Enabling rocm-smi metric '%s'\n", + metric.c_str()); + iitr->second = true; + } + } } is_initialized() = true; diff --git a/source/lib/omnitrace/library/rocm_smi.hpp b/source/lib/omnitrace/library/rocm_smi.hpp index e73e1a117..f6e45027e 100644 --- a/source/lib/omnitrace/library/rocm_smi.hpp +++ b/source/lib/omnitrace/library/rocm_smi.hpp @@ -69,6 +69,14 @@ void set_state(State); uint32_t device_count(); +struct settings +{ + bool busy = true; + bool temp = true; + bool power = true; + bool mem_usage = true; +}; + struct data { using msec_t = std::chrono::milliseconds; diff --git a/source/python/gui/source/gui.py b/source/python/gui/source/gui.py index 2bcf4df70..638c2c1c7 100644 --- a/source/python/gui/source/gui.py +++ b/source/python/gui/source/gui.py @@ -191,7 +191,7 @@ def update_line_graph( def reset_input_filters(workloads, max_points, verbosity): sortOptions = ["Alphabetical", "Max Speedup", "Min Speedup", "Impact"] - if type(workloads) == str: + if isinstance(workloads, str): workloads = [workloads] input_filters = [ @@ -241,9 +241,11 @@ def build_causal_layout( ] app.layout = html.Div( - style={"backgroundColor": "rgb(255, 255, 255)"} - if light_mode - else {"backgroundColor": "rgb(50, 50, 50)"} + style=( + {"backgroundColor": "rgb(255, 255, 255)"} + if light_mode + else {"backgroundColor": "rgb(50, 50, 50)"} + ) ) line_graph1, line_graph2 = build_line_graph() diff --git a/source/python/gui/source/header.py b/source/python/gui/source/header.py index 1c51fda53..6ef5a8ca2 100644 --- a/source/python/gui/source/header.py +++ b/source/python/gui/source/header.py @@ -186,7 +186,7 @@ def get_header(dropDownMenuItems, input_filters): ul = html.Div( id="nav-center", className="nav-center", - children=filter_children + children=filter_children, # [ # html.Li(className="filter", children=filter_children), # refresh(), diff --git a/source/python/libpyomnitrace.cpp b/source/python/libpyomnitrace.cpp index 04fe3a9a5..59ab191c4 100644 --- a/source/python/libpyomnitrace.cpp +++ b/source/python/libpyomnitrace.cpp @@ -135,7 +135,7 @@ PYBIND11_MODULE(libpyomnitrace, omni) } if(!_cmd_line.empty()) { - _cmd_line.substr(_cmd_line.find_first_not_of(' ')); + _cmd_line = _cmd_line.substr(_cmd_line.find_first_not_of(' ')); tim::set_env("OMNITRACE_COMMAND_LINE", _cmd_line, 0); } omnitrace_init("trace", false, _cmd.c_str());