Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Diagnostics] Add diagnostics of execution time and periodicity of the hardware components #2005

Open
wants to merge 33 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
17386af
Add MovingAverageStatistics struct
saikishor Jan 14, 2025
8ba2f6b
Add HardwareComponentCycleStatus return type
saikishor Jan 14, 2025
0a4c273
initialize the statistics
saikishor Jan 14, 2025
7c7fade
Integrate HardwareComponentCycleStatus on the trigger read and write …
saikishor Jan 14, 2025
ef87463
Add statistics_types.hpp header
saikishor Jan 15, 2025
2f00c4e
add HardwareComponentStatisticsCollector
saikishor Jan 15, 2025
4f7cdad
Change to prio_inherit_mutex instead of recursive one
saikishor Jan 15, 2025
5260019
Fill in the HardwareComponentCycleStatus from read and write cycle in…
saikishor Jan 15, 2025
28bf286
Return trigger result for system
saikishor Jan 15, 2025
241a325
add logic to update the statistics information
saikishor Jan 15, 2025
455b08d
Add missing reset_statistics on activate
saikishor Jan 15, 2025
08e583b
Add get_read_statistics and get_write_statistics to the Hardware Comp…
saikishor Jan 15, 2025
baf89cc
Only update_statistics when the sample count is higher
saikishor Jan 15, 2025
138d1b9
Update read and write statistics data of hardware component info
saikishor Jan 15, 2025
dd6eb4f
return a const reference from `get_components_status` method
saikishor Jan 15, 2025
b5b46ed
Add hardware_components statistics GPL information
saikishor Jan 15, 2025
7f4ddeb
Add first version of the hardware component diagnostics
saikishor Jan 16, 2025
562d088
Add logic to the hardware components diagnostics
saikishor Jan 16, 2025
6699413
add set_reset_statistics_sample_count method to reset statistics base…
saikishor Jan 16, 2025
9daa4fa
add test on the hardware statistics
saikishor Jan 16, 2025
5a3877b
Update parameter description
saikishor Jan 16, 2025
8b52c3d
Merge branch 'master' into add/hardware_component/statistics
saikishor Jan 16, 2025
b74a55b
fix the logic for async hardware components
saikishor Jan 16, 2025
ba7275a
update write cycle statistics only if it is valid
saikishor Jan 16, 2025
8c586cf
Apply suggestions from code review
saikishor Jan 18, 2025
cdb5427
Update controller_manager/src/controller_manager_parameters.yaml
saikishor Jan 18, 2025
4a67663
Add conditioning to protect the hardware statistics
saikishor Jan 18, 2025
8a64061
Fix the logic in the hardware components periodicity and proper cycles
saikishor Jan 19, 2025
655414e
Add proper assertions for the async periodicity tests
saikishor Jan 19, 2025
9b1a4c9
Merge branch 'master' into add/hardware_component/statistics
saikishor Jan 19, 2025
1d752a2
Fix the parameters_context for the controllers diagnostics
saikishor Jan 19, 2025
27eea3e
Use parameters_context.yaml for common documentation of the parameters
saikishor Jan 19, 2025
a59873f
fix the logic back for failing tests
saikishor Jan 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 158 additions & 16 deletions controller_manager/src/controller_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3158,7 +3158,7 @@ void ControllerManager::controller_activity_diagnostic_callback(
diagnostic_updater::DiagnosticStatusWrapper & stat)
{
bool atleast_one_hw_active = false;
const auto hw_components_info = resource_manager_->get_components_status();
const auto & hw_components_info = resource_manager_->get_components_status();
for (const auto & [component_name, component_info] : hw_components_info)
{
if (component_info.state.id() == lifecycle_msgs::msg::State::PRIMARY_STATE_ACTIVE)
Expand Down Expand Up @@ -3311,12 +3311,21 @@ void ControllerManager::controller_activity_diagnostic_callback(
void ControllerManager::hardware_components_diagnostic_callback(
diagnostic_updater::DiagnosticStatusWrapper & stat)
{
if (!is_resource_manager_initialized())
{
stat.summary(
diagnostic_msgs::msg::DiagnosticStatus::ERROR, "Resource manager is not yet initialized!");
return;
}

bool all_active = true;
bool atleast_one_hw_active = false;
const auto hw_components_info = resource_manager_->get_components_status();
const std::string read_cycle_suffix = ".read_cycle";
const std::string write_cycle_suffix = ".write_cycle";
const std::string state_suffix = ".state";
const auto & hw_components_info = resource_manager_->get_components_status();
for (const auto & [component_name, component_info] : hw_components_info)
{
stat.add(component_name, component_info.state.label());
if (component_info.state.id() != lifecycle_msgs::msg::State::PRIMARY_STATE_ACTIVE)
{
all_active = false;
Expand All @@ -3326,31 +3335,164 @@ void ControllerManager::hardware_components_diagnostic_callback(
atleast_one_hw_active = true;
}
}
if (!is_resource_manager_initialized())
if (hw_components_info.empty())
{
stat.summary(
diagnostic_msgs::msg::DiagnosticStatus::ERROR, "Resource manager is not yet initialized!");
diagnostic_msgs::msg::DiagnosticStatus::ERROR, "No hardware components are loaded!");
return;
}
else if (hw_components_info.empty())
else if (!atleast_one_hw_active)
{
stat.summary(
diagnostic_msgs::msg::DiagnosticStatus::ERROR, "No hardware components are loaded!");
diagnostic_msgs::msg::DiagnosticStatus::WARN, "No hardware components are currently active");
return;
}
else

stat.summary(
diagnostic_msgs::msg::DiagnosticStatus::OK,
all_active ? "All hardware components are active" : "Not all hardware components are active");

if (cm_param_listener_->is_old(*params_))
{
if (!atleast_one_hw_active)
*params_ = cm_param_listener_->get_params();
}

auto make_stats_string =
[](const auto & statistics_data, const std::string & measurement_unit) -> std::string
{
std::ostringstream oss;
oss << std::fixed << std::setprecision(2);
oss << "Avg: " << statistics_data.average << " [" << statistics_data.min << " - "
<< statistics_data.max << "] " << measurement_unit
<< ", StdDev: " << statistics_data.standard_deviation;
return oss.str();
};

// Variable to define the overall status of the controller diagnostics
auto level = diagnostic_msgs::msg::DiagnosticStatus::OK;

std::vector<std::string> high_exec_time_hw;
std::vector<std::string> bad_periodicity_async_hw;

for (const auto & [component_name, component_info] : hw_components_info)
{
stat.add(component_name + state_suffix, component_info.state.label());
if (component_info.state.id() != lifecycle_msgs::msg::State::PRIMARY_STATE_ACTIVE)
{
stat.summary(
diagnostic_msgs::msg::DiagnosticStatus::WARN,
"No hardware components are currently active");
all_active = false;
}
else
{
stat.summary(
diagnostic_msgs::msg::DiagnosticStatus::OK, all_active
? "All hardware components are active"
: "Not all hardware components are active");
atleast_one_hw_active = true;
}
if (component_info.state.id() == lifecycle_msgs::msg::State::PRIMARY_STATE_ACTIVE)
{
auto update_stats =
[&bad_periodicity_async_hw, &high_exec_time_hw, &stat, &make_stats_string](
const std::string & comp_name, const auto & statistics,
const std::string & statistics_type_suffix, auto & diag_level, const auto & comp_info,
const auto & params)
{
const bool is_async = comp_info.is_async;
const std::string periodicity_suffix = ".periodicity";
const std::string exec_time_suffix = ".execution_time";
const auto periodicity_stats = statistics->periodicity.get_statistics();
const auto exec_time_stats = statistics->execution_time.get_statistics();
stat.add(
comp_name + statistics_type_suffix + exec_time_suffix,
make_stats_string(exec_time_stats, "us"));
if (is_async)
{
stat.add(
comp_name + statistics_type_suffix + periodicity_suffix,
make_stats_string(periodicity_stats, "Hz") +
" -> Desired : " + std::to_string(comp_info.rw_rate) + " Hz");
const double periodicity_error =
std::abs(periodicity_stats.average - static_cast<double>(comp_info.rw_rate));
if (
periodicity_error >
params->diagnostics.threshold.hardware_components.periodicity.mean_error.error ||
periodicity_stats.standard_deviation > params->diagnostics.threshold.hardware_components
.periodicity.standard_deviation.error)
{
diag_level = diagnostic_msgs::msg::DiagnosticStatus::ERROR;
add_element_to_list(bad_periodicity_async_hw, comp_name);
}
else if (
periodicity_error >
params->diagnostics.threshold.hardware_components.periodicity.mean_error.warn ||
periodicity_stats.standard_deviation >
params->diagnostics.threshold.hardware_components.periodicity.standard_deviation.warn)
{
if (diag_level != diagnostic_msgs::msg::DiagnosticStatus::ERROR)
{
diag_level = diagnostic_msgs::msg::DiagnosticStatus::WARN;
}
add_element_to_list(bad_periodicity_async_hw, comp_name);
}
}
const double max_exp_exec_time =
is_async ? 1.e6 / static_cast<double>(comp_info.rw_rate) : 0.0;
if (
(exec_time_stats.average - max_exp_exec_time) >
params->diagnostics.threshold.hardware_components.execution_time.mean_error.error ||
exec_time_stats.standard_deviation > params->diagnostics.threshold.hardware_components
.execution_time.standard_deviation.error)
{
diag_level = diagnostic_msgs::msg::DiagnosticStatus::ERROR;
high_exec_time_hw.push_back(comp_name);
}
else if (
(exec_time_stats.average - max_exp_exec_time) >
params->diagnostics.threshold.hardware_components.execution_time.mean_error.warn ||
exec_time_stats.standard_deviation > params->diagnostics.threshold.hardware_components
.execution_time.standard_deviation.warn)
{
if (diag_level != diagnostic_msgs::msg::DiagnosticStatus::ERROR)
{
diag_level = diagnostic_msgs::msg::DiagnosticStatus::WARN;
}
high_exec_time_hw.push_back(comp_name);
}
};

// For components : {actuator, sensor and system}
update_stats(
component_name, component_info.read_statistics, read_cycle_suffix, level, component_info,
params_);
// For components : {actuator and system}
if (component_info.write_statistics)
{
update_stats(
component_name, component_info.write_statistics, write_cycle_suffix, level,
component_info, params_);
}
}
}

if (!high_exec_time_hw.empty())
{
std::string high_exec_time_hw_string;
for (const auto & hw_comp : high_exec_time_hw)
{
high_exec_time_hw_string.append(hw_comp);
high_exec_time_hw_string.append(" ");
}
stat.mergeSummary(
level,
"\nHardware Components with high execution time : [ " + high_exec_time_hw_string + "]");
}
if (!bad_periodicity_async_hw.empty())
{
std::string bad_periodicity_async_hw_string;
for (const auto & hw_comp : bad_periodicity_async_hw)
{
bad_periodicity_async_hw_string.append(hw_comp);
bad_periodicity_async_hw_string.append(" ");
}
stat.mergeSummary(
level,
"\nHardware Components with bad periodicity : [ " + bad_periodicity_async_hw_string + "]");
}
}

Expand Down
71 changes: 71 additions & 0 deletions controller_manager/src/controller_manager_parameters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,74 @@ controller_manager:
gt<>: 0.0,
}
}
hardware_components:
periodicity:
mean_error:
warn: {
type: double,
default_value: 5.0,
description: "The warning threshold for the mean error of the hardware component's read/write loop. If the mean error exceeds this threshold, a warning diagnostic will be published. This diagnostics will be published only for the asynchronous hardware component, because any affect to the synchronous hardware components will be reflected directly in the controller manager's periodicity.",
validation: {
gt<>: 0.0,
}
}
error: {
type: double,
default_value: 10.0,
description: "The error threshold for the mean error of the hardware component's read/write loop. If the mean error exceeds this threshold, an error diagnostic will be published. This diagnostics will be published only for the asynchronous hardware component, because any affect to the synchronous hardware components will be reflected directly in the controller manager's periodicity.",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we move some parts of the description to the parameters_context.yaml file, because they are repeating a lot? but I see that it is not sorted correctly in the rst, I'll have a look to fix that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with you. Let's get this in as it is consistent with others for now, and let's fix it in a different PR?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the sorting is fixed now with ros-controls/control.ros.org#384

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Awesome. I fixed the paramters_context.yaml as you suggested :)

validation: {
gt<>: 0.0,
}
}
standard_deviation:
warn: {
type: double,
default_value: 5.0,
description: "The warning threshold for the standard deviation of the hardware component's read/write loop. If the standard deviation exceeds this threshold, a warning diagnostic will be published. This diagnostics will be published only for the asynchronous hardware component, because any affect to the synchronous hardware components will be reflected directly in the controller manager's periodicity.",
validation: {
gt<>: 0.0,
}
}
error: {
type: double,
default_value: 10.0,
description: "The error threshold for the standard deviation of the hardware component's read/write loop. If the standard deviation exceeds this threshold, an error diagnostic will be published. This diagnostics will be published only for the asynchronous hardware component, because any affect to the synchronous hardware components will be reflected directly in the controller manager's periodicity.",
validation: {
gt<>: 0.0,
}
}
execution_time:
mean_error:
warn: {
type: double,
default_value: 1000.0,
description: "The warning threshold for the mean error of the hardware component's read/write cycle execution time in microseconds. If the mean error exceeds this threshold, a warning diagnostic will be published. The ``mean_error`` for a synchronous hardware component will be computed against zero, as it should be as low as possible. However, the ``mean_error`` for an asynchronous hardware component will be computed against its desired read/write period, as the hardware component can take a maximum of the desired period cycle to execute its cycle",
validation: {
gt<>: 0.0,
}
}
error: {
type: double,
default_value: 2000.0,
description: "The error threshold for the mean error of the hardware component's read/write cycle execution time in microseconds. If the mean error exceeds this threshold, a error diagnostic will be published. The ``mean_error`` for a synchronous hardware component will be computed against zero, as it should be as low as possible. However, the ``mean_error`` for an asynchronous hardware component will be computed against its desired read/write period, as the hardware component can take a maximum of the desired period cycle to execute it's cycle",
saikishor marked this conversation as resolved.
Show resolved Hide resolved
validation: {
gt<>: 0.0,
}
}
standard_deviation:
warn: {
type: double,
default_value: 100.0,
description: "The warning threshold for the standard deviation of the hardware component's read/write cycle execution time. If the standard deviation exceeds this threshold, a warning diagnostic will be published.",
validation: {
gt<>: 0.0,
}
}
error: {
type: double,
default_value: 200.0,
description: "The error threshold for the standard deviation of the hardware component's update cycle execution time. If the standard deviation exceeds this threshold, an error diagnostic will be published.",
validation: {
gt<>: 0.0,
}
}
8 changes: 8 additions & 0 deletions hardware_interface/include/hardware_interface/actuator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "hardware_interface/handle.hpp"
#include "hardware_interface/hardware_info.hpp"
#include "hardware_interface/types/hardware_interface_return_values.hpp"
#include "hardware_interface/types/statistics_types.hpp"
#include "rclcpp/duration.hpp"
#include "rclcpp/logger.hpp"
#include "rclcpp/node_interfaces/node_clock_interface.hpp"
Expand Down Expand Up @@ -82,6 +83,10 @@ class Actuator final

const rclcpp::Time & get_last_write_time() const;

const HardwareComponentStatisticsCollector & get_read_statistics() const;

const HardwareComponentStatisticsCollector & get_write_statistics() const;

return_type read(const rclcpp::Time & time, const rclcpp::Duration & period);

return_type write(const rclcpp::Time & time, const rclcpp::Duration & period);
Expand All @@ -95,6 +100,9 @@ class Actuator final
rclcpp::Time last_read_cycle_time_;
// Last write cycle time
rclcpp::Time last_write_cycle_time_;
// Component statistics
HardwareComponentStatisticsCollector read_statistics_;
HardwareComponentStatisticsCollector write_statistics_;
};

} // namespace hardware_interface
Expand Down
Loading
Loading