Skip to content

Commit

Permalink
feat: support HDDMonitor and update CPPMonitor with sensors
Browse files Browse the repository at this point in the history
  • Loading branch information
kei1107 committed Jan 16, 2023
1 parent 11b144a commit 72e10cc
Show file tree
Hide file tree
Showing 10 changed files with 438 additions and 137 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ install(DIRECTORY config scripts launch

install(PROGRAMS
scripts/cpu_monitor_node.py
scripts/hdd_monitor_node.py
scripts/mem_monitor_node.py
scripts/net_monitor_node.py
scripts/ntp_monitor_node.py
Expand Down
16 changes: 8 additions & 8 deletions config/system_monitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@ cpu_monitor:
cpu_load5_warn: 0.8
cpu_temp_warn: 85.0
cpu_temp_error: 90.0
# hdd_monitor:
# ros__parameters:
# no_hdd_temp: true
# no_hdd_temp_warn: false
# hdd_level_warn: 0.95
# hdd_level_error: 0.99
# hdd_temp_warn: 55.0
# hdd_temp_error: 70.0
hdd_monitor:
ros__parameters:
no_hw_temp: true
no_hw_temp_warn: false
hdd_level_warn: 0.95
hdd_level_error: 0.99
hdd_temp_warn: 55.0
hdd_temp_error: 70.0
mem_monitor:
ros__parameters:
mem_level_warn: 0.95
Expand Down
12 changes: 12 additions & 0 deletions launch/system_monitor.launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ def generate_launch_description():
description="system monitor config file path")
declare_enable_cpu_monitor = DeclareLaunchArgument(
name="enable_cpu_monitor", default_value="true", description="enable cpu_monitor if true")
declare_enable_hdd_monitor = DeclareLaunchArgument(
name="enable_hdd_monitor", default_value="true", description="enable hdd_monitor if true")
declare_enable_mem_monitor = DeclareLaunchArgument(
name="enable_mem_monitor", default_value="true", description="enable mem_monitor if true")
declare_enable_net_monitor = DeclareLaunchArgument(
Expand All @@ -66,6 +68,14 @@ def generate_launch_description():
emulate_tty=True,
condition=IfCondition(LaunchConfiguration("enable_cpu_monitor"))
)
hdd_monitor = Node(
package="ros2_system_monitor",
executable="hdd_monitor_node.py",
parameters=[LaunchConfiguration("system_monitor_config_file")],
respawn=True,
emulate_tty=True,
condition=IfCondition(LaunchConfiguration("enable_hdd_monitor"))
)
mem_monitor = Node(
package="ros2_system_monitor",
executable="mem_monitor_node.py",
Expand Down Expand Up @@ -97,12 +107,14 @@ def generate_launch_description():
# declare the launch options
ld.add_action(declare_config_file)
ld.add_action(declare_enable_cpu_monitor)
ld.add_action(declare_enable_hdd_monitor)
ld.add_action(declare_enable_mem_monitor)
ld.add_action(declare_enable_net_monitor)
ld.add_action(declare_enable_ntp_monitor)

# add the actions
ld.add_action(cpu_monitor)
ld.add_action(hdd_monitor)
ld.add_action(mem_monitor)
ld.add_action(net_monitor)
ld.add_action(ntp_monitor)
Expand Down
4 changes: 1 addition & 3 deletions package.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,7 @@
<exec_depend>rclpy</exec_depend>
<exec_depend>std_msgs</exec_depend>
<exec_depend>sysstat</exec_depend>
<!-- <exec_depend>hddtemp</exec_depend> -->
<!-- <exec_depend>smartmontools</exec_depend> -->
<!-- <exec_depend>lm-sensors</exec_depend> -->
<exec_depend>lm-sensors</exec_depend>

<test_depend>ament_lint_auto</test_depend>
<!-- <test_depend>ament_cmake_copyright</test_depend> -->
Expand Down
4 changes: 3 additions & 1 deletion ros2_system_monitor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@
#################################################################################

from .cpu_monitor import CPUMonitor
from .hdd_monitor import HDDMonitor
from .mem_monitor import MemMonitor
from .net_monitor import NetMonitor
from .ntp_monitor import NtpMonitor

__all__ = ['CPUMonitor', 'MemMonitor', 'NetMonitor', 'NtpMonitor']
__all__ = ['CPUMonitor', 'HDDMonitor',
'MemMonitor', 'NetMonitor', 'NtpMonitor']
92 changes: 29 additions & 63 deletions ros2_system_monitor/cpu_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#################################################################################

import multiprocessing
import re
import subprocess
import threading
import traceback
Expand Down Expand Up @@ -69,9 +70,6 @@ def __init__(self, hostname, diag_hostname):

self._num_cores = multiprocessing.cpu_count()

# Get temp_input files
self._temp_vals = self.get_core_temp_names()

# updater
self.updater = diagnostic_updater.Updater(self)
self.updater.setHardwareID(hostname)
Expand Down Expand Up @@ -114,39 +112,39 @@ def cancel_timers(self):
if self._usage_timer:
self._usage_timer.cancel()

def check_core_temps(self, sys_temp_strings):
def check_core_temps(self):
"""
Check CPU core temps.
Use 'find /sys -name temp1_input' to find cores
Read from every core, divide by 1000
Use 'sensors' to find cores
Read from every core
"""
diag_vals: list[KeyValue] = []
diag_msgs: list[str] = []
diag_level = DiagnosticStatus.OK

for index, temp_str in enumerate(sys_temp_strings):
if len(temp_str) < 5:
continue

cmd = f"cat {temp_str}"
p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, shell=True)
stdout, stderr = p.communicate()
stdout = stdout.decode()
stderr = stderr.decode()
retcode = p.returncode

if retcode != 0:
diag_level = DiagnosticStatus.ERROR
diag_msgs = ['Core Temperature Error']
diag_vals = [KeyValue(key='Core Temperature Error', value=stderr),
KeyValue(key='Output', value=stdout)]
return diag_vals, diag_msgs, diag_level

tmp = stdout.strip()
if tmp.isnumeric():
temp = float(tmp) / 1000
p = subprocess.Popen("sensors", stdout=subprocess.PIPE,
stderr=subprocess.PIPE, shell=True)
stdout, stderr = p.communicate()
stdout = stdout.decode()
stderr = stderr.decode()
retcode = p.returncode

if retcode != 0:
diag_level = DiagnosticStatus.ERROR
diag_msgs = ['Core Temperature Error']
diag_vals = [KeyValue(key='Core Temperature Error', value=stderr),
KeyValue(key='Output', value=stdout)]
return diag_vals, diag_msgs, diag_level

rows = stdout.split('\n')
r = re.compile("Core [0-9]{1,}:.*")
cores_info = list(filter(r.match, rows))
for index, core_info in enumerate(cores_info):
data = core_info.split()
tmp = data[2][:-2]
try:
temp = float(tmp)
diag_vals.append(
KeyValue(key=f"Core {index} Temperature", value=str(temp)+"DegC"))

Expand All @@ -156,7 +154,7 @@ def check_core_temps(self, sys_temp_strings):
elif temp >= self._cpu_temp_error:
diag_level = max(diag_level, DiagnosticStatus.ERROR)
diag_msgs.append('Hot')
else:
except ValueError:
# Error if not numeric value
diag_level = max(diag_level, DiagnosticStatus.ERROR)
diag_vals.append(
Expand Down Expand Up @@ -349,43 +347,12 @@ def check_mpstat(self):
self._has_error_core_count = True
return DiagnosticStatus.ERROR, 'Incorrect number of CPU cores', vals
except Exception as e:
print(traceback.print_exc())
self.get_logger().error(traceback.print_exc())
mp_level = DiagnosticStatus.ERROR
vals.append(KeyValue(key='mpstat Exception', value=str(e)))

return mp_level, MpstatLoadDict[mp_level], vals

def get_core_temp_names(self):
"""
Get core temp names.
Returns names for core temperature files
Returns list of names as each name can be read like file
"""
temp_vals: list[str] = []
try:
p = subprocess.Popen('find /sys/devices -name temp1_input',
stdout=subprocess.PIPE,
stderr=subprocess.PIPE, shell=True)
stdout, stderr = p.communicate()
stdout = stdout.decode()
stderr = stderr.decode()
retcode = p.returncode

if retcode != 0:
self.get_logger().error(
f"Error find core temp locations: {stderr}")
return []

for ln in stdout.split('\n'):
temp_vals.append(ln.strip())

return temp_vals
except Exception:
self.get_logger().error(
f"Exception finding temp vals: {traceback.format_exc()}")
return []

def check_temps(self):
if not rclpy.ok():
with self._mutex:
Expand All @@ -398,8 +365,7 @@ def check_temps(self):
diag_level = DiagnosticStatus.OK

if self._check_core_temps:
core_vals, core_msgs, core_level = self.check_core_temps(
self._temp_vals)
core_vals, core_msgs, core_level = self.check_core_temps()
diag_vals.extend(core_vals)
diag_msgs.extend(core_msgs)
diag_level = max(diag_level, core_level)
Expand Down
Loading

0 comments on commit 72e10cc

Please sign in to comment.