Skip to content

Commit 4b7481b

Browse files
authored
Merge pull request #1327 from stackhpc/smartmon-py
INFRA-388 Converting smartmon into python and adding mock tests
2 parents 50e6f93 + 5142d79 commit 4b7481b

File tree

7 files changed

+762
-211
lines changed

7 files changed

+762
-211
lines changed
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
#!/usr/bin/env python3
2+
import json
3+
import re
4+
from pySMART import DeviceList
5+
6+
SMARTMON_ATTRS = {
7+
"airflow_temperature_cel",
8+
"command_timeout",
9+
"current_pending_sector",
10+
"end_to_end_error",
11+
"erase_fail_count",
12+
"g_sense_error_rate",
13+
"hardware_ecc_recovered",
14+
"host_reads_32mib",
15+
"host_reads_mib",
16+
"host_writes_32mib",
17+
"host_writes_mib",
18+
"load_cycle_count",
19+
"media_wearout_indicator",
20+
"nand_writes_1gib",
21+
"offline_uncorrectable",
22+
"power_cycle_count",
23+
"power_on_hours",
24+
"program_fail_cnt_total",
25+
"program_fail_count",
26+
"raw_read_error_rate",
27+
"reallocated_event_count",
28+
"reallocated_sector_ct",
29+
"reported_uncorrect",
30+
"runtime_bad_block",
31+
"sata_downshift_count",
32+
"seek_error_rate",
33+
"spin_retry_count",
34+
"spin_up_time",
35+
"start_stop_count",
36+
"temperature_case",
37+
"temperature_celsius",
38+
"temperature_internal",
39+
"total_lbas_read",
40+
"total_lbas_written",
41+
"udma_crc_error_count",
42+
"unsafe_shutdown_count",
43+
"unused_rsvd_blk_cnt_tot",
44+
"wear_leveling_count",
45+
"workld_host_reads_perc",
46+
"workld_media_wear_indic",
47+
"workload_minutes",
48+
"critical_warning",
49+
"temperature",
50+
"available_spare",
51+
"available_spare_threshold",
52+
"percentage_used",
53+
"data_units_read",
54+
"data_units_written",
55+
"host_reads",
56+
"host_writes",
57+
"controller_busy_time",
58+
"power_cycles",
59+
"unsafe_shutdowns",
60+
"media_errors",
61+
"num_err_log_entries",
62+
"warning_temp_time",
63+
"critical_comp_time",
64+
}
65+
66+
DISK_INFO = {
67+
"name",
68+
"interface",
69+
"vendor",
70+
"family",
71+
"model",
72+
"serial",
73+
"firmware",
74+
"smart_capable",
75+
"smart_enabled",
76+
"assessment",
77+
}
78+
79+
def camel_to_snake(name):
80+
"""
81+
Convert a CamelCase string to snake_case.
82+
83+
Reference: https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case
84+
"""
85+
return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()
86+
87+
def attrs_to_dict(obj, allowed_keys):
88+
"""
89+
Build {attr: value} for every public, non-callable attribute whose
90+
snake_case name is in `allowed_keys`.
91+
"""
92+
attributes = {}
93+
for name in dir(obj):
94+
if name.startswith('_'):
95+
continue
96+
try:
97+
value = getattr(obj, name)
98+
except Exception:
99+
continue
100+
if value is None:
101+
continue
102+
if callable(value):
103+
continue
104+
if camel_to_snake(name) in allowed_keys:
105+
attributes[name] = value
106+
return attributes
107+
108+
for disk in DeviceList().devices:
109+
110+
fixtures = {}
111+
disk_info = attrs_to_dict(disk, DISK_INFO)
112+
if_stats = attrs_to_dict(disk.if_attributes, SMARTMON_ATTRS)
113+
114+
fixtures["device_info"] = disk_info
115+
fixtures["if_attributes"] = if_stats
116+
117+
print(f'Disk: {disk.name}: \n')
118+
print(json.dumps(fixtures, indent=2, default=str))
Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
#!/usr/bin/env python3
2+
3+
import subprocess
4+
import json
5+
import re
6+
import datetime
7+
import os
8+
9+
from prometheus_client import CollectorRegistry, Gauge, write_to_textfile
10+
from pySMART import DeviceList
11+
12+
SMARTCTL_PATH = "/usr/sbin/smartctl"
13+
14+
SMARTMON_ATTRS = {
15+
"airflow_temperature_cel",
16+
"command_timeout",
17+
"current_pending_sector",
18+
"end_to_end_error",
19+
"erase_fail_count",
20+
"g_sense_error_rate",
21+
"hardware_ecc_recovered",
22+
"host_reads_32mib",
23+
"host_reads_mib",
24+
"host_writes_32mib",
25+
"host_writes_mib",
26+
"load_cycle_count",
27+
"media_wearout_indicator",
28+
"nand_writes_1gib",
29+
"offline_uncorrectable",
30+
"power_cycle_count",
31+
"power_on_hours",
32+
"program_fail_cnt_total",
33+
"program_fail_count",
34+
"raw_read_error_rate",
35+
"reallocated_event_count",
36+
"reallocated_sector_ct",
37+
"reported_uncorrect",
38+
"runtime_bad_block",
39+
"sata_downshift_count",
40+
"seek_error_rate",
41+
"spin_retry_count",
42+
"spin_up_time",
43+
"start_stop_count",
44+
"temperature_case",
45+
"temperature_celsius",
46+
"temperature_internal",
47+
"total_lbas_read",
48+
"total_lbas_written",
49+
"udma_crc_error_count",
50+
"unsafe_shutdown_count",
51+
"unused_rsvd_blk_cnt_tot",
52+
"wear_leveling_count",
53+
"workld_host_reads_perc",
54+
"workld_media_wear_indic",
55+
"workload_minutes",
56+
"critical_warning",
57+
"temperature",
58+
"available_spare",
59+
"available_spare_threshold",
60+
"percentage_used",
61+
"data_units_read",
62+
"data_units_written",
63+
"host_reads",
64+
"host_writes",
65+
"controller_busy_time",
66+
"power_cycles",
67+
"unsafe_shutdowns",
68+
"media_errors",
69+
"num_err_log_entries",
70+
"warning_temp_time",
71+
"critical_comp_time",
72+
}
73+
74+
def run_command(command, parse_json=False):
75+
"""
76+
Helper to run a subprocess command and optionally parse JSON output.
77+
"""
78+
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
79+
if parse_json:
80+
return json.loads(result.stdout)
81+
return result.stdout.strip()
82+
83+
def camel_to_snake(name):
84+
"""
85+
Convert a CamelCase string to snake_case.
86+
87+
Reference: https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case
88+
"""
89+
return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()
90+
91+
def parse_device_info(device):
92+
"""
93+
Produce Prometheus lines describing the device's identity and SMART status:
94+
- device_info
95+
- device_smart_available
96+
- device_smart_enabled
97+
- device_smart_healthy
98+
99+
Args:
100+
device (Device): A pySMART Device object with attributes such as name, interface, etc.
101+
102+
Returns:
103+
List[str]: A list of Prometheus formatted metric strings.
104+
"""
105+
serial_number = (device.serial or "").lower()
106+
labels = {
107+
"disk": device.name,
108+
"type": device.interface or "",
109+
"vendor": device.vendor or "",
110+
"model_family": device.family or "",
111+
"device_model": device.model or "",
112+
"serial_number": serial_number,
113+
"firmware_version": device.firmware or "",
114+
}
115+
sorted_labels = sorted(labels.items())
116+
label_str = ",".join(f'{k}="{v}"' for k, v in sorted_labels)
117+
118+
metric_labels = f'disk="{device.name}",serial_number="{serial_number}",type="{device.interface}"'
119+
120+
metrics = [
121+
f'smartmon_device_info{{{label_str}}} 1.0',
122+
f'smartmon_device_smart_available{{{metric_labels}}} {float(1) if device.smart_capable else float(0)}',
123+
]
124+
125+
if device.smart_capable:
126+
metrics.append(
127+
f'smartmon_device_smart_enabled{{{metric_labels}}} {float(1) if device.smart_enabled else float(0)}'
128+
)
129+
if device.assessment:
130+
is_healthy = 1 if device.assessment.upper() == "PASS" else 0
131+
metrics.append(
132+
f'smartmon_device_smart_healthy{{{metric_labels}}} {float(is_healthy)}'
133+
)
134+
135+
return metrics
136+
137+
def parse_if_attributes(device):
138+
"""
139+
For any device type (ATA, NVMe, SCSI, etc.), we read device.if_attributes.
140+
We'll iterate over its public fields, convert them to snake_case,
141+
and if it's in SMARTMON_ATTRS and numeric, we produce metrics.
142+
143+
Args:
144+
device (Device): A pySMART Device object with attributes such as name, interface, etc.
145+
Returns:
146+
List[str]: A list of Prometheus formatted metric strings.
147+
"""
148+
metrics = []
149+
150+
if not device.if_attributes:
151+
return metrics
152+
153+
disk = device.name
154+
disk_type = device.interface or ""
155+
serial_number = (device.serial or "").lower()
156+
labels = f'disk="{disk}",serial_number="{serial_number}",type="{disk_type}"'
157+
158+
# Inspect all public attributes on device.if_attributes
159+
for attr_name in dir(device.if_attributes):
160+
if attr_name.startswith("_"):
161+
continue # skip private / special methods
162+
val = getattr(device.if_attributes, attr_name, None)
163+
if callable(val):
164+
continue # skip methods
165+
166+
snake_name = camel_to_snake(attr_name)
167+
168+
if snake_name in SMARTMON_ATTRS and isinstance(val, (int, float)):
169+
metrics.append(f"smartmon_{snake_name}{{{labels}}} {float(val)}")
170+
171+
return metrics
172+
173+
def write_metrics_to_textfile(metrics, output_path=None):
174+
"""
175+
Write metrics to a Prometheus textfile using prometheus_client.
176+
Args:
177+
metrics (List[str]): List of metric strings in 'name{labels} value' format.
178+
output_path (str): Path to write the metrics file. Defaults to node_exporter textfile collector path.
179+
"""
180+
registry = CollectorRegistry()
181+
metric_gauges = {}
182+
for metric in metrics:
183+
# Split metric into name, labels, and value
184+
metric_name, rest = metric.split('{', 1)
185+
label_str, value = rest.split('}', 1)
186+
value = value.strip()
187+
# Parse labels into a dictionary
188+
labels = {}
189+
label_keys = []
190+
label_values = []
191+
for label in label_str.split(','):
192+
if '=' in label:
193+
k, v = label.split('=', 1)
194+
k = k.strip()
195+
v = v.strip('"')
196+
labels[k] = v
197+
label_keys.append(k)
198+
label_values.append(v)
199+
help_str = f"SMART metric {metric_name}"
200+
# Create Gauge if not already present
201+
if metric_name not in metric_gauges:
202+
metric_gauges[metric_name] = Gauge(metric_name, help_str, label_keys, registry=registry)
203+
# Set metric value
204+
gauge = metric_gauges[metric_name]
205+
gauge.labels(*label_values).set(float(value))
206+
if output_path is None:
207+
output_path = '/var/lib/node_exporter/textfile_collector/smartmon.prom'
208+
write_to_textfile(output_path, registry) # Write all metrics to file
209+
210+
def main(output_path=None):
211+
all_metrics = []
212+
213+
try:
214+
version_output = run_command([SMARTCTL_PATH, "--version"])
215+
if version_output.startswith("smartctl"):
216+
first_line = version_output.splitlines()[0]
217+
version_num = first_line.split()[1]
218+
else:
219+
version_num = "unknown"
220+
except Exception:
221+
version_num = "unknown"
222+
all_metrics.append(f'smartmon_smartctl_version{{version="{version_num}"}} 1')
223+
224+
dev_list = DeviceList()
225+
226+
for dev in dev_list.devices:
227+
disk_name = dev.name
228+
disk_type = dev.interface or ""
229+
serial_number = (dev.serial or "").lower()
230+
231+
run_timestamp = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
232+
all_metrics.append(f'smartmon_smartctl_run{{disk="{disk_name}",type="{disk_type}"}} {run_timestamp}')
233+
234+
active = 1
235+
try:
236+
cmd = [SMARTCTL_PATH, "-n", "standby", "-d", disk_type, "-j", disk_name]
237+
standby_json = run_command(cmd, parse_json=True)
238+
if standby_json.get("power_mode", "") == "standby":
239+
active = 0
240+
except json.JSONDecodeError:
241+
active = 0
242+
except Exception:
243+
active = 0
244+
245+
all_metrics.append(
246+
f'smartmon_device_active{{disk="{disk_name}",type="{disk_type}",serial_number="{serial_number}"}} {active}'
247+
)
248+
if active == 0:
249+
continue
250+
251+
all_metrics.extend(parse_device_info(dev))
252+
all_metrics.extend(parse_if_attributes(dev))
253+
254+
write_metrics_to_textfile(all_metrics, output_path)
255+
256+
if __name__ == "__main__":
257+
import argparse
258+
parser = argparse.ArgumentParser(description="Export SMART metrics to Prometheus textfile format.")
259+
parser.add_argument('--output', type=str, default=None, help='Output path for Prometheus textfile (default: /var/lib/node_exporter/textfile_collector/smartmon.prom)')
260+
args = parser.parse_args()
261+
main(args.output)

0 commit comments

Comments
 (0)