|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +import subprocess |
| 4 | +import json |
| 5 | +import re |
| 6 | +import datetime |
| 7 | +import os |
| 8 | + |
| 9 | +from prometheus_client import CollectorRegistry, Gauge, write_to_textfile |
| 10 | +from pySMART import DeviceList |
| 11 | + |
| 12 | +SMARTCTL_PATH = "/usr/sbin/smartctl" |
| 13 | + |
| 14 | +SMARTMON_ATTRS = { |
| 15 | + "airflow_temperature_cel", |
| 16 | + "command_timeout", |
| 17 | + "current_pending_sector", |
| 18 | + "end_to_end_error", |
| 19 | + "erase_fail_count", |
| 20 | + "g_sense_error_rate", |
| 21 | + "hardware_ecc_recovered", |
| 22 | + "host_reads_32mib", |
| 23 | + "host_reads_mib", |
| 24 | + "host_writes_32mib", |
| 25 | + "host_writes_mib", |
| 26 | + "load_cycle_count", |
| 27 | + "media_wearout_indicator", |
| 28 | + "nand_writes_1gib", |
| 29 | + "offline_uncorrectable", |
| 30 | + "power_cycle_count", |
| 31 | + "power_on_hours", |
| 32 | + "program_fail_cnt_total", |
| 33 | + "program_fail_count", |
| 34 | + "raw_read_error_rate", |
| 35 | + "reallocated_event_count", |
| 36 | + "reallocated_sector_ct", |
| 37 | + "reported_uncorrect", |
| 38 | + "runtime_bad_block", |
| 39 | + "sata_downshift_count", |
| 40 | + "seek_error_rate", |
| 41 | + "spin_retry_count", |
| 42 | + "spin_up_time", |
| 43 | + "start_stop_count", |
| 44 | + "temperature_case", |
| 45 | + "temperature_celsius", |
| 46 | + "temperature_internal", |
| 47 | + "total_lbas_read", |
| 48 | + "total_lbas_written", |
| 49 | + "udma_crc_error_count", |
| 50 | + "unsafe_shutdown_count", |
| 51 | + "unused_rsvd_blk_cnt_tot", |
| 52 | + "wear_leveling_count", |
| 53 | + "workld_host_reads_perc", |
| 54 | + "workld_media_wear_indic", |
| 55 | + "workload_minutes", |
| 56 | + "critical_warning", |
| 57 | + "temperature", |
| 58 | + "available_spare", |
| 59 | + "available_spare_threshold", |
| 60 | + "percentage_used", |
| 61 | + "data_units_read", |
| 62 | + "data_units_written", |
| 63 | + "host_reads", |
| 64 | + "host_writes", |
| 65 | + "controller_busy_time", |
| 66 | + "power_cycles", |
| 67 | + "unsafe_shutdowns", |
| 68 | + "media_errors", |
| 69 | + "num_err_log_entries", |
| 70 | + "warning_temp_time", |
| 71 | + "critical_comp_time", |
| 72 | +} |
| 73 | + |
| 74 | +def run_command(command, parse_json=False): |
| 75 | + """ |
| 76 | + Helper to run a subprocess command and optionally parse JSON output. |
| 77 | + """ |
| 78 | + result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) |
| 79 | + if parse_json: |
| 80 | + return json.loads(result.stdout) |
| 81 | + return result.stdout.strip() |
| 82 | + |
| 83 | +def camel_to_snake(name): |
| 84 | + """ |
| 85 | + Convert a CamelCase string to snake_case. |
| 86 | +
|
| 87 | + Reference: https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case |
| 88 | + """ |
| 89 | + return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower() |
| 90 | + |
| 91 | +def parse_device_info(device): |
| 92 | + """ |
| 93 | + Produce Prometheus lines describing the device's identity and SMART status: |
| 94 | + - device_info |
| 95 | + - device_smart_available |
| 96 | + - device_smart_enabled |
| 97 | + - device_smart_healthy |
| 98 | +
|
| 99 | + Args: |
| 100 | + device (Device): A pySMART Device object with attributes such as name, interface, etc. |
| 101 | +
|
| 102 | + Returns: |
| 103 | + List[str]: A list of Prometheus formatted metric strings. |
| 104 | + """ |
| 105 | + serial_number = (device.serial or "").lower() |
| 106 | + labels = { |
| 107 | + "disk": device.name, |
| 108 | + "type": device.interface or "", |
| 109 | + "vendor": device.vendor or "", |
| 110 | + "model_family": device.family or "", |
| 111 | + "device_model": device.model or "", |
| 112 | + "serial_number": serial_number, |
| 113 | + "firmware_version": device.firmware or "", |
| 114 | + } |
| 115 | + sorted_labels = sorted(labels.items()) |
| 116 | + label_str = ",".join(f'{k}="{v}"' for k, v in sorted_labels) |
| 117 | + |
| 118 | + metric_labels = f'disk="{device.name}",serial_number="{serial_number}",type="{device.interface}"' |
| 119 | + |
| 120 | + metrics = [ |
| 121 | + f'smartmon_device_info{{{label_str}}} 1.0', |
| 122 | + f'smartmon_device_smart_available{{{metric_labels}}} {float(1) if device.smart_capable else float(0)}', |
| 123 | + ] |
| 124 | + |
| 125 | + if device.smart_capable: |
| 126 | + metrics.append( |
| 127 | + f'smartmon_device_smart_enabled{{{metric_labels}}} {float(1) if device.smart_enabled else float(0)}' |
| 128 | + ) |
| 129 | + if device.assessment: |
| 130 | + is_healthy = 1 if device.assessment.upper() == "PASS" else 0 |
| 131 | + metrics.append( |
| 132 | + f'smartmon_device_smart_healthy{{{metric_labels}}} {float(is_healthy)}' |
| 133 | + ) |
| 134 | + |
| 135 | + return metrics |
| 136 | + |
| 137 | +def parse_if_attributes(device): |
| 138 | + """ |
| 139 | + For any device type (ATA, NVMe, SCSI, etc.), we read device.if_attributes. |
| 140 | + We'll iterate over its public fields, convert them to snake_case, |
| 141 | + and if it's in SMARTMON_ATTRS and numeric, we produce metrics. |
| 142 | +
|
| 143 | + Args: |
| 144 | + device (Device): A pySMART Device object with attributes such as name, interface, etc. |
| 145 | + Returns: |
| 146 | + List[str]: A list of Prometheus formatted metric strings. |
| 147 | + """ |
| 148 | + metrics = [] |
| 149 | + |
| 150 | + if not device.if_attributes: |
| 151 | + return metrics |
| 152 | + |
| 153 | + disk = device.name |
| 154 | + disk_type = device.interface or "" |
| 155 | + serial_number = (device.serial or "").lower() |
| 156 | + labels = f'disk="{disk}",serial_number="{serial_number}",type="{disk_type}"' |
| 157 | + |
| 158 | + # Inspect all public attributes on device.if_attributes |
| 159 | + for attr_name in dir(device.if_attributes): |
| 160 | + if attr_name.startswith("_"): |
| 161 | + continue # skip private / special methods |
| 162 | + val = getattr(device.if_attributes, attr_name, None) |
| 163 | + if callable(val): |
| 164 | + continue # skip methods |
| 165 | + |
| 166 | + snake_name = camel_to_snake(attr_name) |
| 167 | + |
| 168 | + if snake_name in SMARTMON_ATTRS and isinstance(val, (int, float)): |
| 169 | + metrics.append(f"smartmon_{snake_name}{{{labels}}} {float(val)}") |
| 170 | + |
| 171 | + return metrics |
| 172 | + |
| 173 | +def write_metrics_to_textfile(metrics, output_path=None): |
| 174 | + """ |
| 175 | + Write metrics to a Prometheus textfile using prometheus_client. |
| 176 | + Args: |
| 177 | + metrics (List[str]): List of metric strings in 'name{labels} value' format. |
| 178 | + output_path (str): Path to write the metrics file. Defaults to node_exporter textfile collector path. |
| 179 | + """ |
| 180 | + registry = CollectorRegistry() |
| 181 | + metric_gauges = {} |
| 182 | + for metric in metrics: |
| 183 | + # Split metric into name, labels, and value |
| 184 | + metric_name, rest = metric.split('{', 1) |
| 185 | + label_str, value = rest.split('}', 1) |
| 186 | + value = value.strip() |
| 187 | + # Parse labels into a dictionary |
| 188 | + labels = {} |
| 189 | + label_keys = [] |
| 190 | + label_values = [] |
| 191 | + for label in label_str.split(','): |
| 192 | + if '=' in label: |
| 193 | + k, v = label.split('=', 1) |
| 194 | + k = k.strip() |
| 195 | + v = v.strip('"') |
| 196 | + labels[k] = v |
| 197 | + label_keys.append(k) |
| 198 | + label_values.append(v) |
| 199 | + help_str = f"SMART metric {metric_name}" |
| 200 | + # Create Gauge if not already present |
| 201 | + if metric_name not in metric_gauges: |
| 202 | + metric_gauges[metric_name] = Gauge(metric_name, help_str, label_keys, registry=registry) |
| 203 | + # Set metric value |
| 204 | + gauge = metric_gauges[metric_name] |
| 205 | + gauge.labels(*label_values).set(float(value)) |
| 206 | + if output_path is None: |
| 207 | + output_path = '/var/lib/node_exporter/textfile_collector/smartmon.prom' |
| 208 | + write_to_textfile(output_path, registry) # Write all metrics to file |
| 209 | + |
| 210 | +def main(output_path=None): |
| 211 | + all_metrics = [] |
| 212 | + |
| 213 | + try: |
| 214 | + version_output = run_command([SMARTCTL_PATH, "--version"]) |
| 215 | + if version_output.startswith("smartctl"): |
| 216 | + first_line = version_output.splitlines()[0] |
| 217 | + version_num = first_line.split()[1] |
| 218 | + else: |
| 219 | + version_num = "unknown" |
| 220 | + except Exception: |
| 221 | + version_num = "unknown" |
| 222 | + all_metrics.append(f'smartmon_smartctl_version{{version="{version_num}"}} 1') |
| 223 | + |
| 224 | + dev_list = DeviceList() |
| 225 | + |
| 226 | + for dev in dev_list.devices: |
| 227 | + disk_name = dev.name |
| 228 | + disk_type = dev.interface or "" |
| 229 | + serial_number = (dev.serial or "").lower() |
| 230 | + |
| 231 | + run_timestamp = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) |
| 232 | + all_metrics.append(f'smartmon_smartctl_run{{disk="{disk_name}",type="{disk_type}"}} {run_timestamp}') |
| 233 | + |
| 234 | + active = 1 |
| 235 | + try: |
| 236 | + cmd = [SMARTCTL_PATH, "-n", "standby", "-d", disk_type, "-j", disk_name] |
| 237 | + standby_json = run_command(cmd, parse_json=True) |
| 238 | + if standby_json.get("power_mode", "") == "standby": |
| 239 | + active = 0 |
| 240 | + except json.JSONDecodeError: |
| 241 | + active = 0 |
| 242 | + except Exception: |
| 243 | + active = 0 |
| 244 | + |
| 245 | + all_metrics.append( |
| 246 | + f'smartmon_device_active{{disk="{disk_name}",type="{disk_type}",serial_number="{serial_number}"}} {active}' |
| 247 | + ) |
| 248 | + if active == 0: |
| 249 | + continue |
| 250 | + |
| 251 | + all_metrics.extend(parse_device_info(dev)) |
| 252 | + all_metrics.extend(parse_if_attributes(dev)) |
| 253 | + |
| 254 | + write_metrics_to_textfile(all_metrics, output_path) |
| 255 | + |
| 256 | +if __name__ == "__main__": |
| 257 | + import argparse |
| 258 | + parser = argparse.ArgumentParser(description="Export SMART metrics to Prometheus textfile format.") |
| 259 | + parser.add_argument('--output', type=str, default=None, help='Output path for Prometheus textfile (default: /var/lib/node_exporter/textfile_collector/smartmon.prom)') |
| 260 | + args = parser.parse_args() |
| 261 | + main(args.output) |
0 commit comments