Skip to content

Commit 20b2f85

Browse files
committed
Use pySMART
1 parent c27e357 commit 20b2f85

File tree

1 file changed

+168
-111
lines changed

1 file changed

+168
-111
lines changed

etc/kayobe/ansible/scripts/smartmon.py

Lines changed: 168 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -2,155 +2,212 @@
22

33
import subprocess
44
import json
5+
import re
56
from datetime import datetime
67

8+
from pySMART import DeviceList
9+
710
SMARTCTL_PATH = "/usr/sbin/smartctl"
811

12+
SMARTMON_ATTRS = {
13+
"airflow_temperature_cel",
14+
"command_timeout",
15+
"current_pending_sector",
16+
"end_to_end_error",
17+
"erase_fail_count",
18+
"g_sense_error_rate",
19+
"hardware_ecc_recovered",
20+
"host_reads_32mib",
21+
"host_reads_mib",
22+
"host_writes_32mib",
23+
"host_writes_mib",
24+
"load_cycle_count",
25+
"media_wearout_indicator",
26+
"nand_writes_1gib",
27+
"offline_uncorrectable",
28+
"power_cycle_count",
29+
"power_on_hours",
30+
"program_fail_cnt_total",
31+
"program_fail_count",
32+
"raw_read_error_rate",
33+
"reallocated_event_count",
34+
"reallocated_sector_ct",
35+
"reported_uncorrect",
36+
"runtime_bad_block",
37+
"sata_downshift_count",
38+
"seek_error_rate",
39+
"spin_retry_count",
40+
"spin_up_time",
41+
"start_stop_count",
42+
"temperature_case",
43+
"temperature_celsius",
44+
"temperature_internal",
45+
"total_lbas_read",
46+
"total_lbas_written",
47+
"udma_crc_error_count",
48+
"unsafe_shutdown_count",
49+
"unused_rsvd_blk_cnt_tot",
50+
"wear_leveling_count",
51+
"workld_host_reads_perc",
52+
"workld_media_wear_indic",
53+
"workload_minutes",
54+
"critical_warning",
55+
"temperature",
56+
"available_spare",
57+
"available_spare_threshold",
58+
"percentage_used",
59+
"data_units_read",
60+
"data_units_written",
61+
"host_reads",
62+
"host_writes",
63+
"controller_busy_time",
64+
"power_cycles",
65+
"unsafe_shutdowns",
66+
"media_errors",
67+
"num_err_log_entries",
68+
"warning_temp_time",
69+
"critical_comp_time",
70+
}
71+
972
def run_command(command, parse_json=False):
73+
"""
74+
Helper to run a subprocess command and optionally parse JSON output.
75+
"""
1076
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
1177
if parse_json:
1278
return json.loads(result.stdout)
13-
else:
14-
return result.stdout.strip()
15-
16-
def parse_smartctl_attributes(disk, disk_type, serial, json_data):
17-
labels = f'disk="{disk}",type="{disk_type}",serial_number="{serial}"'
18-
metrics = []
19-
smartmon_attrs = set([
20-
"airflow_temperature_cel", "command_timeout", "current_pending_sector", "end_to_end_error", "erase_fail_count",
21-
"g_sense_error_rate", "hardware_ecc_recovered", "host_reads_32mib", "host_reads_mib", "host_writes_32mib",
22-
"host_writes_mib", "load_cycle_count", "media_wearout_indicator", "nand_writes_1gib", "offline_uncorrectable",
23-
"power_cycle_count", "power_on_hours", "program_fail_cnt_total", "program_fail_count", "raw_read_error_rate",
24-
"reallocated_event_count", "reallocated_sector_ct", "reported_uncorrect", "runtime_bad_block", "sata_downshift_count",
25-
"seek_error_rate", "spin_retry_count", "spin_up_time", "start_stop_count", "temperature_case", "temperature_celsius",
26-
"temperature_internal", "total_lbas_read", "total_lbas_written", "udma_crc_error_count", "unsafe_shutdown_count",
27-
"unused_rsvd_blk_cnt_tot", "wear_leveling_count", "workld_host_reads_perc", "workld_media_wear_indic", "workload_minutes",
28-
"critical_warning", "temperature", "available_spare", "available_spare_threshold", "percentage_used",
29-
"data_units_read", "data_units_written", "host_reads", "host_writes", "controller_busy_time",
30-
"power_cycles", "unsafe_shutdowns", "media_errors", "num_err_log_entries",
31-
"warning_temp_time", "critical_comp_time"
32-
])
33-
if 'nvme_smart_health_information_log' in json_data:
34-
smart_log = json_data['nvme_smart_health_information_log']
35-
for attr_name, value in smart_log.items():
36-
attr_name = attr_name.replace(' ', '_').lower()
37-
if attr_name in smartmon_attrs:
38-
metrics.append(f"{attr_name}{{{labels}}} {value}")
39-
elif 'scsi_grown_defect_list' in json_data:
40-
scsi_attrs = json_data.get('scsi_grown_defect_list', {})
41-
for attr_name, value in scsi_attrs.items():
42-
attr_name = attr_name.replace(' ', '_').lower()
43-
if attr_name in smartmon_attrs:
44-
metrics.append(f"{attr_name}{{{labels}}} {value}")
45-
elif 'ata_smart_attributes' in json_data and 'table' in json_data['ata_smart_attributes']:
46-
for attr in json_data['ata_smart_attributes']['table']:
47-
attr_name = attr['name'].replace('-', '_').lower()
48-
if attr_name in smartmon_attrs:
49-
attr_id = attr.get('id', '')
50-
value = attr.get('value', '')
51-
worst = attr.get('worst', '')
52-
threshold = attr.get('thresh', '')
53-
raw_value = attr.get('raw', {}).get('value', '')
54-
metrics.append(f"{attr_name}_value{{{labels},smart_id=\"{attr_id}\"}} {value}")
55-
metrics.append(f"{attr_name}_worst{{{labels},smart_id=\"{attr_id}\"}} {worst}")
56-
metrics.append(f"{attr_name}_threshold{{{labels},smart_id=\"{attr_id}\"}} {threshold}")
57-
metrics.append(f"{attr_name}_raw_value{{{labels},smart_id=\"{attr_id}\"}} {raw_value}")
58-
return metrics
59-
60-
def parse_smartctl_info(disk, disk_type, json_data):
61-
info = json_data.get('device', {})
62-
smart_status = json_data.get('smart_status', {})
79+
return result.stdout.strip()
80+
81+
def parse_device_info(device):
82+
"""
83+
Produce Prometheus lines describing the device's identity and SMART status:
84+
- device_info
85+
- device_smart_available
86+
- device_smart_enabled
87+
- device_smart_healthy
88+
"""
89+
serial_number = (device.serial or "").lower()
6390
labels = {
64-
'disk': disk,
65-
'type': disk_type,
66-
'vendor': info.get('vendor', ''),
67-
'product': info.get('product', ''),
68-
'revision': info.get('revision', ''),
69-
'lun_id': info.get('lun_id', ''),
70-
'model_family': json_data.get('model_family', ''),
71-
'device_model': json_data.get('model_name', ''),
72-
'serial_number': json_data.get('serial_number', '').lower(),
73-
'firmware_version': json_data.get('firmware_version', '')
91+
"disk": device.name,
92+
"type": device.interface or "",
93+
"vendor": device.vendor or "",
94+
"model_family": device.family or "",
95+
"device_model": device.model or "",
96+
"serial_number": serial_number,
97+
"firmware_version": device.firmware or "",
7498
}
75-
label_str = ','.join(f'{k}="{v}"' for k, v in labels.items())
99+
label_str = ",".join(f'{k}="{v}"' for k, v in labels.items())
100+
76101
metrics = [
77102
f'device_info{{{label_str}}} 1',
78-
f'device_smart_available{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("available", False) else 0}',
103+
f'device_smart_available{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {1 if device.smart_capable else 0}',
79104
]
80-
if smart_status.get("available", False):
81-
metrics.append(f'device_smart_enabled{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("enabled", False) else 0}')
82-
if 'passed' in smart_status:
83-
metrics.append(f'device_smart_healthy{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("passed", False) else 0}')
105+
106+
if device.smart_capable:
107+
metrics.append(
108+
f'device_smart_enabled{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {1 if device.smart_enabled else 0}'
109+
)
110+
if device.assessment:
111+
is_healthy = 1 if device.assessment.upper() == "PASS" else 0
112+
metrics.append(
113+
f'device_smart_healthy{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {is_healthy}'
114+
)
115+
116+
return metrics
117+
118+
def parse_if_attributes(device):
119+
"""
120+
For any device type (ATA, NVMe, SCSI, etc.), we read device.if_attributes.
121+
We'll iterate over its public fields, convert them to snake_case,
122+
and if it's in SMARTMON_ATTRS and numeric, we produce metrics.
123+
"""
124+
metrics = []
125+
126+
if not device.if_attributes:
127+
return metrics
128+
129+
disk = device.name
130+
disk_type = device.interface or ""
131+
serial_number = (device.serial or "").lower()
132+
labels = f'disk="{disk}",type="{disk_type}",serial_number="{serial_number}"'
133+
134+
# Inspect all public attributes on device.if_attributes
135+
for attr_name in dir(device.if_attributes):
136+
if attr_name.startswith("_"):
137+
continue # skip private / special methods
138+
val = getattr(device.if_attributes, attr_name, None)
139+
if callable(val):
140+
continue # skip methods
141+
142+
# Convert CamelCase or PascalCase -> snake_case, e.g. dataUnitsRead -> data_units_read
143+
snake_name = re.sub(r'(?<!^)(?=[A-Z])', '_', attr_name).lower()
144+
145+
if snake_name in SMARTMON_ATTRS and isinstance(val, (int, float)):
146+
metrics.append(f"{snake_name}{{{labels}}} {val}")
147+
84148
return metrics
85149

86150
def format_output(metrics):
151+
"""
152+
Convert a list of lines like "some_metric{...} value"
153+
into a Prometheus text output with # HELP / # TYPE lines.
154+
"""
87155
output = []
88156
last_metric = ""
89157
for metric in sorted(metrics):
90-
metric_name = metric.split('{')[0]
158+
metric_name = metric.split("{")[0]
91159
if metric_name != last_metric:
92160
output.append(f"# HELP smartmon_{metric_name} SMART metric {metric_name}")
93161
output.append(f"# TYPE smartmon_{metric_name} gauge")
94162
last_metric = metric_name
95163
output.append(f"smartmon_{metric}")
96-
return '\n'.join(output)
164+
return "\n".join(output)
97165

98166
def main():
167+
all_metrics = []
168+
99169
try:
100-
version_output = run_command([SMARTCTL_PATH, '-j'], parse_json=True)
101-
smartctl_version_list = version_output.get('smartctl', {}).get('version', [])
102-
if smartctl_version_list:
103-
smartctl_version_str = '.'.join(map(str, smartctl_version_list))
170+
version_output = run_command([SMARTCTL_PATH, "--version"])
171+
if version_output.startswith("smartctl"):
172+
first_line = version_output.splitlines()[0]
173+
version_num = first_line.split()[1]
104174
else:
105-
smartctl_version_str = "unknown"
106-
except json.JSONDecodeError:
107-
smartctl_version_str = "unknown"
108-
metrics = [f'smartctl_version{{version="{smartctl_version_str}"}} 1']
175+
version_num = "unknown"
176+
except Exception:
177+
version_num = "unknown"
178+
all_metrics.append(f'smartctl_version{{version="{version_num}"}} 1')
109179

110-
try:
111-
device_list_output = run_command([SMARTCTL_PATH, '--scan-open', '-j'], parse_json=True)
112-
devices = []
113-
for device in device_list_output.get('devices', []):
114-
disk = device.get('name', '')
115-
disk_type = device.get('type', 'auto')
116-
if disk:
117-
devices.append((disk, disk_type))
118-
except json.JSONDecodeError:
119-
devices = []
120-
121-
for disk, disk_type in devices:
122-
serial_number = ''
123-
active = 1
124-
metrics.append(f'smartctl_run{{disk="{disk}",type="{disk_type}"}} {int(datetime.utcnow().timestamp())}')
180+
dev_list = DeviceList()
181+
182+
for dev in dev_list.devices:
183+
disk_name = dev.name
184+
disk_type = dev.interface or ""
185+
serial_number = (dev.serial or "").lower()
125186

187+
run_timestamp = int(datetime.utcnow().timestamp())
188+
all_metrics.append(f'smartctl_run{{disk="{disk_name}",type="{disk_type}"}} {run_timestamp}')
189+
190+
active = 1
126191
try:
127-
standby_output = run_command([SMARTCTL_PATH, '-n', 'standby', '-d', disk_type, '-j', disk], parse_json=True)
128-
power_mode = standby_output.get('power_mode', '')
129-
if power_mode == 'standby':
192+
cmd = [SMARTCTL_PATH, "-n", "standby", "-d", disk_type, "-j", disk_name]
193+
standby_json = run_command(cmd, parse_json=True)
194+
if standby_json.get("power_mode", "") == "standby":
130195
active = 0
131196
except json.JSONDecodeError:
132-
active = 0 # Assume device is inactive if we can't parse the output
133-
134-
metrics.append(f'device_active{{disk="{disk}",type="{disk_type}"}} {active}')
197+
active = 0
198+
except Exception:
199+
active = 0
135200

201+
all_metrics.append(
202+
f'device_active{{disk="{disk_name}",type="{disk_type}",serial_number="{serial_number}"}} {active}'
203+
)
136204
if active == 0:
137205
continue
138206

139-
try:
140-
info_output = run_command([SMARTCTL_PATH, '-i', '-H', '-d', disk_type, '-j', disk], parse_json=True)
141-
except json.JSONDecodeError:
142-
continue
143-
metrics.extend(parse_smartctl_info(disk, disk_type, info_output))
144-
serial_number = info_output.get('serial_number', '').lower()
145-
146-
try:
147-
attributes_output = run_command([SMARTCTL_PATH, '-A', '-d', disk_type, '-j', disk], parse_json=True)
148-
except json.JSONDecodeError:
149-
continue
150-
metrics.extend(parse_smartctl_attributes(disk, disk_type, serial_number, attributes_output))
207+
all_metrics.extend(parse_device_info(dev))
208+
all_metrics.extend(parse_if_attributes(dev))
151209

152-
formatted_output = format_output(metrics)
153-
print(formatted_output)
210+
print(format_output(all_metrics))
154211

155212
if __name__ == "__main__":
156213
main()

0 commit comments

Comments
 (0)