|
2 | 2 |
|
3 | 3 | import subprocess
|
4 | 4 | import json
|
| 5 | +import re |
5 | 6 | from datetime import datetime
|
6 | 7 |
|
| 8 | +from pySMART import DeviceList |
| 9 | + |
7 | 10 | SMARTCTL_PATH = "/usr/sbin/smartctl"
|
8 | 11 |
|
| 12 | +SMARTMON_ATTRS = { |
| 13 | + "airflow_temperature_cel", |
| 14 | + "command_timeout", |
| 15 | + "current_pending_sector", |
| 16 | + "end_to_end_error", |
| 17 | + "erase_fail_count", |
| 18 | + "g_sense_error_rate", |
| 19 | + "hardware_ecc_recovered", |
| 20 | + "host_reads_32mib", |
| 21 | + "host_reads_mib", |
| 22 | + "host_writes_32mib", |
| 23 | + "host_writes_mib", |
| 24 | + "load_cycle_count", |
| 25 | + "media_wearout_indicator", |
| 26 | + "nand_writes_1gib", |
| 27 | + "offline_uncorrectable", |
| 28 | + "power_cycle_count", |
| 29 | + "power_on_hours", |
| 30 | + "program_fail_cnt_total", |
| 31 | + "program_fail_count", |
| 32 | + "raw_read_error_rate", |
| 33 | + "reallocated_event_count", |
| 34 | + "reallocated_sector_ct", |
| 35 | + "reported_uncorrect", |
| 36 | + "runtime_bad_block", |
| 37 | + "sata_downshift_count", |
| 38 | + "seek_error_rate", |
| 39 | + "spin_retry_count", |
| 40 | + "spin_up_time", |
| 41 | + "start_stop_count", |
| 42 | + "temperature_case", |
| 43 | + "temperature_celsius", |
| 44 | + "temperature_internal", |
| 45 | + "total_lbas_read", |
| 46 | + "total_lbas_written", |
| 47 | + "udma_crc_error_count", |
| 48 | + "unsafe_shutdown_count", |
| 49 | + "unused_rsvd_blk_cnt_tot", |
| 50 | + "wear_leveling_count", |
| 51 | + "workld_host_reads_perc", |
| 52 | + "workld_media_wear_indic", |
| 53 | + "workload_minutes", |
| 54 | + "critical_warning", |
| 55 | + "temperature", |
| 56 | + "available_spare", |
| 57 | + "available_spare_threshold", |
| 58 | + "percentage_used", |
| 59 | + "data_units_read", |
| 60 | + "data_units_written", |
| 61 | + "host_reads", |
| 62 | + "host_writes", |
| 63 | + "controller_busy_time", |
| 64 | + "power_cycles", |
| 65 | + "unsafe_shutdowns", |
| 66 | + "media_errors", |
| 67 | + "num_err_log_entries", |
| 68 | + "warning_temp_time", |
| 69 | + "critical_comp_time", |
| 70 | +} |
| 71 | + |
9 | 72 | def run_command(command, parse_json=False):
|
| 73 | + """ |
| 74 | + Helper to run a subprocess command and optionally parse JSON output. |
| 75 | + """ |
10 | 76 | result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
11 | 77 | if parse_json:
|
12 | 78 | return json.loads(result.stdout)
|
13 |
| - else: |
14 |
| - return result.stdout.strip() |
15 |
| - |
16 |
| -def parse_smartctl_attributes(disk, disk_type, serial, json_data): |
17 |
| - labels = f'disk="{disk}",type="{disk_type}",serial_number="{serial}"' |
18 |
| - metrics = [] |
19 |
| - smartmon_attrs = set([ |
20 |
| - "airflow_temperature_cel", "command_timeout", "current_pending_sector", "end_to_end_error", "erase_fail_count", |
21 |
| - "g_sense_error_rate", "hardware_ecc_recovered", "host_reads_32mib", "host_reads_mib", "host_writes_32mib", |
22 |
| - "host_writes_mib", "load_cycle_count", "media_wearout_indicator", "nand_writes_1gib", "offline_uncorrectable", |
23 |
| - "power_cycle_count", "power_on_hours", "program_fail_cnt_total", "program_fail_count", "raw_read_error_rate", |
24 |
| - "reallocated_event_count", "reallocated_sector_ct", "reported_uncorrect", "runtime_bad_block", "sata_downshift_count", |
25 |
| - "seek_error_rate", "spin_retry_count", "spin_up_time", "start_stop_count", "temperature_case", "temperature_celsius", |
26 |
| - "temperature_internal", "total_lbas_read", "total_lbas_written", "udma_crc_error_count", "unsafe_shutdown_count", |
27 |
| - "unused_rsvd_blk_cnt_tot", "wear_leveling_count", "workld_host_reads_perc", "workld_media_wear_indic", "workload_minutes", |
28 |
| - "critical_warning", "temperature", "available_spare", "available_spare_threshold", "percentage_used", |
29 |
| - "data_units_read", "data_units_written", "host_reads", "host_writes", "controller_busy_time", |
30 |
| - "power_cycles", "unsafe_shutdowns", "media_errors", "num_err_log_entries", |
31 |
| - "warning_temp_time", "critical_comp_time" |
32 |
| - ]) |
33 |
| - if 'nvme_smart_health_information_log' in json_data: |
34 |
| - smart_log = json_data['nvme_smart_health_information_log'] |
35 |
| - for attr_name, value in smart_log.items(): |
36 |
| - attr_name = attr_name.replace(' ', '_').lower() |
37 |
| - if attr_name in smartmon_attrs: |
38 |
| - metrics.append(f"{attr_name}{{{labels}}} {value}") |
39 |
| - elif 'scsi_grown_defect_list' in json_data: |
40 |
| - scsi_attrs = json_data.get('scsi_grown_defect_list', {}) |
41 |
| - for attr_name, value in scsi_attrs.items(): |
42 |
| - attr_name = attr_name.replace(' ', '_').lower() |
43 |
| - if attr_name in smartmon_attrs: |
44 |
| - metrics.append(f"{attr_name}{{{labels}}} {value}") |
45 |
| - elif 'ata_smart_attributes' in json_data and 'table' in json_data['ata_smart_attributes']: |
46 |
| - for attr in json_data['ata_smart_attributes']['table']: |
47 |
| - attr_name = attr['name'].replace('-', '_').lower() |
48 |
| - if attr_name in smartmon_attrs: |
49 |
| - attr_id = attr.get('id', '') |
50 |
| - value = attr.get('value', '') |
51 |
| - worst = attr.get('worst', '') |
52 |
| - threshold = attr.get('thresh', '') |
53 |
| - raw_value = attr.get('raw', {}).get('value', '') |
54 |
| - metrics.append(f"{attr_name}_value{{{labels},smart_id=\"{attr_id}\"}} {value}") |
55 |
| - metrics.append(f"{attr_name}_worst{{{labels},smart_id=\"{attr_id}\"}} {worst}") |
56 |
| - metrics.append(f"{attr_name}_threshold{{{labels},smart_id=\"{attr_id}\"}} {threshold}") |
57 |
| - metrics.append(f"{attr_name}_raw_value{{{labels},smart_id=\"{attr_id}\"}} {raw_value}") |
58 |
| - return metrics |
59 |
| - |
60 |
| -def parse_smartctl_info(disk, disk_type, json_data): |
61 |
| - info = json_data.get('device', {}) |
62 |
| - smart_status = json_data.get('smart_status', {}) |
| 79 | + return result.stdout.strip() |
| 80 | + |
| 81 | +def parse_device_info(device): |
| 82 | + """ |
| 83 | + Produce Prometheus lines describing the device's identity and SMART status: |
| 84 | + - device_info |
| 85 | + - device_smart_available |
| 86 | + - device_smart_enabled |
| 87 | + - device_smart_healthy |
| 88 | + """ |
| 89 | + serial_number = (device.serial or "").lower() |
63 | 90 | labels = {
|
64 |
| - 'disk': disk, |
65 |
| - 'type': disk_type, |
66 |
| - 'vendor': info.get('vendor', ''), |
67 |
| - 'product': info.get('product', ''), |
68 |
| - 'revision': info.get('revision', ''), |
69 |
| - 'lun_id': info.get('lun_id', ''), |
70 |
| - 'model_family': json_data.get('model_family', ''), |
71 |
| - 'device_model': json_data.get('model_name', ''), |
72 |
| - 'serial_number': json_data.get('serial_number', '').lower(), |
73 |
| - 'firmware_version': json_data.get('firmware_version', '') |
| 91 | + "disk": device.name, |
| 92 | + "type": device.interface or "", |
| 93 | + "vendor": device.vendor or "", |
| 94 | + "model_family": device.family or "", |
| 95 | + "device_model": device.model or "", |
| 96 | + "serial_number": serial_number, |
| 97 | + "firmware_version": device.firmware or "", |
74 | 98 | }
|
75 |
| - label_str = ','.join(f'{k}="{v}"' for k, v in labels.items()) |
| 99 | + label_str = ",".join(f'{k}="{v}"' for k, v in labels.items()) |
| 100 | + |
76 | 101 | metrics = [
|
77 | 102 | f'device_info{{{label_str}}} 1',
|
78 |
| - f'device_smart_available{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("available", False) else 0}', |
| 103 | + f'device_smart_available{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {1 if device.smart_capable else 0}', |
79 | 104 | ]
|
80 |
| - if smart_status.get("available", False): |
81 |
| - metrics.append(f'device_smart_enabled{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("enabled", False) else 0}') |
82 |
| - if 'passed' in smart_status: |
83 |
| - metrics.append(f'device_smart_healthy{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("passed", False) else 0}') |
| 105 | + |
| 106 | + if device.smart_capable: |
| 107 | + metrics.append( |
| 108 | + f'device_smart_enabled{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {1 if device.smart_enabled else 0}' |
| 109 | + ) |
| 110 | + if device.assessment: |
| 111 | + is_healthy = 1 if device.assessment.upper() == "PASS" else 0 |
| 112 | + metrics.append( |
| 113 | + f'device_smart_healthy{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {is_healthy}' |
| 114 | + ) |
| 115 | + |
| 116 | + return metrics |
| 117 | + |
| 118 | +def parse_if_attributes(device): |
| 119 | + """ |
| 120 | + For any device type (ATA, NVMe, SCSI, etc.), we read device.if_attributes. |
| 121 | + We'll iterate over its public fields, convert them to snake_case, |
| 122 | + and if it's in SMARTMON_ATTRS and numeric, we produce metrics. |
| 123 | + """ |
| 124 | + metrics = [] |
| 125 | + |
| 126 | + if not device.if_attributes: |
| 127 | + return metrics |
| 128 | + |
| 129 | + disk = device.name |
| 130 | + disk_type = device.interface or "" |
| 131 | + serial_number = (device.serial or "").lower() |
| 132 | + labels = f'disk="{disk}",type="{disk_type}",serial_number="{serial_number}"' |
| 133 | + |
| 134 | + # Inspect all public attributes on device.if_attributes |
| 135 | + for attr_name in dir(device.if_attributes): |
| 136 | + if attr_name.startswith("_"): |
| 137 | + continue # skip private / special methods |
| 138 | + val = getattr(device.if_attributes, attr_name, None) |
| 139 | + if callable(val): |
| 140 | + continue # skip methods |
| 141 | + |
| 142 | + # Convert CamelCase or PascalCase -> snake_case, e.g. dataUnitsRead -> data_units_read |
| 143 | + snake_name = re.sub(r'(?<!^)(?=[A-Z])', '_', attr_name).lower() |
| 144 | + |
| 145 | + if snake_name in SMARTMON_ATTRS and isinstance(val, (int, float)): |
| 146 | + metrics.append(f"{snake_name}{{{labels}}} {val}") |
| 147 | + |
84 | 148 | return metrics
|
85 | 149 |
|
86 | 150 | def format_output(metrics):
|
| 151 | + """ |
| 152 | + Convert a list of lines like "some_metric{...} value" |
| 153 | + into a Prometheus text output with # HELP / # TYPE lines. |
| 154 | + """ |
87 | 155 | output = []
|
88 | 156 | last_metric = ""
|
89 | 157 | for metric in sorted(metrics):
|
90 |
| - metric_name = metric.split('{')[0] |
| 158 | + metric_name = metric.split("{")[0] |
91 | 159 | if metric_name != last_metric:
|
92 | 160 | output.append(f"# HELP smartmon_{metric_name} SMART metric {metric_name}")
|
93 | 161 | output.append(f"# TYPE smartmon_{metric_name} gauge")
|
94 | 162 | last_metric = metric_name
|
95 | 163 | output.append(f"smartmon_{metric}")
|
96 |
| - return '\n'.join(output) |
| 164 | + return "\n".join(output) |
97 | 165 |
|
98 | 166 | def main():
|
| 167 | + all_metrics = [] |
| 168 | + |
99 | 169 | try:
|
100 |
| - version_output = run_command([SMARTCTL_PATH, '-j'], parse_json=True) |
101 |
| - smartctl_version_list = version_output.get('smartctl', {}).get('version', []) |
102 |
| - if smartctl_version_list: |
103 |
| - smartctl_version_str = '.'.join(map(str, smartctl_version_list)) |
| 170 | + version_output = run_command([SMARTCTL_PATH, "--version"]) |
| 171 | + if version_output.startswith("smartctl"): |
| 172 | + first_line = version_output.splitlines()[0] |
| 173 | + version_num = first_line.split()[1] |
104 | 174 | else:
|
105 |
| - smartctl_version_str = "unknown" |
106 |
| - except json.JSONDecodeError: |
107 |
| - smartctl_version_str = "unknown" |
108 |
| - metrics = [f'smartctl_version{{version="{smartctl_version_str}"}} 1'] |
| 175 | + version_num = "unknown" |
| 176 | + except Exception: |
| 177 | + version_num = "unknown" |
| 178 | + all_metrics.append(f'smartctl_version{{version="{version_num}"}} 1') |
109 | 179 |
|
110 |
| - try: |
111 |
| - device_list_output = run_command([SMARTCTL_PATH, '--scan-open', '-j'], parse_json=True) |
112 |
| - devices = [] |
113 |
| - for device in device_list_output.get('devices', []): |
114 |
| - disk = device.get('name', '') |
115 |
| - disk_type = device.get('type', 'auto') |
116 |
| - if disk: |
117 |
| - devices.append((disk, disk_type)) |
118 |
| - except json.JSONDecodeError: |
119 |
| - devices = [] |
120 |
| - |
121 |
| - for disk, disk_type in devices: |
122 |
| - serial_number = '' |
123 |
| - active = 1 |
124 |
| - metrics.append(f'smartctl_run{{disk="{disk}",type="{disk_type}"}} {int(datetime.utcnow().timestamp())}') |
| 180 | + dev_list = DeviceList() |
| 181 | + |
| 182 | + for dev in dev_list.devices: |
| 183 | + disk_name = dev.name |
| 184 | + disk_type = dev.interface or "" |
| 185 | + serial_number = (dev.serial or "").lower() |
125 | 186 |
|
| 187 | + run_timestamp = int(datetime.utcnow().timestamp()) |
| 188 | + all_metrics.append(f'smartctl_run{{disk="{disk_name}",type="{disk_type}"}} {run_timestamp}') |
| 189 | + |
| 190 | + active = 1 |
126 | 191 | try:
|
127 |
| - standby_output = run_command([SMARTCTL_PATH, '-n', 'standby', '-d', disk_type, '-j', disk], parse_json=True) |
128 |
| - power_mode = standby_output.get('power_mode', '') |
129 |
| - if power_mode == 'standby': |
| 192 | + cmd = [SMARTCTL_PATH, "-n", "standby", "-d", disk_type, "-j", disk_name] |
| 193 | + standby_json = run_command(cmd, parse_json=True) |
| 194 | + if standby_json.get("power_mode", "") == "standby": |
130 | 195 | active = 0
|
131 | 196 | except json.JSONDecodeError:
|
132 |
| - active = 0 # Assume device is inactive if we can't parse the output |
133 |
| - |
134 |
| - metrics.append(f'device_active{{disk="{disk}",type="{disk_type}"}} {active}') |
| 197 | + active = 0 |
| 198 | + except Exception: |
| 199 | + active = 0 |
135 | 200 |
|
| 201 | + all_metrics.append( |
| 202 | + f'device_active{{disk="{disk_name}",type="{disk_type}",serial_number="{serial_number}"}} {active}' |
| 203 | + ) |
136 | 204 | if active == 0:
|
137 | 205 | continue
|
138 | 206 |
|
139 |
| - try: |
140 |
| - info_output = run_command([SMARTCTL_PATH, '-i', '-H', '-d', disk_type, '-j', disk], parse_json=True) |
141 |
| - except json.JSONDecodeError: |
142 |
| - continue |
143 |
| - metrics.extend(parse_smartctl_info(disk, disk_type, info_output)) |
144 |
| - serial_number = info_output.get('serial_number', '').lower() |
145 |
| - |
146 |
| - try: |
147 |
| - attributes_output = run_command([SMARTCTL_PATH, '-A', '-d', disk_type, '-j', disk], parse_json=True) |
148 |
| - except json.JSONDecodeError: |
149 |
| - continue |
150 |
| - metrics.extend(parse_smartctl_attributes(disk, disk_type, serial_number, attributes_output)) |
| 207 | + all_metrics.extend(parse_device_info(dev)) |
| 208 | + all_metrics.extend(parse_if_attributes(dev)) |
151 | 209 |
|
152 |
| - formatted_output = format_output(metrics) |
153 |
| - print(formatted_output) |
| 210 | + print(format_output(all_metrics)) |
154 | 211 |
|
155 | 212 | if __name__ == "__main__":
|
156 | 213 | main()
|
0 commit comments