Skip to content

INFRA-709 Rated dwpd alerts #1077

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jun 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions etc/kayobe/ansible/get-nvme-drives.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
---
- name: Gather unique NVMe disk models on all hosts
hosts: overcloud
gather_facts: no
tasks:
- name: Retrieve NVMe device information
ansible.builtin.command: "nvme list -o json"
register: nvme_list
changed_when: false
become: true

- name: Parse NVMe device model names
ansible.builtin.set_fact:
nvme_models: "{{ nvme_models | default([]) + [item.ModelNumber] }}"
loop: "{{ nvme_list.stdout | from_json | json_query('Devices[].{ModelNumber: ModelNumber}') }}"
changed_when: false

- name: Set unique NVMe models as host facts
ansible.builtin.set_fact:
unique_nvme_models: "{{ (nvme_models | default([])) | unique }}"

- name: Show unique NVMe models per host
ansible.builtin.debug:
var: unique_nvme_models

- name: Aggregate all unique NVMe models from all hosts
hosts: localhost
gather_facts: no
tasks:
- name: Aggregate unique NVMe models from all overcloud hosts
ansible.builtin.set_fact:
all_nvme_models: "{{ groups['overcloud'] | map('extract', hostvars, 'unique_nvme_models') | select('defined') | sum(start=[]) | unique }}"

- name: Show all unique NVMe models
ansible.builtin.debug:
var: all_nvme_models

- name: Ensure dwpd-ratings.yml exists
ansible.builtin.stat:
path: "{{ kayobe_env_config_path }}/dwpd-ratings.yml"
register: dwpd_ratings_stat
run_once: true

- name: Load existing dwpd-ratings.yml
ansible.builtin.set_fact:
existing_dwpd_yml: "{{ lookup('file', kayobe_env_config_path ~ '/dwpd-ratings.yml') | from_yaml }}"
when: dwpd_ratings_stat.stat.exists
run_once: true

- name: Convert existing YAML array into a dictionary
ansible.builtin.set_fact:
dwpd_lookup: "{{ dwpd_lookup | default({}) | combine({item.model_name: item.rated_dwpd}) }}"
loop: "{{ existing_dwpd_yml.stackhpc_dwpd_ratings | default([]) }}"
loop_control:
label: "{{ item.model_name }}"
run_once: true

- name: Get list of existing model names
ansible.builtin.set_fact:
existing_model_names: "{{ existing_dwpd_yml.stackhpc_dwpd_ratings | default([]) | map(attribute='model_name') | list }}"
run_once: true

- name: Identify new models not already in the configuration
ansible.builtin.set_fact:
new_models: "{{ all_nvme_models | default([]) | reject('in', existing_model_names | default([])) | list }}"
run_once: true

- name: Create entry dictionary for new models
ansible.builtin.set_fact:
new_entries: "{{ new_entries | default([]) + [{'model_name': item, 'rated_dwpd': 1}] }}"
loop: "{{ new_models }}"
run_once: true
when: new_models | length > 0

- name: Build updated list for stackhpc_dwpd_ratings
ansible.builtin.set_fact:
new_dwpd_list: "{{ existing_dwpd_yml.stackhpc_dwpd_ratings | default([]) + (new_entries | default([])) }}"
run_once: true

- name: Write updated dwpd-ratings.yml
ansible.builtin.copy:
content: "---\nstackhpc_dwpd_ratings:\n{% for item in new_dwpd_list %} - model_name: \"{{ item.model_name }}\"\n rated_dwpd: {{ item.rated_dwpd }}\n{% endfor %}"
dest: "{{ kayobe_env_config_path }}/dwpd-ratings.yml"
run_once: true
notify: Show updated dwpd-ratings.yml contents
when: new_dwpd_list is defined and new_dwpd_list | length > 0

handlers:
- name: Show updated dwpd-ratings.yml contents
ansible.builtin.debug:
msg:
- "Updated local dwpd-ratings.yml contents"
- "{{ {'stackhpc_dwpd_ratings': new_dwpd_list} | to_nice_yaml }}"
- "PLEASE REVIEW AND COMMIT {{ kayobe_env_config_path }}/dwpd-ratings.yml TO VERSION CONTROL."
run_once: true
changed_when: true
89 changes: 69 additions & 20 deletions etc/kayobe/ansible/scripts/nvmemon.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,43 @@ if ! command -v nvme >/dev/null 2>&1; then
exit 1
fi

if ! command -v jq >/dev/null 2>&1; then
echo "${0##*/}: jq is required but not installed. Aborting." >&2
exit 1
fi

# Path to the DWPD ratings JSON file
dwpd_file="/opt/kayobe/etc/monitoring/dwpd_ratings.json"

declare -A rated_dwpd

load_dwpd_ratings() {
if [[ -f "$dwpd_file" ]]; then
# Read the JSON; if it fails, default to empty array
dwpd_json="$(cat "$dwpd_file" 2>/dev/null | jq '.' || echo '[]')"

# We iterate over each array element in dwpd_json
while IFS= read -r line; do
key="$(echo "$line" | jq -r '.model_name')"
value="$(echo "$line" | jq -r '.rated_dwpd')"

# Clean up trailing whitespace
key="${key%%[[:space:]]*}"
value="${value%%[[:space:]]*}"

# If we have a valid key, store it in the dictionary
if [[ -n "$key" && "$key" != "null" ]]; then
rated_dwpd["$key"]="$value"
fi
done < <(echo "$dwpd_json" | jq -c '.[]')
else
echo "Warning: DWPD ratings file not found at '$dwpd_file'. Defaulting to rated_dwpd=1." >&2
fi
}


load_dwpd_ratings

output_format_awk="$(
cat <<'OUTPUTAWK'
BEGIN { v = "" }
Expand All @@ -44,58 +81,70 @@ format_output() {
nvme_version="$(nvme version | awk '$1 == "nvme" {print $3}')"
echo "nvmecli{version=\"${nvme_version}\"} 1" | format_output

# Get devices (DevicePath and PhysicalSize)
device_info="$(nvme list -o json | jq -c '.Devices[] | {DevicePath: .DevicePath, PhysicalSize: .PhysicalSize}')"
# Get devices (DevicePath, PhysicalSize and ModelNumber)
device_info="$(nvme list -o json | jq -c '.Devices[] | {DevicePath, PhysicalSize, ModelNumber, SerialNumber}')"

# Convert device_info to an array
device_info_array=()
while IFS= read -r line; do
device_info_array+=("$line")
done <<< "$device_info"

# Loop through the NVMe devices
echo "$device_info" | while read -r device_data; do
device=$(echo "$device_data" | jq -r '.DevicePath')
for device_data in "${device_info_array[@]}"; do
device="$(echo "$device_data" | jq -r '.DevicePath')"
json_check="$(nvme smart-log -o json "${device}")"
disk="${device##*/}"
model_name="$(echo "$device_data" | jq -r '.ModelNumber')"
serial_number="$(echo "$device_data" | jq -r '.SerialNumber')"

physical_size=$(echo "$device_data" | jq -r '.PhysicalSize')
echo "physical_size_bytes{device=\"${disk}\"} ${physical_size}"
physical_size="$(echo "$device_data" | jq -r '.PhysicalSize')"
echo "physical_size_bytes{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${physical_size}"

# The temperature value in JSON is in Kelvin, we want Celsius
value_temperature="$(echo "$json_check" | jq '.temperature - 273')"
echo "temperature_celsius{device=\"${disk}\"} ${value_temperature}"
echo "temperature_celsius{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_temperature}"

# Get the rated DWPD from the dictionary or default to 1 if not found
value_rated_dwpd="${rated_dwpd[$model_name]:-1}"
echo "rated_dwpd{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_rated_dwpd}"

value_available_spare="$(echo "$json_check" | jq '.avail_spare / 100')"
echo "available_spare_ratio{device=\"${disk}\"} ${value_available_spare}"
echo "available_spare_ratio{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_available_spare}"

value_available_spare_threshold="$(echo "$json_check" | jq '.spare_thresh / 100')"
echo "available_spare_threshold_ratio{device=\"${disk}\"} ${value_available_spare_threshold}"
echo "available_spare_threshold_ratio{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_available_spare_threshold}"

value_percentage_used="$(echo "$json_check" | jq '.percent_used / 100')"
echo "percentage_used_ratio{device=\"${disk}\"} ${value_percentage_used}"
echo "percentage_used_ratio{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_percentage_used}"

value_critical_warning="$(echo "$json_check" | jq '.critical_warning')"
echo "critical_warning_total{device=\"${disk}\"} ${value_critical_warning}"
echo "critical_warning_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_critical_warning}"

value_media_errors="$(echo "$json_check" | jq '.media_errors')"
echo "media_errors_total{device=\"${disk}\"} ${value_media_errors}"
echo "media_errors_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_media_errors}"

value_num_err_log_entries="$(echo "$json_check" | jq '.num_err_log_entries')"
echo "num_err_log_entries_total{device=\"${disk}\"} ${value_num_err_log_entries}"
echo "num_err_log_entries_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_num_err_log_entries}"

value_power_cycles="$(echo "$json_check" | jq '.power_cycles')"
echo "power_cycles_total{device=\"${disk}\"} ${value_power_cycles}"
echo "power_cycles_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_power_cycles}"

value_power_on_hours="$(echo "$json_check" | jq '.power_on_hours')"
echo "power_on_hours_total{device=\"${disk}\"} ${value_power_on_hours}"
echo "power_on_hours_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_power_on_hours}"

value_controller_busy_time="$(echo "$json_check" | jq '.controller_busy_time')"
echo "controller_busy_time_seconds{device=\"${disk}\"} ${value_controller_busy_time}"
echo "controller_busy_time_seconds{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_controller_busy_time}"

value_data_units_written="$(echo "$json_check" | jq '.data_units_written')"
echo "data_units_written_total{device=\"${disk}\"} ${value_data_units_written}"
echo "data_units_written_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_data_units_written}"

value_data_units_read="$(echo "$json_check" | jq '.data_units_read')"
echo "data_units_read_total{device=\"${disk}\"} ${value_data_units_read}"
echo "data_units_read_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_data_units_read}"

value_host_read_commands="$(echo "$json_check" | jq '.host_read_commands')"
echo "host_read_commands_total{device=\"${disk}\"} ${value_host_read_commands}"
echo "host_read_commands_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_host_read_commands}"

value_host_write_commands="$(echo "$json_check" | jq '.host_write_commands')"
echo "host_write_commands_total{device=\"${disk}\"} ${value_host_write_commands}"
echo "host_write_commands_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_host_write_commands}"
done | format_output
56 changes: 50 additions & 6 deletions etc/kayobe/ansible/smartmon-tools.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
---
- name: Install and set up smartmon-tools
- name: Install and set up SMART monitoring tools
hosts: overcloud

tasks:
- name: Ensure smartmontools, jq, nvme-cli and cron/cronie are installed
ansible.builtin.package:
Expand All @@ -13,11 +12,23 @@
state: present
become: true

- name: Ensure Python 3, venv, and pip are installed
ansible.builtin.package:
name: >
{{ ['python3', 'python3-pip'] + (['python3-venv'] if ansible_facts['distribution'] == 'Ubuntu' else []) }}
- name: Ensure Python 3, venv, and pip are installed on Debian/Ubuntu
ansible.builtin.apt:
name:
- python3
- python3-venv
- python3-pip
state: present
when: ansible_facts.os_family == 'Debian'
become: true

- name: Ensure Python 3, and pip are installed on RedHat/CentOS
ansible.builtin.yum:
name:
- python3
- python3-pip
state: present
when: ansible_facts.os_family == 'RedHat'
become: true

- name: Create smartmon Python virtual environment
Expand All @@ -31,6 +42,7 @@
name:
- prometheus_client
- pySMART
state: present
virtualenv: /opt/smartmon-venv
virtualenv_python: python3
become: true
Expand Down Expand Up @@ -98,3 +110,35 @@
path: /usr/local/bin/smartmon.sh
state: absent
become: true

- name: Gather NVMe drives and generate dwpd ratings
import_playbook: get-nvme-drives.yml
when: create_dwpd_ratings | default(false)

- name: Copy DWPD ratings to overcloud hosts
hosts: overcloud
gather_facts: false
tasks:
- name: Convert the stackhpc_dwpd_ratings variable to JSON
ansible.builtin.set_fact:
dwpd_ratings_json: "{{ stackhpc_dwpd_ratings | default([]) | to_json }}"
run_once: true
when: stackhpc_dwpd_ratings is defined

- name: Ensure /opt/kayobe/etc/monitoring directory exists
ansible.builtin.file:
path: /opt/kayobe/etc/monitoring
state: directory
mode: '0755'
become: true
when: stackhpc_dwpd_ratings is defined

- name: Copy JSON file to remote
ansible.builtin.copy:
content: "{{ dwpd_ratings_json }}"
dest: "/opt/kayobe/etc/monitoring/dwpd_ratings.json"
owner: root
group: root
mode: '0644'
become: true
when: stackhpc_dwpd_ratings is defined
Loading
Loading