Skip to content

Commit

Permalink
[nvidia-bluefield] update platform-api thermal logic (sonic-net#19409)
Browse files Browse the repository at this point in the history
- Why I did it
For setting the rts and cts configuration for the minicom package to the switch image, The file is copied to minicom configuration location in the switch image
This is a machine generated file in minicom , so the configuration can be present in the switch image upon installation
  • Loading branch information
Yakiv-Huryk authored Jul 2, 2024
1 parent e488f10 commit 0114373
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,36 +18,87 @@
try:
from sonic_platform_base.thermal_base import ThermalBase
from sonic_py_common.logger import Logger
from .device_data import DeviceDataManager
from enum import Enum, auto
import os
import json
import subprocess
except ImportError as e:
raise ImportError (str(e) + "- required module not found")

# Global logger class instance
logger = Logger()

MLXBF_BASE_PATH = '/sys/kernel/debug/mlxbf-ptm/monitors/status'
SSD_DEV='nvme0'

SENSORS = [
{'name': 'CPU', 'mlxbf_sensor_name': 'core_temp', 'ht': 95, 'cht': 100},
{'name': 'DDR', 'mlxbf_sensor_name': 'ddr_temp', 'ht': 95, 'cht': 100},
{'name': 'SFP0', 'iface': 'Ethernet0', 'hwmon_path': None},
{'name': 'SFP1', 'iface': 'Ethernet4', 'hwmon_path': None},
class ThermalType(Enum):
MLXBF = auto()
SFP = auto()
SSD = auto()

MLXBF_SENSORS = [
{'name': 'CPU', 'thermal_type': ThermalType.MLXBF, 'mlxbf_sensor_name': 'core_temp', 'ht': 95, 'cht': 100},
{'name': 'DDR', 'thermal_type': ThermalType.MLXBF, 'mlxbf_sensor_name': 'ddr_temp', 'ht': 95, 'cht': 100},
]

def set_hwmon_path(sensor):
base = f'/sys/class/net/{sensor["iface"]}/device/hwmon'
def get_hwmon_path(iface):
base = f'/sys/class/net/{iface}/device/hwmon'
dirs = os.listdir(base)
if len(dirs) != 1 or not dirs[0].startswith('hwmon'):
logger.log_error(f'Failed to find hwmon path for {sensor["iface"]}')
logger.log_error(f'Failed to find hwmon path for {iface}')
return
sensor['hwmon_path'] = f'{base}/{dirs[0]}'
return f'{base}/{dirs[0]}'

def initialize_sfp_thermals():
sfps = []
sfp_count = DeviceDataManager().get_sfp_count()
for i in range(sfp_count):
iface = f'Ethernet{i * 4}'
sfp_thermal = Thermal(name=f'SFP{i}', thermal_type=ThermalType.SFP, hwmon_path=get_hwmon_path(iface))
sfps.append(sfp_thermal)
return sfps

def read_smartctl(dev, all=False):
all_flag = 'u' if all else ''
cmd = f'smartctl -x /dev/{dev} --json=v{all_flag}'
try:
output = subprocess.check_output(cmd.split(' ')).decode().strip()
return json.loads(output)
except:
logger.log_error('Failed to read smartctl output')
return {}

def read_ssd_temperatrue(dev):
try:
return int(read_smartctl(dev)['temperature']['current'])
except:
logger.log_error('Failed to read nvme0 temperature')
return 'N/A'

def read_ssd_thresholds(dev):
higt_th = None
crit_th = None
def parse_value(v):
return int(v.split(':')[1].strip().split(' ')[0])

output = read_smartctl(dev, all=True)
for k,v in output.items():
if "smartctl" in k:
if not higt_th and "Warning Comp. Temp. Threshold" in v:
higt_th = parse_value(v)
if not crit_th and "Critical Comp. Temp. Threshold" in v:
crit_th = parse_value(v)
return higt_th or 'N/A', crit_th or 'N/A'

def initialize_ssd_thermals():
higt_th, crit_th = read_ssd_thresholds(SSD_DEV)
return [Thermal(name=f'NVME', thermal_type=ThermalType.SSD, dev=SSD_DEV, ht=higt_th, cht=crit_th)]

def initialize_chassis_thermals():
thermal_list = []
for s in SENSORS:
if 'hwmon_path' in s:
set_hwmon_path(s)
thermal_list.append(Thermal(**s))
thermal_list = [Thermal(**x) for x in MLXBF_SENSORS]
thermal_list += initialize_sfp_thermals()
thermal_list += initialize_ssd_thermals()
return thermal_list

def read_fs(path, name):
Expand All @@ -72,10 +123,12 @@ def read_temp_hwmon(hwmon_path, sensor):
return v / 1000

class Thermal(ThermalBase):
def __init__(self, name, mlxbf_sensor_name=None, iface=None, hwmon_path=None, ht='N/A', cht='N/A'):
def __init__(self, name, thermal_type=None, mlxbf_sensor_name=None, dev=None, hwmon_path=None, ht='N/A', cht='N/A'):
super(Thermal, self).__init__()
self.name = name
self.thermal_type = thermal_type
self.mlxbf_sensor_name = mlxbf_sensor_name
self.dev = dev
self.hwmon_path = hwmon_path
self.ht = ht
self.cht = cht
Expand All @@ -97,10 +150,13 @@ def get_temperature(self):
A float number of current temperature in Celsius up to nearest thousandth
of one degree Celsius, e.g. 30.125
"""
if self.mlxbf_sensor_name:
return read_temp_mlxbf(self.mlxbf_sensor_name)
else:
return read_temp_hwmon(self.hwmon_path, 'temp1_input')
match self.thermal_type:
case ThermalType.MLXBF:
return read_temp_mlxbf(self.mlxbf_sensor_name)
case ThermalType.SFP:
return read_temp_hwmon(self.hwmon_path, 'temp1_input')
case ThermalType.SSD:
return read_ssd_temperatrue(self.dev)

def get_high_threshold(self):
"""
Expand Down
44 changes: 22 additions & 22 deletions platform/nvidia-bluefield/platform-api/tests/test_thermal_bf3.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import os
import sys
import json

from unittest.mock import patch
from unittest.mock import mock_open
Expand All @@ -27,7 +28,7 @@
sys.path.insert(0, modules_path)

from sonic_platform.chassis import Chassis
from .utils import platform_sample_bf3
from .utils import platform_sample_bf3, smartctl_output


@patch('sonic_py_common.device_info.get_platform', MagicMock(return_value=""))
Expand All @@ -38,54 +39,53 @@
class TestThermal:

def test_chassis_thermal(self, *args):
from sonic_platform.thermal_bf3 import SENSORS
chassis = Chassis()
thermal_list = chassis.get_all_thermals()
assert thermal_list

for s in SENSORS:
assert 'name' in s
assert 'mlxbf_sensor_name' in s or 'hwmon_path' in s

sensor_names = list(map(lambda x: x.get('name'), SENSORS))
thermal_names = list(map(lambda x: x.get_name(), thermal_list))
for sn in sensor_names:
assert sn in thermal_names


def test_hwmon_read(self, *args):
from sonic_platform import thermal_bf3 as thermal
from sonic_platform.thermal_bf3 import Thermal
from sonic_platform.thermal_bf3 import Thermal, ThermalType

thermal.read_fs = MagicMock(return_value=83123)
sensor = {'name': 'test', 'hwmon_path': '/tmp/', 'ht': 95, 'cht': 100}
t = Thermal(**sensor)
t = Thermal(name='test', thermal_type=ThermalType.SFP, hwmon_path='/tmp/', ht=95, cht=100)
assert t.get_temperature() == 83.123
assert t.get_high_critical_threshold() == 83.123


def test_ssd(self, *args):
from sonic_platform import thermal_bf3 as thermal
from sonic_platform.thermal_bf3 import initialize_ssd_thermals
thermal.read_smartctl = MagicMock(return_value=json.loads(smartctl_output))
t = initialize_ssd_thermals()
assert len(t) == 1
assert t[0].get_temperature() == 42
assert t[0].get_high_threshold() == 90
assert t[0].get_high_critical_threshold() == 100


def test_thermal_get(self, *args):
from sonic_platform import thermal_bf3 as thermal
from sonic_platform.thermal_bf3 import Thermal
from sonic_platform.thermal_bf3 import Thermal, ThermalType

temp_test_mocked_vals = [123, 10.5, -1, None]

for tv in temp_test_mocked_vals:
thermal.read_temp_mlxbf = MagicMock(return_value=tv)
sensor = {'name': 'test', 'mlxbf_sensor_name': 'test', 'ht': 95, 'cht': 100}
t = Thermal(**sensor)
t = Thermal(name='test', thermal_type=ThermalType.MLXBF, mlxbf_sensor_name='test', ht=95, cht=100)
assert t.get_temperature() == tv
assert t.get_high_threshold() == sensor['ht']
assert t.get_high_critical_threshold() == sensor['cht']
assert t.get_high_threshold() == 95

assert t.get_high_critical_threshold() == 100
assert t.get_low_threshold() == 'N/A'
assert t.get_low_critical_threshold() == 'N/A'

for tv in temp_test_mocked_vals:
thermal.read_temp_hwmon = MagicMock(return_value=tv)
sensor = {'name': 'test', 'hwmon_path': '/tmp/', 'ht': 95, 'cht': 100}
t = Thermal(**sensor)
t = Thermal(name='test', thermal_type=ThermalType.SFP, hwmon_path='/tmp/', ht=95)
assert t.get_temperature() == tv
assert t.get_high_threshold() == sensor['ht']
assert t.get_high_threshold() == 95
assert t.get_high_critical_threshold() == tv
assert t.get_low_threshold() == 'N/A'
assert t.get_low_critical_threshold() == 'N/A'
12 changes: 12 additions & 0 deletions platform/nvidia-bluefield/platform-api/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,18 @@
}
"""

smartctl_output = """
{
"smartctl_0004_u": "=== START OF INFORMATION SECTION ===",
"smartctl_0023_u": "Maximum Data Transfer Size: 512 Pages",
"smartctl_0024_u": "Warning Comp. Temp. Threshold: 90 Celsius",
"smartctl_0025_u": "Critical Comp. Temp. Threshold: 100 Celsius",
"temperature": {
"current": 42
}
}
"""

# Utilities for throttling tests
class LogRecorderMock(object):
def __init__(self):
Expand Down

0 comments on commit 0114373

Please sign in to comment.