Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Mellanox] update asic and module temperature in a thread #197

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ class Chassis(ChassisBase):
# System UID LED
_led_uid = None

chassis_instance = None

def __init__(self):
super(Chassis, self).__init__()

Expand Down Expand Up @@ -124,6 +126,7 @@ def __init__(self):
self._RJ45_port_inited = False
self._RJ45_port_list = None

Chassis.chassis_instance = self
logger.log_info("Chassis loaded successfully")

def __del__(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,3 +227,12 @@ def get_cpld_component_list(cls):
# Currently, only fetching BIOS version is supported
return ComponentCPLDSN2201.get_component_list()
return ComponentCPLD.get_component_list()

@classmethod
@utils.read_only_cache()
def is_independent_mode(cls):
from sonic_py_common import device_info
_, hwsku_dir = device_info.get_paths_to_platform_and_hwsku_dirs()
sai_profile_file = os.path.join(hwsku_dir, 'sai.profile')
data = utils.read_key_value_file(sai_profile_file, delimeter='=')
return data.get('SAI_INDEPENDENT_MODULE_MODE') == '1'
82 changes: 82 additions & 0 deletions platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
from . import utils
from .device_data import DeviceDataManager
from sonic_platform_base.sonic_xcvr.sfp_optoe_base import SfpOptoeBase
from sonic_platform_base.sonic_xcvr.fields import consts
from sonic_platform_base.sonic_xcvr.api.public import sff8636, sff8436

except ImportError as e:
raise ImportError (str(e) + "- required module not found")
Expand Down Expand Up @@ -135,6 +137,10 @@
# SFP stderr
SFP_EEPROM_NOT_AVAILABLE = 'Input/output error'

SFP_DEFAULT_TEMP_WARNNING_THRESHOLD = 70.0
SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD = 80.0
SFP_TEMPERATURE_SCALE = 8.0

# SFP EEPROM limited bytes
limited_eeprom = {
SFP_TYPE_CMIS: {
Expand Down Expand Up @@ -527,6 +533,71 @@ def get_tx_fault(self):
api = self.get_xcvr_api()
return [False] * api.NUM_CHANNELS if api else None

def get_temperature(self):
try:
if not self.is_sw_control():
temperature = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/input',
default=None)
return temperature / SFP_TEMPERATURE_SCALE if temperature is not None else None
except:
return 0.0

temperature = super().get_temperature()
return temperature if temperature is not None else None

def get_temperature_warning_threashold(self):
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: retry reading threshold

"""Get temperature warning threshold

Returns:
int: temperature warning threshold
"""
try:
if not self.is_sw_control():
emergency = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/emergency',
log_func=None,
default=None)
return emergency / SFP_TEMPERATURE_SCALE if emergency is not None else SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
except:
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD

thresh = self._get_temperature_threshold()
if thresh and consts.TEMP_HIGH_WARNING_FIELD in thresh:
return thresh[consts.TEMP_HIGH_WARNING_FIELD]
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD

def get_temperature_critical_threashold(self):
"""Get temperature critical threshold

Returns:
int: temperature critical threshold
"""
try:
if not self.is_sw_control():
critical = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/critical',
log_func=None,
default=None)
return critical / SFP_TEMPERATURE_SCALE if critical is not None else SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
except:
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD

thresh = self._get_temperature_threshold()
if thresh and consts.TEMP_HIGH_ALARM_FIELD in thresh:
return thresh[consts.TEMP_HIGH_ALARM_FIELD]
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD

def _get_temperature_threshold(self):
api = self.get_xcvr_api()
if not api:
return None

thresh_support = api.get_transceiver_thresholds_support()
if thresh_support:
if isinstance(api, sff8636.Sff8636Api) or isinstance(api, sff8436.Sff8436Api):
return api.xcvr_eeprom.read(consts.TEMP_THRESHOLDS_FIELD)
return api.xcvr_eeprom.read(consts.THRESHOLDS_FIELD)
else:
return None

def get_xcvr_api(self):
"""
Retrieves the XcvrApi associated with this SFP
Expand All @@ -541,6 +612,17 @@ def get_xcvr_api(self):
self._xcvr_api.get_tx_fault = self.get_tx_fault
return self._xcvr_api

def is_sw_control(self):
if not DeviceDataManager.is_independent_mode():
return False

db = utils.DbUtils.get_db_instance('STATE_DB')
control_type = db.get('STATE_DB', f'TRANSCEIVER_MODULES_MGMT|{self.sdk_index}', 'control_type')
if not control_type:
raise Exception(f'Module {self.sdk_index} is in initialization, please retry later')

return control_type == 'SW_CONTROL'


class RJ45Port(NvidiaSFPCommon):
"""class derived from SFP, representing RJ45 ports"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,36 @@
# limitations under the License.
#
from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase
from . import thermal_updater
from .device_data import DeviceDataManager


class ThermalManager(ThermalManagerBase):
thermal_updater_task = None

@classmethod
def run_policy(cls, chassis):
pass

@classmethod
def initialize(cls):
"""
Initialize thermal manager, including register thermal condition types and thermal action types
and any other vendor specific initialization.
:return:
"""
if DeviceDataManager.is_independent_mode():
from .chassis import Chassis
cls.thermal_updater_task = thermal_updater.ThermalUpdater(Chassis.chassis_instance.get_all_sfps())
cls.thermal_updater_task.start()


@classmethod
def deinitialize(cls):
"""
Destroy thermal manager, including any vendor specific cleanup. The default behavior of this function
is a no-op.
:return:
"""
if DeviceDataManager.is_independent_mode():
cls.thermal_updater_task.stop()
205 changes: 205 additions & 0 deletions platform/mellanox/mlnx-platform-api/sonic_platform/thermal_updater.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
#
# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from . import utils
from sonic_py_common import logger

import sys
import time

sys.path.append('/run/hw-management/bin')

try:
import hw_management_independent_mode_update
except ImportError:
# For unit test only
from unittest import mock
keboliu marked this conversation as resolved.
Show resolved Hide resolved
hw_management_independent_mode_update = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_set_asic = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_set_module = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_clean_asic = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_clean_module = mock.MagicMock()


SFP_TEMPERATURE_SCALE = 1000
ASIC_TEMPERATURE_SCALE = 125
ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD = 105000
ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD = 120000

ERROR_READ_THERMAL_DATA = -254

TC_CONFIG_FILE = '/run/hw-management/config/tc_config.json'
logger = logger.Logger('thermal-updater')


class ThermalUpdater:
def __init__(self, sfp_list):
self._sfp_list = sfp_list
self._sfp_status = {}
self._timer = utils.Timer()

def load_tc_config(self):
asic_poll_interval = 1
sfp_poll_interval = 10
data = utils.load_json_file(TC_CONFIG_FILE)
if not data:
logger.log_notice(f'{TC_CONFIG_FILE} does not exist, use default polling interval')

if data:
dev_parameters = data.get('dev_parameters')
if dev_parameters is not None:
asic_parameter = dev_parameters.get('asic')
if asic_parameter is not None:
asic_poll_interval_config = asic_parameter.get('poll_time')
if asic_poll_interval_config:
asic_poll_interval = int(asic_poll_interval_config) / 2
module_parameter = dev_parameters.get('module\\d+')
if module_parameter is not None:
sfp_poll_interval_config = module_parameter.get('poll_time')
if sfp_poll_interval_config:
sfp_poll_interval = int(sfp_poll_interval_config) / 2

logger.log_notice(f'ASIC polling interval: {asic_poll_interval}')
self._timer.schedule(asic_poll_interval, self.update_asic)
logger.log_notice(f'Module polling interval: {sfp_poll_interval}')
self._timer.schedule(sfp_poll_interval, self.update_module)

def start(self):
self.clean_thermal_data()
if not self.wait_all_sfp_ready():
logger.log_error('Failed to wait for all SFP ready, will put hw-management-tc to suspend')
self.control_tc(True)
return
self.control_tc(False)
self.load_tc_config()
self._timer.start()

def stop(self):
self._timer.stop()
self.control_tc(True)

def control_tc(self, suspend):
logger.log_notice(f'Set hw-management-tc to {"suspend" if suspend else "resume"}')
utils.write_file('/run/hw-management/config/suspend', 1 if suspend else 0)

def clean_thermal_data(self):
hw_management_independent_mode_update.thermal_data_clean_asic(0)
for sfp in self._sfp_list:
hw_management_independent_mode_update.thermal_data_clean_module(
0,
sfp.sdk_index + 1
)

def wait_all_sfp_ready(self):
logger.log_notice('Waiting for all SFP modules ready...')
max_wait_time = 60
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test if 60 second is enough?

ready_set = set()
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

confirm with hw-mgmt if it is per cable control

while len(ready_set) != len(self._sfp_list):
for sfp in self._sfp_list:
try:
sfp.is_sw_control()
ready_set.add(sfp)
except:
continue
max_wait_time -= 1
if max_wait_time == 0:
return False
time.sleep(1)

logger.log_notice('All SFP modules are ready')
return True

def get_asic_temp(self):
temperature = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/input', default=None)
return temperature * ASIC_TEMPERATURE_SCALE if temperature is not None else None

def get_asic_temp_warning_threashold(self):
emergency = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/emergency', default=None, log_func=None)
return emergency * ASIC_TEMPERATURE_SCALE if emergency is not None else ASIC_DEFAULT_TEMP_WARNNING_THRESHOLD

def get_asic_temp_critical_threashold(self):
critical = utils.read_int_from_file('/sys/module/sx_core/asic0/temperature/critical', default=None, log_func=None)
return critical * ASIC_TEMPERATURE_SCALE if critical is not None else ASIC_DEFAULT_TEMP_CRITICAL_THRESHOLD

def update_single_module(self, sfp):
try:
presence = sfp.get_presence()
pre_presence = self._sfp_status.get(sfp.sdk_index)
if presence:
temperature = sfp.get_temperature()
if temperature == 0:
warning_thresh = 0
critical_thresh = 0
else:
warning_thresh = sfp.get_temperature_warning_threashold()
critical_thresh = sfp.get_temperature_critical_threashold()
temperature = ERROR_READ_THERMAL_DATA if temperature is None else int(temperature * SFP_TEMPERATURE_SCALE)
warning_thresh = ERROR_READ_THERMAL_DATA if warning_thresh is None else int(warning_thresh * SFP_TEMPERATURE_SCALE)
critical_thresh = ERROR_READ_THERMAL_DATA if critical_thresh is None else int(critical_thresh * SFP_TEMPERATURE_SCALE)

hw_management_independent_mode_update.thermal_data_set_module(
0, # ASIC index always 0 for now
sfp.sdk_index + 1,
temperature,
warning_thresh,
critical_thresh
)
else:
if pre_presence != presence:
hw_management_independent_mode_update.thermal_data_clean_module(0, sfp.sdk_index + 1)

if pre_presence != presence:
self._sfp_status[sfp.sdk_index] = presence
except Exception as e:
logger.log_error('Failed to update module {sfp.sdk_index} thermal data - {e}')
hw_management_independent_mode_update.thermal_data_set_module(
0, # ASIC index always 0 for now
sfp.sdk_index + 1,
ERROR_READ_THERMAL_DATA,
ERROR_READ_THERMAL_DATA,
ERROR_READ_THERMAL_DATA
)

def update_module(self):
for sfp in self._sfp_list:
self.update_single_module(sfp)

def update_asic(self):
try:
asic_temp = self.get_asic_temp()
if asic_temp:
hw_management_independent_mode_update.thermal_data_set_asic(
0, # ASIC index always 0 for now
self.get_asic_temp(),
self.get_asic_temp_warning_threashold(),
self.get_asic_temp_critical_threashold()
)
else:
hw_management_independent_mode_update.thermal_data_set_asic(
0, # ASIC index always 0 for now
ERROR_READ_THERMAL_DATA,
self.get_asic_temp_warning_threashold(),
self.get_asic_temp_critical_threashold()
)
except Exception as e:
logger.log_error('Failed to update ASIC thermal data - {e}')
hw_management_independent_mode_update.thermal_data_set_asic(
0, # ASIC index always 0 for now
ERROR_READ_THERMAL_DATA,
ERROR_READ_THERMAL_DATA,
ERROR_READ_THERMAL_DATA,
)
Loading