Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Mellanox] update asic and module temperature in a thread #197

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ class Chassis(ChassisBase):
# System UID LED
_led_uid = None

chassis_instance = None

def __init__(self):
super(Chassis, self).__init__()

Expand Down Expand Up @@ -124,6 +126,7 @@ def __init__(self):
self._RJ45_port_inited = False
self._RJ45_port_list = None

Chassis.chassis_instance = self
logger.log_info("Chassis loaded successfully")

def __del__(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,3 +227,12 @@ def get_cpld_component_list(cls):
# Currently, only fetching BIOS version is supported
return ComponentCPLDSN2201.get_component_list()
return ComponentCPLD.get_component_list()

@classmethod
@utils.read_only_cache()
def is_independent_mode(cls):
from sonic_py_common import device_info
_, hwsku_dir = device_info.get_paths_to_platform_and_hwsku_dirs()
sai_profile_file = os.path.join(hwsku_dir, 'sai.profile')
data = utils.read_key_value_file(sai_profile_file, delimeter='=')
return data.get('SAI_INDEPENDENT_MODULE_MODE') == '1'
87 changes: 87 additions & 0 deletions platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
from . import utils
from .device_data import DeviceDataManager
from sonic_platform_base.sonic_xcvr.sfp_optoe_base import SfpOptoeBase
from sonic_platform_base.sonic_xcvr.fields import consts
from sonic_platform_base.sonic_xcvr.api.public import sff8636, sff8436

except ImportError as e:
raise ImportError (str(e) + "- required module not found")
Expand Down Expand Up @@ -135,6 +137,10 @@
# SFP stderr
SFP_EEPROM_NOT_AVAILABLE = 'Input/output error'

SFP_DEFAULT_TEMP_WARNNING_THRESHOLD = 70.0
SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD = 80.0
SFP_TEMPERATURE_SCALE = 8.0

# SFP EEPROM limited bytes
limited_eeprom = {
SFP_TYPE_CMIS: {
Expand Down Expand Up @@ -406,6 +412,11 @@ def get_error_description(self):
error_description = "Unknow SFP module status ({})".format(oper_status)
return error_description

def get_error(self):
Junchao-Mellanox marked this conversation as resolved.
Show resolved Hide resolved
status_file_path = SFP_SDK_MODULE_SYSFS_ROOT_TEMPLATE.format(sdk_index) + SFP_SYSFS_STATUS
oper_state = utils.read_int_from_file(status_file_path)
return 1 if oper_state == SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR else 0

def _get_eeprom_path(self):
return SFP_EEPROM_ROOT_TEMPLATE.format(self.sdk_index)

Expand Down Expand Up @@ -527,6 +538,71 @@ def get_tx_fault(self):
api = self.get_xcvr_api()
return [False] * api.NUM_CHANNELS if api else None

def get_temperature(self):
try:
if not self.is_sw_control():
temperature = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/input',
default=None)
return temperature / SFP_TEMPERATURE_SCALE if temperature is not None else 0.0
except:
return 0.0

temperature = super().get_temperature()
return temperature if temperature is not None else 0.0

def get_temperature_warning_threashold(self):
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: retry reading threshold

"""Get temperature warning threshold

Returns:
int: temperature warning threshold
"""
try:
if not self.is_sw_control():
emergency = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/emergency',
log_func=None,
default=None)
return emergency / SFP_TEMPERATURE_SCALE if emergency is not None else SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
except:
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD

thresh = self._get_temperature_threshold()
if thresh and consts.TEMP_HIGH_WARNING_FIELD in thresh:
return thresh[consts.TEMP_HIGH_WARNING_FIELD]
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD

def get_temperature_critical_threashold(self):
"""Get temperature critical threshold

Returns:
int: temperature critical threshold
"""
try:
if not self.is_sw_control():
critical = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/critical',
log_func=None,
default=None)
return critical / SFP_TEMPERATURE_SCALE if critical is not None else SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
except:
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD

thresh = self._get_temperature_threshold()
if thresh and consts.TEMP_HIGH_ALARM_FIELD in thresh:
return thresh[consts.TEMP_HIGH_ALARM_FIELD]
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD

def _get_temperature_threshold(self):
api = self.get_xcvr_api()
if not api:
return None

thresh_support = api.get_transceiver_thresholds_support()
if thresh_support:
if isinstance(api, sff8636.Sff8636Api) or isinstance(api, sff8436.Sff8436Api):
return api.xcvr_eeprom.read(consts.TEMP_THRESHOLDS_FIELD)
return api.xcvr_eeprom.read(consts.THRESHOLDS_FIELD)
else:
return None

def get_xcvr_api(self):
"""
Retrieves the XcvrApi associated with this SFP
Expand All @@ -541,6 +617,17 @@ def get_xcvr_api(self):
self._xcvr_api.get_tx_fault = self.get_tx_fault
return self._xcvr_api

def is_sw_control(self):
if not DeviceDataManager.is_independent_mode():
return False

db = utils.DbUtils.get_db_instance('STATE_DB')
control_type = db.get('STATE_DB', f'TRANSCEIVER_MODULES_MGMT|{self.sdk_index}', 'control_type')
if not control_type:
raise Exception(f'Module {self.sdk_index} is in initialization, please retry later')

return control_type == 'SW_CONTROL'


class RJ45Port(NvidiaSFPCommon):
"""class derived from SFP, representing RJ45 ports"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,36 @@
# limitations under the License.
#
from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase
from . import thermal_updater
from .device_data import DeviceDataManager


class ThermalManager(ThermalManagerBase):
thermal_updater_task = None

@classmethod
def run_policy(cls, chassis):
pass

@classmethod
def initialize(cls):
"""
Initialize thermal manager, including register thermal condition types and thermal action types
and any other vendor specific initialization.
:return:
"""
if DeviceDataManager.is_independent_mode():
from .chassis import Chassis
cls.thermal_updater_task = thermal_updater.ThermalUpdater(Chassis.chassis_instance.get_all_sfps())
cls.thermal_updater_task.start()


@classmethod
def deinitialize(cls):
"""
Destroy thermal manager, including any vendor specific cleanup. The default behavior of this function
is a no-op.
:return:
"""
if DeviceDataManager.is_independent_mode():
cls.thermal_updater_task.stop()
Loading