From 0af24ab58c8612377cb25266ffe9e795614f97df Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Wed, 21 Jun 2023 10:44:37 -0700 Subject: [PATCH 01/22] Add initial support for sff_mgr --- sonic-xcvrd/tests/test_xcvrd.py | 210 +++++++- sonic-xcvrd/xcvrd/xcvrd.py | 483 ++++++++++++++++-- .../xcvrd/xcvrd_utilities/port_mapping.py | 51 +- 3 files changed, 685 insertions(+), 59 deletions(-) diff --git a/sonic-xcvrd/tests/test_xcvrd.py b/sonic-xcvrd/tests/test_xcvrd.py index 2581c7a0d..f475d7fa4 100644 --- a/sonic-xcvrd/tests/test_xcvrd.py +++ b/sonic-xcvrd/tests/test_xcvrd.py @@ -43,6 +43,27 @@ class TestXcvrdThreadException(object): + @patch('xcvrd.xcvrd.platform_chassis', MagicMock()) + @patch('xcvrd.xcvrd_utilities.port_mapping.subscribe_port_update_event', MagicMock(side_effect = NotImplementedError)) + def test_SffManagerTask_task_run_with_exception(self): + port_mapping = PortMapping() + stop_event = threading.Event() + sff_mgr = SffManagerTask(DEFAULT_NAMESPACE, port_mapping, stop_event) + exception_received = None + trace = None + try: + sff_mgr.start() + sff_mgr.join() + except Exception as e1: + exception_received = e1 + trace = traceback.format_exc() + + assert not sff_mgr.is_alive() + assert(type(exception_received) == NotImplementedError) + assert("NotImplementedError" in str(trace) and "effect" in str(trace)) + assert("sonic-xcvrd/xcvrd/xcvrd.py" in str(trace)) + assert("subscribe_port_update_event" in str(trace)) + @patch('xcvrd.xcvrd.platform_chassis', MagicMock()) def test_CmisManagerTask_task_run_with_exception(self): port_mapping = PortMapping() @@ -109,8 +130,10 @@ def test_SfpStateUpdateTask_task_run_with_exception(self): @patch('xcvrd.xcvrd.SfpStateUpdateTask.is_alive', MagicMock(return_value = False)) @patch('xcvrd.xcvrd.DomInfoUpdateTask.is_alive', MagicMock(return_value = False)) @patch('xcvrd.xcvrd.CmisManagerTask.is_alive', MagicMock(return_value = False)) - @patch('xcvrd.xcvrd.CmisManagerTask.join', MagicMock(side_effect = NotImplementedError)) + @patch('xcvrd.xcvrd.SffManagerTask.is_alive', MagicMock(return_value=False)) + @patch('xcvrd.xcvrd.CmisManagerTask.join', MagicMock(side_effect=NotImplementedError)) @patch('xcvrd.xcvrd.CmisManagerTask.start', MagicMock()) + @patch('xcvrd.xcvrd.SffManagerTask.start', MagicMock()) @patch('xcvrd.xcvrd.DomInfoUpdateTask.start', MagicMock()) @patch('xcvrd.xcvrd.SfpStateUpdateTask.start', MagicMock()) @patch('xcvrd.xcvrd.DaemonXcvrd.deinit', MagicMock()) @@ -118,16 +141,21 @@ def test_SfpStateUpdateTask_task_run_with_exception(self): @patch('xcvrd.xcvrd.DaemonXcvrd.init') @patch('xcvrd.xcvrd.DomInfoUpdateTask.join') @patch('xcvrd.xcvrd.SfpStateUpdateTask.join') - def test_DaemonXcvrd_run_with_exception(self, mock_task_join1, mock_task_join2, mock_init, mock_os_kill): + @patch('xcvrd.xcvrd.SffManagerTask.join') + def test_DaemonXcvrd_run_with_exception(self, mock_task_join_sff, mock_task_join_sfp, + mock_task_join_dom, mock_init, mock_os_kill): mock_init.return_value = (PortMapping(), set()) xcvrd = DaemonXcvrd(SYSLOG_IDENTIFIER) + xcvrd.enable_sff_mgr = True + xcvrd.load_feature_flags = MagicMock() xcvrd.stop_event.wait = MagicMock() xcvrd.run() - assert len(xcvrd.threads) == 3 + assert len(xcvrd.threads) == 4 assert mock_init.call_count == 1 - assert mock_task_join1.call_count == 1 - assert mock_task_join2.call_count == 1 + assert mock_task_join_sff.call_count == 1 + assert mock_task_join_sfp.call_count == 1 + assert mock_task_join_dom.call_count == 1 assert mock_os_kill.call_count == 1 class TestXcvrdScript(object): @@ -593,6 +621,159 @@ def test_DaemonXcvrd_run(self, mock_task_stop1, mock_task_stop2, mock_task_run1, assert mock_deinit.call_count == 1 assert mock_init.call_count == 1 + def test_SffManagerTask_handle_port_change_event(self): + port_mapping = PortMapping() + stop_event = threading.Event() + task = SffManagerTask(DEFAULT_NAMESPACE, port_mapping, stop_event) + + port_change_event = PortChangeEvent('PortConfigDone', -1, 0, PortChangeEvent.PORT_SET) + task.on_port_update_event(port_change_event) + assert len(task.port_dict) == 0 + + port_change_event = PortChangeEvent('PortInitDone', -1, 0, PortChangeEvent.PORT_SET) + task.on_port_update_event(port_change_event) + assert len(task.port_dict) == 0 + + port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_ADD) + task.on_port_update_event(port_change_event) + assert len(task.port_dict) == 0 + + port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_REMOVE) + task.on_port_update_event(port_change_event) + assert len(task.port_dict) == 0 + + port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_DEL) + task.on_port_update_event(port_change_event) + assert len(task.port_dict) == 0 + + port_dict = {'type': 'QSFP28', 'channel': '0', 'host_tx_ready': 'false'} + port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET, port_dict) + task.on_port_update_event(port_change_event) + assert len(task.port_dict) == 1 + + port_change_event = PortChangeEvent('Ethernet0', -1, 0, PortChangeEvent.PORT_DEL, {}, + 'STATE_DB', 'TRANSCEIVER_INFO') + task.on_port_update_event(port_change_event) + assert len(task.port_dict) == 1 + + port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_DEL, {}, + 'CONFIG_DB', 'PORT_TABLE') + task.on_port_update_event(port_change_event) + assert len(task.port_dict) == 0 + + @patch.object(XcvrTableHelper, 'get_state_port_tbl', return_value=MagicMock()) + def test_SffManagerTask_get_host_tx_status(self, mock_get_state_port_tbl): + mock_get_state_port_tbl.return_value.get.return_value = (True, {'host_tx_ready': 'true'}) + + sff_manager_task = SffManagerTask(namespaces=DEFAULT_NAMESPACE, + port_mapping=PortMapping(), + main_thread_stop_event=threading.Event()) + sff_manager_task.port_mapping.get_asic_id_for_logical_port = MagicMock(return_value=1) + + lport = 'Ethernet0' + assert sff_manager_task.get_host_tx_status(lport) == 'true' + sff_manager_task.port_mapping.get_asic_id_for_logical_port.assert_called_once_with(lport) + mock_get_state_port_tbl.assert_called_once_with(1) + mock_get_state_port_tbl.return_value.get.assert_called_once_with(lport) + + @patch('xcvrd.xcvrd.platform_chassis') + @patch('xcvrd.xcvrd_utilities.port_mapping.subscribe_port_update_event', + MagicMock(return_value=(None, None))) + @patch('xcvrd.xcvrd_utilities.port_mapping.handle_port_update_event', MagicMock()) + def test_SffManagerTask_task_worker(self, mock_chassis): + mock_xcvr_api = MagicMock() + mock_xcvr_api.tx_disable_channel = MagicMock(return_value=True) + mock_xcvr_api.is_flat_memory = MagicMock(return_value=False) + mock_xcvr_api.is_copper = MagicMock(return_value=False) + mock_xcvr_api.get_tx_disable_support = MagicMock(return_value=True) + + mock_sfp = MagicMock() + mock_sfp.get_presence = MagicMock(return_value=True) + mock_sfp.get_xcvr_api = MagicMock(return_value=mock_xcvr_api) + + mock_chassis.get_all_sfps = MagicMock(return_value=[mock_sfp]) + mock_chassis.get_sfp = MagicMock(return_value=mock_sfp) + + task = SffManagerTask(namespaces=DEFAULT_NAMESPACE, + port_mapping=PortMapping(), + main_thread_stop_event=threading.Event()) + + # TX enable case: + port_dict = {'type': 'QSFP28', 'channel': '0'} + port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET, port_dict) + task.on_port_update_event(port_change_event) + assert len(task.port_dict) == 1 + task.get_host_tx_status = MagicMock(return_value='true') + mock_xcvr_api.get_tx_disable = MagicMock(return_value=[True, True, True, True]) + task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True]) + task.task_worker() + assert mock_xcvr_api.tx_disable_channel.call_count == 1 + assert task.get_host_tx_status.call_count == 1 + + # TX disable case: + port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET, + {'host_tx_ready': 'false'}) + task.on_port_update_event(port_change_event) + assert len(task.port_dict) == 1 + mock_xcvr_api.get_tx_disable = MagicMock(return_value=[False, False, False, False]) + task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True]) + task.task_worker() + assert mock_xcvr_api.tx_disable_channel.call_count == 2 + + # No insertion and no change on host_tx_ready + task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True]) + task.task_worker() + assert task.port_dict == task.port_dict_prev + assert mock_xcvr_api.tx_disable_channel.call_count == 2 + + # flat memory case + port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET, + {'host_tx_ready': 'true'}) + task.on_port_update_event(port_change_event) + mock_xcvr_api.is_flat_memory = MagicMock(return_value=True) + mock_xcvr_api.is_flat_memory.call_count = 0 + task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True]) + task.task_worker() + assert mock_xcvr_api.is_flat_memory.call_count == 1 + assert mock_xcvr_api.tx_disable_channel.call_count == 2 + mock_xcvr_api.is_flat_memory = MagicMock(return_value=False) + + # copper case + port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET, + {'host_tx_ready': 'false'}) + task.on_port_update_event(port_change_event) + mock_xcvr_api.is_copper = MagicMock(return_value=True) + mock_xcvr_api.is_copper.call_count = 0 + task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True]) + task.task_worker() + assert mock_xcvr_api.is_copper.call_count == 1 + assert mock_xcvr_api.tx_disable_channel.call_count == 2 + mock_xcvr_api.is_copper = MagicMock(return_value=False) + + # tx_disable not supported case + port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET, + {'host_tx_ready': 'true'}) + task.on_port_update_event(port_change_event) + mock_xcvr_api.get_tx_disable_support = MagicMock(return_value=False) + mock_xcvr_api.get_tx_disable_support.call_count = 0 + task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True]) + task.task_worker() + assert mock_xcvr_api.get_tx_disable_support.call_count == 1 + assert mock_xcvr_api.tx_disable_channel.call_count == 2 + mock_xcvr_api.get_tx_disable_support = MagicMock(return_value=True) + + # sfp not present case + port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET, + {'host_tx_ready': 'false'}) + task.on_port_update_event(port_change_event) + mock_sfp.get_presence = MagicMock(return_value=False) + mock_sfp.get_presence.call_count = 0 + task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True]) + task.task_worker() + assert mock_sfp.get_presence.call_count == 1 + assert mock_xcvr_api.tx_disable_channel.call_count == 2 + mock_sfp.get_presence = MagicMock(return_value=True) + @patch('xcvrd.xcvrd._wrapper_get_sfp_type', MagicMock(return_value='QSFP_DD')) def test_CmisManagerTask_handle_port_change_event(self): port_mapping = PortMapping() @@ -872,7 +1053,7 @@ def test_CmisManagerTask_task_worker(self, mock_chassis): assert task.isPortConfigDone port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET, - {'speed':'400000', 'lanes':'1,2,3,4,5,6,7,8'}) + {'speed': '400000', 'lanes': '1,2,3,4,5,6,7,8'}) task.on_port_update_event(port_change_event) assert len(task.port_dict) == 1 @@ -1498,6 +1679,23 @@ def test_DaemonXcvrd_init_deinit_fastboot_enabled(self): xcvrd.init() xcvrd.deinit() + @patch('builtins.open', new_callable=MagicMock) + @patch('xcvrd.xcvrd.DaemonXcvrd.get_platform_pmon_ctrl_file_path', return_value="dummy_path") + @patch('json.load', return_value={ENABLE_SFF_MGR_FLAG_NAME: True}) + def test_DaemonXcvrd_load_feature_flags(self, mock_json_load, mock_get_path, mock_open): + xcvrd = DaemonXcvrd(SYSLOG_IDENTIFIER) + xcvrd.load_feature_flags() + mock_get_path.assert_called_once() + mock_open.assert_called_once_with('dummy_path') + mock_json_load.assert_called_once() + assert xcvrd.enable_sff_mgr == True + + @patch('sonic_py_common.device_info.get_platform', MagicMock(return_value='x86_64-sonic-linux')) + @patch('os.path.isfile', MagicMock(return_value=True)) + def test_DaemonXcvrd_get_platform_pmon_ctrl_file_path(self): + xcvrd = DaemonXcvrd(SYSLOG_IDENTIFIER) + assert xcvrd.get_platform_pmon_ctrl_file_path( + ) == '/usr/share/sonic/platform/pmon_daemon_control.json' def wait_until(total_wait_time, interval, call_back, *args, **kwargs): wait_time = 0 diff --git a/sonic-xcvrd/xcvrd/xcvrd.py b/sonic-xcvrd/xcvrd/xcvrd.py index f59cf6433..575372f5f 100644 --- a/sonic-xcvrd/xcvrd/xcvrd.py +++ b/sonic-xcvrd/xcvrd/xcvrd.py @@ -87,6 +87,12 @@ POWER_UNIT = 'dBm' BIAS_UNIT = 'mA' +USR_SHARE_SONIC_PATH = "/usr/share/sonic" +HOST_DEVICE_PATH = USR_SHARE_SONIC_PATH + "/device" +CONTAINER_PLATFORM_PATH = USR_SHARE_SONIC_PATH + "/platform" +PLATFORM_PMON_CTRL_FILENAME = "pmon_daemon_control.json" +ENABLE_SFF_MGR_FLAG_NAME = "enable_xcvrd_sff_mgr" + g_dict = {} # Global platform specific sfputil class instance platform_sfputil = None @@ -864,6 +870,361 @@ def is_fast_reboot_enabled(): # Helper classes =============================================================== # +# Thread wrapper class for SFF compliant transceiver management + + +class SffManagerTask(threading.Thread): + # Subscribe to below tables in Redis DB + PORT_TBL_MAP = [ + { + 'CONFIG_DB': swsscommon.CFG_PORT_TABLE_NAME + }, + { + 'STATE_DB': 'TRANSCEIVER_INFO', + 'FILTER': ['type'] + }, + { + 'STATE_DB': 'PORT_TABLE', + 'FILTER': ['host_tx_ready'] + }, + ] + # Default number of channels for QSFP28/QSFP+ transceiver + DEFAULT_NUM_CHANNELS = 4 + + def __init__(self, namespaces, port_mapping, main_thread_stop_event): + threading.Thread.__init__(self) + self.name = "SffManagerTask" + self.exc = None + self.task_stopping_event = threading.Event() + self.main_thread_stop_event = main_thread_stop_event + self.port_dict = {} + # port_dict snapshot captured in the previous while loop + self.port_dict_prev = {} + self.port_mapping = copy.deepcopy(port_mapping) + self.xcvr_table_helper = XcvrTableHelper(namespaces) + self.namespaces = namespaces + + def log_notice(self, message): + helper_logger.log_notice("SFF: {}".format(message)) + + def log_warning(self, message): + helper_logger.log_warning("SFF: {}".format(message)) + + def log_error(self, message): + helper_logger.log_error("SFF: {}".format(message)) + + def on_port_update_event(self, port_change_event): + if (port_change_event.event_type + not in [port_change_event.PORT_SET, port_change_event.PORT_DEL]): + return + + lport = port_change_event.port_name + pport = port_change_event.port_index + + # This thread doesn't need to expilictly wait on PortInitDone and + # PortConfigDone events, as xcvrd main thread waits on them before + # spawrning this thread. + if lport in ['PortInitDone']: + return + + if lport in ['PortConfigDone']: + return + + # Skip if it's not a physical port + if not lport.startswith('Ethernet'): + return + + # Skip if the physical index is not available + if pport is None: + return + + if port_change_event.port_dict is None: + return + + if port_change_event.event_type == port_change_event.PORT_SET: + if lport not in self.port_dict: + self.port_dict[lport] = {} + if pport >= 0: + self.port_dict[lport]['index'] = pport + # This field comes from CONFIG_DB PORT_TABLE. This is the channel + # that blongs to this logical port, 0 means all channels. tx_disable + # API needs to know which channels to disable/enable for a + # particular physical port. + if 'channel' in port_change_event.port_dict: + self.port_dict[lport]['channel'] = port_change_event.port_dict['channel'] + # This field comes from STATE_DB PORT_TABLE + if 'host_tx_ready' in port_change_event.port_dict: + self.port_dict[lport]['host_tx_ready'] = \ + port_change_event.port_dict['host_tx_ready'] + # This field comes from STATE_DB TRANSCEIVER_INFO table. + # TRANSCEIVER_INFO has the same life cycle as a transceiver, if + # transceiver is inserted/removed, TRANSCEIVER_INFO is also + # created/deleted. Thus this filed can used to determine + # insertion/removal event. + if 'type' in port_change_event.port_dict: + self.port_dict[lport]['type'] = port_change_event.port_dict['type'] + # CONFIG_DB PORT_TABLE DEL case: + elif port_change_event.db_name and \ + port_change_event.db_name == 'CONFIG_DB': + # Only when port is removed from CONFIG, we consider this entry as deleted. + if lport in self.port_dict: + del self.port_dict[lport] + # STATE_DB TRANSCEIVER_INFO DEL case: + elif port_change_event.table_name and \ + port_change_event.table_name == 'TRANSCEIVER_INFO': + # TRANSCEIVER_INFO DEL corresponds to transceiver removal (not + # port/interface removal), so we only remove 'type' field to help + # determine xcvr removal/insertion event + if lport in self.port_dict and 'type' in self.port_dict[lport]: + del self.port_dict[lport]['type'] + + def get_host_tx_status(self, lport): + host_tx_ready = 'false' + + asic_index = self.port_mapping.get_asic_id_for_logical_port(lport) + state_port_tbl = self.xcvr_table_helper.get_state_port_tbl(asic_index) + + found, port_info = state_port_tbl.get(lport) + if found and 'host_tx_ready' in dict(port_info): + host_tx_ready = dict(port_info)['host_tx_ready'] + return host_tx_ready + + def run(self): + if platform_chassis is None: + self.log_notice("Platform chassis is not available, stopping...") + return + + try: + self.task_worker() + except Exception as e: + helper_logger.log_error("Exception occured at {} thread due to {}".format( + threading.current_thread().getName(), repr(e))) + exc_type, exc_value, exc_traceback = sys.exc_info() + msg = traceback.format_exception(exc_type, exc_value, exc_traceback) + for tb_line in msg: + for tb_line_split in tb_line.splitlines(): + helper_logger.log_error(tb_line_split) + self.exc = e + self.main_thread_stop_event.set() + + def join(self): + self.task_stopping_event.set() + threading.Thread.join(self) + if self.exc: + raise self.exc + + def calculate_tx_disable_delta_array(self, cur_tx_disable_array, tx_disable_flag, channel): + """ + Calculate the delta between the current transmitter (TX) disable array + and a new TX disable flag for a specific channel. The function returns a + new array (delta_array) where each entry indicates whether there's a + difference for each corresponding channel in the TX disable array. + + Args: + cur_tx_disable_array (list): Current array of TX disable flags for all channels. + tx_disable_flag (bool): The new TX disable flag that needs to be compared + with the current flags. + channel (int): The specific channel that needs to be checked. If channel is 0, all + channels are checked. + + Returns: + list: A boolean array where each entry indicates whether there's a + change for the corresponding + channel in the TX disable array. True means there's a change, + False means no change. + """ + delta_array = [] + for i, cur_flag in enumerate(cur_tx_disable_array): + is_different = (tx_disable_flag != cur_flag) if channel in [i + 1, 0] else False + delta_array.append(is_different) + return delta_array + + def convert_bool_array_to_bit_mask(self, bool_array): + """ + Convert a boolean array into a bitmask. If a value in the boolean array + is True, the corresponding bit in the bitmask is set to 1, otherwise + it's set to 0. The function starts from the least significant bit for + the first item in the boolean array. + + Args: + bool_array (list): An array of boolean values. + + Returns: + int: A bitmask corresponding to the input boolean array. + """ + mask = 0 + for i, flag in enumerate(bool_array): + mask += (1 << i if flag else 0) + return mask + + def task_worker(self): + ''' + The goal of sff_mgr is to make sure SFF compliant modules are brought up + in a deterministc way, meaning TX is enabled only after host_tx_ready + becomes True, and TX will be disabled when host_tx_ready becomes False. + This will help eliminate link stability issue and potential interface + flap, also turning off TX reduces the power consumption and avoid any + lab hazard for admin shut interface. + + sff_mgr will only proceed at two events: transceiver insertion event + (including bootup and hot plugin) and host_tx_ready change event, all + other cases are ignored. If either of these two events is detected, + sff_mgr will determine the target tx_disable value based on + host_tx_ready, and also check the current HW value of tx_disable/enable, + if it's already in target tx_disable/enable value, no need to take + action, otherwise, sff_mgr will set the target tx_disable/enable value + to HW. + + To detect these two events, sff_mgr listens to below DB tables, and + mantains a local copy of those DB values, stored in self.port_dict. In + each round of task_worker while loop, sff_mgr will calculate the delta + between current self.port_dict and previous self.port_dict (i.e. + self.port_dict_prev), and determine whether there is a change event. + 1. STATE_DB PORT_TABLE's 'host_tx_ready' field for host_tx_ready + change event. + 2. STATE_DB TRANSCEIVER_INFO table's 'type' field for insesrtion + event. Since TRANSCEIVER_INFO table has the same life cycle of + transceiver module insertion/removal, its DB entry will get + created at xcvr insertion, and will get deleted at xcvr removal. + TRANSCEIVER_STATUS table can also be used to detect + insertion/removal event, but 'type' field of TRANSCEIVER_INFO + table can also be used to filter out unwanted xcvr type, thus + TRANSCEIVER_INFO table is used here. + + On the other hand, sff_mgr also listens to CONFIG_DB PORT_TABLE for info + such as 'index'/etc. + + Platform can decide whether to enable sff_mgr via platform + enable_sff_mgr flag. If enable_sff_mgr is False, sff_mgr will not run. + + There is a behavior change (and requirement) for the platforms that + enable this sff_mgr feature: platform needs to keep TX in disabled state + after module coming out-of-reset, in either module insertion or bootup + cases. This is to make sure the module is not transmitting with TX + enabled before host_tx_ready is True. No impact for the platforms in + current deployment (since they don't enable it explictly.) + + ''' + self.xcvr_table_helper = XcvrTableHelper(self.namespaces) + + # CONFIG updates, and STATE_DB for insertion/removal, and host_tx_ready change + sel, asic_context = port_mapping.subscribe_port_update_event(self.namespaces, self, + self.PORT_TBL_MAP) + while not self.task_stopping_event.is_set(): + # Internally, handle_port_update_event will block for up to + # SELECT_TIMEOUT_MSECS until a message is received. A message is + # received when there is a Redis SET/DEL operation in the DB tables. + # Upon process restart, messages will be replayed for all fields, no + # need to explictly query the DB tables here. + port_mapping.handle_port_update_event(sel, asic_context, self.task_stopping_event, self, + self.on_port_update_event) + + for lport in list(self.port_dict.keys()): + if self.task_stopping_event.is_set(): + break + data = self.port_dict[lport] + pport = int(data.get('index', '-1')) + channel = int(data.get('channel', '0')) + xcvr_type = data.get('type', None) + xcvr_inserted = False + host_tx_ready_changed = False + if pport < 0 or channel < 0: + continue + + if xcvr_type is None: + # TRANSCEIVER_INFO table's 'type' is not ready, meaning xcvr is not present + continue + + # Procced only for 100G/40G + if not (xcvr_type.startswith('QSFP28') or xcvr_type.startswith('QSFP+')): + continue + + # Handle the case when Xcvrd was NOT running when 'host_tx_ready' + # was updated or this is the first run so reconcile + if 'host_tx_ready' not in data: + data['host_tx_ready'] = self.get_host_tx_status(lport) + + # It's a xcvr insertion case if TRANSCEIVER_INFO 'type' doesn't exist + # in previous port_dict sanpshot + if lport not in self.port_dict_prev or 'type' not in self.port_dict_prev[lport]: + xcvr_inserted = True + # Check if there's an diff between current and previous host_tx_ready + if (lport not in self.port_dict_prev or + 'host_tx_ready' not in self.port_dict_prev[lport] or + self.port_dict_prev[lport]['host_tx_ready'] != data['host_tx_ready']): + host_tx_ready_changed = True + # Only proceed on xcvr insertion case or the case of host_tx_ready getting changed + if (not xcvr_inserted) and (not host_tx_ready_changed): + continue + self.log_notice("{}: xcvr_inserted={}, host_tx_ready_changed={}".format( + lport, xcvr_inserted, host_tx_ready_changed)) + + # double-check the HW presence before moving forward + sfp = platform_chassis.get_sfp(pport) + if not sfp.get_presence(): + self.log_error("{}: module not present!".format(lport)) + del self.port_dict[lport] + continue + try: + # Skip if XcvrApi is not supported + api = sfp.get_xcvr_api() + if api is None: + self.log_error( + "{}: skipping sff_mgr since no xcvr api!".format(lport)) + continue + + # Skip if it's not a paged memory device + if api.is_flat_memory(): + self.log_notice( + "{}: skipping sff_mgr for flat memory xcvr".format(lport)) + continue + + # Skip if it's a copper cable + if api.is_copper(): + self.log_notice( + "{}: skipping sff_mgr for copper cable".format(lport)) + continue + + # Skip if tx_disable action is not supported for this xcvr + if not api.get_tx_disable_support(): + self.log_notice( + "{}: skipping sff_mgr due to tx_disable not supported".format( + lport)) + continue + except AttributeError: + # Skip if these essential routines are not available + continue + + target_tx_disable_flag = data['host_tx_ready'] == 'false' + # get_tx_disable API returns an array of bool, with tx_disable flag on each channel. + # True means tx disabled; False means tx enabled. + cur_tx_disable_array = api.get_tx_disable() + if cur_tx_disable_array is None: + self.log_error("{}: Failed to get current tx_disable value".format(lport)) + # If reading current tx_disable/enable value failed (could be due to + # read error), then set this variable to the opposite value of + # target_tx_disable_flag, to let detla array to be True on + # all the interested channels, to try best-effort TX disable/enable. + cur_tx_disable_array = [not target_tx_disable_flag] * self.DEFAULT_NUM_CHANNELS + # Get an array of bool, where it's True only on the channels that need change. + delta_array = self.calculate_tx_disable_delta_array(cur_tx_disable_array, + target_tx_disable_flag, channel) + mask = self.convert_bool_array_to_bit_mask(delta_array) + if mask == 0: + self.log_notice("{}: No change is needed for tx_disable value".format(lport)) + continue + if api.tx_disable_channel(mask, target_tx_disable_flag): + self.log_notice("{}: TX was {} with channel mask: {}".format( + lport, "disabled" if target_tx_disable_flag else "enabled", bin(mask))) + else: + self.log_error("{}: Failed to {} TX with channel mask: {}".format( + lport, "disable" if target_tx_disable_flag else "enable", bin(mask))) + + # Take a snapshot for port_dict, + # this will be used to calculate diff in the next while loop + self.port_dict_prev = copy.deepcopy(self.port_dict) + + # Thread wrapper class for CMIS transceiver management class CmisManagerTask(threading.Thread): @@ -1095,7 +1456,7 @@ def get_cmis_media_lanes_mask(self, api, appl, lport, subport): self.log_error("Invalid input to get media lane mask - appl {} media_lane_count {} " "lport {} subport {}!".format(appl, media_lane_count, lport, subport)) return media_lanes_mask - + media_lane_start_bit = (media_lane_count * (0 if subport == 0 else subport - 1)) if media_lane_assignment_option & (1 << media_lane_start_bit): media_lanes_mask = ((1 << media_lane_count) - 1) << media_lane_start_bit @@ -1317,9 +1678,9 @@ def get_port_admin_status(self, lport): def configure_tx_output_power(self, api, lport, tx_power): min_p, max_p = api.get_supported_power_config() if tx_power < min_p: - self.log_error("{} configured tx power {} < minimum power {} supported".format(lport, tx_power, min_p)) + self.log_error("{} configured tx power {} < minimum power {} supported".format(lport, tx_power, min_p)) if tx_power > max_p: - self.log_error("{} configured tx power {} > maximum power {} supported".format(lport, tx_power, max_p)) + self.log_error("{} configured tx power {} > maximum power {} supported".format(lport, tx_power, max_p)) return api.set_tx_power(tx_power) def configure_laser_frequency(self, api, lport, freq, grid=75): @@ -1393,10 +1754,10 @@ def task_worker(self): # Handle the case when Xcvrd was NOT running when 'host_tx_ready' or 'admin_status' # was updated or this is the first run so reconcile the above two attributes if 'host_tx_ready' not in self.port_dict[lport]: - self.port_dict[lport]['host_tx_ready'] = self.get_host_tx_status(lport) + self.port_dict[lport]['host_tx_ready'] = self.get_host_tx_status(lport) if 'admin_status' not in self.port_dict[lport]: - self.port_dict[lport]['admin_status'] = self.get_port_admin_status(lport) + self.port_dict[lport]['admin_status'] = self.get_port_admin_status(lport) pport = int(info.get('index', "-1")) speed = int(info.get('speed', "0")) @@ -1436,10 +1797,10 @@ def task_worker(self): continue if api.is_coherent_module(): - if 'tx_power' not in self.port_dict[lport]: - self.port_dict[lport]['tx_power'] = self.get_configured_tx_power_from_db(lport) - if 'laser_freq' not in self.port_dict[lport]: - self.port_dict[lport]['laser_freq'] = self.get_configured_laser_freq_from_db(lport) + if 'tx_power' not in self.port_dict[lport]: + self.port_dict[lport]['tx_power'] = self.get_configured_tx_power_from_db(lport) + if 'laser_freq' not in self.port_dict[lport]: + self.port_dict[lport]['laser_freq'] = self.get_configured_laser_freq_from_db(lport) except AttributeError: # Skip if these essential routines are not available self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_READY @@ -1491,7 +1852,7 @@ def task_worker(self): continue host_lanes_mask = self.port_dict[lport]['host_lanes_mask'] self.log_notice("{}: Setting host_lanemask=0x{:x}".format(lport, host_lanes_mask)) - + self.port_dict[lport]['media_lane_count'] = int(api.get_media_lane_count(appl)) self.port_dict[lport]['media_lane_assignment_options'] = int(api.get_media_lane_assignment_option(appl)) media_lane_count = self.port_dict[lport]['media_lane_count'] @@ -1509,30 +1870,30 @@ def task_worker(self): if self.port_dict[lport]['host_tx_ready'] != 'true' or \ self.port_dict[lport]['admin_status'] != 'up': - self.log_notice("{} Forcing Tx laser OFF".format(lport)) - # Force DataPath re-init - api.tx_disable_channel(media_lanes_mask, True) - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_READY - continue + self.log_notice("{} Forcing Tx laser OFF".format(lport)) + # Force DataPath re-init + api.tx_disable_channel(media_lanes_mask, True) + self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_READY + continue # Configure the target output power if ZR module if api.is_coherent_module(): - tx_power = self.port_dict[lport]['tx_power'] - # Prevent configuring same tx power multiple times - if 0 != tx_power and tx_power != api.get_tx_config_power(): - if 1 != self.configure_tx_output_power(api, lport, tx_power): - self.log_error("{} failed to configure Tx power = {}".format(lport, tx_power)) - else: - self.log_notice("{} Successfully configured Tx power = {}".format(lport, tx_power)) + tx_power = self.port_dict[lport]['tx_power'] + # Prevent configuring same tx power multiple times + if 0 != tx_power and tx_power != api.get_tx_config_power(): + if 1 != self.configure_tx_output_power(api, lport, tx_power): + self.log_error("{} failed to configure Tx power = {}".format(lport, tx_power)) + else: + self.log_notice("{} Successfully configured Tx power = {}".format(lport, tx_power)) need_update = self.is_cmis_application_update_required(api, appl, host_lanes_mask) # For ZR module, Datapath needes to be re-initlialized on new channel selection if api.is_coherent_module(): - freq = self.port_dict[lport]['laser_freq'] - # If user requested frequency is NOT the same as configured on the module - # force datapath re-initialization - if 0 != freq and freq != api.get_laser_config_freq(): - need_update = True + freq = self.port_dict[lport]['laser_freq'] + # If user requested frequency is NOT the same as configured on the module + # force datapath re-initialization + if 0 != freq and freq != api.get_laser_config_freq(): + need_update = True if not need_update: # No application updates @@ -1575,13 +1936,13 @@ def task_worker(self): continue if api.is_coherent_module(): - # For ZR module, configure the laser frequency when Datapath is in Deactivated state - freq = self.port_dict[lport]['laser_freq'] - if 0 != freq: + # For ZR module, configure the laser frequency when Datapath is in Deactivated state + freq = self.port_dict[lport]['laser_freq'] + if 0 != freq: if 1 != self.configure_laser_frequency(api, lport, freq): - self.log_error("{} failed to configure laser frequency {} GHz".format(lport, freq)) + self.log_error("{} failed to configure laser frequency {} GHz".format(lport, freq)) else: - self.log_notice("{} configured laser frequency {} GHz".format(lport, freq)) + self.log_notice("{} configured laser frequency {} GHz".format(lport, freq)) # D.1.3 Software Configuration and Initialization if not api.set_application(host_lanes_mask, appl): @@ -2328,6 +2689,7 @@ def __init__(self, log_identifier, skip_cmis_mgr=False): self.stop_event = threading.Event() self.sfp_error_event = threading.Event() self.skip_cmis_mgr = skip_cmis_mgr + self.enable_sff_mgr = False self.namespaces = [''] self.threads = [] @@ -2456,6 +2818,46 @@ def deinit(self): del globals()['platform_chassis'] + def load_feature_flags(self): + """ + Load feature enable/skip flags from platform files. + """ + platform_pmon_ctrl_file_path = self.get_platform_pmon_ctrl_file_path() + if platform_pmon_ctrl_file_path is not None: + # Load the JSON file + with open(platform_pmon_ctrl_file_path) as file: + data = json.load(file) + if ENABLE_SFF_MGR_FLAG_NAME in data: + enable_sff_mgr = data[ENABLE_SFF_MGR_FLAG_NAME] + self.enable_sff_mgr = isinstance(enable_sff_mgr, bool) and enable_sff_mgr + + def get_platform_pmon_ctrl_file_path(self): + """ + Get the platform PMON control file path. + + The method generates a list of potential file paths, prioritizing the container platform path. + It then checks these paths in order to find an existing file. + + Returns: + str: The first found platform PMON control file path. + None: If no file path exists. + """ + platform_env_conf_path_candidates = [] + + platform_env_conf_path_candidates.append( + os.path.join(CONTAINER_PLATFORM_PATH, PLATFORM_PMON_CTRL_FILENAME)) + + platform = device_info.get_platform() + if platform: + platform_env_conf_path_candidates.append( + os.path.join(HOST_DEVICE_PATH, platform, PLATFORM_PMON_CTRL_FILENAME)) + + for platform_env_conf_file_path in platform_env_conf_path_candidates: + if os.path.isfile(platform_env_conf_file_path): + return platform_env_conf_file_path + + return None + # Run daemon def run(self): @@ -2464,6 +2866,16 @@ def run(self): # Start daemon initialization sequence port_mapping_data = self.init() + self.load_feature_flags() + # Start the SFF manager + sff_manager = None + if self.enable_sff_mgr: + sff_manager = SffManagerTask(self.namespaces, port_mapping_data, self.stop_event) + sff_manager.start() + self.threads.append(sff_manager) + else: + self.log_notice("Skipping SFF Task Manager") + # Start the CMIS manager cmis_manager = CmisManagerTask(self.namespaces, port_mapping_data, self.stop_event, self.skip_cmis_mgr) if not self.skip_cmis_mgr: @@ -2503,6 +2915,11 @@ def run(self): self.log_error("Exiting main loop as child thread raised exception!") os.kill(os.getpid(), signal.SIGKILL) + # Stop the SFF manager + if sff_manager is not None: + if sff_manager.is_alive(): + sff_manager.join() + # Stop the CMIS manager if cmis_manager is not None: if cmis_manager.is_alive(): @@ -2529,7 +2946,7 @@ def run(self): class XcvrTableHelper: def __init__(self, namespaces): self.int_tbl, self.dom_tbl, self.dom_threshold_tbl, self.status_tbl, self.app_port_tbl, \ - self.cfg_port_tbl, self.state_port_tbl, self.pm_tbl = {}, {}, {}, {}, {}, {}, {}, {} + self.cfg_port_tbl, self.state_port_tbl, self.pm_tbl = {}, {}, {}, {}, {}, {}, {}, {} self.state_db = {} self.cfg_db = {} for namespace in namespaces: diff --git a/sonic-xcvrd/xcvrd/xcvrd_utilities/port_mapping.py b/sonic-xcvrd/xcvrd/xcvrd_utilities/port_mapping.py index c7965f343..c7d8b8893 100644 --- a/sonic-xcvrd/xcvrd/xcvrd_utilities/port_mapping.py +++ b/sonic-xcvrd/xcvrd/xcvrd_utilities/port_mapping.py @@ -1,19 +1,27 @@ +import threading from sonic_py_common import daemon_base from sonic_py_common import multi_asic from sonic_py_common.interface import backplane_prefix, inband_prefix, recirc_prefix from swsscommon import swsscommon SELECT_TIMEOUT_MSECS = 1000 +DEFAULT_PORT_TBL_MAP = [ + {'CONFIG_DB': swsscommon.CFG_PORT_TABLE_NAME}, + {'STATE_DB': 'TRANSCEIVER_INFO'}, + {'STATE_DB': 'PORT_TABLE', 'FILTER': ['host_tx_ready']}, +] class PortChangeEvent: + # Used for holding class-level thread-safe variable PORT_EVENT + thread_local_data = threading.local() PORT_ADD = 0 PORT_REMOVE = 1 PORT_SET = 2 PORT_DEL = 3 - PORT_EVENT = {} - def __init__(self, port_name, port_index, asic_id, event_type, port_dict=None): + def __init__(self, port_name, port_index, asic_id, event_type, port_dict=None, + db_name=None, table_name=None): # Logical port name, e.g. Ethernet0 self.port_name = port_name # Physical port index, equals to "index" field of PORT table in CONFIG_DB @@ -24,6 +32,8 @@ def __init__(self, port_name, port_index, asic_id, event_type, port_dict=None): self.event_type = event_type # Port config dict self.port_dict = port_dict + self.db_name = db_name + self.table_name = table_name def __str__(self): return '{} - name={} index={} asic_id={}'.format('Add' if self.event_type == self.PORT_ADD else 'Remove', @@ -107,18 +117,13 @@ def subscribe_port_config_change(namespaces): sel.addSelectable(port_tbl) return sel, asic_context -def subscribe_port_update_event(namespaces, logger): +def subscribe_port_update_event(namespaces, logger, port_tbl_map=DEFAULT_PORT_TBL_MAP): """ Subscribe to a particular DB's table and listen to only interested fields Format : { : , , , .. } where only field update will be received """ - port_tbl_map = [ - {'CONFIG_DB': swsscommon.CFG_PORT_TABLE_NAME}, - {'STATE_DB': 'TRANSCEIVER_INFO'}, - {'STATE_DB': 'PORT_TABLE', 'FILTER': ['host_tx_ready']}, - ] - + PortChangeEvent.thread_local_data.PORT_EVENT = {} sel = swsscommon.Select() asic_context = {} for d in port_tbl_map: @@ -173,37 +178,43 @@ def handle_port_update_event(sel, asic_context, stop_event, logger, port_change_ fvp['op'] = op fvp['FILTER'] = port_tbl.filter # Soak duplicate events and consider only the last event - port_event_cache[key+port_tbl.db_name+port_tbl.table_name] = fvp + port_event_cache[(key, port_tbl.db_name, port_tbl.table_name)] = fvp # Now apply filter over soaked events for key, fvp in port_event_cache.items(): + port_name = key[0] + db_name = key[1] + table_name = key[2] port_index = int(fvp['index']) port_change_event = None - diff = {} filter = fvp['FILTER'] del fvp['FILTER'] apply_filter_to_fvp(filter, fvp) - if key in PortChangeEvent.PORT_EVENT: - diff = dict(set(fvp.items()) - set(PortChangeEvent.PORT_EVENT[key].items())) - # Ignore duplicate events - if not diff: - PortChangeEvent.PORT_EVENT[key] = fvp - continue - PortChangeEvent.PORT_EVENT[key] = fvp + if key in PortChangeEvent.thread_local_data.PORT_EVENT: + diff = set(fvp.items()) - set(PortChangeEvent.thread_local_data.PORT_EVENT[key].items()) + # Ignore duplicate events + if not diff: + PortChangeEvent.thread_local_data.PORT_EVENT[key] = fvp + continue + PortChangeEvent.thread_local_data.PORT_EVENT[key] = fvp if fvp['op'] == swsscommon.SET_COMMAND: port_change_event = PortChangeEvent(fvp['key'], port_index, fvp['asic_id'], PortChangeEvent.PORT_SET, - fvp) + fvp, + db_name, + table_name) elif fvp['op'] == swsscommon.DEL_COMMAND: port_change_event = PortChangeEvent(fvp['key'], port_index, fvp['asic_id'], PortChangeEvent.PORT_DEL, - fvp) + fvp, + db_name, + table_name) # This is the final event considered for processing logger.log_warning("*** {} handle_port_update_event() fvp {}".format( key, fvp)) From 297e22dd3c3fae625fb98a530affae10a17e1884 Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Wed, 21 Jun 2023 13:14:39 -0700 Subject: [PATCH 02/22] Remove unnecessary check and add comments --- sonic-xcvrd/xcvrd/xcvrd.py | 31 +++++++++---------- .../xcvrd/xcvrd_utilities/port_mapping.py | 9 +++++- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/sonic-xcvrd/xcvrd/xcvrd.py b/sonic-xcvrd/xcvrd/xcvrd.py index 575372f5f..8d1cdc296 100644 --- a/sonic-xcvrd/xcvrd/xcvrd.py +++ b/sonic-xcvrd/xcvrd/xcvrd.py @@ -921,15 +921,6 @@ def on_port_update_event(self, port_change_event): lport = port_change_event.port_name pport = port_change_event.port_index - # This thread doesn't need to expilictly wait on PortInitDone and - # PortConfigDone events, as xcvrd main thread waits on them before - # spawrning this thread. - if lport in ['PortInitDone']: - return - - if lport in ['PortConfigDone']: - return - # Skip if it's not a physical port if not lport.startswith('Ethernet'): return @@ -1110,12 +1101,22 @@ def task_worker(self): # CONFIG updates, and STATE_DB for insertion/removal, and host_tx_ready change sel, asic_context = port_mapping.subscribe_port_update_event(self.namespaces, self, self.PORT_TBL_MAP) + + # This thread doesn't need to expilictly wait on PortInitDone and + # PortConfigDone events, as xcvrd main thread waits on them before + # spawrning this thread. while not self.task_stopping_event.is_set(): + # Take a snapshot for port_dict, this will be used to calculate diff + # later in the while loop to determine if there's really a value + # change on the notified Redis update. + self.port_dict_prev = copy.deepcopy(self.port_dict) + # Internally, handle_port_update_event will block for up to - # SELECT_TIMEOUT_MSECS until a message is received. A message is - # received when there is a Redis SET/DEL operation in the DB tables. - # Upon process restart, messages will be replayed for all fields, no - # need to explictly query the DB tables here. + # SELECT_TIMEOUT_MSECS until a message is received(in select + # function). A message is received when there is a Redis SET/DEL + # operation in the DB tables. Upon process restart, messages will be + # replayed for all fields, no need to explictly query the DB tables + # here. port_mapping.handle_port_update_event(sel, asic_context, self.task_stopping_event, self, self.on_port_update_event) @@ -1220,10 +1221,6 @@ def task_worker(self): self.log_error("{}: Failed to {} TX with channel mask: {}".format( lport, "disable" if target_tx_disable_flag else "enable", bin(mask))) - # Take a snapshot for port_dict, - # this will be used to calculate diff in the next while loop - self.port_dict_prev = copy.deepcopy(self.port_dict) - # Thread wrapper class for CMIS transceiver management diff --git a/sonic-xcvrd/xcvrd/xcvrd_utilities/port_mapping.py b/sonic-xcvrd/xcvrd/xcvrd_utilities/port_mapping.py index c7d8b8893..e6642775f 100644 --- a/sonic-xcvrd/xcvrd/xcvrd_utilities/port_mapping.py +++ b/sonic-xcvrd/xcvrd/xcvrd_utilities/port_mapping.py @@ -13,7 +13,11 @@ class PortChangeEvent: - # Used for holding class-level thread-safe variable PORT_EVENT + # thread_local_data will be used to hold class-level local-safe variable + # PORT_EVENT (i.e. global variable to the local thread). + # thread_local_data.PORT_EVENT dict will be initialized in + # subscribe_port_update_event, and used to store the latest port change + # event for each key, to avoid duplicate event processing. thread_local_data = threading.local() PORT_ADD = 0 PORT_REMOVE = 1 @@ -192,11 +196,14 @@ def handle_port_update_event(sel, asic_context, stop_event, logger, port_change_ apply_filter_to_fvp(filter, fvp) if key in PortChangeEvent.thread_local_data.PORT_EVENT: + # Compare current event with last event on this key, to see if + # there's really a change on the contents. diff = set(fvp.items()) - set(PortChangeEvent.thread_local_data.PORT_EVENT[key].items()) # Ignore duplicate events if not diff: PortChangeEvent.thread_local_data.PORT_EVENT[key] = fvp continue + # Update the latest event to the cache PortChangeEvent.thread_local_data.PORT_EVENT[key] = fvp if fvp['op'] == swsscommon.SET_COMMAND: From 6eea7d849468d1cbf761a5cc2174db1e303bc733 Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Wed, 21 Jun 2023 20:15:38 -0700 Subject: [PATCH 03/22] Update comments --- sonic-xcvrd/xcvrd/xcvrd_utilities/port_mapping.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sonic-xcvrd/xcvrd/xcvrd_utilities/port_mapping.py b/sonic-xcvrd/xcvrd/xcvrd_utilities/port_mapping.py index e6642775f..7c1789ab8 100644 --- a/sonic-xcvrd/xcvrd/xcvrd_utilities/port_mapping.py +++ b/sonic-xcvrd/xcvrd/xcvrd_utilities/port_mapping.py @@ -13,11 +13,11 @@ class PortChangeEvent: - # thread_local_data will be used to hold class-level local-safe variable - # PORT_EVENT (i.e. global variable to the local thread). - # thread_local_data.PORT_EVENT dict will be initialized in - # subscribe_port_update_event, and used to store the latest port change - # event for each key, to avoid duplicate event processing. + # thread_local_data will be used to hold class-scope thread-safe variables + # (i.e. global variable to the local thread), such as + # thread_local_data.PORT_EVENT. thread_local_data.PORT_EVENT dict will be + # initialized in subscribe_port_update_event, and used to store the latest + # port change event for each key, to avoid duplicate event processing. thread_local_data = threading.local() PORT_ADD = 0 PORT_REMOVE = 1 @@ -127,6 +127,9 @@ def subscribe_port_update_event(namespaces, logger, port_tbl_map=DEFAULT_PORT_TB Format : { :
, , , .. } where only field update will be received """ + # PORT_EVENT dict stores the latest port change event for each key, to avoid + # duplicate event processing. key in PORT_EVENT dict would be a combination + # of (key of redis DB entry, port_tbl.db_name, port_tbl.table_name) PortChangeEvent.thread_local_data.PORT_EVENT = {} sel = swsscommon.Select() asic_context = {} From 04fa51ed70b3775307a1e2559a317da217b18e98 Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Thu, 22 Jun 2023 13:09:47 -0700 Subject: [PATCH 04/22] Add return value to handle_port_update_event and update comments --- sonic-xcvrd/xcvrd/xcvrd.py | 33 ++++++++++++------- .../xcvrd/xcvrd_utilities/port_mapping.py | 13 ++++++-- 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/sonic-xcvrd/xcvrd/xcvrd.py b/sonic-xcvrd/xcvrd/xcvrd.py index 8d1cdc296..d82be845e 100644 --- a/sonic-xcvrd/xcvrd/xcvrd.py +++ b/sonic-xcvrd/xcvrd/xcvrd.py @@ -897,8 +897,16 @@ def __init__(self, namespaces, port_mapping, main_thread_stop_event): self.exc = None self.task_stopping_event = threading.Event() self.main_thread_stop_event = main_thread_stop_event + # port_dict holds data per port entry with logical_port_name as key, it + # maintains local copy of the following DB fields: + # CONFIG_DB PORT_TABLE 'index' + # CONFIG_DB PORT_TABLE 'channel' + # STATE_DB PORT_TABLE 'host_tx_ready' + # STATE_DB TRANSCEIVER_INFO 'type' + # Its port entry will get deleted upon CONFIG_DB PORT_TABLE DEL. + # Port entry's 'type' field will get deleted upon STATE_DB TRANSCEIVER_INFO DEL. self.port_dict = {} - # port_dict snapshot captured in the previous while loop + # port_dict snapshot captured in the previous event update loop self.port_dict_prev = {} self.port_mapping = copy.deepcopy(port_mapping) self.xcvr_table_helper = XcvrTableHelper(namespaces) @@ -1020,9 +1028,8 @@ def calculate_tx_disable_delta_array(self, cur_tx_disable_array, tx_disable_flag Returns: list: A boolean array where each entry indicates whether there's a - change for the corresponding - channel in the TX disable array. True means there's a change, - False means no change. + change for the corresponding channel in the TX disable array. + True means there's a change, False means no change. """ delta_array = [] for i, cur_flag in enumerate(cur_tx_disable_array): @@ -1083,7 +1090,7 @@ def task_worker(self): TRANSCEIVER_INFO table is used here. On the other hand, sff_mgr also listens to CONFIG_DB PORT_TABLE for info - such as 'index'/etc. + such as 'index', 'channel'/etc. Platform can decide whether to enable sff_mgr via platform enable_sff_mgr flag. If enable_sff_mgr is False, sff_mgr will not run. @@ -1106,19 +1113,16 @@ def task_worker(self): # PortConfigDone events, as xcvrd main thread waits on them before # spawrning this thread. while not self.task_stopping_event.is_set(): - # Take a snapshot for port_dict, this will be used to calculate diff - # later in the while loop to determine if there's really a value - # change on the notified Redis update. - self.port_dict_prev = copy.deepcopy(self.port_dict) - # Internally, handle_port_update_event will block for up to # SELECT_TIMEOUT_MSECS until a message is received(in select # function). A message is received when there is a Redis SET/DEL # operation in the DB tables. Upon process restart, messages will be # replayed for all fields, no need to explictly query the DB tables # here. - port_mapping.handle_port_update_event(sel, asic_context, self.task_stopping_event, self, - self.on_port_update_event) + if not port_mapping.handle_port_update_event( + sel, asic_context, self.task_stopping_event, self, self.on_port_update_event): + # In the case of no real update, go back to the beginning of the loop + continue for lport in list(self.port_dict.keys()): if self.task_stopping_event.is_set(): @@ -1221,6 +1225,11 @@ def task_worker(self): self.log_error("{}: Failed to {} TX with channel mask: {}".format( lport, "disable" if target_tx_disable_flag else "enable", bin(mask))) + # Take a snapshot for port_dict, this will be used to calculate diff + # later in the while loop to determine if there's really a value + # change on the fields regarding to xcvr insertion and host_tx_ready. + self.port_dict_prev = copy.deepcopy(self.port_dict) + # Thread wrapper class for CMIS transceiver management diff --git a/sonic-xcvrd/xcvrd/xcvrd_utilities/port_mapping.py b/sonic-xcvrd/xcvrd/xcvrd_utilities/port_mapping.py index 7c1789ab8..03c593733 100644 --- a/sonic-xcvrd/xcvrd/xcvrd_utilities/port_mapping.py +++ b/sonic-xcvrd/xcvrd/xcvrd_utilities/port_mapping.py @@ -157,14 +157,18 @@ def handle_port_update_event(sel, asic_context, stop_event, logger, port_change_ """ Select PORT update events, notify the observers upon a port update in CONFIG_DB or a XCVR insertion/removal in STATE_DB + + Returns: + bool: True if there's at least one update event; False if there's no update event. """ + has_event = False if not stop_event.is_set(): (state, _) = sel.select(SELECT_TIMEOUT_MSECS) if state == swsscommon.Select.TIMEOUT: - return + return has_event if state != swsscommon.Select.OBJECT: logger.log_warning('sel.select() did not return swsscommon.Select.OBJECT') - return + return has_event port_event_cache = {} for port_tbl in asic_context.keys(): @@ -229,7 +233,10 @@ def handle_port_update_event(sel, asic_context, stop_event, logger, port_change_ logger.log_warning("*** {} handle_port_update_event() fvp {}".format( key, fvp)) if port_change_event is not None: - port_change_event_handler(port_change_event) + has_event = True + port_change_event_handler(port_change_event) + + return has_event def handle_port_config_change(sel, asic_context, stop_event, port_mapping, logger, port_change_event_handler): From 82ce3f39a7c71593b9dffdcdf63e77f22bea2527 Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Thu, 22 Jun 2023 13:40:28 -0700 Subject: [PATCH 05/22] Update comments --- sonic-xcvrd/xcvrd/xcvrd.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sonic-xcvrd/xcvrd/xcvrd.py b/sonic-xcvrd/xcvrd/xcvrd.py index d82be845e..69b97cd12 100644 --- a/sonic-xcvrd/xcvrd/xcvrd.py +++ b/sonic-xcvrd/xcvrd/xcvrd.py @@ -1144,9 +1144,11 @@ def task_worker(self): if not (xcvr_type.startswith('QSFP28') or xcvr_type.startswith('QSFP+')): continue - # Handle the case when Xcvrd was NOT running when 'host_tx_ready' - # was updated or this is the first run so reconcile + # Handle the case that host_tx_ready value in the local cache hasn't + # been updated via PortChangeEvent: if 'host_tx_ready' not in data: + # Fetch host_tx_ready status from STATE_DB (if not present + # in DB, treat it as false), and update self.port_dict data['host_tx_ready'] = self.get_host_tx_status(lport) # It's a xcvr insertion case if TRANSCEIVER_INFO 'type' doesn't exist From 43ad2e1e465f250d30f2740b0d7f55145b47aa59 Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Thu, 22 Jun 2023 17:59:25 -0700 Subject: [PATCH 06/22] Rely on asic_id from event update instead from static port_mapping --- sonic-xcvrd/tests/test_xcvrd.py | 23 +++++++++-------------- sonic-xcvrd/xcvrd/xcvrd.py | 27 +++++++++++++++++++-------- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/sonic-xcvrd/tests/test_xcvrd.py b/sonic-xcvrd/tests/test_xcvrd.py index f475d7fa4..0790d19f0 100644 --- a/sonic-xcvrd/tests/test_xcvrd.py +++ b/sonic-xcvrd/tests/test_xcvrd.py @@ -46,9 +46,8 @@ class TestXcvrdThreadException(object): @patch('xcvrd.xcvrd.platform_chassis', MagicMock()) @patch('xcvrd.xcvrd_utilities.port_mapping.subscribe_port_update_event', MagicMock(side_effect = NotImplementedError)) def test_SffManagerTask_task_run_with_exception(self): - port_mapping = PortMapping() stop_event = threading.Event() - sff_mgr = SffManagerTask(DEFAULT_NAMESPACE, port_mapping, stop_event) + sff_mgr = SffManagerTask(DEFAULT_NAMESPACE, stop_event) exception_received = None trace = None try: @@ -622,9 +621,8 @@ def test_DaemonXcvrd_run(self, mock_task_stop1, mock_task_stop2, mock_task_run1, assert mock_init.call_count == 1 def test_SffManagerTask_handle_port_change_event(self): - port_mapping = PortMapping() stop_event = threading.Event() - task = SffManagerTask(DEFAULT_NAMESPACE, port_mapping, stop_event) + task = SffManagerTask(DEFAULT_NAMESPACE, stop_event) port_change_event = PortChangeEvent('PortConfigDone', -1, 0, PortChangeEvent.PORT_SET) task.on_port_update_event(port_change_event) @@ -666,20 +664,16 @@ def test_SffManagerTask_get_host_tx_status(self, mock_get_state_port_tbl): mock_get_state_port_tbl.return_value.get.return_value = (True, {'host_tx_ready': 'true'}) sff_manager_task = SffManagerTask(namespaces=DEFAULT_NAMESPACE, - port_mapping=PortMapping(), main_thread_stop_event=threading.Event()) - sff_manager_task.port_mapping.get_asic_id_for_logical_port = MagicMock(return_value=1) - lport = 'Ethernet0' - assert sff_manager_task.get_host_tx_status(lport) == 'true' - sff_manager_task.port_mapping.get_asic_id_for_logical_port.assert_called_once_with(lport) - mock_get_state_port_tbl.assert_called_once_with(1) + assert sff_manager_task.get_host_tx_status(lport, 0) == 'true' + mock_get_state_port_tbl.assert_called_once_with(0) mock_get_state_port_tbl.return_value.get.assert_called_once_with(lport) @patch('xcvrd.xcvrd.platform_chassis') @patch('xcvrd.xcvrd_utilities.port_mapping.subscribe_port_update_event', MagicMock(return_value=(None, None))) - @patch('xcvrd.xcvrd_utilities.port_mapping.handle_port_update_event', MagicMock()) + @patch('xcvrd.xcvrd_utilities.port_mapping.handle_port_update_event', MagicMock(return_value=True)) def test_SffManagerTask_task_worker(self, mock_chassis): mock_xcvr_api = MagicMock() mock_xcvr_api.tx_disable_channel = MagicMock(return_value=True) @@ -695,12 +689,13 @@ def test_SffManagerTask_task_worker(self, mock_chassis): mock_chassis.get_sfp = MagicMock(return_value=mock_sfp) task = SffManagerTask(namespaces=DEFAULT_NAMESPACE, - port_mapping=PortMapping(), main_thread_stop_event=threading.Event()) # TX enable case: - port_dict = {'type': 'QSFP28', 'channel': '0'} - port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET, port_dict) + port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET, { + 'type': 'QSFP28', + 'channel': '0' + }) task.on_port_update_event(port_change_event) assert len(task.port_dict) == 1 task.get_host_tx_status = MagicMock(return_value='true') diff --git a/sonic-xcvrd/xcvrd/xcvrd.py b/sonic-xcvrd/xcvrd/xcvrd.py index 69b97cd12..8d9a79335 100644 --- a/sonic-xcvrd/xcvrd/xcvrd.py +++ b/sonic-xcvrd/xcvrd/xcvrd.py @@ -891,7 +891,7 @@ class SffManagerTask(threading.Thread): # Default number of channels for QSFP28/QSFP+ transceiver DEFAULT_NUM_CHANNELS = 4 - def __init__(self, namespaces, port_mapping, main_thread_stop_event): + def __init__(self, namespaces, main_thread_stop_event): threading.Thread.__init__(self) self.name = "SffManagerTask" self.exc = None @@ -903,12 +903,14 @@ def __init__(self, namespaces, port_mapping, main_thread_stop_event): # CONFIG_DB PORT_TABLE 'channel' # STATE_DB PORT_TABLE 'host_tx_ready' # STATE_DB TRANSCEIVER_INFO 'type' + # plus 'asic_id' field which is used to identify the ASIC where the + # port is located. ('asic_id' always comes with every event in + # handle_port_update_event function) # Its port entry will get deleted upon CONFIG_DB PORT_TABLE DEL. # Port entry's 'type' field will get deleted upon STATE_DB TRANSCEIVER_INFO DEL. self.port_dict = {} # port_dict snapshot captured in the previous event update loop self.port_dict_prev = {} - self.port_mapping = copy.deepcopy(port_mapping) self.xcvr_table_helper = XcvrTableHelper(namespaces) self.namespaces = namespaces @@ -928,6 +930,7 @@ def on_port_update_event(self, port_change_event): lport = port_change_event.port_name pport = port_change_event.port_index + asic_id = port_change_event.asic_id # Skip if it's not a physical port if not lport.startswith('Ethernet'): @@ -962,6 +965,7 @@ def on_port_update_event(self, port_change_event): # insertion/removal event. if 'type' in port_change_event.port_dict: self.port_dict[lport]['type'] = port_change_event.port_dict['type'] + self.port_dict[lport]['asic_id'] = asic_id # CONFIG_DB PORT_TABLE DEL case: elif port_change_event.db_name and \ port_change_event.db_name == 'CONFIG_DB': @@ -977,10 +981,9 @@ def on_port_update_event(self, port_change_event): if lport in self.port_dict and 'type' in self.port_dict[lport]: del self.port_dict[lport]['type'] - def get_host_tx_status(self, lport): + def get_host_tx_status(self, lport, asic_index): host_tx_ready = 'false' - asic_index = self.port_mapping.get_asic_id_for_logical_port(lport) state_port_tbl = self.xcvr_table_helper.get_state_port_tbl(asic_index) found, port_info = state_port_tbl.get(lport) @@ -1093,7 +1096,8 @@ def task_worker(self): such as 'index', 'channel'/etc. Platform can decide whether to enable sff_mgr via platform - enable_sff_mgr flag. If enable_sff_mgr is False, sff_mgr will not run. + enable_sff_mgr flag. If enable_sff_mgr is False or not present, sff_mgr + will not run. By default, it's disabled. There is a behavior change (and requirement) for the platforms that enable this sff_mgr feature: platform needs to keep TX in disabled state @@ -1149,7 +1153,9 @@ def task_worker(self): if 'host_tx_ready' not in data: # Fetch host_tx_ready status from STATE_DB (if not present # in DB, treat it as false), and update self.port_dict - data['host_tx_ready'] = self.get_host_tx_status(lport) + data['host_tx_ready'] = self.get_host_tx_status(lport, data['asic_id']) + self.log_notice("{}: fetched DB and updated host_tx_ready={} locally".format( + lport, data['host_tx_ready'])) # It's a xcvr insertion case if TRANSCEIVER_INFO 'type' doesn't exist # in previous port_dict sanpshot @@ -1160,7 +1166,12 @@ def task_worker(self): 'host_tx_ready' not in self.port_dict_prev[lport] or self.port_dict_prev[lport]['host_tx_ready'] != data['host_tx_ready']): host_tx_ready_changed = True - # Only proceed on xcvr insertion case or the case of host_tx_ready getting changed + # Only proceed on xcvr insertion case or the case of + # host_tx_ready getting changed. + # Even though handle_port_update_event() filters out duplicate + # events, we still need to have this below extra check to ignore + # the event that is not related to insertrion or host_tx_ready + # change, such as CONFIG_DB related change. if (not xcvr_inserted) and (not host_tx_ready_changed): continue self.log_notice("{}: xcvr_inserted={}, host_tx_ready_changed={}".format( @@ -2878,7 +2889,7 @@ def run(self): # Start the SFF manager sff_manager = None if self.enable_sff_mgr: - sff_manager = SffManagerTask(self.namespaces, port_mapping_data, self.stop_event) + sff_manager = SffManagerTask(self.namespaces, self.stop_event) sff_manager.start() self.threads.append(sff_manager) else: From 94a0c9d4d16d21222035cea9948a979ec3e3580d Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Thu, 22 Jun 2023 18:03:41 -0700 Subject: [PATCH 07/22] Update comment --- sonic-xcvrd/xcvrd/xcvrd.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sonic-xcvrd/xcvrd/xcvrd.py b/sonic-xcvrd/xcvrd/xcvrd.py index 8d9a79335..4db0dbce9 100644 --- a/sonic-xcvrd/xcvrd/xcvrd.py +++ b/sonic-xcvrd/xcvrd/xcvrd.py @@ -903,9 +903,8 @@ def __init__(self, namespaces, main_thread_stop_event): # CONFIG_DB PORT_TABLE 'channel' # STATE_DB PORT_TABLE 'host_tx_ready' # STATE_DB TRANSCEIVER_INFO 'type' - # plus 'asic_id' field which is used to identify the ASIC where the - # port is located. ('asic_id' always comes with every event in - # handle_port_update_event function) + # plus 'asic_id' from PortChangeEvent.asic_id (asic_id always gets + # filled in handle_port_update_event function based on asic_context) # Its port entry will get deleted upon CONFIG_DB PORT_TABLE DEL. # Port entry's 'type' field will get deleted upon STATE_DB TRANSCEIVER_INFO DEL. self.port_dict = {} From c93f851664949639ed5b25483c6e08e9bfd2c542 Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Mon, 26 Jun 2023 12:37:14 -0700 Subject: [PATCH 08/22] Move sff_mgr to a new file --- sonic-xcvrd/tests/test_xcvrd.py | 25 +- sonic-xcvrd/xcvrd/sff_mgr.py | 393 ++++++++++++++++ sonic-xcvrd/xcvrd/xcvrd.py | 430 +----------------- .../xcvrd_utilities/xcvr_table_helper.py | 59 +++ 4 files changed, 471 insertions(+), 436 deletions(-) create mode 100644 sonic-xcvrd/xcvrd/sff_mgr.py create mode 100644 sonic-xcvrd/xcvrd/xcvrd_utilities/xcvr_table_helper.py diff --git a/sonic-xcvrd/tests/test_xcvrd.py b/sonic-xcvrd/tests/test_xcvrd.py index 0790d19f0..ee3bbd05f 100644 --- a/sonic-xcvrd/tests/test_xcvrd.py +++ b/sonic-xcvrd/tests/test_xcvrd.py @@ -2,6 +2,8 @@ from xcvrd.xcvrd_utilities.port_mapping import * from xcvrd.xcvrd_utilities.sfp_status_helper import * from xcvrd.xcvrd import * +from xcvrd.sff_mgr import * +from xcvrd.xcvrd_utilities.xcvr_table_helper import * import pytest import copy import os @@ -43,11 +45,11 @@ class TestXcvrdThreadException(object): - @patch('xcvrd.xcvrd.platform_chassis', MagicMock()) - @patch('xcvrd.xcvrd_utilities.port_mapping.subscribe_port_update_event', MagicMock(side_effect = NotImplementedError)) + @patch('xcvrd.xcvrd_utilities.port_mapping.subscribe_port_update_event', + MagicMock(side_effect = NotImplementedError)) def test_SffManagerTask_task_run_with_exception(self): stop_event = threading.Event() - sff_mgr = SffManagerTask(DEFAULT_NAMESPACE, stop_event) + sff_mgr = SffManagerTask(DEFAULT_NAMESPACE, stop_event, MagicMock(), helper_logger) exception_received = None trace = None try: @@ -60,7 +62,7 @@ def test_SffManagerTask_task_run_with_exception(self): assert not sff_mgr.is_alive() assert(type(exception_received) == NotImplementedError) assert("NotImplementedError" in str(trace) and "effect" in str(trace)) - assert("sonic-xcvrd/xcvrd/xcvrd.py" in str(trace)) + assert("sonic-xcvrd/xcvrd/sff_mgr.py" in str(trace)) assert("subscribe_port_update_event" in str(trace)) @patch('xcvrd.xcvrd.platform_chassis', MagicMock()) @@ -622,7 +624,7 @@ def test_DaemonXcvrd_run(self, mock_task_stop1, mock_task_stop2, mock_task_run1, def test_SffManagerTask_handle_port_change_event(self): stop_event = threading.Event() - task = SffManagerTask(DEFAULT_NAMESPACE, stop_event) + task = SffManagerTask(DEFAULT_NAMESPACE, stop_event, MagicMock(), helper_logger) port_change_event = PortChangeEvent('PortConfigDone', -1, 0, PortChangeEvent.PORT_SET) task.on_port_update_event(port_change_event) @@ -663,8 +665,11 @@ def test_SffManagerTask_handle_port_change_event(self): def test_SffManagerTask_get_host_tx_status(self, mock_get_state_port_tbl): mock_get_state_port_tbl.return_value.get.return_value = (True, {'host_tx_ready': 'true'}) - sff_manager_task = SffManagerTask(namespaces=DEFAULT_NAMESPACE, - main_thread_stop_event=threading.Event()) + sff_manager_task = SffManagerTask(DEFAULT_NAMESPACE, + threading.Event(), + MagicMock(), + helper_logger) + lport = 'Ethernet0' assert sff_manager_task.get_host_tx_status(lport, 0) == 'true' mock_get_state_port_tbl.assert_called_once_with(0) @@ -688,8 +693,10 @@ def test_SffManagerTask_task_worker(self, mock_chassis): mock_chassis.get_all_sfps = MagicMock(return_value=[mock_sfp]) mock_chassis.get_sfp = MagicMock(return_value=mock_sfp) - task = SffManagerTask(namespaces=DEFAULT_NAMESPACE, - main_thread_stop_event=threading.Event()) + task = SffManagerTask(DEFAULT_NAMESPACE, + threading.Event(), + mock_chassis, + helper_logger) # TX enable case: port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET, { diff --git a/sonic-xcvrd/xcvrd/sff_mgr.py b/sonic-xcvrd/xcvrd/sff_mgr.py new file mode 100644 index 000000000..f9e1e4ecc --- /dev/null +++ b/sonic-xcvrd/xcvrd/sff_mgr.py @@ -0,0 +1,393 @@ +""" + SFF task manager + Deterministic link bring-up task manager for SFF compliant modules, running + as a thread inside xcvrd +""" + +try: + import copy + import sys + import threading + import traceback + + from swsscommon import swsscommon + + from .xcvrd_utilities import port_mapping + from .xcvrd_utilities.xcvr_table_helper import XcvrTableHelper +except ImportError as e: + raise ImportError(str(e) + " - required module not found") + + +# Thread wrapper class for SFF compliant transceiver management + + +class SffManagerTask(threading.Thread): + # Subscribe to below tables in Redis DB + PORT_TBL_MAP = [ + { + 'CONFIG_DB': swsscommon.CFG_PORT_TABLE_NAME + }, + { + 'STATE_DB': 'TRANSCEIVER_INFO', + 'FILTER': ['type'] + }, + { + 'STATE_DB': 'PORT_TABLE', + 'FILTER': ['host_tx_ready'] + }, + ] + # Default number of channels for QSFP28/QSFP+ transceiver + DEFAULT_NUM_CHANNELS = 4 + + def __init__(self, namespaces, main_thread_stop_event, platform_chassis, helper_logger): + threading.Thread.__init__(self) + self.name = "SffManagerTask" + self.exc = None + self.task_stopping_event = threading.Event() + self.main_thread_stop_event = main_thread_stop_event + self.helper_logger = helper_logger + self.platform_chassis = platform_chassis + # port_dict holds data per port entry with logical_port_name as key, it + # maintains local copy of the following DB fields: + # CONFIG_DB PORT_TABLE 'index' + # CONFIG_DB PORT_TABLE 'channel' + # STATE_DB PORT_TABLE 'host_tx_ready' + # STATE_DB TRANSCEIVER_INFO 'type' + # plus 'asic_id' from PortChangeEvent.asic_id (asic_id always gets + # filled in handle_port_update_event function based on asic_context) + # Its port entry will get deleted upon CONFIG_DB PORT_TABLE DEL. + # Port entry's 'type' field will get deleted upon STATE_DB TRANSCEIVER_INFO DEL. + self.port_dict = {} + # port_dict snapshot captured in the previous event update loop + self.port_dict_prev = {} + self.xcvr_table_helper = XcvrTableHelper(namespaces) + self.namespaces = namespaces + + def log_notice(self, message): + self.helper_logger.log_notice("SFF: {}".format(message)) + + def log_warning(self, message): + self.helper_logger.log_warning("SFF: {}".format(message)) + + def log_error(self, message): + self.helper_logger.log_error("SFF: {}".format(message)) + + def on_port_update_event(self, port_change_event): + if (port_change_event.event_type + not in [port_change_event.PORT_SET, port_change_event.PORT_DEL]): + return + + lport = port_change_event.port_name + pport = port_change_event.port_index + asic_id = port_change_event.asic_id + + # Skip if it's not a physical port + if not lport.startswith('Ethernet'): + return + + # Skip if the physical index is not available + if pport is None: + return + + if port_change_event.port_dict is None: + return + + if port_change_event.event_type == port_change_event.PORT_SET: + if lport not in self.port_dict: + self.port_dict[lport] = {} + if pport >= 0: + self.port_dict[lport]['index'] = pport + # This field comes from CONFIG_DB PORT_TABLE. This is the channel + # that blongs to this logical port, 0 means all channels. tx_disable + # API needs to know which channels to disable/enable for a + # particular physical port. + if 'channel' in port_change_event.port_dict: + self.port_dict[lport]['channel'] = port_change_event.port_dict['channel'] + # This field comes from STATE_DB PORT_TABLE + if 'host_tx_ready' in port_change_event.port_dict: + self.port_dict[lport]['host_tx_ready'] = \ + port_change_event.port_dict['host_tx_ready'] + # This field comes from STATE_DB TRANSCEIVER_INFO table. + # TRANSCEIVER_INFO has the same life cycle as a transceiver, if + # transceiver is inserted/removed, TRANSCEIVER_INFO is also + # created/deleted. Thus this filed can used to determine + # insertion/removal event. + if 'type' in port_change_event.port_dict: + self.port_dict[lport]['type'] = port_change_event.port_dict['type'] + self.port_dict[lport]['asic_id'] = asic_id + # CONFIG_DB PORT_TABLE DEL case: + elif port_change_event.db_name and \ + port_change_event.db_name == 'CONFIG_DB': + # Only when port is removed from CONFIG, we consider this entry as deleted. + if lport in self.port_dict: + del self.port_dict[lport] + # STATE_DB TRANSCEIVER_INFO DEL case: + elif port_change_event.table_name and \ + port_change_event.table_name == 'TRANSCEIVER_INFO': + # TRANSCEIVER_INFO DEL corresponds to transceiver removal (not + # port/interface removal), so we only remove 'type' field to help + # determine xcvr removal/insertion event + if lport in self.port_dict and 'type' in self.port_dict[lport]: + del self.port_dict[lport]['type'] + + def get_host_tx_status(self, lport, asic_index): + host_tx_ready = 'false' + + state_port_tbl = self.xcvr_table_helper.get_state_port_tbl(asic_index) + + found, port_info = state_port_tbl.get(lport) + if found and 'host_tx_ready' in dict(port_info): + host_tx_ready = dict(port_info)['host_tx_ready'] + return host_tx_ready + + def run(self): + if self.platform_chassis is None: + self.log_notice("Platform chassis is not available, stopping...") + return + + try: + self.task_worker() + except Exception as e: + self.helper_logger.log_error("Exception occured at {} thread due to {}".format( + threading.current_thread().getName(), repr(e))) + exc_type, exc_value, exc_traceback = sys.exc_info() + msg = traceback.format_exception(exc_type, exc_value, exc_traceback) + for tb_line in msg: + for tb_line_split in tb_line.splitlines(): + self.helper_logger.log_error(tb_line_split) + self.exc = e + self.main_thread_stop_event.set() + + def join(self): + self.task_stopping_event.set() + threading.Thread.join(self) + if self.exc: + raise self.exc + + def calculate_tx_disable_delta_array(self, cur_tx_disable_array, tx_disable_flag, channel): + """ + Calculate the delta between the current transmitter (TX) disable array + and a new TX disable flag for a specific channel. The function returns a + new array (delta_array) where each entry indicates whether there's a + difference for each corresponding channel in the TX disable array. + + Args: + cur_tx_disable_array (list): Current array of TX disable flags for all channels. + tx_disable_flag (bool): The new TX disable flag that needs to be compared + with the current flags. + channel (int): The specific channel that needs to be checked. If channel is 0, all + channels are checked. + + Returns: + list: A boolean array where each entry indicates whether there's a + change for the corresponding channel in the TX disable array. + True means there's a change, False means no change. + """ + delta_array = [] + for i, cur_flag in enumerate(cur_tx_disable_array): + is_different = (tx_disable_flag != cur_flag) if channel in [i + 1, 0] else False + delta_array.append(is_different) + return delta_array + + def convert_bool_array_to_bit_mask(self, bool_array): + """ + Convert a boolean array into a bitmask. If a value in the boolean array + is True, the corresponding bit in the bitmask is set to 1, otherwise + it's set to 0. The function starts from the least significant bit for + the first item in the boolean array. + + Args: + bool_array (list): An array of boolean values. + + Returns: + int: A bitmask corresponding to the input boolean array. + """ + mask = 0 + for i, flag in enumerate(bool_array): + mask += (1 << i if flag else 0) + return mask + + def task_worker(self): + ''' + The goal of sff_mgr is to make sure SFF compliant modules are brought up + in a deterministc way, meaning TX is enabled only after host_tx_ready + becomes True, and TX will be disabled when host_tx_ready becomes False. + This will help eliminate link stability issue and potential interface + flap, also turning off TX reduces the power consumption and avoid any + lab hazard for admin shut interface. + + sff_mgr will only proceed at two events: transceiver insertion event + (including bootup and hot plugin) and host_tx_ready change event, all + other cases are ignored. If either of these two events is detected, + sff_mgr will determine the target tx_disable value based on + host_tx_ready, and also check the current HW value of tx_disable/enable, + if it's already in target tx_disable/enable value, no need to take + action, otherwise, sff_mgr will set the target tx_disable/enable value + to HW. + + To detect these two events, sff_mgr listens to below DB tables, and + mantains a local copy of those DB values, stored in self.port_dict. In + each round of task_worker while loop, sff_mgr will calculate the delta + between current self.port_dict and previous self.port_dict (i.e. + self.port_dict_prev), and determine whether there is a change event. + 1. STATE_DB PORT_TABLE's 'host_tx_ready' field for host_tx_ready + change event. + 2. STATE_DB TRANSCEIVER_INFO table's 'type' field for insesrtion + event. Since TRANSCEIVER_INFO table has the same life cycle of + transceiver module insertion/removal, its DB entry will get + created at xcvr insertion, and will get deleted at xcvr removal. + TRANSCEIVER_STATUS table can also be used to detect + insertion/removal event, but 'type' field of TRANSCEIVER_INFO + table can also be used to filter out unwanted xcvr type, thus + TRANSCEIVER_INFO table is used here. + + On the other hand, sff_mgr also listens to CONFIG_DB PORT_TABLE for info + such as 'index', 'channel'/etc. + + Platform can decide whether to enable sff_mgr via platform + enable_sff_mgr flag. If enable_sff_mgr is False or not present, sff_mgr + will not run. By default, it's disabled. + + There is a behavior change (and requirement) for the platforms that + enable this sff_mgr feature: platform needs to keep TX in disabled state + after module coming out-of-reset, in either module insertion or bootup + cases. This is to make sure the module is not transmitting with TX + enabled before host_tx_ready is True. No impact for the platforms in + current deployment (since they don't enable it explictly.) + + ''' + + # CONFIG updates, and STATE_DB for insertion/removal, and host_tx_ready change + sel, asic_context = port_mapping.subscribe_port_update_event(self.namespaces, self, + self.PORT_TBL_MAP) + + # This thread doesn't need to expilictly wait on PortInitDone and + # PortConfigDone events, as xcvrd main thread waits on them before + # spawrning this thread. + while not self.task_stopping_event.is_set(): + # Internally, handle_port_update_event will block for up to + # SELECT_TIMEOUT_MSECS until a message is received(in select + # function). A message is received when there is a Redis SET/DEL + # operation in the DB tables. Upon process restart, messages will be + # replayed for all fields, no need to explictly query the DB tables + # here. + if not port_mapping.handle_port_update_event( + sel, asic_context, self.task_stopping_event, self, self.on_port_update_event): + # In the case of no real update, go back to the beginning of the loop + continue + + for lport in list(self.port_dict.keys()): + if self.task_stopping_event.is_set(): + break + data = self.port_dict[lport] + pport = int(data.get('index', '-1')) + channel = int(data.get('channel', '0')) + xcvr_type = data.get('type', None) + xcvr_inserted = False + host_tx_ready_changed = False + if pport < 0 or channel < 0: + continue + + if xcvr_type is None: + # TRANSCEIVER_INFO table's 'type' is not ready, meaning xcvr is not present + continue + + # Procced only for 100G/40G + if not (xcvr_type.startswith('QSFP28') or xcvr_type.startswith('QSFP+')): + continue + + # Handle the case that host_tx_ready value in the local cache hasn't + # been updated via PortChangeEvent: + if 'host_tx_ready' not in data: + # Fetch host_tx_ready status from STATE_DB (if not present + # in DB, treat it as false), and update self.port_dict + data['host_tx_ready'] = self.get_host_tx_status(lport, data['asic_id']) + self.log_notice("{}: fetched DB and updated host_tx_ready={} locally".format( + lport, data['host_tx_ready'])) + + # It's a xcvr insertion case if TRANSCEIVER_INFO 'type' doesn't exist + # in previous port_dict sanpshot + if lport not in self.port_dict_prev or 'type' not in self.port_dict_prev[lport]: + xcvr_inserted = True + # Check if there's an diff between current and previous host_tx_ready + if (lport not in self.port_dict_prev or + 'host_tx_ready' not in self.port_dict_prev[lport] or + self.port_dict_prev[lport]['host_tx_ready'] != data['host_tx_ready']): + host_tx_ready_changed = True + # Only proceed on xcvr insertion case or the case of + # host_tx_ready getting changed. + # Even though handle_port_update_event() filters out duplicate + # events, we still need to have this below extra check to ignore + # the event that is not related to insertrion or host_tx_ready + # change, such as CONFIG_DB related change. + if (not xcvr_inserted) and (not host_tx_ready_changed): + continue + self.log_notice("{}: xcvr_inserted={}, host_tx_ready_changed={}".format( + lport, xcvr_inserted, host_tx_ready_changed)) + + # double-check the HW presence before moving forward + sfp = self.platform_chassis.get_sfp(pport) + if not sfp.get_presence(): + self.log_error("{}: module not present!".format(lport)) + del self.port_dict[lport] + continue + try: + # Skip if XcvrApi is not supported + api = sfp.get_xcvr_api() + if api is None: + self.log_error( + "{}: skipping sff_mgr since no xcvr api!".format(lport)) + continue + + # Skip if it's not a paged memory device + if api.is_flat_memory(): + self.log_notice( + "{}: skipping sff_mgr for flat memory xcvr".format(lport)) + continue + + # Skip if it's a copper cable + if api.is_copper(): + self.log_notice( + "{}: skipping sff_mgr for copper cable".format(lport)) + continue + + # Skip if tx_disable action is not supported for this xcvr + if not api.get_tx_disable_support(): + self.log_notice( + "{}: skipping sff_mgr due to tx_disable not supported".format( + lport)) + continue + except AttributeError: + # Skip if these essential routines are not available + continue + + target_tx_disable_flag = data['host_tx_ready'] == 'false' + # get_tx_disable API returns an array of bool, with tx_disable flag on each channel. + # True means tx disabled; False means tx enabled. + cur_tx_disable_array = api.get_tx_disable() + if cur_tx_disable_array is None: + self.log_error("{}: Failed to get current tx_disable value".format(lport)) + # If reading current tx_disable/enable value failed (could be due to + # read error), then set this variable to the opposite value of + # target_tx_disable_flag, to let detla array to be True on + # all the interested channels, to try best-effort TX disable/enable. + cur_tx_disable_array = [not target_tx_disable_flag] * self.DEFAULT_NUM_CHANNELS + # Get an array of bool, where it's True only on the channels that need change. + delta_array = self.calculate_tx_disable_delta_array(cur_tx_disable_array, + target_tx_disable_flag, channel) + mask = self.convert_bool_array_to_bit_mask(delta_array) + if mask == 0: + self.log_notice("{}: No change is needed for tx_disable value".format(lport)) + continue + if api.tx_disable_channel(mask, target_tx_disable_flag): + self.log_notice("{}: TX was {} with channel mask: {}".format( + lport, "disabled" if target_tx_disable_flag else "enabled", bin(mask))) + else: + self.log_error("{}: Failed to {} TX with channel mask: {}".format( + lport, "disable" if target_tx_disable_flag else "enable", bin(mask))) + + # Take a snapshot for port_dict, this will be used to calculate diff + # later in the while loop to determine if there's really a value + # change on the fields regarding to xcvr insertion and host_tx_ready. + self.port_dict_prev = copy.deepcopy(self.port_dict) + diff --git a/sonic-xcvrd/xcvrd/xcvrd.py b/sonic-xcvrd/xcvrd/xcvrd.py index 4db0dbce9..72dac0483 100644 --- a/sonic-xcvrd/xcvrd/xcvrd.py +++ b/sonic-xcvrd/xcvrd/xcvrd.py @@ -27,6 +27,8 @@ from .xcvrd_utilities import sfp_status_helper from .xcvrd_utilities import port_mapping + from .sff_mgr import SffManagerTask + from .xcvrd_utilities.xcvr_table_helper import XcvrTableHelper except ImportError as e: raise ImportError(str(e) + " - required module not found") @@ -39,12 +41,6 @@ PLATFORM_SPECIFIC_MODULE_NAME = "sfputil" PLATFORM_SPECIFIC_CLASS_NAME = "SfpUtil" -TRANSCEIVER_INFO_TABLE = 'TRANSCEIVER_INFO' -TRANSCEIVER_DOM_SENSOR_TABLE = 'TRANSCEIVER_DOM_SENSOR' -TRANSCEIVER_DOM_THRESHOLD_TABLE = 'TRANSCEIVER_DOM_THRESHOLD' -TRANSCEIVER_STATUS_TABLE = 'TRANSCEIVER_STATUS' -TRANSCEIVER_PM_TABLE = 'TRANSCEIVER_PM' - TRANSCEIVER_STATUS_TABLE_SW_FIELDS = ["status", "error"] # Mgminit time required as per CMIS spec @@ -870,379 +866,6 @@ def is_fast_reboot_enabled(): # Helper classes =============================================================== # -# Thread wrapper class for SFF compliant transceiver management - - -class SffManagerTask(threading.Thread): - # Subscribe to below tables in Redis DB - PORT_TBL_MAP = [ - { - 'CONFIG_DB': swsscommon.CFG_PORT_TABLE_NAME - }, - { - 'STATE_DB': 'TRANSCEIVER_INFO', - 'FILTER': ['type'] - }, - { - 'STATE_DB': 'PORT_TABLE', - 'FILTER': ['host_tx_ready'] - }, - ] - # Default number of channels for QSFP28/QSFP+ transceiver - DEFAULT_NUM_CHANNELS = 4 - - def __init__(self, namespaces, main_thread_stop_event): - threading.Thread.__init__(self) - self.name = "SffManagerTask" - self.exc = None - self.task_stopping_event = threading.Event() - self.main_thread_stop_event = main_thread_stop_event - # port_dict holds data per port entry with logical_port_name as key, it - # maintains local copy of the following DB fields: - # CONFIG_DB PORT_TABLE 'index' - # CONFIG_DB PORT_TABLE 'channel' - # STATE_DB PORT_TABLE 'host_tx_ready' - # STATE_DB TRANSCEIVER_INFO 'type' - # plus 'asic_id' from PortChangeEvent.asic_id (asic_id always gets - # filled in handle_port_update_event function based on asic_context) - # Its port entry will get deleted upon CONFIG_DB PORT_TABLE DEL. - # Port entry's 'type' field will get deleted upon STATE_DB TRANSCEIVER_INFO DEL. - self.port_dict = {} - # port_dict snapshot captured in the previous event update loop - self.port_dict_prev = {} - self.xcvr_table_helper = XcvrTableHelper(namespaces) - self.namespaces = namespaces - - def log_notice(self, message): - helper_logger.log_notice("SFF: {}".format(message)) - - def log_warning(self, message): - helper_logger.log_warning("SFF: {}".format(message)) - - def log_error(self, message): - helper_logger.log_error("SFF: {}".format(message)) - - def on_port_update_event(self, port_change_event): - if (port_change_event.event_type - not in [port_change_event.PORT_SET, port_change_event.PORT_DEL]): - return - - lport = port_change_event.port_name - pport = port_change_event.port_index - asic_id = port_change_event.asic_id - - # Skip if it's not a physical port - if not lport.startswith('Ethernet'): - return - - # Skip if the physical index is not available - if pport is None: - return - - if port_change_event.port_dict is None: - return - - if port_change_event.event_type == port_change_event.PORT_SET: - if lport not in self.port_dict: - self.port_dict[lport] = {} - if pport >= 0: - self.port_dict[lport]['index'] = pport - # This field comes from CONFIG_DB PORT_TABLE. This is the channel - # that blongs to this logical port, 0 means all channels. tx_disable - # API needs to know which channels to disable/enable for a - # particular physical port. - if 'channel' in port_change_event.port_dict: - self.port_dict[lport]['channel'] = port_change_event.port_dict['channel'] - # This field comes from STATE_DB PORT_TABLE - if 'host_tx_ready' in port_change_event.port_dict: - self.port_dict[lport]['host_tx_ready'] = \ - port_change_event.port_dict['host_tx_ready'] - # This field comes from STATE_DB TRANSCEIVER_INFO table. - # TRANSCEIVER_INFO has the same life cycle as a transceiver, if - # transceiver is inserted/removed, TRANSCEIVER_INFO is also - # created/deleted. Thus this filed can used to determine - # insertion/removal event. - if 'type' in port_change_event.port_dict: - self.port_dict[lport]['type'] = port_change_event.port_dict['type'] - self.port_dict[lport]['asic_id'] = asic_id - # CONFIG_DB PORT_TABLE DEL case: - elif port_change_event.db_name and \ - port_change_event.db_name == 'CONFIG_DB': - # Only when port is removed from CONFIG, we consider this entry as deleted. - if lport in self.port_dict: - del self.port_dict[lport] - # STATE_DB TRANSCEIVER_INFO DEL case: - elif port_change_event.table_name and \ - port_change_event.table_name == 'TRANSCEIVER_INFO': - # TRANSCEIVER_INFO DEL corresponds to transceiver removal (not - # port/interface removal), so we only remove 'type' field to help - # determine xcvr removal/insertion event - if lport in self.port_dict and 'type' in self.port_dict[lport]: - del self.port_dict[lport]['type'] - - def get_host_tx_status(self, lport, asic_index): - host_tx_ready = 'false' - - state_port_tbl = self.xcvr_table_helper.get_state_port_tbl(asic_index) - - found, port_info = state_port_tbl.get(lport) - if found and 'host_tx_ready' in dict(port_info): - host_tx_ready = dict(port_info)['host_tx_ready'] - return host_tx_ready - - def run(self): - if platform_chassis is None: - self.log_notice("Platform chassis is not available, stopping...") - return - - try: - self.task_worker() - except Exception as e: - helper_logger.log_error("Exception occured at {} thread due to {}".format( - threading.current_thread().getName(), repr(e))) - exc_type, exc_value, exc_traceback = sys.exc_info() - msg = traceback.format_exception(exc_type, exc_value, exc_traceback) - for tb_line in msg: - for tb_line_split in tb_line.splitlines(): - helper_logger.log_error(tb_line_split) - self.exc = e - self.main_thread_stop_event.set() - - def join(self): - self.task_stopping_event.set() - threading.Thread.join(self) - if self.exc: - raise self.exc - - def calculate_tx_disable_delta_array(self, cur_tx_disable_array, tx_disable_flag, channel): - """ - Calculate the delta between the current transmitter (TX) disable array - and a new TX disable flag for a specific channel. The function returns a - new array (delta_array) where each entry indicates whether there's a - difference for each corresponding channel in the TX disable array. - - Args: - cur_tx_disable_array (list): Current array of TX disable flags for all channels. - tx_disable_flag (bool): The new TX disable flag that needs to be compared - with the current flags. - channel (int): The specific channel that needs to be checked. If channel is 0, all - channels are checked. - - Returns: - list: A boolean array where each entry indicates whether there's a - change for the corresponding channel in the TX disable array. - True means there's a change, False means no change. - """ - delta_array = [] - for i, cur_flag in enumerate(cur_tx_disable_array): - is_different = (tx_disable_flag != cur_flag) if channel in [i + 1, 0] else False - delta_array.append(is_different) - return delta_array - - def convert_bool_array_to_bit_mask(self, bool_array): - """ - Convert a boolean array into a bitmask. If a value in the boolean array - is True, the corresponding bit in the bitmask is set to 1, otherwise - it's set to 0. The function starts from the least significant bit for - the first item in the boolean array. - - Args: - bool_array (list): An array of boolean values. - - Returns: - int: A bitmask corresponding to the input boolean array. - """ - mask = 0 - for i, flag in enumerate(bool_array): - mask += (1 << i if flag else 0) - return mask - - def task_worker(self): - ''' - The goal of sff_mgr is to make sure SFF compliant modules are brought up - in a deterministc way, meaning TX is enabled only after host_tx_ready - becomes True, and TX will be disabled when host_tx_ready becomes False. - This will help eliminate link stability issue and potential interface - flap, also turning off TX reduces the power consumption and avoid any - lab hazard for admin shut interface. - - sff_mgr will only proceed at two events: transceiver insertion event - (including bootup and hot plugin) and host_tx_ready change event, all - other cases are ignored. If either of these two events is detected, - sff_mgr will determine the target tx_disable value based on - host_tx_ready, and also check the current HW value of tx_disable/enable, - if it's already in target tx_disable/enable value, no need to take - action, otherwise, sff_mgr will set the target tx_disable/enable value - to HW. - - To detect these two events, sff_mgr listens to below DB tables, and - mantains a local copy of those DB values, stored in self.port_dict. In - each round of task_worker while loop, sff_mgr will calculate the delta - between current self.port_dict and previous self.port_dict (i.e. - self.port_dict_prev), and determine whether there is a change event. - 1. STATE_DB PORT_TABLE's 'host_tx_ready' field for host_tx_ready - change event. - 2. STATE_DB TRANSCEIVER_INFO table's 'type' field for insesrtion - event. Since TRANSCEIVER_INFO table has the same life cycle of - transceiver module insertion/removal, its DB entry will get - created at xcvr insertion, and will get deleted at xcvr removal. - TRANSCEIVER_STATUS table can also be used to detect - insertion/removal event, but 'type' field of TRANSCEIVER_INFO - table can also be used to filter out unwanted xcvr type, thus - TRANSCEIVER_INFO table is used here. - - On the other hand, sff_mgr also listens to CONFIG_DB PORT_TABLE for info - such as 'index', 'channel'/etc. - - Platform can decide whether to enable sff_mgr via platform - enable_sff_mgr flag. If enable_sff_mgr is False or not present, sff_mgr - will not run. By default, it's disabled. - - There is a behavior change (and requirement) for the platforms that - enable this sff_mgr feature: platform needs to keep TX in disabled state - after module coming out-of-reset, in either module insertion or bootup - cases. This is to make sure the module is not transmitting with TX - enabled before host_tx_ready is True. No impact for the platforms in - current deployment (since they don't enable it explictly.) - - ''' - self.xcvr_table_helper = XcvrTableHelper(self.namespaces) - - # CONFIG updates, and STATE_DB for insertion/removal, and host_tx_ready change - sel, asic_context = port_mapping.subscribe_port_update_event(self.namespaces, self, - self.PORT_TBL_MAP) - - # This thread doesn't need to expilictly wait on PortInitDone and - # PortConfigDone events, as xcvrd main thread waits on them before - # spawrning this thread. - while not self.task_stopping_event.is_set(): - # Internally, handle_port_update_event will block for up to - # SELECT_TIMEOUT_MSECS until a message is received(in select - # function). A message is received when there is a Redis SET/DEL - # operation in the DB tables. Upon process restart, messages will be - # replayed for all fields, no need to explictly query the DB tables - # here. - if not port_mapping.handle_port_update_event( - sel, asic_context, self.task_stopping_event, self, self.on_port_update_event): - # In the case of no real update, go back to the beginning of the loop - continue - - for lport in list(self.port_dict.keys()): - if self.task_stopping_event.is_set(): - break - data = self.port_dict[lport] - pport = int(data.get('index', '-1')) - channel = int(data.get('channel', '0')) - xcvr_type = data.get('type', None) - xcvr_inserted = False - host_tx_ready_changed = False - if pport < 0 or channel < 0: - continue - - if xcvr_type is None: - # TRANSCEIVER_INFO table's 'type' is not ready, meaning xcvr is not present - continue - - # Procced only for 100G/40G - if not (xcvr_type.startswith('QSFP28') or xcvr_type.startswith('QSFP+')): - continue - - # Handle the case that host_tx_ready value in the local cache hasn't - # been updated via PortChangeEvent: - if 'host_tx_ready' not in data: - # Fetch host_tx_ready status from STATE_DB (if not present - # in DB, treat it as false), and update self.port_dict - data['host_tx_ready'] = self.get_host_tx_status(lport, data['asic_id']) - self.log_notice("{}: fetched DB and updated host_tx_ready={} locally".format( - lport, data['host_tx_ready'])) - - # It's a xcvr insertion case if TRANSCEIVER_INFO 'type' doesn't exist - # in previous port_dict sanpshot - if lport not in self.port_dict_prev or 'type' not in self.port_dict_prev[lport]: - xcvr_inserted = True - # Check if there's an diff between current and previous host_tx_ready - if (lport not in self.port_dict_prev or - 'host_tx_ready' not in self.port_dict_prev[lport] or - self.port_dict_prev[lport]['host_tx_ready'] != data['host_tx_ready']): - host_tx_ready_changed = True - # Only proceed on xcvr insertion case or the case of - # host_tx_ready getting changed. - # Even though handle_port_update_event() filters out duplicate - # events, we still need to have this below extra check to ignore - # the event that is not related to insertrion or host_tx_ready - # change, such as CONFIG_DB related change. - if (not xcvr_inserted) and (not host_tx_ready_changed): - continue - self.log_notice("{}: xcvr_inserted={}, host_tx_ready_changed={}".format( - lport, xcvr_inserted, host_tx_ready_changed)) - - # double-check the HW presence before moving forward - sfp = platform_chassis.get_sfp(pport) - if not sfp.get_presence(): - self.log_error("{}: module not present!".format(lport)) - del self.port_dict[lport] - continue - try: - # Skip if XcvrApi is not supported - api = sfp.get_xcvr_api() - if api is None: - self.log_error( - "{}: skipping sff_mgr since no xcvr api!".format(lport)) - continue - - # Skip if it's not a paged memory device - if api.is_flat_memory(): - self.log_notice( - "{}: skipping sff_mgr for flat memory xcvr".format(lport)) - continue - - # Skip if it's a copper cable - if api.is_copper(): - self.log_notice( - "{}: skipping sff_mgr for copper cable".format(lport)) - continue - - # Skip if tx_disable action is not supported for this xcvr - if not api.get_tx_disable_support(): - self.log_notice( - "{}: skipping sff_mgr due to tx_disable not supported".format( - lport)) - continue - except AttributeError: - # Skip if these essential routines are not available - continue - - target_tx_disable_flag = data['host_tx_ready'] == 'false' - # get_tx_disable API returns an array of bool, with tx_disable flag on each channel. - # True means tx disabled; False means tx enabled. - cur_tx_disable_array = api.get_tx_disable() - if cur_tx_disable_array is None: - self.log_error("{}: Failed to get current tx_disable value".format(lport)) - # If reading current tx_disable/enable value failed (could be due to - # read error), then set this variable to the opposite value of - # target_tx_disable_flag, to let detla array to be True on - # all the interested channels, to try best-effort TX disable/enable. - cur_tx_disable_array = [not target_tx_disable_flag] * self.DEFAULT_NUM_CHANNELS - # Get an array of bool, where it's True only on the channels that need change. - delta_array = self.calculate_tx_disable_delta_array(cur_tx_disable_array, - target_tx_disable_flag, channel) - mask = self.convert_bool_array_to_bit_mask(delta_array) - if mask == 0: - self.log_notice("{}: No change is needed for tx_disable value".format(lport)) - continue - if api.tx_disable_channel(mask, target_tx_disable_flag): - self.log_notice("{}: TX was {} with channel mask: {}".format( - lport, "disabled" if target_tx_disable_flag else "enabled", bin(mask))) - else: - self.log_error("{}: Failed to {} TX with channel mask: {}".format( - lport, "disable" if target_tx_disable_flag else "enable", bin(mask))) - - # Take a snapshot for port_dict, this will be used to calculate diff - # later in the while loop to determine if there's really a value - # change on the fields regarding to xcvr insertion and host_tx_ready. - self.port_dict_prev = copy.deepcopy(self.port_dict) - - # Thread wrapper class for CMIS transceiver management class CmisManagerTask(threading.Thread): @@ -2888,7 +2511,7 @@ def run(self): # Start the SFF manager sff_manager = None if self.enable_sff_mgr: - sff_manager = SffManagerTask(self.namespaces, self.stop_event) + sff_manager = SffManagerTask(self.namespaces, self.stop_event, platform_chassis, helper_logger) sff_manager.start() self.threads.append(sff_manager) else: @@ -2961,53 +2584,6 @@ def run(self): sys.exit(SFP_SYSTEM_ERROR) -class XcvrTableHelper: - def __init__(self, namespaces): - self.int_tbl, self.dom_tbl, self.dom_threshold_tbl, self.status_tbl, self.app_port_tbl, \ - self.cfg_port_tbl, self.state_port_tbl, self.pm_tbl = {}, {}, {}, {}, {}, {}, {}, {} - self.state_db = {} - self.cfg_db = {} - for namespace in namespaces: - asic_id = multi_asic.get_asic_index_from_namespace(namespace) - self.state_db[asic_id] = daemon_base.db_connect("STATE_DB", namespace) - self.int_tbl[asic_id] = swsscommon.Table(self.state_db[asic_id], TRANSCEIVER_INFO_TABLE) - self.dom_tbl[asic_id] = swsscommon.Table(self.state_db[asic_id], TRANSCEIVER_DOM_SENSOR_TABLE) - self.dom_threshold_tbl[asic_id] = swsscommon.Table(self.state_db[asic_id], TRANSCEIVER_DOM_THRESHOLD_TABLE) - self.status_tbl[asic_id] = swsscommon.Table(self.state_db[asic_id], TRANSCEIVER_STATUS_TABLE) - self.pm_tbl[asic_id] = swsscommon.Table(self.state_db[asic_id], TRANSCEIVER_PM_TABLE) - self.state_port_tbl[asic_id] = swsscommon.Table(self.state_db[asic_id], swsscommon.STATE_PORT_TABLE_NAME) - appl_db = daemon_base.db_connect("APPL_DB", namespace) - self.app_port_tbl[asic_id] = swsscommon.ProducerStateTable(appl_db, swsscommon.APP_PORT_TABLE_NAME) - self.cfg_db[asic_id] = daemon_base.db_connect("CONFIG_DB", namespace) - self.cfg_port_tbl[asic_id] = swsscommon.Table(self.cfg_db[asic_id], swsscommon.CFG_PORT_TABLE_NAME) - - def get_intf_tbl(self, asic_id): - return self.int_tbl[asic_id] - - def get_dom_tbl(self, asic_id): - return self.dom_tbl[asic_id] - - def get_dom_threshold_tbl(self, asic_id): - return self.dom_threshold_tbl[asic_id] - - def get_status_tbl(self, asic_id): - return self.status_tbl[asic_id] - - def get_pm_tbl(self, asic_id): - return self.pm_tbl[asic_id] - - def get_app_port_tbl(self, asic_id): - return self.app_port_tbl[asic_id] - - def get_state_db(self, asic_id): - return self.state_db[asic_id] - - def get_cfg_port_tbl(self, asic_id): - return self.cfg_port_tbl[asic_id] - - def get_state_port_tbl(self, asic_id): - return self.state_port_tbl[asic_id] - # # Main ========================================================================= # diff --git a/sonic-xcvrd/xcvrd/xcvrd_utilities/xcvr_table_helper.py b/sonic-xcvrd/xcvrd/xcvrd_utilities/xcvr_table_helper.py new file mode 100644 index 000000000..9894d9002 --- /dev/null +++ b/sonic-xcvrd/xcvrd/xcvrd_utilities/xcvr_table_helper.py @@ -0,0 +1,59 @@ +try: + from sonic_py_common import daemon_base + from sonic_py_common import multi_asic + from swsscommon import swsscommon +except ImportError as e: + raise ImportError(str(e) + " - required module not found") + +TRANSCEIVER_INFO_TABLE = 'TRANSCEIVER_INFO' +TRANSCEIVER_DOM_SENSOR_TABLE = 'TRANSCEIVER_DOM_SENSOR' +TRANSCEIVER_DOM_THRESHOLD_TABLE = 'TRANSCEIVER_DOM_THRESHOLD' +TRANSCEIVER_STATUS_TABLE = 'TRANSCEIVER_STATUS' +TRANSCEIVER_PM_TABLE = 'TRANSCEIVER_PM' + +class XcvrTableHelper: + def __init__(self, namespaces): + self.int_tbl, self.dom_tbl, self.dom_threshold_tbl, self.status_tbl, self.app_port_tbl, \ + self.cfg_port_tbl, self.state_port_tbl, self.pm_tbl = {}, {}, {}, {}, {}, {}, {}, {} + self.state_db = {} + self.cfg_db = {} + for namespace in namespaces: + asic_id = multi_asic.get_asic_index_from_namespace(namespace) + self.state_db[asic_id] = daemon_base.db_connect("STATE_DB", namespace) + self.int_tbl[asic_id] = swsscommon.Table(self.state_db[asic_id], TRANSCEIVER_INFO_TABLE) + self.dom_tbl[asic_id] = swsscommon.Table(self.state_db[asic_id], TRANSCEIVER_DOM_SENSOR_TABLE) + self.dom_threshold_tbl[asic_id] = swsscommon.Table(self.state_db[asic_id], TRANSCEIVER_DOM_THRESHOLD_TABLE) + self.status_tbl[asic_id] = swsscommon.Table(self.state_db[asic_id], TRANSCEIVER_STATUS_TABLE) + self.pm_tbl[asic_id] = swsscommon.Table(self.state_db[asic_id], TRANSCEIVER_PM_TABLE) + self.state_port_tbl[asic_id] = swsscommon.Table(self.state_db[asic_id], swsscommon.STATE_PORT_TABLE_NAME) + appl_db = daemon_base.db_connect("APPL_DB", namespace) + self.app_port_tbl[asic_id] = swsscommon.ProducerStateTable(appl_db, swsscommon.APP_PORT_TABLE_NAME) + self.cfg_db[asic_id] = daemon_base.db_connect("CONFIG_DB", namespace) + self.cfg_port_tbl[asic_id] = swsscommon.Table(self.cfg_db[asic_id], swsscommon.CFG_PORT_TABLE_NAME) + + def get_intf_tbl(self, asic_id): + return self.int_tbl[asic_id] + + def get_dom_tbl(self, asic_id): + return self.dom_tbl[asic_id] + + def get_dom_threshold_tbl(self, asic_id): + return self.dom_threshold_tbl[asic_id] + + def get_status_tbl(self, asic_id): + return self.status_tbl[asic_id] + + def get_pm_tbl(self, asic_id): + return self.pm_tbl[asic_id] + + def get_app_port_tbl(self, asic_id): + return self.app_port_tbl[asic_id] + + def get_state_db(self, asic_id): + return self.state_db[asic_id] + + def get_cfg_port_tbl(self, asic_id): + return self.cfg_port_tbl[asic_id] + + def get_state_port_tbl(self, asic_id): + return self.state_port_tbl[asic_id] From bad4fd8395cb3e9c3d3a9eef37de6bc3c1b1294e Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Mon, 26 Jun 2023 12:40:33 -0700 Subject: [PATCH 09/22] Revert space change --- sonic-xcvrd/tests/test_xcvrd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sonic-xcvrd/tests/test_xcvrd.py b/sonic-xcvrd/tests/test_xcvrd.py index ee3bbd05f..55746c4ca 100644 --- a/sonic-xcvrd/tests/test_xcvrd.py +++ b/sonic-xcvrd/tests/test_xcvrd.py @@ -1055,7 +1055,7 @@ def test_CmisManagerTask_task_worker(self, mock_chassis): assert task.isPortConfigDone port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET, - {'speed': '400000', 'lanes': '1,2,3,4,5,6,7,8'}) + {'speed':'400000', 'lanes':'1,2,3,4,5,6,7,8'}) task.on_port_update_event(port_change_event) assert len(task.port_dict) == 1 From 4821c8148a8bc9220e98ce958b862fbee8261bf9 Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Thu, 29 Jun 2023 18:13:24 -0700 Subject: [PATCH 10/22] Update comments --- sonic-xcvrd/xcvrd/sff_mgr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sonic-xcvrd/xcvrd/sff_mgr.py b/sonic-xcvrd/xcvrd/sff_mgr.py index f9e1e4ecc..17d8b1b3c 100644 --- a/sonic-xcvrd/xcvrd/sff_mgr.py +++ b/sonic-xcvrd/xcvrd/sff_mgr.py @@ -125,8 +125,8 @@ def on_port_update_event(self, port_change_event): elif port_change_event.table_name and \ port_change_event.table_name == 'TRANSCEIVER_INFO': # TRANSCEIVER_INFO DEL corresponds to transceiver removal (not - # port/interface removal), so we only remove 'type' field to help - # determine xcvr removal/insertion event + # port/interface removal), in this case, remove 'type' field from + # self.port_dict if lport in self.port_dict and 'type' in self.port_dict[lport]: del self.port_dict[lport]['type'] From dbf2ea5dac54dccc2290a09959e83c6b35ad671b Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Thu, 13 Jul 2023 17:30:30 -0700 Subject: [PATCH 11/22] Add admin_status sanity check --- sonic-xcvrd/tests/test_xcvrd.py | 16 +++++ sonic-xcvrd/xcvrd/sff_mgr.py | 106 ++++++++++++++++++-------------- 2 files changed, 76 insertions(+), 46 deletions(-) diff --git a/sonic-xcvrd/tests/test_xcvrd.py b/sonic-xcvrd/tests/test_xcvrd.py index 55746c4ca..443a1f44c 100644 --- a/sonic-xcvrd/tests/test_xcvrd.py +++ b/sonic-xcvrd/tests/test_xcvrd.py @@ -675,6 +675,20 @@ def test_SffManagerTask_get_host_tx_status(self, mock_get_state_port_tbl): mock_get_state_port_tbl.assert_called_once_with(0) mock_get_state_port_tbl.return_value.get.assert_called_once_with(lport) + @patch.object(XcvrTableHelper, 'get_cfg_port_tbl', return_value=MagicMock()) + def test_SffManagerTask_get_admin_status(self, mock_get_cfg_port_tbl): + mock_get_cfg_port_tbl.return_value.get.return_value = (True, {'admin_status': 'up'}) + + sff_manager_task = SffManagerTask(DEFAULT_NAMESPACE, + threading.Event(), + MagicMock(), + helper_logger) + + lport = 'Ethernet0' + assert sff_manager_task.get_admin_status(lport, 0) == 'up' + mock_get_cfg_port_tbl.assert_called_once_with(0) + mock_get_cfg_port_tbl.return_value.get.assert_called_once_with(lport) + @patch('xcvrd.xcvrd.platform_chassis') @patch('xcvrd.xcvrd_utilities.port_mapping.subscribe_port_update_event', MagicMock(return_value=(None, None))) @@ -706,11 +720,13 @@ def test_SffManagerTask_task_worker(self, mock_chassis): task.on_port_update_event(port_change_event) assert len(task.port_dict) == 1 task.get_host_tx_status = MagicMock(return_value='true') + task.get_admin_status = MagicMock(return_value='up') mock_xcvr_api.get_tx_disable = MagicMock(return_value=[True, True, True, True]) task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True]) task.task_worker() assert mock_xcvr_api.tx_disable_channel.call_count == 1 assert task.get_host_tx_status.call_count == 1 + assert task.get_admin_status.call_count == 1 # TX disable case: port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET, diff --git a/sonic-xcvrd/xcvrd/sff_mgr.py b/sonic-xcvrd/xcvrd/sff_mgr.py index 17d8b1b3c..5b4e179ad 100644 --- a/sonic-xcvrd/xcvrd/sff_mgr.py +++ b/sonic-xcvrd/xcvrd/sff_mgr.py @@ -49,8 +49,7 @@ def __init__(self, namespaces, main_thread_stop_event, platform_chassis, helper_ self.platform_chassis = platform_chassis # port_dict holds data per port entry with logical_port_name as key, it # maintains local copy of the following DB fields: - # CONFIG_DB PORT_TABLE 'index' - # CONFIG_DB PORT_TABLE 'channel' + # CONFIG_DB PORT_TABLE 'index', 'channel', 'admin_status' # STATE_DB PORT_TABLE 'host_tx_ready' # STATE_DB TRANSCEIVER_INFO 'type' # plus 'asic_id' from PortChangeEvent.asic_id (asic_id always gets @@ -107,6 +106,15 @@ def on_port_update_event(self, port_change_event): if 'host_tx_ready' in port_change_event.port_dict: self.port_dict[lport]['host_tx_ready'] = \ port_change_event.port_dict['host_tx_ready'] + # This field comes from CONFIG_DB PORT_TABLE + if 'admin_status' in port_change_event.port_dict and \ + port_change_event.db_name and \ + port_change_event.db_name == 'CONFIG_DB': + # Only consider admin_status from CONFIG_DB. + # Ignore admin_status from STATE_DB, which may have + # different value. + self.port_dict[lport]['admin_status'] = \ + port_change_event.port_dict['admin_status'] # This field comes from STATE_DB TRANSCEIVER_INFO table. # TRANSCEIVER_INFO has the same life cycle as a transceiver, if # transceiver is inserted/removed, TRANSCEIVER_INFO is also @@ -140,6 +148,16 @@ def get_host_tx_status(self, lport, asic_index): host_tx_ready = dict(port_info)['host_tx_ready'] return host_tx_ready + def get_admin_status(self, lport, asic_index): + admin_status = 'down' + + cfg_port_tbl = self.xcvr_table_helper.get_cfg_port_tbl(asic_index) + + found, port_info = cfg_port_tbl.get(lport) + if found and 'admin_status' in dict(port_info): + admin_status = dict(port_info)['admin_status'] + return admin_status + def run(self): if self.platform_chassis is None: self.log_notice("Platform chassis is not available, stopping...") @@ -216,39 +234,11 @@ def task_worker(self): flap, also turning off TX reduces the power consumption and avoid any lab hazard for admin shut interface. - sff_mgr will only proceed at two events: transceiver insertion event - (including bootup and hot plugin) and host_tx_ready change event, all - other cases are ignored. If either of these two events is detected, - sff_mgr will determine the target tx_disable value based on - host_tx_ready, and also check the current HW value of tx_disable/enable, - if it's already in target tx_disable/enable value, no need to take - action, otherwise, sff_mgr will set the target tx_disable/enable value - to HW. - - To detect these two events, sff_mgr listens to below DB tables, and - mantains a local copy of those DB values, stored in self.port_dict. In - each round of task_worker while loop, sff_mgr will calculate the delta - between current self.port_dict and previous self.port_dict (i.e. - self.port_dict_prev), and determine whether there is a change event. - 1. STATE_DB PORT_TABLE's 'host_tx_ready' field for host_tx_ready - change event. - 2. STATE_DB TRANSCEIVER_INFO table's 'type' field for insesrtion - event. Since TRANSCEIVER_INFO table has the same life cycle of - transceiver module insertion/removal, its DB entry will get - created at xcvr insertion, and will get deleted at xcvr removal. - TRANSCEIVER_STATUS table can also be used to detect - insertion/removal event, but 'type' field of TRANSCEIVER_INFO - table can also be used to filter out unwanted xcvr type, thus - TRANSCEIVER_INFO table is used here. - - On the other hand, sff_mgr also listens to CONFIG_DB PORT_TABLE for info - such as 'index', 'channel'/etc. - Platform can decide whether to enable sff_mgr via platform enable_sff_mgr flag. If enable_sff_mgr is False or not present, sff_mgr will not run. By default, it's disabled. - There is a behavior change (and requirement) for the platforms that + There is a pre-requisite for the platforms that enable this sff_mgr feature: platform needs to keep TX in disabled state after module coming out-of-reset, in either module insertion or bootup cases. This is to make sure the module is not transmitting with TX @@ -285,6 +275,7 @@ def task_worker(self): xcvr_type = data.get('type', None) xcvr_inserted = False host_tx_ready_changed = False + admin_status_changed = False if pport < 0 or channel < 0: continue @@ -304,26 +295,48 @@ def task_worker(self): data['host_tx_ready'] = self.get_host_tx_status(lport, data['asic_id']) self.log_notice("{}: fetched DB and updated host_tx_ready={} locally".format( lport, data['host_tx_ready'])) - + # Handle the case that admin_status value in the local cache hasn't + # been updated via PortChangeEvent: + if 'admin_status' not in data: + # Fetch admin_status from CONFIG_DB (if not present in DB, + # treat it as false), and update self.port_dict + data['admin_status'] = self.get_admin_status(lport, data['asic_id']) + self.log_notice("{}: fetched DB and updated admin_status={} locally".format( + lport, data['admin_status'])) + + # Check if there's a diff between current and previous 'type' # It's a xcvr insertion case if TRANSCEIVER_INFO 'type' doesn't exist # in previous port_dict sanpshot if lport not in self.port_dict_prev or 'type' not in self.port_dict_prev[lport]: xcvr_inserted = True - # Check if there's an diff between current and previous host_tx_ready + # Check if there's a diff between current and previous host_tx_ready if (lport not in self.port_dict_prev or 'host_tx_ready' not in self.port_dict_prev[lport] or self.port_dict_prev[lport]['host_tx_ready'] != data['host_tx_ready']): host_tx_ready_changed = True - # Only proceed on xcvr insertion case or the case of - # host_tx_ready getting changed. - # Even though handle_port_update_event() filters out duplicate - # events, we still need to have this below extra check to ignore - # the event that is not related to insertrion or host_tx_ready - # change, such as CONFIG_DB related change. - if (not xcvr_inserted) and (not host_tx_ready_changed): + # Check if there's a diff between current and previous admin_status + if (lport not in self.port_dict_prev or + 'admin_status' not in self.port_dict_prev[lport] or + self.port_dict_prev[lport]['admin_status'] != data['admin_status']): + admin_status_changed = True + # Skip if neither of below cases happens: + # 1) xcvr insertion + # 2) host_tx_ready getting changed + # 3) admin_status getting changed + # In addition to handle_port_update_event()'s internal filter, + # this check serves as additional filter to ignore irrelevant + # event, such as CONFIG_DB change other than admin_status field. + if ((not xcvr_inserted) and + (not host_tx_ready_changed) and + (not admin_status_changed)): continue - self.log_notice("{}: xcvr_inserted={}, host_tx_ready_changed={}".format( - lport, xcvr_inserted, host_tx_ready_changed)) + self.log_notice(("{}: xcvr=present(inserted={}), " + "host_tx_ready={}(changed={}), " + "admin_status={}(changed={})").format( + lport, + xcvr_inserted, + data['host_tx_ready'], host_tx_ready_changed, + data['admin_status'], admin_status_changed)) # double-check the HW presence before moving forward sfp = self.platform_chassis.get_sfp(pport) @@ -361,7 +374,9 @@ def task_worker(self): # Skip if these essential routines are not available continue - target_tx_disable_flag = data['host_tx_ready'] == 'false' + # Only turn on TX if both host_tx_ready is true and admin_status is up + target_tx_disable_flag = not (data['host_tx_ready'] == 'true' + and data['admin_status'] == 'up') # get_tx_disable API returns an array of bool, with tx_disable flag on each channel. # True means tx disabled; False means tx enabled. cur_tx_disable_array = api.get_tx_disable() @@ -388,6 +403,5 @@ def task_worker(self): # Take a snapshot for port_dict, this will be used to calculate diff # later in the while loop to determine if there's really a value - # change on the fields regarding to xcvr insertion and host_tx_ready. - self.port_dict_prev = copy.deepcopy(self.port_dict) - + # change on the fields related to the events we care about. + self.port_dict_prev = copy.deepcopy(self.port_dict) \ No newline at end of file From a66831ef0c68d5c11465b8bafe882eb96c6d25f9 Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Tue, 5 Sep 2023 17:33:55 -0700 Subject: [PATCH 12/22] Revert unnecessary space change on xcvrd.py --- sonic-xcvrd/xcvrd/xcvrd.py | 64 +++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/sonic-xcvrd/xcvrd/xcvrd.py b/sonic-xcvrd/xcvrd/xcvrd.py index 72dac0483..2e8971608 100644 --- a/sonic-xcvrd/xcvrd/xcvrd.py +++ b/sonic-xcvrd/xcvrd/xcvrd.py @@ -1097,7 +1097,7 @@ def get_cmis_media_lanes_mask(self, api, appl, lport, subport): self.log_error("Invalid input to get media lane mask - appl {} media_lane_count {} " "lport {} subport {}!".format(appl, media_lane_count, lport, subport)) return media_lanes_mask - + media_lane_start_bit = (media_lane_count * (0 if subport == 0 else subport - 1)) if media_lane_assignment_option & (1 << media_lane_start_bit): media_lanes_mask = ((1 << media_lane_count) - 1) << media_lane_start_bit @@ -1319,9 +1319,9 @@ def get_port_admin_status(self, lport): def configure_tx_output_power(self, api, lport, tx_power): min_p, max_p = api.get_supported_power_config() if tx_power < min_p: - self.log_error("{} configured tx power {} < minimum power {} supported".format(lport, tx_power, min_p)) + self.log_error("{} configured tx power {} < minimum power {} supported".format(lport, tx_power, min_p)) if tx_power > max_p: - self.log_error("{} configured tx power {} > maximum power {} supported".format(lport, tx_power, max_p)) + self.log_error("{} configured tx power {} > maximum power {} supported".format(lport, tx_power, max_p)) return api.set_tx_power(tx_power) def configure_laser_frequency(self, api, lport, freq, grid=75): @@ -1395,10 +1395,10 @@ def task_worker(self): # Handle the case when Xcvrd was NOT running when 'host_tx_ready' or 'admin_status' # was updated or this is the first run so reconcile the above two attributes if 'host_tx_ready' not in self.port_dict[lport]: - self.port_dict[lport]['host_tx_ready'] = self.get_host_tx_status(lport) + self.port_dict[lport]['host_tx_ready'] = self.get_host_tx_status(lport) if 'admin_status' not in self.port_dict[lport]: - self.port_dict[lport]['admin_status'] = self.get_port_admin_status(lport) + self.port_dict[lport]['admin_status'] = self.get_port_admin_status(lport) pport = int(info.get('index', "-1")) speed = int(info.get('speed', "0")) @@ -1438,10 +1438,10 @@ def task_worker(self): continue if api.is_coherent_module(): - if 'tx_power' not in self.port_dict[lport]: - self.port_dict[lport]['tx_power'] = self.get_configured_tx_power_from_db(lport) - if 'laser_freq' not in self.port_dict[lport]: - self.port_dict[lport]['laser_freq'] = self.get_configured_laser_freq_from_db(lport) + if 'tx_power' not in self.port_dict[lport]: + self.port_dict[lport]['tx_power'] = self.get_configured_tx_power_from_db(lport) + if 'laser_freq' not in self.port_dict[lport]: + self.port_dict[lport]['laser_freq'] = self.get_configured_laser_freq_from_db(lport) except AttributeError: # Skip if these essential routines are not available self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_READY @@ -1493,7 +1493,7 @@ def task_worker(self): continue host_lanes_mask = self.port_dict[lport]['host_lanes_mask'] self.log_notice("{}: Setting host_lanemask=0x{:x}".format(lport, host_lanes_mask)) - + self.port_dict[lport]['media_lane_count'] = int(api.get_media_lane_count(appl)) self.port_dict[lport]['media_lane_assignment_options'] = int(api.get_media_lane_assignment_option(appl)) media_lane_count = self.port_dict[lport]['media_lane_count'] @@ -1511,30 +1511,30 @@ def task_worker(self): if self.port_dict[lport]['host_tx_ready'] != 'true' or \ self.port_dict[lport]['admin_status'] != 'up': - self.log_notice("{} Forcing Tx laser OFF".format(lport)) - # Force DataPath re-init - api.tx_disable_channel(media_lanes_mask, True) - self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_READY - continue + self.log_notice("{} Forcing Tx laser OFF".format(lport)) + # Force DataPath re-init + api.tx_disable_channel(media_lanes_mask, True) + self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_READY + continue # Configure the target output power if ZR module if api.is_coherent_module(): - tx_power = self.port_dict[lport]['tx_power'] - # Prevent configuring same tx power multiple times - if 0 != tx_power and tx_power != api.get_tx_config_power(): - if 1 != self.configure_tx_output_power(api, lport, tx_power): - self.log_error("{} failed to configure Tx power = {}".format(lport, tx_power)) - else: - self.log_notice("{} Successfully configured Tx power = {}".format(lport, tx_power)) + tx_power = self.port_dict[lport]['tx_power'] + # Prevent configuring same tx power multiple times + if 0 != tx_power and tx_power != api.get_tx_config_power(): + if 1 != self.configure_tx_output_power(api, lport, tx_power): + self.log_error("{} failed to configure Tx power = {}".format(lport, tx_power)) + else: + self.log_notice("{} Successfully configured Tx power = {}".format(lport, tx_power)) need_update = self.is_cmis_application_update_required(api, appl, host_lanes_mask) # For ZR module, Datapath needes to be re-initlialized on new channel selection if api.is_coherent_module(): - freq = self.port_dict[lport]['laser_freq'] - # If user requested frequency is NOT the same as configured on the module - # force datapath re-initialization - if 0 != freq and freq != api.get_laser_config_freq(): - need_update = True + freq = self.port_dict[lport]['laser_freq'] + # If user requested frequency is NOT the same as configured on the module + # force datapath re-initialization + if 0 != freq and freq != api.get_laser_config_freq(): + need_update = True if not need_update: # No application updates @@ -1577,13 +1577,13 @@ def task_worker(self): continue if api.is_coherent_module(): - # For ZR module, configure the laser frequency when Datapath is in Deactivated state - freq = self.port_dict[lport]['laser_freq'] - if 0 != freq: + # For ZR module, configure the laser frequency when Datapath is in Deactivated state + freq = self.port_dict[lport]['laser_freq'] + if 0 != freq: if 1 != self.configure_laser_frequency(api, lport, freq): - self.log_error("{} failed to configure laser frequency {} GHz".format(lport, freq)) + self.log_error("{} failed to configure laser frequency {} GHz".format(lport, freq)) else: - self.log_notice("{} configured laser frequency {} GHz".format(lport, freq)) + self.log_notice("{} configured laser frequency {} GHz".format(lport, freq)) # D.1.3 Software Configuration and Initialization if not api.set_application(host_lanes_mask, appl): From 689e46b195d097a7372a929d8107f68a3e156970 Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Mon, 23 Oct 2023 18:45:54 -0700 Subject: [PATCH 13/22] Add NotImplementedError catching --- sonic-xcvrd/xcvrd/sff_mgr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sonic-xcvrd/xcvrd/sff_mgr.py b/sonic-xcvrd/xcvrd/sff_mgr.py index 5b4e179ad..2f32da70e 100644 --- a/sonic-xcvrd/xcvrd/sff_mgr.py +++ b/sonic-xcvrd/xcvrd/sff_mgr.py @@ -370,7 +370,7 @@ def task_worker(self): "{}: skipping sff_mgr due to tx_disable not supported".format( lport)) continue - except AttributeError: + except (AttributeError, NotImplementedError): # Skip if these essential routines are not available continue From a9d6fecf58841ba8276d0cfdd6e34c71b975f206 Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Wed, 20 Dec 2023 00:17:41 -0800 Subject: [PATCH 14/22] Address comments and update subport handling --- sonic-xcvrd/tests/test_xcvrd.py | 15 +- sonic-xcvrd/xcvrd/sff_mgr.py | 254 ++++++++++++++++++-------------- sonic-xcvrd/xcvrd/xcvrd.py | 25 +--- 3 files changed, 156 insertions(+), 138 deletions(-) diff --git a/sonic-xcvrd/tests/test_xcvrd.py b/sonic-xcvrd/tests/test_xcvrd.py index 66471d719..cacb7abc3 100644 --- a/sonic-xcvrd/tests/test_xcvrd.py +++ b/sonic-xcvrd/tests/test_xcvrd.py @@ -867,6 +867,7 @@ def test_DaemonXcvrd_wait_for_port_config_done(self, mock_select, mock_sub_table def test_DaemonXcvrd_run(self, mock_task_stop1, mock_task_stop2, mock_task_run1, mock_task_run2, mock_deinit, mock_init): mock_init.return_value = (PortMapping(), set()) xcvrd = DaemonXcvrd(SYSLOG_IDENTIFIER) + xcvrd.load_feature_flags = MagicMock() xcvrd.stop_event.wait = MagicMock() xcvrd.run() assert mock_task_stop1.call_count == 1 @@ -900,7 +901,7 @@ def test_SffManagerTask_handle_port_change_event(self): task.on_port_update_event(port_change_event) assert len(task.port_dict) == 0 - port_dict = {'type': 'QSFP28', 'channel': '0', 'host_tx_ready': 'false'} + port_dict = {'type': 'QSFP28', 'subport': '0', 'host_tx_ready': 'false'} port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET, port_dict) task.on_port_update_event(port_change_event) assert len(task.port_dict) == 1 @@ -917,7 +918,7 @@ def test_SffManagerTask_handle_port_change_event(self): @patch.object(XcvrTableHelper, 'get_state_port_tbl', return_value=MagicMock()) def test_SffManagerTask_get_host_tx_status(self, mock_get_state_port_tbl): - mock_get_state_port_tbl.return_value.get.return_value = (True, {'host_tx_ready': 'true'}) + mock_get_state_port_tbl.return_value.hget.return_value = (True, 'true') sff_manager_task = SffManagerTask(DEFAULT_NAMESPACE, threading.Event(), @@ -927,11 +928,11 @@ def test_SffManagerTask_get_host_tx_status(self, mock_get_state_port_tbl): lport = 'Ethernet0' assert sff_manager_task.get_host_tx_status(lport, 0) == 'true' mock_get_state_port_tbl.assert_called_once_with(0) - mock_get_state_port_tbl.return_value.get.assert_called_once_with(lport) + mock_get_state_port_tbl.return_value.hget.assert_called_once_with(lport, 'host_tx_ready') @patch.object(XcvrTableHelper, 'get_cfg_port_tbl', return_value=MagicMock()) def test_SffManagerTask_get_admin_status(self, mock_get_cfg_port_tbl): - mock_get_cfg_port_tbl.return_value.get.return_value = (True, {'admin_status': 'up'}) + mock_get_cfg_port_tbl.return_value.hget.return_value = (True, 'up') sff_manager_task = SffManagerTask(DEFAULT_NAMESPACE, threading.Event(), @@ -941,7 +942,7 @@ def test_SffManagerTask_get_admin_status(self, mock_get_cfg_port_tbl): lport = 'Ethernet0' assert sff_manager_task.get_admin_status(lport, 0) == 'up' mock_get_cfg_port_tbl.assert_called_once_with(0) - mock_get_cfg_port_tbl.return_value.get.assert_called_once_with(lport) + mock_get_cfg_port_tbl.return_value.hget.assert_called_once_with(lport, 'admin_status') @patch('xcvrd.xcvrd.platform_chassis') @patch('xcvrd.xcvrd_utilities.port_mapping.subscribe_port_update_event', @@ -969,7 +970,7 @@ def test_SffManagerTask_task_worker(self, mock_chassis): # TX enable case: port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET, { 'type': 'QSFP28', - 'channel': '0' + 'subport': '0' }) task.on_port_update_event(port_change_event) assert len(task.port_dict) == 1 @@ -2070,7 +2071,7 @@ def test_DaemonXcvrd_load_feature_flags(self, mock_json_load, mock_get_path, moc mock_json_load.assert_called_once() assert xcvrd.enable_sff_mgr == True - @patch('sonic_py_common.device_info.get_platform', MagicMock(return_value='x86_64-sonic-linux')) + @patch('sonic_py_common.device_info.get_path_to_platform_dir', MagicMock(return_value='/usr/share/sonic/platform')) @patch('os.path.isfile', MagicMock(return_value=True)) def test_DaemonXcvrd_get_platform_pmon_ctrl_file_path(self): xcvrd = DaemonXcvrd(SYSLOG_IDENTIFIER) diff --git a/sonic-xcvrd/xcvrd/sff_mgr.py b/sonic-xcvrd/xcvrd/sff_mgr.py index 2f32da70e..2879857f0 100644 --- a/sonic-xcvrd/xcvrd/sff_mgr.py +++ b/sonic-xcvrd/xcvrd/sff_mgr.py @@ -22,6 +22,32 @@ class SffManagerTask(threading.Thread): + + # CONFIG_DB port table fields: + + ADMIN_STATUS = 'admin_status' + # This is the subport index for this logical port, starting from 1, 0 means + # all lanes are taken. + SUBPORT = 'subport' + # This is a comma separated list of lane numbers + LANES_LIST = 'lanes' + + + # STATE_DB TRANSCEIVER_INFO table fields: + + # This filed can used to determine insertion/removal event. Since + # TRANSCEIVER_INFO has the same life cycle as a transceiver, if transceiver + # is inserted/removed, TRANSCEIVER_INFO is also created/deleted. + XCVR_TYPE = 'type' + + + # STATE_DB PORT_TABLE fields: + + HOST_TX_READY = 'host_tx_ready' + + # Default number of lanes per physical port for QSFP28/QSFP+ transceiver + DEFAULT_NUM_LANES_PER_PPORT = 4 + # Subscribe to below tables in Redis DB PORT_TBL_MAP = [ { @@ -29,15 +55,13 @@ class SffManagerTask(threading.Thread): }, { 'STATE_DB': 'TRANSCEIVER_INFO', - 'FILTER': ['type'] + 'FILTER': [XCVR_TYPE] }, { 'STATE_DB': 'PORT_TABLE', - 'FILTER': ['host_tx_ready'] + 'FILTER': [HOST_TX_READY] }, ] - # Default number of channels for QSFP28/QSFP+ transceiver - DEFAULT_NUM_CHANNELS = 4 def __init__(self, namespaces, main_thread_stop_event, platform_chassis, helper_logger): threading.Thread.__init__(self) @@ -47,15 +71,9 @@ def __init__(self, namespaces, main_thread_stop_event, platform_chassis, helper_ self.main_thread_stop_event = main_thread_stop_event self.helper_logger = helper_logger self.platform_chassis = platform_chassis - # port_dict holds data per port entry with logical_port_name as key, it - # maintains local copy of the following DB fields: - # CONFIG_DB PORT_TABLE 'index', 'channel', 'admin_status' - # STATE_DB PORT_TABLE 'host_tx_ready' - # STATE_DB TRANSCEIVER_INFO 'type' - # plus 'asic_id' from PortChangeEvent.asic_id (asic_id always gets - # filled in handle_port_update_event function based on asic_context) + # port_dict holds data obtained from on_port_update_event per port entry + # with logical_port_name as key. # Its port entry will get deleted upon CONFIG_DB PORT_TABLE DEL. - # Port entry's 'type' field will get deleted upon STATE_DB TRANSCEIVER_INFO DEL. self.port_dict = {} # port_dict snapshot captured in the previous event update loop self.port_dict_prev = {} @@ -71,6 +89,34 @@ def log_warning(self, message): def log_error(self, message): self.helper_logger.log_error("SFF: {}".format(message)) + def get_active_lanes_for_lport(self, lport, subport_idx, num_lanes_per_lport, num_lanes_per_pport): + """ + Get the active lanes for a logical port based on the subport index. + + Args: + lport (str): Logical port name. + subport_idx (int): Subport index. + num_lanes_per_lport (int): Number of lanes per logical port. + num_lanes_per_pport (int): Number of lanes per physical port. + + Returns: + list: A list of boolean values, where True means the corresponding + lane is active. + """ + if subport_idx < 0 or subport_idx > self.DEFAULT_NUM_LANES_PER_PPORT: + self.log_error("{}: Invalid subport index {}".format(lport, subport_idx)) + return None + + if subport_idx == 0: + lanes = [True] * num_lanes_per_pport + else: + lanes = [False] * num_lanes_per_pport + + start = (subport_idx - 1) * num_lanes_per_lport + end = subport_idx * num_lanes_per_lport + lanes[start:end] = [True] * (end - start) + return lanes + def on_port_update_event(self, port_change_event): if (port_change_event.event_type not in [port_change_event.PORT_SET, port_change_event.PORT_DEL]): @@ -96,32 +142,32 @@ def on_port_update_event(self, port_change_event): self.port_dict[lport] = {} if pport >= 0: self.port_dict[lport]['index'] = pport - # This field comes from CONFIG_DB PORT_TABLE. This is the channel - # that blongs to this logical port, 0 means all channels. tx_disable - # API needs to know which channels to disable/enable for a - # particular physical port. - if 'channel' in port_change_event.port_dict: - self.port_dict[lport]['channel'] = port_change_event.port_dict['channel'] - # This field comes from STATE_DB PORT_TABLE - if 'host_tx_ready' in port_change_event.port_dict: - self.port_dict[lport]['host_tx_ready'] = \ - port_change_event.port_dict['host_tx_ready'] - # This field comes from CONFIG_DB PORT_TABLE - if 'admin_status' in port_change_event.port_dict and \ + + if self.SUBPORT in port_change_event.port_dict and \ + self.LANES_LIST in port_change_event.port_dict: + subport_idx = int(port_change_event.port_dict[self.SUBPORT]) + lanes_list = port_change_event.port_dict[self.LANES_LIST].split(',') + active_lanes = self.get_active_lanes_for_lport(lport, subport_idx, + len(lanes_list), + self.DEFAULT_NUM_LANES_PER_PPORT) + if active_lanes is not None: + self.port_dict[lport]['active_lanes'] = active_lanes + + if self.HOST_TX_READY in port_change_event.port_dict: + self.port_dict[lport][self.HOST_TX_READY] = \ + port_change_event.port_dict[self.HOST_TX_READY] + + if self.ADMIN_STATUS in port_change_event.port_dict and \ port_change_event.db_name and \ port_change_event.db_name == 'CONFIG_DB': # Only consider admin_status from CONFIG_DB. # Ignore admin_status from STATE_DB, which may have # different value. - self.port_dict[lport]['admin_status'] = \ - port_change_event.port_dict['admin_status'] - # This field comes from STATE_DB TRANSCEIVER_INFO table. - # TRANSCEIVER_INFO has the same life cycle as a transceiver, if - # transceiver is inserted/removed, TRANSCEIVER_INFO is also - # created/deleted. Thus this filed can used to determine - # insertion/removal event. - if 'type' in port_change_event.port_dict: - self.port_dict[lport]['type'] = port_change_event.port_dict['type'] + self.port_dict[lport][self.ADMIN_STATUS] = \ + port_change_event.port_dict[self.ADMIN_STATUS] + + if self.XCVR_TYPE in port_change_event.port_dict: + self.port_dict[lport][self.XCVR_TYPE] = port_change_event.port_dict[self.XCVR_TYPE] self.port_dict[lport]['asic_id'] = asic_id # CONFIG_DB PORT_TABLE DEL case: elif port_change_event.db_name and \ @@ -133,19 +179,19 @@ def on_port_update_event(self, port_change_event): elif port_change_event.table_name and \ port_change_event.table_name == 'TRANSCEIVER_INFO': # TRANSCEIVER_INFO DEL corresponds to transceiver removal (not - # port/interface removal), in this case, remove 'type' field from + # port/interface removal), in this case, remove XCVR_TYPE field from # self.port_dict - if lport in self.port_dict and 'type' in self.port_dict[lport]: - del self.port_dict[lport]['type'] + if lport in self.port_dict and self.XCVR_TYPE in self.port_dict[lport]: + del self.port_dict[lport][self.XCVR_TYPE] def get_host_tx_status(self, lport, asic_index): host_tx_ready = 'false' state_port_tbl = self.xcvr_table_helper.get_state_port_tbl(asic_index) - found, port_info = state_port_tbl.get(lport) - if found and 'host_tx_ready' in dict(port_info): - host_tx_ready = dict(port_info)['host_tx_ready'] + found, value = state_port_tbl.hget(lport, self.HOST_TX_READY) + if found: + host_tx_ready = value return host_tx_ready def get_admin_status(self, lport, asic_index): @@ -153,9 +199,9 @@ def get_admin_status(self, lport, asic_index): cfg_port_tbl = self.xcvr_table_helper.get_cfg_port_tbl(asic_index) - found, port_info = cfg_port_tbl.get(lport) - if found and 'admin_status' in dict(port_info): - admin_status = dict(port_info)['admin_status'] + found, value = cfg_port_tbl.hget(lport, self.ADMIN_STATUS) + if found: + admin_status = value return admin_status def run(self): @@ -182,28 +228,24 @@ def join(self): if self.exc: raise self.exc - def calculate_tx_disable_delta_array(self, cur_tx_disable_array, tx_disable_flag, channel): + def calculate_tx_disable_delta_array(self, cur_tx_disable_array, tx_disable_flag, active_lanes): """ - Calculate the delta between the current transmitter (TX) disable array - and a new TX disable flag for a specific channel. The function returns a - new array (delta_array) where each entry indicates whether there's a - difference for each corresponding channel in the TX disable array. + Calculate the delta array between current tx_disable array and the target tx_disable flag. Args: - cur_tx_disable_array (list): Current array of TX disable flags for all channels. - tx_disable_flag (bool): The new TX disable flag that needs to be compared - with the current flags. - channel (int): The specific channel that needs to be checked. If channel is 0, all - channels are checked. + cur_tx_disable_array (list): An array of boolean values, where True means the + corresponding lane is disabled. + tx_disable_flag (bool): The target tx_disable flag. + active_lanes (list): An array of boolean values, where True means the + corresponding lane is active for this logical port. Returns: - list: A boolean array where each entry indicates whether there's a - change for the corresponding channel in the TX disable array. - True means there's a change, False means no change. + list: A list of boolean values, where True means the corresponding + lane needs to be changed. """ delta_array = [] - for i, cur_flag in enumerate(cur_tx_disable_array): - is_different = (tx_disable_flag != cur_flag) if channel in [i + 1, 0] else False + for active, cur_flag in zip(active_lanes, cur_tx_disable_array): + is_different = (tx_disable_flag != cur_flag) if active else False delta_array.append(is_different) return delta_array @@ -227,24 +269,20 @@ def convert_bool_array_to_bit_mask(self, bool_array): def task_worker(self): ''' - The goal of sff_mgr is to make sure SFF compliant modules are brought up - in a deterministc way, meaning TX is enabled only after host_tx_ready - becomes True, and TX will be disabled when host_tx_ready becomes False. - This will help eliminate link stability issue and potential interface - flap, also turning off TX reduces the power consumption and avoid any - lab hazard for admin shut interface. - - Platform can decide whether to enable sff_mgr via platform - enable_sff_mgr flag. If enable_sff_mgr is False or not present, sff_mgr - will not run. By default, it's disabled. - - There is a pre-requisite for the platforms that - enable this sff_mgr feature: platform needs to keep TX in disabled state - after module coming out-of-reset, in either module insertion or bootup - cases. This is to make sure the module is not transmitting with TX - enabled before host_tx_ready is True. No impact for the platforms in - current deployment (since they don't enable it explictly.) - + The main goal of sff_mgr is to make sure SFF compliant modules are + brought up in a deterministc way, meaning TX is enabled only after + host_tx_ready becomes True, and TX will be disabled when host_tx_ready + becomes False. This will help eliminate link stability issue and + potential interface flap, also turning off TX reduces the power + consumption and avoid any lab hazard for admin shut interface. + + Platform can decide whether to enable sff_mgr. By default, it's disabled. + + Pre-requisite for platform to enable sff_mgr: + platform needs to keep TX in disabled state after module coming + out-of-reset, in either module insertion or bootup cases. This is to + make sure the module is not transmitting with TX enabled before + host_tx_ready is True. ''' # CONFIG updates, and STATE_DB for insertion/removal, and host_tx_ready change @@ -266,58 +304,58 @@ def task_worker(self): # In the case of no real update, go back to the beginning of the loop continue - for lport in list(self.port_dict.keys()): + for lport in self.port_dict: if self.task_stopping_event.is_set(): break data = self.port_dict[lport] pport = int(data.get('index', '-1')) - channel = int(data.get('channel', '0')) - xcvr_type = data.get('type', None) + active_lanes = data.get('active_lanes', [True] * self.DEFAULT_NUM_LANES_PER_PPORT) + xcvr_type = data.get(self.XCVR_TYPE, None) xcvr_inserted = False host_tx_ready_changed = False admin_status_changed = False - if pport < 0 or channel < 0: + if pport < 0: continue if xcvr_type is None: - # TRANSCEIVER_INFO table's 'type' is not ready, meaning xcvr is not present + # TRANSCEIVER_INFO table's XCVR_TYPE is not ready, meaning xcvr is not present continue - # Procced only for 100G/40G + # Procced only for QSFP28/QSFP+ transceiver if not (xcvr_type.startswith('QSFP28') or xcvr_type.startswith('QSFP+')): continue # Handle the case that host_tx_ready value in the local cache hasn't # been updated via PortChangeEvent: - if 'host_tx_ready' not in data: + if self.HOST_TX_READY not in data: # Fetch host_tx_ready status from STATE_DB (if not present # in DB, treat it as false), and update self.port_dict - data['host_tx_ready'] = self.get_host_tx_status(lport, data['asic_id']) + data[self.HOST_TX_READY] = self.get_host_tx_status(lport, data['asic_id']) self.log_notice("{}: fetched DB and updated host_tx_ready={} locally".format( - lport, data['host_tx_ready'])) + lport, data[self.HOST_TX_READY])) # Handle the case that admin_status value in the local cache hasn't # been updated via PortChangeEvent: - if 'admin_status' not in data: + if self.ADMIN_STATUS not in data: # Fetch admin_status from CONFIG_DB (if not present in DB, # treat it as false), and update self.port_dict - data['admin_status'] = self.get_admin_status(lport, data['asic_id']) + data[self.ADMIN_STATUS] = self.get_admin_status(lport, data['asic_id']) self.log_notice("{}: fetched DB and updated admin_status={} locally".format( - lport, data['admin_status'])) + lport, data[self.ADMIN_STATUS])) - # Check if there's a diff between current and previous 'type' - # It's a xcvr insertion case if TRANSCEIVER_INFO 'type' doesn't exist - # in previous port_dict sanpshot - if lport not in self.port_dict_prev or 'type' not in self.port_dict_prev[lport]: + # Check if there's a diff between current and previous XCVR_TYPE + # It's a xcvr insertion case if TRANSCEIVER_INFO XCVR_TYPE doesn't exist + # in previous port_dict snapshot + if lport not in self.port_dict_prev or self.XCVR_TYPE not in self.port_dict_prev[lport]: xcvr_inserted = True # Check if there's a diff between current and previous host_tx_ready if (lport not in self.port_dict_prev or - 'host_tx_ready' not in self.port_dict_prev[lport] or - self.port_dict_prev[lport]['host_tx_ready'] != data['host_tx_ready']): + self.HOST_TX_READY not in self.port_dict_prev[lport] or + self.port_dict_prev[lport][self.HOST_TX_READY] != data[self.HOST_TX_READY]): host_tx_ready_changed = True # Check if there's a diff between current and previous admin_status if (lport not in self.port_dict_prev or - 'admin_status' not in self.port_dict_prev[lport] or - self.port_dict_prev[lport]['admin_status'] != data['admin_status']): + self.ADMIN_STATUS not in self.port_dict_prev[lport] or + self.port_dict_prev[lport][self.ADMIN_STATUS] != data[self.ADMIN_STATUS]): admin_status_changed = True # Skip if neither of below cases happens: # 1) xcvr insertion @@ -335,14 +373,14 @@ def task_worker(self): "admin_status={}(changed={})").format( lport, xcvr_inserted, - data['host_tx_ready'], host_tx_ready_changed, - data['admin_status'], admin_status_changed)) + data[self.HOST_TX_READY], host_tx_ready_changed, + data[self.ADMIN_STATUS], admin_status_changed)) # double-check the HW presence before moving forward sfp = self.platform_chassis.get_sfp(pport) if not sfp.get_presence(): self.log_error("{}: module not present!".format(lport)) - del self.port_dict[lport] + del self.port_dict[lport][self.XCVR_TYPE] continue try: # Skip if XcvrApi is not supported @@ -375,9 +413,9 @@ def task_worker(self): continue # Only turn on TX if both host_tx_ready is true and admin_status is up - target_tx_disable_flag = not (data['host_tx_ready'] == 'true' - and data['admin_status'] == 'up') - # get_tx_disable API returns an array of bool, with tx_disable flag on each channel. + target_tx_disable_flag = not (data[self.HOST_TX_READY] == 'true' + and data[self.ADMIN_STATUS] == 'up') + # get_tx_disable API returns an array of bool, with tx_disable flag on each lane. # True means tx disabled; False means tx enabled. cur_tx_disable_array = api.get_tx_disable() if cur_tx_disable_array is None: @@ -385,23 +423,23 @@ def task_worker(self): # If reading current tx_disable/enable value failed (could be due to # read error), then set this variable to the opposite value of # target_tx_disable_flag, to let detla array to be True on - # all the interested channels, to try best-effort TX disable/enable. - cur_tx_disable_array = [not target_tx_disable_flag] * self.DEFAULT_NUM_CHANNELS - # Get an array of bool, where it's True only on the channels that need change. + # all the interested lanes, to try best-effort TX disable/enable. + cur_tx_disable_array = [not target_tx_disable_flag] * self.DEFAULT_NUM_LANES_PER_PPORT + # Get an array of bool, where it's True only on the lanes that need change. delta_array = self.calculate_tx_disable_delta_array(cur_tx_disable_array, - target_tx_disable_flag, channel) + target_tx_disable_flag, active_lanes) mask = self.convert_bool_array_to_bit_mask(delta_array) if mask == 0: self.log_notice("{}: No change is needed for tx_disable value".format(lport)) continue if api.tx_disable_channel(mask, target_tx_disable_flag): - self.log_notice("{}: TX was {} with channel mask: {}".format( + self.log_notice("{}: TX was {} with lanes mask: {}".format( lport, "disabled" if target_tx_disable_flag else "enabled", bin(mask))) else: - self.log_error("{}: Failed to {} TX with channel mask: {}".format( + self.log_error("{}: Failed to {} TX with lanes mask: {}".format( lport, "disable" if target_tx_disable_flag else "enable", bin(mask))) # Take a snapshot for port_dict, this will be used to calculate diff # later in the while loop to determine if there's really a value # change on the fields related to the events we care about. - self.port_dict_prev = copy.deepcopy(self.port_dict) \ No newline at end of file + self.port_dict_prev = copy.deepcopy(self.port_dict) diff --git a/sonic-xcvrd/xcvrd/xcvrd.py b/sonic-xcvrd/xcvrd/xcvrd.py index 482d2f832..e9e04f2b5 100644 --- a/sonic-xcvrd/xcvrd/xcvrd.py +++ b/sonic-xcvrd/xcvrd/xcvrd.py @@ -88,9 +88,6 @@ POWER_UNIT = 'dBm' BIAS_UNIT = 'mA' -USR_SHARE_SONIC_PATH = "/usr/share/sonic" -HOST_DEVICE_PATH = USR_SHARE_SONIC_PATH + "/device" -CONTAINER_PLATFORM_PATH = USR_SHARE_SONIC_PATH + "/platform" PLATFORM_PMON_CTRL_FILENAME = "pmon_daemon_control.json" ENABLE_SFF_MGR_FLAG_NAME = "enable_xcvrd_sff_mgr" @@ -2326,28 +2323,10 @@ def get_platform_pmon_ctrl_file_path(self): """ Get the platform PMON control file path. - The method generates a list of potential file paths, prioritizing the container platform path. - It then checks these paths in order to find an existing file. - Returns: - str: The first found platform PMON control file path. - None: If no file path exists. + str: The platform PMON control file path. """ - platform_env_conf_path_candidates = [] - - platform_env_conf_path_candidates.append( - os.path.join(CONTAINER_PLATFORM_PATH, PLATFORM_PMON_CTRL_FILENAME)) - - platform = device_info.get_platform() - if platform: - platform_env_conf_path_candidates.append( - os.path.join(HOST_DEVICE_PATH, platform, PLATFORM_PMON_CTRL_FILENAME)) - - for platform_env_conf_file_path in platform_env_conf_path_candidates: - if os.path.isfile(platform_env_conf_file_path): - return platform_env_conf_file_path - - return None + return device_info.get_path_to_platform_dir() + '/' + PLATFORM_PMON_CTRL_FILENAME # Run daemon From 4a56969c64d20b14fe98877eaa69cf85edf08d6d Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Wed, 20 Dec 2023 13:28:08 -0800 Subject: [PATCH 15/22] Add unit tests for SffManagerTask.get_active_lanes_for_lport() method --- sonic-xcvrd/tests/test_xcvrd.py | 74 +++++++++++++++++++++++++++++++++ sonic-xcvrd/xcvrd/sff_mgr.py | 16 ++++--- 2 files changed, 84 insertions(+), 6 deletions(-) diff --git a/sonic-xcvrd/tests/test_xcvrd.py b/sonic-xcvrd/tests/test_xcvrd.py index cacb7abc3..9fa7384d1 100644 --- a/sonic-xcvrd/tests/test_xcvrd.py +++ b/sonic-xcvrd/tests/test_xcvrd.py @@ -916,6 +916,80 @@ def test_SffManagerTask_handle_port_change_event(self): task.on_port_update_event(port_change_event) assert len(task.port_dict) == 0 + def test_SffManagerTask_get_active_lanes_for_lport(self): + sff_manager_task = SffManagerTask(DEFAULT_NAMESPACE, + threading.Event(), + MagicMock(), + helper_logger) + + lport = 'Ethernet0' + + subport_idx = 3 + num_lanes_per_lport = 1 + num_lanes_per_pport = 4 + expected_result = [False, False, True, False] + result = sff_manager_task.get_active_lanes_for_lport(lport, subport_idx, num_lanes_per_lport, num_lanes_per_pport) + assert result == expected_result + + subport_idx = 1 + num_lanes_per_lport = 2 + num_lanes_per_pport = 4 + expected_result = [True, True, False, False] + result = sff_manager_task.get_active_lanes_for_lport(lport, subport_idx, num_lanes_per_lport, num_lanes_per_pport) + assert result == expected_result + + subport_idx = 1 + num_lanes_per_lport = 2 + num_lanes_per_pport = 4 + expected_result = [True, True, False, False] + result = sff_manager_task.get_active_lanes_for_lport(lport, subport_idx, num_lanes_per_lport, num_lanes_per_pport) + assert result == expected_result + + subport_idx = 2 + num_lanes_per_lport = 2 + num_lanes_per_pport = 4 + expected_result = [False, False, True, True] + result = sff_manager_task.get_active_lanes_for_lport(lport, subport_idx, num_lanes_per_lport, num_lanes_per_pport) + assert result == expected_result + + subport_idx = 0 + num_lanes_per_lport = 4 + num_lanes_per_pport = 4 + expected_result = [True, True, True, True] + result = sff_manager_task.get_active_lanes_for_lport(lport, subport_idx, num_lanes_per_lport, num_lanes_per_pport) + assert result == expected_result + + # Test with larger number of lanes per port (not real use case) + subport_idx = 1 + num_lanes_per_lport = 4 + num_lanes_per_pport = 32 + expected_result = [True, True, True, True, False, False, False, False, + False, False, False, False, False, False, False, False, + False, False, False, False, False, False, False, False, + False, False, False, False, False, False, False, False] + result = sff_manager_task.get_active_lanes_for_lport(lport, subport_idx, num_lanes_per_lport, num_lanes_per_pport) + assert result == expected_result + + def test_SffManagerTask_get_active_lanes_for_lport_with_invalid_input(self): + sff_manager_task = SffManagerTask(DEFAULT_NAMESPACE, + threading.Event(), + MagicMock(), + helper_logger) + + lport = 'Ethernet0' + + subport_idx = -1 + num_lanes_per_lport = 4 + num_lanes_per_pport = 32 + result = sff_manager_task.get_active_lanes_for_lport(lport, subport_idx, num_lanes_per_lport, num_lanes_per_pport) + assert result is None + + subport_idx = 5 + num_lanes_per_lport = 1 + num_lanes_per_pport = 4 + result = sff_manager_task.get_active_lanes_for_lport(lport, subport_idx, num_lanes_per_lport, num_lanes_per_pport) + assert result is None + @patch.object(XcvrTableHelper, 'get_state_port_tbl', return_value=MagicMock()) def test_SffManagerTask_get_host_tx_status(self, mock_get_state_port_tbl): mock_get_state_port_tbl.return_value.hget.return_value = (True, 'true') diff --git a/sonic-xcvrd/xcvrd/sff_mgr.py b/sonic-xcvrd/xcvrd/sff_mgr.py index 2879857f0..14d9889c0 100644 --- a/sonic-xcvrd/xcvrd/sff_mgr.py +++ b/sonic-xcvrd/xcvrd/sff_mgr.py @@ -95,7 +95,7 @@ def get_active_lanes_for_lport(self, lport, subport_idx, num_lanes_per_lport, nu Args: lport (str): Logical port name. - subport_idx (int): Subport index. + subport_idx (int): Subport index, starting from 1. 0 means all lanes are taken. num_lanes_per_lport (int): Number of lanes per logical port. num_lanes_per_pport (int): Number of lanes per physical port. @@ -103,18 +103,22 @@ def get_active_lanes_for_lport(self, lport, subport_idx, num_lanes_per_lport, nu list: A list of boolean values, where True means the corresponding lane is active. """ - if subport_idx < 0 or subport_idx > self.DEFAULT_NUM_LANES_PER_PPORT: - self.log_error("{}: Invalid subport index {}".format(lport, subport_idx)) + if subport_idx < 0 or subport_idx > num_lanes_per_pport // num_lanes_per_lport: + self.log_error( + f"{lport}: Invalid subport_idx {subport_idx} " + f"for num_lanes_per_lport={num_lanes_per_lport}, " + f"num_lanes_per_pport={num_lanes_per_pport}" + ) return None if subport_idx == 0: lanes = [True] * num_lanes_per_pport else: lanes = [False] * num_lanes_per_pport + start = (subport_idx - 1) * num_lanes_per_lport + end = subport_idx * num_lanes_per_lport + lanes[start:end] = [True] * (end - start) - start = (subport_idx - 1) * num_lanes_per_lport - end = subport_idx * num_lanes_per_lport - lanes[start:end] = [True] * (end - start) return lanes def on_port_update_event(self, port_change_event): From 5515851c2e6447c3aacb35ff88e3bd901306154a Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Fri, 22 Dec 2023 21:03:14 -0800 Subject: [PATCH 16/22] Move get_active_lanes_for_lport after checking xcvr_type --- sonic-xcvrd/tests/test_xcvrd.py | 3 ++- sonic-xcvrd/xcvrd/sff_mgr.py | 33 +++++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/sonic-xcvrd/tests/test_xcvrd.py b/sonic-xcvrd/tests/test_xcvrd.py index 9fa7384d1..35daa757c 100644 --- a/sonic-xcvrd/tests/test_xcvrd.py +++ b/sonic-xcvrd/tests/test_xcvrd.py @@ -1044,7 +1044,8 @@ def test_SffManagerTask_task_worker(self, mock_chassis): # TX enable case: port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET, { 'type': 'QSFP28', - 'subport': '0' + 'subport': '0', + 'lanes': '1,2,3,4', }) task.on_port_update_event(port_change_event) assert len(task.port_dict) == 1 diff --git a/sonic-xcvrd/xcvrd/sff_mgr.py b/sonic-xcvrd/xcvrd/sff_mgr.py index 14d9889c0..ce2a8bf10 100644 --- a/sonic-xcvrd/xcvrd/sff_mgr.py +++ b/sonic-xcvrd/xcvrd/sff_mgr.py @@ -101,7 +101,9 @@ def get_active_lanes_for_lport(self, lport, subport_idx, num_lanes_per_lport, nu Returns: list: A list of boolean values, where True means the corresponding - lane is active. + lane belongs to this logical port. For example, [True, True, + False, False] means the first two lanes on this physical port + belong to this logical port. """ if subport_idx < 0 or subport_idx > num_lanes_per_pport // num_lanes_per_lport: self.log_error( @@ -149,13 +151,9 @@ def on_port_update_event(self, port_change_event): if self.SUBPORT in port_change_event.port_dict and \ self.LANES_LIST in port_change_event.port_dict: - subport_idx = int(port_change_event.port_dict[self.SUBPORT]) - lanes_list = port_change_event.port_dict[self.LANES_LIST].split(',') - active_lanes = self.get_active_lanes_for_lport(lport, subport_idx, - len(lanes_list), - self.DEFAULT_NUM_LANES_PER_PPORT) - if active_lanes is not None: - self.port_dict[lport]['active_lanes'] = active_lanes + self.port_dict[lport][self.SUBPORT] = port_change_event.port_dict[self.SUBPORT] + self.port_dict[lport][self.LANES_LIST] = \ + port_change_event.port_dict[self.LANES_LIST].split(',') if self.HOST_TX_READY in port_change_event.port_dict: self.port_dict[lport][self.HOST_TX_READY] = \ @@ -313,12 +311,16 @@ def task_worker(self): break data = self.port_dict[lport] pport = int(data.get('index', '-1')) - active_lanes = data.get('active_lanes', [True] * self.DEFAULT_NUM_LANES_PER_PPORT) + subport_idx = int(data.get(self.SUBPORT, '0')) + lanes_list = data.get(self.LANES_LIST, None) + # active_lanes is a list of boolean values, where True means the + # corresponding lane belongs to this logical port. + active_lanes = data.get('active_lanes', None) xcvr_type = data.get(self.XCVR_TYPE, None) xcvr_inserted = False host_tx_ready_changed = False admin_status_changed = False - if pport < 0: + if pport < 0 or lanes_list is None: continue if xcvr_type is None: @@ -416,6 +418,17 @@ def task_worker(self): # Skip if these essential routines are not available continue + if active_lanes is None: + active_lanes = self.get_active_lanes_for_lport(lport, subport_idx, + len(lanes_list), + self.DEFAULT_NUM_LANES_PER_PPORT) + if active_lanes is None: + self.log_error("{}: skipping sff_mgr due to " + "failing to get active lanes".format(lport)) + continue + # Save active_lanes in self.port_dict + self.port_dict[lport]['active_lanes'] = active_lanes + # Only turn on TX if both host_tx_ready is true and admin_status is up target_tx_disable_flag = not (data[self.HOST_TX_READY] == 'true' and data[self.ADMIN_STATUS] == 'up') From e83f9bd407946ba1bd4d584e55413d225ef0ed6e Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Mon, 8 Jan 2024 11:46:32 -0800 Subject: [PATCH 17/22] Update on_port_update_event checks and log format --- sonic-xcvrd/xcvrd/sff_mgr.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/sonic-xcvrd/xcvrd/sff_mgr.py b/sonic-xcvrd/xcvrd/sff_mgr.py index ce2a8bf10..d710c6497 100644 --- a/sonic-xcvrd/xcvrd/sff_mgr.py +++ b/sonic-xcvrd/xcvrd/sff_mgr.py @@ -59,7 +59,7 @@ class SffManagerTask(threading.Thread): }, { 'STATE_DB': 'PORT_TABLE', - 'FILTER': [HOST_TX_READY] + 'FILTER': [HOST_TX_READY] # This also filters out unwanted 'admin_status' from STATE_DB. }, ] @@ -107,9 +107,12 @@ def get_active_lanes_for_lport(self, lport, subport_idx, num_lanes_per_lport, nu """ if subport_idx < 0 or subport_idx > num_lanes_per_pport // num_lanes_per_lport: self.log_error( - f"{lport}: Invalid subport_idx {subport_idx} " - f"for num_lanes_per_lport={num_lanes_per_lport}, " - f"num_lanes_per_pport={num_lanes_per_pport}" + "{}: Invalid subport_idx {} " + "for num_lanes_per_lport={}, " + "num_lanes_per_pport={}".format(lport, + subport_idx, + num_lanes_per_lport, + num_lanes_per_pport) ) return None @@ -159,12 +162,7 @@ def on_port_update_event(self, port_change_event): self.port_dict[lport][self.HOST_TX_READY] = \ port_change_event.port_dict[self.HOST_TX_READY] - if self.ADMIN_STATUS in port_change_event.port_dict and \ - port_change_event.db_name and \ - port_change_event.db_name == 'CONFIG_DB': - # Only consider admin_status from CONFIG_DB. - # Ignore admin_status from STATE_DB, which may have - # different value. + if self.ADMIN_STATUS in port_change_event.port_dict: self.port_dict[lport][self.ADMIN_STATUS] = \ port_change_event.port_dict[self.ADMIN_STATUS] @@ -208,7 +206,7 @@ def get_admin_status(self, lport, asic_index): def run(self): if self.platform_chassis is None: - self.log_notice("Platform chassis is not available, stopping...") + self.log_error("Platform chassis is not available, stopping...") return try: From e03e8dd42e951be5710866b047ac3808ca50752b Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Tue, 6 Feb 2024 18:11:44 -0800 Subject: [PATCH 18/22] Change to get enable_sff_mgr flag from process argument --- sonic-xcvrd/tests/test_xcvrd.py | 18 ------------------ sonic-xcvrd/xcvrd/xcvrd.py | 32 ++++---------------------------- 2 files changed, 4 insertions(+), 46 deletions(-) diff --git a/sonic-xcvrd/tests/test_xcvrd.py b/sonic-xcvrd/tests/test_xcvrd.py index 35daa757c..789277d4a 100644 --- a/sonic-xcvrd/tests/test_xcvrd.py +++ b/sonic-xcvrd/tests/test_xcvrd.py @@ -2135,24 +2135,6 @@ def test_DaemonXcvrd_init_deinit_fastboot_enabled(self): xcvrd.init() xcvrd.deinit() - @patch('builtins.open', new_callable=MagicMock) - @patch('xcvrd.xcvrd.DaemonXcvrd.get_platform_pmon_ctrl_file_path', return_value="dummy_path") - @patch('json.load', return_value={ENABLE_SFF_MGR_FLAG_NAME: True}) - def test_DaemonXcvrd_load_feature_flags(self, mock_json_load, mock_get_path, mock_open): - xcvrd = DaemonXcvrd(SYSLOG_IDENTIFIER) - xcvrd.load_feature_flags() - mock_get_path.assert_called_once() - mock_open.assert_called_once_with('dummy_path') - mock_json_load.assert_called_once() - assert xcvrd.enable_sff_mgr == True - - @patch('sonic_py_common.device_info.get_path_to_platform_dir', MagicMock(return_value='/usr/share/sonic/platform')) - @patch('os.path.isfile', MagicMock(return_value=True)) - def test_DaemonXcvrd_get_platform_pmon_ctrl_file_path(self): - xcvrd = DaemonXcvrd(SYSLOG_IDENTIFIER) - assert xcvrd.get_platform_pmon_ctrl_file_path( - ) == '/usr/share/sonic/platform/pmon_daemon_control.json' - def wait_until(total_wait_time, interval, call_back, *args, **kwargs): wait_time = 0 while wait_time <= total_wait_time: diff --git a/sonic-xcvrd/xcvrd/xcvrd.py b/sonic-xcvrd/xcvrd/xcvrd.py index e9e04f2b5..54d987ea9 100644 --- a/sonic-xcvrd/xcvrd/xcvrd.py +++ b/sonic-xcvrd/xcvrd/xcvrd.py @@ -88,9 +88,6 @@ POWER_UNIT = 'dBm' BIAS_UNIT = 'mA' -PLATFORM_PMON_CTRL_FILENAME = "pmon_daemon_control.json" -ENABLE_SFF_MGR_FLAG_NAME = "enable_xcvrd_sff_mgr" - g_dict = {} # Global platform specific sfputil class instance platform_sfputil = None @@ -2182,11 +2179,12 @@ def retry_eeprom_reading(self): class DaemonXcvrd(daemon_base.DaemonBase): - def __init__(self, log_identifier, skip_cmis_mgr=False): + def __init__(self, log_identifier, skip_cmis_mgr=False, enable_sff_mgr=False): super(DaemonXcvrd, self).__init__(log_identifier) self.stop_event = threading.Event() self.sfp_error_event = threading.Event() self.skip_cmis_mgr = skip_cmis_mgr + self.enable_sff_mgr = enable_sff_mgr self.enable_sff_mgr = False self.namespaces = [''] self.threads = [] @@ -2306,28 +2304,6 @@ def deinit(self): del globals()['platform_chassis'] - def load_feature_flags(self): - """ - Load feature enable/skip flags from platform files. - """ - platform_pmon_ctrl_file_path = self.get_platform_pmon_ctrl_file_path() - if platform_pmon_ctrl_file_path is not None: - # Load the JSON file - with open(platform_pmon_ctrl_file_path) as file: - data = json.load(file) - if ENABLE_SFF_MGR_FLAG_NAME in data: - enable_sff_mgr = data[ENABLE_SFF_MGR_FLAG_NAME] - self.enable_sff_mgr = isinstance(enable_sff_mgr, bool) and enable_sff_mgr - - def get_platform_pmon_ctrl_file_path(self): - """ - Get the platform PMON control file path. - - Returns: - str: The platform PMON control file path. - """ - return device_info.get_path_to_platform_dir() + '/' + PLATFORM_PMON_CTRL_FILENAME - # Run daemon def run(self): @@ -2336,7 +2312,6 @@ def run(self): # Start daemon initialization sequence port_mapping_data = self.init() - self.load_feature_flags() # Start the SFF manager sff_manager = None if self.enable_sff_mgr: @@ -2423,9 +2398,10 @@ def run(self): def main(): parser = argparse.ArgumentParser() parser.add_argument('--skip_cmis_mgr', action='store_true') + parser.add_argument('--enable_sff_mgr', action='store_true') args = parser.parse_args() - xcvrd = DaemonXcvrd(SYSLOG_IDENTIFIER, args.skip_cmis_mgr) + xcvrd = DaemonXcvrd(SYSLOG_IDENTIFIER, args.skip_cmis_mgr, args.enable_sff_mgr) xcvrd.run() From e3dcff25c07a0ba2c96b2f135800e755b26ef406 Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Tue, 6 Feb 2024 18:46:03 -0800 Subject: [PATCH 19/22] Fix typo --- sonic-xcvrd/xcvrd/xcvrd.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sonic-xcvrd/xcvrd/xcvrd.py b/sonic-xcvrd/xcvrd/xcvrd.py index 54d987ea9..9caf23b01 100644 --- a/sonic-xcvrd/xcvrd/xcvrd.py +++ b/sonic-xcvrd/xcvrd/xcvrd.py @@ -2185,7 +2185,6 @@ def __init__(self, log_identifier, skip_cmis_mgr=False, enable_sff_mgr=False): self.sfp_error_event = threading.Event() self.skip_cmis_mgr = skip_cmis_mgr self.enable_sff_mgr = enable_sff_mgr - self.enable_sff_mgr = False self.namespaces = [''] self.threads = [] From 51d16c3e2defda47f225fdc0f3dcdd47524d9587 Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Sat, 24 Feb 2024 21:43:58 -0800 Subject: [PATCH 20/22] Resolve conflicts --- sonic-xcvrd/tests/test_xcvrd.py | 9 +++------ sonic-xcvrd/xcvrd/sff_mgr.py | 36 ++++++++++++++++++++++++--------- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/sonic-xcvrd/tests/test_xcvrd.py b/sonic-xcvrd/tests/test_xcvrd.py index 71ad057c3..18224318f 100644 --- a/sonic-xcvrd/tests/test_xcvrd.py +++ b/sonic-xcvrd/tests/test_xcvrd.py @@ -138,8 +138,7 @@ class TestXcvrdThreadException(object): - @patch('xcvrd.xcvrd_utilities.port_mapping.subscribe_port_update_event', - MagicMock(side_effect = NotImplementedError)) + @patch('xcvrd.sff_mgr.PortChangeObserver', MagicMock(side_effect=NotImplementedError)) def test_SffManagerTask_task_run_with_exception(self): stop_event = threading.Event() sff_mgr = SffManagerTask(DEFAULT_NAMESPACE, stop_event, MagicMock(), helper_logger) @@ -156,7 +155,7 @@ def test_SffManagerTask_task_run_with_exception(self): assert(type(exception_received) == NotImplementedError) assert("NotImplementedError" in str(trace) and "effect" in str(trace)) assert("sonic-xcvrd/xcvrd/sff_mgr.py" in str(trace)) - assert("subscribe_port_update_event" in str(trace)) + assert("PortChangeObserver" in str(trace)) @patch('xcvrd.xcvrd.platform_chassis', MagicMock()) def test_CmisManagerTask_task_run_with_exception(self): @@ -1208,9 +1207,7 @@ def test_SffManagerTask_get_admin_status(self, mock_get_cfg_port_tbl): mock_get_cfg_port_tbl.return_value.hget.assert_called_once_with(lport, 'admin_status') @patch('xcvrd.xcvrd.platform_chassis') - @patch('xcvrd.xcvrd_utilities.port_mapping.subscribe_port_update_event', - MagicMock(return_value=(None, None))) - @patch('xcvrd.xcvrd_utilities.port_mapping.handle_port_update_event', MagicMock(return_value=True)) + @patch('xcvrd.sff_mgr.PortChangeObserver', MagicMock(handle_port_update_event=MagicMock())) def test_SffManagerTask_task_worker(self, mock_chassis): mock_xcvr_api = MagicMock() mock_xcvr_api.tx_disable_channel = MagicMock(return_value=True) diff --git a/sonic-xcvrd/xcvrd/sff_mgr.py b/sonic-xcvrd/xcvrd/sff_mgr.py index d710c6497..e1e307024 100644 --- a/sonic-xcvrd/xcvrd/sff_mgr.py +++ b/sonic-xcvrd/xcvrd/sff_mgr.py @@ -12,15 +12,28 @@ from swsscommon import swsscommon - from .xcvrd_utilities import port_mapping + from .xcvrd_utilities.port_event_helper import PortChangeObserver from .xcvrd_utilities.xcvr_table_helper import XcvrTableHelper except ImportError as e: raise ImportError(str(e) + " - required module not found") -# Thread wrapper class for SFF compliant transceiver management +class SffLoggerForPortUpdateEvent: + SFF_LOGGER_PREFIX = "SFF-PORT-UPDATE: " + + def __init__(self, logger): + self.logger = logger + def log_notice(self, message): + self.logger.log_notice("{}{}".format(self.SFF_LOGGER_PREFIX, message)) + + def log_warning(self, message): + self.logger.log_warning("{}{}".format(self.SFF_LOGGER_PREFIX, message)) + def log_error(self, message): + self.logger.log_error("{}{}".format(self.SFF_LOGGER_PREFIX, message)) + +# Thread wrapper class for SFF compliant transceiver management class SffManagerTask(threading.Thread): # CONFIG_DB port table fields: @@ -63,6 +76,8 @@ class SffManagerTask(threading.Thread): }, ] + SFF_LOGGER_PREFIX = "SFF-MAIN: " + def __init__(self, namespaces, main_thread_stop_event, platform_chassis, helper_logger): threading.Thread.__init__(self) self.name = "SffManagerTask" @@ -70,6 +85,7 @@ def __init__(self, namespaces, main_thread_stop_event, platform_chassis, helper_ self.task_stopping_event = threading.Event() self.main_thread_stop_event = main_thread_stop_event self.helper_logger = helper_logger + self.logger_for_port_update_event = SffLoggerForPortUpdateEvent(helper_logger) self.platform_chassis = platform_chassis # port_dict holds data obtained from on_port_update_event per port entry # with logical_port_name as key. @@ -81,13 +97,13 @@ def __init__(self, namespaces, main_thread_stop_event, platform_chassis, helper_ self.namespaces = namespaces def log_notice(self, message): - self.helper_logger.log_notice("SFF: {}".format(message)) + self.helper_logger.log_notice("{}{}".format(self.SFF_LOGGER_PREFIX, message)) def log_warning(self, message): - self.helper_logger.log_warning("SFF: {}".format(message)) + self.helper_logger.log_warning("{}{}".format(self.SFF_LOGGER_PREFIX, message)) def log_error(self, message): - self.helper_logger.log_error("SFF: {}".format(message)) + self.helper_logger.log_error("{}{}".format(self.SFF_LOGGER_PREFIX, message)) def get_active_lanes_for_lport(self, lport, subport_idx, num_lanes_per_lport, num_lanes_per_pport): """ @@ -286,8 +302,11 @@ def task_worker(self): ''' # CONFIG updates, and STATE_DB for insertion/removal, and host_tx_ready change - sel, asic_context = port_mapping.subscribe_port_update_event(self.namespaces, self, - self.PORT_TBL_MAP) + port_change_observer = PortChangeObserver(self.namespaces, + self.logger_for_port_update_event, + self.task_stopping_event, + self.on_port_update_event, + self.PORT_TBL_MAP) # This thread doesn't need to expilictly wait on PortInitDone and # PortConfigDone events, as xcvrd main thread waits on them before @@ -299,8 +318,7 @@ def task_worker(self): # operation in the DB tables. Upon process restart, messages will be # replayed for all fields, no need to explictly query the DB tables # here. - if not port_mapping.handle_port_update_event( - sel, asic_context, self.task_stopping_event, self, self.on_port_update_event): + if not port_change_observer.handle_port_update_event(): # In the case of no real update, go back to the beginning of the loop continue From 83f7bc3b2dfbbc359380f3b7fbc64127000a1c22 Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Sat, 24 Feb 2024 21:49:38 -0800 Subject: [PATCH 21/22] Remove unused import and class variable in port_event_helper.py --- sonic-xcvrd/xcvrd/xcvrd_utilities/port_event_helper.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/sonic-xcvrd/xcvrd/xcvrd_utilities/port_event_helper.py b/sonic-xcvrd/xcvrd/xcvrd_utilities/port_event_helper.py index 0b969d0f6..3c0dbffa3 100644 --- a/sonic-xcvrd/xcvrd/xcvrd_utilities/port_event_helper.py +++ b/sonic-xcvrd/xcvrd/xcvrd_utilities/port_event_helper.py @@ -1,4 +1,3 @@ -import threading from sonic_py_common import daemon_base from sonic_py_common import multi_asic from sonic_py_common.interface import backplane_prefix, inband_prefix, recirc_prefix @@ -13,12 +12,6 @@ class PortChangeEvent: - # thread_local_data will be used to hold class-scope thread-safe variables - # (i.e. global variable to the local thread), such as - # thread_local_data.PORT_EVENT. thread_local_data.PORT_EVENT dict will be - # initialized in subscribe_port_update_event, and used to store the latest - # port change event for each key, to avoid duplicate event processing. - thread_local_data = threading.local() PORT_ADD = 0 PORT_REMOVE = 1 PORT_SET = 2 From 178716c5a90e3cca67c6e126b11d014e95c5e26b Mon Sep 17 00:00:00 2001 From: Longyin Huang Date: Wed, 6 Mar 2024 19:26:12 -0800 Subject: [PATCH 22/22] Change to update 'lanes' and 'subport' fields independently --- sonic-xcvrd/xcvrd/sff_mgr.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sonic-xcvrd/xcvrd/sff_mgr.py b/sonic-xcvrd/xcvrd/sff_mgr.py index e1e307024..0940b6b54 100644 --- a/sonic-xcvrd/xcvrd/sff_mgr.py +++ b/sonic-xcvrd/xcvrd/sff_mgr.py @@ -168,9 +168,10 @@ def on_port_update_event(self, port_change_event): if pport >= 0: self.port_dict[lport]['index'] = pport - if self.SUBPORT in port_change_event.port_dict and \ - self.LANES_LIST in port_change_event.port_dict: + if self.SUBPORT in port_change_event.port_dict: self.port_dict[lport][self.SUBPORT] = port_change_event.port_dict[self.SUBPORT] + + if self.LANES_LIST in port_change_event.port_dict: self.port_dict[lport][self.LANES_LIST] = \ port_change_event.port_dict[self.LANES_LIST].split(',')