diff --git a/tests/bgp/test_startup_tsa_tsb_service.py b/tests/bgp/test_startup_tsa_tsb_service.py index d49a1694ae..0423cbdbd9 100644 --- a/tests/bgp/test_startup_tsa_tsb_service.py +++ b/tests/bgp/test_startup_tsa_tsb_service.py @@ -1,6 +1,7 @@ import logging import datetime import pexpect +import sys import pytest @@ -24,7 +25,8 @@ TS_MAINTENANCE = "System Mode: Maintenance" TS_INCONSISTENT = "System Mode: Not consistent" TS_NO_NEIGHBORS = "System Mode: No external neighbors" -COLD_REBOOT_CAUSE = 'cold' +COLD_REBOOT_CAUSE = "cold" +KERNEL_PANIC_REBOOT_CAUSE = "Kernel Panic" UNKNOWN_REBOOT_CAUSE = "Unknown" SUP_REBOOT_CAUSE = 'Reboot from Supervisor' SUP_HEARTBEAT_LOSS_CAUSE = 'Heartbeat with the Supervisor card lost' @@ -105,8 +107,12 @@ def get_tsa_tsb_service_uptime(duthost): service_status = duthost.shell("sudo systemctl status startup_tsa_tsb.service | grep 'Active'") for line in service_status["stdout_lines"]: if 'active' in line: - tmp_time = line.split('since')[1].strip().encode('utf-8') - act_time = tmp_time.split('UTC')[0].strip().encode('utf-8') + if sys.version_info.major < 3: + tmp_time = line.split('since')[1].strip().encode('utf-8') + act_time = tmp_time.split('UTC')[0].strip().encode('utf-8') + else: + tmp_time = line.split('since')[1].strip() + act_time = tmp_time.split('UTC')[0].strip() service_uptime = datetime.datetime.strptime(act_time[4:], '%Y-%m-%d %H:%M:%S') return service_uptime return service_uptime @@ -199,6 +205,12 @@ def exec_tsa_tsb_cmd_on_supervisor(duthosts, enum_supervisor_dut_hostname, creds except Exception as e: pytest.fail("Cannot connect to DUT {} host via SSH: {}".format(suphost.hostname, e)) +def check_traffic_shift_state(duthost, state): + if state != get_traffic_shift_state(duthost): + return False + else: + return True + @pytest.mark.disable_loganalyzer def test_tsa_tsb_service_with_dut_cold_reboot(duthosts, localhost, enum_rand_one_per_hwsku_frontend_hostname, ptfhost, @@ -214,8 +226,8 @@ def test_tsa_tsb_service_with_dut_cold_reboot(duthosts, localhost, enum_rand_one pytest.skip("startup_tsa_tsb.service is not supported on the {}".format(duthost.hostname)) dut_nbrhosts = nbrhosts_to_dut(duthost, nbrhosts) # Ensure that the DUT is not in maintenance already before start of the test - pytest_assert(TS_NORMAL == get_traffic_shift_state(duthost), - "DUT is not in normal state") + if TS_NORMAL != get_traffic_shift_state(duthost): + pytest.skip("Test skipped since DUT is not in normal state") if not check_tsa_persistence_support(duthost): pytest.skip("TSA persistence not supported in the image") @@ -224,6 +236,8 @@ def test_tsa_tsb_service_with_dut_cold_reboot(duthosts, localhost, enum_rand_one orig_v4_routes = parse_routes_on_neighbors(duthost, nbrhosts, 4) orig_v6_routes = parse_routes_on_neighbors(duthost, nbrhosts, 6) + up_bgp_neighbors = duthost.get_bgp_neighbors_per_asic("established") + # Reboot dut and wait for startup_tsa_tsb service to start logger.info("Cold reboot on node: %s", duthost.hostname) reboot(duthost, localhost, wait=240) @@ -241,7 +255,7 @@ def test_tsa_tsb_service_with_dut_cold_reboot(duthosts, localhost, enum_rand_one logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(duthost) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 240, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Verify DUT is in maintenance state. @@ -253,6 +267,11 @@ def test_tsa_tsb_service_with_dut_cold_reboot(duthosts, localhost, enum_rand_one pytest_assert(wait_until(600, 20, 0, check_interface_status_of_up_ports, duthost), "Not all ports that are admin up on are operationally up") + # verify sessions are established + pytest_assert(wait_until(600, 10, 0, + duthost.check_bgp_session_state_all_asics, up_bgp_neighbors, "established"), + "All BGP sessions are not up. No point in continuing the test") + pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( duthosts, duthost, dut_nbrhosts, traffic_shift_community), "Failed to verify routes on nbr in TSA") @@ -283,8 +302,9 @@ def test_tsa_tsb_service_with_dut_cold_reboot(duthosts, localhost, enum_rand_one finally: # Verify DUT is in normal state. - pytest_assert(TS_NORMAL == get_traffic_shift_state(duthost), + pytest_assert(wait_until(120, 20, 0, check_traffic_shift_state, duthost, TS_NORMAL), "DUT is not in normal state") + # Make sure the dut's reboot cause is as expected logger.info("Check reboot cause of the dut") reboot_cause = get_reboot_cause(duthost) @@ -307,8 +327,8 @@ def test_tsa_tsb_service_with_dut_abnormal_reboot(duthosts, localhost, enum_rand dut_nbrhosts = nbrhosts_to_dut(duthost, nbrhosts) dut_ip = duthost.mgmt_ip # Ensure that the DUT is not in maintenance already before start of the test - pytest_assert(TS_NORMAL == get_traffic_shift_state(duthost), - "DUT is not in normal state") + if TS_NORMAL != get_traffic_shift_state(duthost): + pytest.skip("Test skipped since DUT is not in normal state") if not check_tsa_persistence_support(duthost): pytest.skip("TSA persistence not supported in the image") @@ -317,6 +337,8 @@ def test_tsa_tsb_service_with_dut_abnormal_reboot(duthosts, localhost, enum_rand orig_v4_routes = parse_routes_on_neighbors(duthost, nbrhosts, 4) orig_v6_routes = parse_routes_on_neighbors(duthost, nbrhosts, 6) + up_bgp_neighbors = duthost.get_bgp_neighbors_per_asic("established") + # Our shell command is designed as 'nohup bash -c "sleep 5 && tail /dev/zero" &' because of: # * `tail /dev/zero` is used to run out of memory completely. # * Since `tail /dev/zero` will cause the DUT reboot, we need to run it in the background @@ -346,7 +368,7 @@ def test_tsa_tsb_service_with_dut_abnormal_reboot(duthosts, localhost, enum_rand service_uptime = get_tsa_tsb_service_uptime(duthost) time_diff = (service_uptime - dut_uptime).total_seconds() logger.info("Time difference between dut up-time & tsa_tsb_service up-time is {}".format(int(time_diff))) - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 240, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Make sure BGP containers are running properly before verifying @@ -362,6 +384,11 @@ def test_tsa_tsb_service_with_dut_abnormal_reboot(duthosts, localhost, enum_rand pytest_assert(wait_until(600, 20, 0, check_interface_status_of_up_ports, duthost), "Not all ports that are admin up on are operationally up") + # verify sessions are established + pytest_assert(wait_until(600, 10, 0, + duthost.check_bgp_session_state_all_asics, up_bgp_neighbors, "established"), + "All BGP sessions are not up. No point in continuing the test") + pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( duthosts, duthost, dut_nbrhosts, traffic_shift_community), "Failed to verify routes on nbr in TSA") @@ -392,13 +419,18 @@ def test_tsa_tsb_service_with_dut_abnormal_reboot(duthosts, localhost, enum_rand finally: # Verify DUT is in normal state. - pytest_assert(TS_NORMAL == get_traffic_shift_state(duthost), + pytest_assert(wait_until(120, 20, 0, check_traffic_shift_state, duthost, TS_NORMAL), "DUT is not in normal state") # Make sure the dut's reboot cause is as expected logger.info("Check reboot cause of the dut") + out = duthost.command('show kdump config') reboot_cause = get_reboot_cause(duthost) - pytest_assert(reboot_cause == UNKNOWN_REBOOT_CAUSE, - "Reboot cause {} did not match the trigger {}".format(reboot_cause, UNKNOWN_REBOOT_CAUSE)) + if "Enabled" not in out["stdout"]: + pytest_assert(reboot_cause == UNKNOWN_REBOOT_CAUSE, + "Reboot cause {} did not match the trigger {}".format(reboot_cause, UNKNOWN_REBOOT_CAUSE)) + else: + pytest_assert(reboot_cause == KERNEL_PANIC_REBOOT_CAUSE, + "Reboot cause {} did not match the trigger {}".format(reboot_cause, KERNEL_PANIC_REBOOT_CAUSE)) @pytest.mark.disable_loganalyzer @@ -412,15 +444,17 @@ def test_tsa_tsb_service_with_supervisor_cold_reboot(duthosts, localhost, enum_s suphost = duthosts[enum_supervisor_dut_hostname] tsa_tsb_timer = dict() dut_nbrhosts = dict() + up_bgp_neighbors = dict() orig_v4_routes, orig_v6_routes = dict(), dict() for linecard in duthosts.frontend_nodes: + up_bgp_neighbors[linecard] = linecard.get_bgp_neighbors_per_asic("established") tsa_tsb_timer[linecard] = get_startup_tsb_timer(linecard) if not tsa_tsb_timer[linecard]: pytest.skip("startup_tsa_tsb.service is not supported on the duts under {}".format(suphost.hostname)) dut_nbrhosts[linecard] = nbrhosts_to_dut(linecard, nbrhosts) # Ensure that the DUT is not in maintenance already before start of the test - pytest_assert(TS_NORMAL == get_traffic_shift_state(linecard), - "DUT is not in normal state") + if TS_NORMAL != get_traffic_shift_state(linecard): + pytest.skip("Test skipped since DUT is not in normal state") if not check_tsa_persistence_support(linecard): pytest.skip("TSA persistence not supported in the image") @@ -451,7 +485,7 @@ def test_tsa_tsb_service_with_supervisor_cold_reboot(duthosts, localhost, enum_s logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(linecard) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 240, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Verify DUT is in maintenance state. @@ -463,6 +497,11 @@ def test_tsa_tsb_service_with_supervisor_cold_reboot(duthosts, localhost, enum_s pytest_assert(wait_until(600, 20, 0, check_interface_status_of_up_ports, linecard), "Not all ports that are admin up on are operationally up") + # verify sessions are established + pytest_assert(wait_until(600, 10, 0, + linecard.check_bgp_session_state_all_asics, up_bgp_neighbors[linecard], "established"), + "All BGP sessions are not up. No point in continuing the test") + pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( duthosts, linecard, dut_nbrhosts[linecard], traffic_shift_community), "Failed to verify routes on nbr in TSA") @@ -498,8 +537,8 @@ def test_tsa_tsb_service_with_supervisor_cold_reboot(duthosts, localhost, enum_s finally: for linecard in duthosts.frontend_nodes: # Verify DUT is in normal state. - pytest_assert(TS_NORMAL == get_traffic_shift_state(linecard), - "DUT {} is not in normal state".format(linecard)) + pytest_assert(wait_until(120, 20, 0, check_traffic_shift_state, linecard, TS_NORMAL), + "DUT {} is not in normal state".format(linecard)) # Make sure the dut's reboot cause is as expected logger.info("Check reboot cause of the dut {}".format(linecard)) reboot_cause = get_reboot_cause(linecard) @@ -512,7 +551,6 @@ def test_tsa_tsb_service_with_supervisor_cold_reboot(duthosts, localhost, enum_s pytest_assert(reboot_cause == COLD_REBOOT_CAUSE, "Reboot cause {} did not match the trigger {}".format(reboot_cause, COLD_REBOOT_CAUSE)) - @pytest.mark.disable_loganalyzer def test_tsa_tsb_service_with_supervisor_abnormal_reboot(duthosts, localhost, enum_supervisor_dut_hostname, ptfhost, nbrhosts, traffic_shift_community, tbinfo): @@ -525,15 +563,17 @@ def test_tsa_tsb_service_with_supervisor_abnormal_reboot(duthosts, localhost, en sup_ip = suphost.mgmt_ip tsa_tsb_timer = dict() dut_nbrhosts = dict() + up_bgp_neighbors = dict() orig_v4_routes, orig_v6_routes = dict(), dict() for linecard in duthosts.frontend_nodes: + up_bgp_neighbors[linecard] = linecard.get_bgp_neighbors_per_asic("established") tsa_tsb_timer[linecard] = get_startup_tsb_timer(linecard) if not tsa_tsb_timer[linecard]: pytest.skip("startup_tsa_tsb.service is not supported on the duts under {}".format(suphost.hostname)) dut_nbrhosts[linecard] = nbrhosts_to_dut(linecard, nbrhosts) # Ensure that the DUT is not in maintenance already before start of the test - pytest_assert(TS_NORMAL == get_traffic_shift_state(linecard), - "DUT is not in normal state") + if TS_NORMAL != get_traffic_shift_state(linecard): + pytest.skip("Test skipped since DUT is not in normal state") if not check_tsa_persistence_support(linecard): pytest.skip("TSA persistence not supported in the image") @@ -581,7 +621,7 @@ def test_tsa_tsb_service_with_supervisor_abnormal_reboot(duthosts, localhost, en logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(linecard) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 240, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Verify DUT is in maintenance state. @@ -593,6 +633,11 @@ def test_tsa_tsb_service_with_supervisor_abnormal_reboot(duthosts, localhost, en pytest_assert(wait_until(600, 20, 0, check_interface_status_of_up_ports, linecard), "Not all ports that are admin up on are operationally up") + # verify sessions are established + pytest_assert(wait_until(600, 10, 0, + linecard.check_bgp_session_state_all_asics, up_bgp_neighbors[linecard], "established"), + "All BGP sessions are not up. No point in continuing the test") + pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( duthosts, linecard, dut_nbrhosts[linecard], traffic_shift_community), "Failed to verify routes on nbr in TSA") @@ -628,8 +673,9 @@ def test_tsa_tsb_service_with_supervisor_abnormal_reboot(duthosts, localhost, en finally: for linecard in duthosts.frontend_nodes: # Verify DUT is in normal state. - pytest_assert(TS_NORMAL == get_traffic_shift_state(linecard), - "DUT {} is not in normal state".format(linecard)) + pytest_assert(wait_until(120, 20, 0, check_traffic_shift_state, linecard, TS_NORMAL), + "DUT {} is not in normal state".format(linecard)) + # Make sure the dut's reboot cause is as expected logger.info("Check reboot cause of the dut {}".format(linecard)) reboot_cause = get_reboot_cause(linecard) @@ -638,9 +684,14 @@ def test_tsa_tsb_service_with_supervisor_abnormal_reboot(duthosts, localhost, en # Make sure the Supervisor's reboot cause is as expected logger.info("Check reboot cause of the supervisor") + out = suphost.command('show kdump config') reboot_cause = get_reboot_cause(suphost) - pytest_assert(reboot_cause == UNKNOWN_REBOOT_CAUSE, - "Reboot cause {} did not match the trigger {}".format(reboot_cause, UNKNOWN_REBOOT_CAUSE)) + if "Enabled" not in out["stdout"]: + pytest_assert(reboot_cause == UNKNOWN_REBOOT_CAUSE, + "Reboot cause {} did not match the trigger {}".format(reboot_cause, UNKNOWN_REBOOT_CAUSE)) + else: + pytest_assert(reboot_cause == KERNEL_PANIC_REBOOT_CAUSE, + "Reboot cause {} did not match the trigger {}".format(reboot_cause, KERNEL_PANIC_REBOOT_CAUSE)) @pytest.mark.disable_loganalyzer @@ -659,8 +710,8 @@ def test_tsa_tsb_service_with_user_init_tsa(duthosts, localhost, enum_rand_one_p dut_nbrhosts = nbrhosts_to_dut(duthost, nbrhosts) orig_v4_routes, orig_v6_routes = {}, {} # Ensure that the DUT is not in maintenance already before start of the test - pytest_assert(TS_NORMAL == get_traffic_shift_state(duthost), - "DUT is not in normal state") + if TS_NORMAL != get_traffic_shift_state(duthost): + pytest.skip("Test skipped since DUT is not in normal state") if not check_tsa_persistence_support(duthost): pytest.skip("TSA persistence not supported in the image") @@ -669,6 +720,7 @@ def test_tsa_tsb_service_with_user_init_tsa(duthosts, localhost, enum_rand_one_p orig_v4_routes = parse_routes_on_neighbors(duthost, nbrhosts, 4) orig_v6_routes = parse_routes_on_neighbors(duthost, nbrhosts, 6) + up_bgp_neighbors = duthost.get_bgp_neighbors_per_asic("established") # Issue TSA on DUT duthost.shell("TSA") duthost.shell('sudo config save -y') @@ -686,7 +738,7 @@ def test_tsa_tsb_service_with_user_init_tsa(duthosts, localhost, enum_rand_one_p logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(duthost) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 240, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Ensure startup_tsa_tsb service is in exited state after dut reboot @@ -702,6 +754,11 @@ def test_tsa_tsb_service_with_user_init_tsa(duthosts, localhost, enum_rand_one_p pytest_assert(wait_until(600, 20, 0, check_interface_status_of_up_ports, duthost), "Not all ports that are admin up on are operationally up") + # verify sessions are established + pytest_assert(wait_until(600, 10, 0, + duthost.check_bgp_session_state_all_asics, up_bgp_neighbors, "established"), + "All BGP sessions are not up. No point in continuing the test") + pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( duthosts, duthost, dut_nbrhosts, traffic_shift_community), "Failed to verify routes on nbr in TSA") @@ -717,10 +774,17 @@ def test_tsa_tsb_service_with_user_init_tsa(duthosts, localhost, enum_rand_one_p config_reload(duthost, safe_reload=True, check_intf_up_ports=True) # Verify DUT comes back to normal state after TSB. - pytest_assert(TS_NORMAL == get_traffic_shift_state(duthost), "DUT is not in normal state") + pytest_assert(wait_until(120, 20, 0, check_traffic_shift_state, duthost, TS_NORMAL), + "DUT is not in normal state") # Wait until all routes are announced to neighbors cur_v4_routes = {} cur_v6_routes = {} + + # verify sessions are established + pytest_assert(wait_until(600, 10, 0, + duthost.check_bgp_session_state_all_asics, up_bgp_neighbors, "established"), + "All BGP sessions are not up. No point in continuing the test") + # Verify that all routes advertised to neighbor at the start of the test if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, duthost, nbrhosts, orig_v4_routes, cur_v4_routes, 4): @@ -756,8 +820,8 @@ def test_user_init_tsa_while_service_run_on_dut(duthosts, localhost, enum_rand_o pytest.skip("startup_tsa_tsb.service is not supported on the {}".format(duthost.hostname)) dut_nbrhosts = nbrhosts_to_dut(duthost, nbrhosts) # Ensure that the DUT is not in maintenance already before start of the test - pytest_assert(TS_NORMAL == get_traffic_shift_state(duthost), - "DUT is not in normal state") + if TS_NORMAL != get_traffic_shift_state(duthost): + pytest.skip("Test skipped since DUT is not in normal state") if not check_tsa_persistence_support(duthost): pytest.skip("TSA persistence not supported in the image") @@ -766,6 +830,8 @@ def test_user_init_tsa_while_service_run_on_dut(duthosts, localhost, enum_rand_o orig_v4_routes = parse_routes_on_neighbors(duthost, nbrhosts, 4) orig_v6_routes = parse_routes_on_neighbors(duthost, nbrhosts, 6) + up_bgp_neighbors = duthost.get_bgp_neighbors_per_asic("established") + # Reboot dut and wait for startup_tsa_tsb service to start logger.info("Cold reboot on node: %s", duthost.hostname) reboot(duthost, localhost, wait=240) @@ -783,7 +849,7 @@ def test_user_init_tsa_while_service_run_on_dut(duthosts, localhost, enum_rand_o logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(duthost) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 240, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Verify DUT is in maintenance state. @@ -807,6 +873,12 @@ def test_user_init_tsa_while_service_run_on_dut(duthosts, localhost, enum_rand_o pytest_assert(wait_until(600, 20, 0, check_interface_status_of_up_ports, duthost), "Not all ports that are admin up on are operationally up") + # verify sessions are established + pytest_assert(wait_until(600, 10, 0, + duthost.check_bgp_session_state_all_asics, up_bgp_neighbors, "established"), + "All BGP sessions are not up. No point in continuing the test") + + pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( duthosts, duthost, dut_nbrhosts, traffic_shift_community), "Failed to verify routes on nbr in TSA") @@ -821,7 +893,8 @@ def test_user_init_tsa_while_service_run_on_dut(duthosts, localhost, enum_rand_o duthost.shell('sudo config save -y') # Verify DUT comes back to normal state after TSB. - pytest_assert(TS_NORMAL == get_traffic_shift_state(duthost), "DUT is not in normal state") + pytest_assert(wait_until(120, 20, 0, check_traffic_shift_state, duthost, TS_NORMAL), + "DUT is not in normal state") # Wait until all routes are announced to neighbors cur_v4_routes = {} cur_v6_routes = {} @@ -859,8 +932,8 @@ def test_user_init_tsb_while_service_run_on_dut(duthosts, localhost, enum_rand_o if not tsa_tsb_timer: pytest.skip("startup_tsa_tsb.service is not supported on the {}".format(duthost.hostname)) # Ensure that the DUT is not in maintenance already before start of the test - pytest_assert(TS_NORMAL == get_traffic_shift_state(duthost), - "DUT is not in normal state") + if TS_NORMAL != get_traffic_shift_state(duthost): + pytest.skip("Test skipped since DUT is not in normal state") if not check_tsa_persistence_support(duthost): pytest.skip("TSA persistence not supported in the image") @@ -869,6 +942,8 @@ def test_user_init_tsb_while_service_run_on_dut(duthosts, localhost, enum_rand_o orig_v4_routes = parse_routes_on_neighbors(duthost, nbrhosts, 4) orig_v6_routes = parse_routes_on_neighbors(duthost, nbrhosts, 6) + up_bgp_neighbors = duthost.get_bgp_neighbors_per_asic("established") + # Reboot dut and wait for startup_tsa_tsb service to start logger.info("Cold reboot on node: %s", duthost.hostname) reboot(duthost, localhost, wait=240) @@ -886,7 +961,7 @@ def test_user_init_tsb_while_service_run_on_dut(duthosts, localhost, enum_rand_o logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(duthost) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 240, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Verify DUT is in maintenance state. @@ -908,6 +983,11 @@ def test_user_init_tsb_while_service_run_on_dut(duthosts, localhost, enum_rand_o assert wait_until(300, 20, 2, duthost.critical_services_fully_started), \ "Not all critical services are fully started on {}".format(duthost.hostname) + # verify sessions are established + pytest_assert(wait_until(600, 10, 0, + duthost.check_bgp_session_state_all_asics, up_bgp_neighbors, "established"), + "All BGP sessions are not up. No point in continuing the test") + # Wait until all routes are announced to neighbors cur_v4_routes = {} cur_v6_routes = {} @@ -925,7 +1005,7 @@ def test_user_init_tsb_while_service_run_on_dut(duthosts, localhost, enum_rand_o finally: # Verify DUT is in normal state. - pytest_assert(TS_NORMAL == get_traffic_shift_state(duthost), + pytest_assert(wait_until(120, 20, 0, check_traffic_shift_state, duthost, TS_NORMAL), "DUT is not in normal state") # Make sure the dut's reboot cause is as expected @@ -950,15 +1030,17 @@ def test_user_init_tsb_on_sup_while_service_run_on_dut(duthosts, localhost, suphost = duthosts[enum_supervisor_dut_hostname] tsa_tsb_timer = dict() dut_nbrhosts = dict() + up_bgp_neighbors = dict() orig_v4_routes, orig_v6_routes = dict(), dict() for linecard in duthosts.frontend_nodes: + up_bgp_neighbors[linecard] = linecard.get_bgp_neighbors_per_asic("established") tsa_tsb_timer[linecard] = get_startup_tsb_timer(linecard) if not tsa_tsb_timer[linecard]: pytest.skip("startup_tsa_tsb.service is not supported on the duts under {}".format(suphost.hostname)) dut_nbrhosts[linecard] = nbrhosts_to_dut(linecard, nbrhosts) # Ensure that the DUT is not in maintenance already before start of the test - pytest_assert(TS_NORMAL == get_traffic_shift_state(linecard), - "DUT is not in normal state") + if TS_NORMAL != get_traffic_shift_state(linecard): + pytest.skip("Test skipped since DUT is not in normal state") if not check_tsa_persistence_support(linecard): pytest.skip("TSA persistence not supported in the image") @@ -989,7 +1071,7 @@ def test_user_init_tsb_on_sup_while_service_run_on_dut(duthosts, localhost, logging.info('DUT {} up since {}'.format(linecard.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(linecard) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 240, "startup_tsa_tsb service started much later than the expected time after dut reboot") # Verify DUT is in maintenance state. @@ -999,6 +1081,11 @@ def test_user_init_tsb_on_sup_while_service_run_on_dut(duthosts, localhost, logging.info("Wait until all critical processes are fully started") wait_critical_processes(linecard) + # verify sessions are established + pytest_assert(wait_until(600, 10, 0, + linecard.check_bgp_session_state_all_asics, up_bgp_neighbors[linecard], "established"), + "All BGP sessions are not up. No point in continuing the test") + pytest_assert(verify_only_loopback_routes_are_announced_to_neighs( duthosts, linecard, dut_nbrhosts[linecard], traffic_shift_community), "Failed to verify routes on nbr in TSA") @@ -1021,6 +1108,12 @@ def test_user_init_tsb_on_sup_while_service_run_on_dut(duthosts, localhost, # Wait until all routes are announced to neighbors cur_v4_routes = {} cur_v6_routes = {} + + # verify sessions are established + pytest_assert(wait_until(600, 10, 0, + linecard.check_bgp_session_state_all_asics, up_bgp_neighbors[linecard], "established"), + "All BGP sessions are not up. No point in continuing the test") + # Verify that all routes advertised to neighbor at the start of the test if not wait_until(300, 3, 0, verify_current_routes_announced_to_neighs, linecard, dut_nbrhosts[linecard], orig_v4_routes[linecard], cur_v4_routes, 4): @@ -1040,8 +1133,9 @@ def test_user_init_tsb_on_sup_while_service_run_on_dut(duthosts, localhost, linecard.shell("TSB") linecard.shell('sudo config save -y') # Verify DUT is in normal state. - pytest_assert(TS_NORMAL == get_traffic_shift_state(linecard), - "DUT {} is not in normal state".format(linecard)) + pytest_assert(wait_until(120, 20, 0, check_traffic_shift_state, linecard, TS_NORMAL), + "DUT {} is not in normal state".format(linecard)) + # Make sure the dut's reboot cause is as expected logger.info("Check reboot cause of the dut {}".format(linecard)) reboot_cause = get_reboot_cause(linecard) @@ -1068,8 +1162,8 @@ def test_tsa_tsb_timer_efficiency(duthosts, localhost, enum_rand_one_per_hwsku_f if not tsa_tsb_timer: pytest.skip("startup_tsa_tsb.service is not supported on the {}".format(duthost.hostname)) # Ensure that the DUT is not in maintenance already before start of the test - pytest_assert(TS_NORMAL == get_traffic_shift_state(duthost), - "DUT is not in normal state") + if TS_NORMAL != get_traffic_shift_state(duthost): + pytest.skip("Test skipped since DUT is not in normal state") if not check_tsa_persistence_support(duthost): pytest.skip("TSA persistence not supported in the image") @@ -1097,7 +1191,7 @@ def test_tsa_tsb_timer_efficiency(duthosts, localhost, enum_rand_one_per_hwsku_f logging.info('DUT {} up since {}'.format(duthost.hostname, dut_uptime)) service_uptime = get_tsa_tsb_service_uptime(duthost) time_diff = (service_uptime - dut_uptime).total_seconds() - pytest_assert(int(time_diff) < 120, + pytest_assert(int(time_diff) < 240, "startup_tsa_tsb service started much later than the expected time after dut reboot") logging.info("Wait until all critical services are fully started") @@ -1109,8 +1203,9 @@ def test_tsa_tsb_timer_efficiency(duthosts, localhost, enum_rand_one_per_hwsku_f pytest_assert(wait_until(600, 20, 0, check_interface_status_of_up_ports, duthost), "Not all ports that are admin up on are operationally up") - pytest_assert(wait_until(300, 10, 0, - duthost.check_bgp_session_state_all_asics, up_bgp_neighbors, "established")) + pytest_assert(wait_until(600, 10, 0, + duthost.check_bgp_session_state_all_asics, up_bgp_neighbors, "established"), + "All BGP sessions are not up. No point in continuing the test") stability_check_time = datetime.datetime.now() time_to_stabilize = (stability_check_time - service_uptime).total_seconds() @@ -1151,8 +1246,9 @@ def test_tsa_tsb_timer_efficiency(duthosts, localhost, enum_rand_one_per_hwsku_f finally: # Verify DUT is in normal state. - pytest_assert(TS_NORMAL == get_traffic_shift_state(duthost), + pytest_assert(wait_until(120, 20, 0, check_traffic_shift_state, duthost, TS_NORMAL), "DUT is not in normal state") + # Make sure the dut's reboot cause is as expected logger.info("Check reboot cause of the dut") reboot_cause = get_reboot_cause(duthost) diff --git a/tests/common/reboot.py b/tests/common/reboot.py index cf28aebc02..d286acc463 100644 --- a/tests/common/reboot.py +++ b/tests/common/reboot.py @@ -55,15 +55,6 @@ "cause": "Power Loss", "test_reboot_cause_only": True }, - REBOOT_TYPE_COLD: { - "command": "reboot", - "timeout": 300, - "wait": 120, - # We are searching two types of reboot cause. - # This change relates to changes of PR #6130 in sonic-buildimage repository - "cause": r"'reboot'|Non-Hardware \(reboot|^reboot", - "test_reboot_cause_only": False - }, REBOOT_TYPE_SOFT: { "command": "soft-reboot", "timeout": 300, @@ -125,7 +116,7 @@ "timeout": 300, "wait": 120, # When linecards are rebooted due to supervisor cold reboot - "cause": "reboot from Supervisor", + "cause": r"^Reboot from Supervisor$|^reboot from Supervisor$", "test_reboot_cause_only": False }, REBOOT_TYPE_SUPERVISOR_HEARTBEAT_LOSS: { @@ -133,7 +124,16 @@ "timeout": 300, "wait": 120, # When linecards are rebooted due to supervisor crash/abnormal reboot - "cause": "Heartbeat", + "cause": r"Heartbeat|headless", + "test_reboot_cause_only": False + }, + REBOOT_TYPE_COLD: { + "command": "reboot", + "timeout": 300, + "wait": 120, + # We are searching two types of reboot cause. + # This change relates to changes of PR #6130 in sonic-buildimage repository + "cause": r"'reboot'|Non-Hardware \(reboot|^reboot", "test_reboot_cause_only": False } }