diff --git a/ddtrace/contrib/internal/pytest/_atr_utils.py b/ddtrace/contrib/internal/pytest/_atr_utils.py
index b61f2925d25..3388fd35c9a 100644
--- a/ddtrace/contrib/internal/pytest/_atr_utils.py
+++ b/ddtrace/contrib/internal/pytest/_atr_utils.py
@@ -4,18 +4,14 @@
 import pytest
 
 from ddtrace.contrib.internal.pytest._retry_utils import RetryOutcomes
-from ddtrace.contrib.internal.pytest._retry_utils import RetryReason
-from ddtrace.contrib.internal.pytest._retry_utils import UserProperty
 from ddtrace.contrib.internal.pytest._retry_utils import _get_outcome_from_retry
 from ddtrace.contrib.internal.pytest._retry_utils import _get_retry_attempt_string
 from ddtrace.contrib.internal.pytest._retry_utils import set_retry_num
 from ddtrace.contrib.internal.pytest._types import _pytest_report_teststatus_return_type
 from ddtrace.contrib.internal.pytest._types import pytest_TestReport
 from ddtrace.contrib.internal.pytest._utils import _PYTEST_STATUS
-from ddtrace.contrib.internal.pytest._utils import TestPhase
 from ddtrace.contrib.internal.pytest._utils import _get_test_id_from_item
 from ddtrace.contrib.internal.pytest._utils import _TestOutcome
-from ddtrace.contrib.internal.pytest._utils import get_user_property
 from ddtrace.ext.test_visibility.api import TestStatus
 from ddtrace.internal.logger import get_logger
 from ddtrace.internal.test_visibility._internal_item_ids import InternalTestId
@@ -54,14 +50,11 @@ class _QUARANTINE_ATR_RETRY_OUTCOMES(_ATR_RETRY_OUTCOMES):
 def atr_handle_retries(
     test_id: InternalTestId,
     item: pytest.Item,
-    test_reports: t.Dict[str, pytest_TestReport],
+    when: str,
+    original_result: pytest_TestReport,
     test_outcome: _TestOutcome,
     is_quarantined: bool = False,
 ):
-    setup_report = test_reports.get(TestPhase.SETUP)
-    call_report = test_reports.get(TestPhase.CALL)
-    teardown_report = test_reports.get(TestPhase.TEARDOWN)
-
     if is_quarantined:
         retry_outcomes = _QUARANTINE_ATR_RETRY_OUTCOMES
         final_outcomes = _QUARANTINE_FINAL_OUTCOMES
@@ -77,14 +70,11 @@ def atr_handle_retries(
         XPASS=retry_outcomes.ATR_ATTEMPT_FAILED,
     )
 
-    item.ihook.pytest_runtest_logreport(report=setup_report)
-
     # Overwrite the original result to avoid double-counting when displaying totals in final summary
-    if call_report:
+    if when == "call":
         if test_outcome.status == TestStatus.FAIL:
-            call_report.outcome = outcomes.FAILED
-
-        item.ihook.pytest_runtest_logreport(report=call_report)
+            original_result.outcome = outcomes.FAILED
+        return
 
     atr_outcome = _atr_do_retries(item, outcomes)
     longrepr = InternalTest.stash_get(test_id, "failure_longrepr")
@@ -93,14 +83,19 @@ def atr_handle_retries(
         nodeid=item.nodeid,
         location=item.location,
         keywords={k: 1 for k in item.keywords},
-        when=TestPhase.CALL,
+        when="call",
         longrepr=longrepr,
         outcome=final_outcomes[atr_outcome],
-        user_properties=item.user_properties + [(UserProperty.RETRY_REASON, RetryReason.AUTO_TEST_RETRY)],
+        user_properties=item.user_properties + [("dd_retry_reason", "auto_test_retry")],
     )
     item.ihook.pytest_runtest_logreport(report=final_report)
 
-    item.ihook.pytest_runtest_logreport(report=teardown_report)
+
+def get_user_property(report, key, default=None):
+    for k, v in report.user_properties:
+        if k == key:
+            return v
+    return default
 
 
 def atr_get_failed_reports(terminalreporter: _pytest.terminal.TerminalReporter) -> t.List[pytest_TestReport]:
@@ -132,12 +127,12 @@ def _atr_write_report_for_status(
     markedup_strings: t.List[str],
     color: str,
     delete_reports: bool = True,
-    retry_reason: str = RetryReason.AUTO_TEST_RETRY,
+    retry_reason: str = "auto_test_retry",
 ):
     reports = [
         report
         for report in terminalreporter.getreports(report_outcome)
-        if get_user_property(report, UserProperty.RETRY_REASON) == retry_reason
+        if get_user_property(report, "dd_retry_reason") == retry_reason
     ]
     markup_kwargs = {color: True}
     if reports:
diff --git a/ddtrace/contrib/internal/pytest/_attempt_to_fix.py b/ddtrace/contrib/internal/pytest/_attempt_to_fix.py
index a1ac54d4ecb..9eae7a1779c 100644
--- a/ddtrace/contrib/internal/pytest/_attempt_to_fix.py
+++ b/ddtrace/contrib/internal/pytest/_attempt_to_fix.py
@@ -10,7 +10,6 @@
 from ddtrace.contrib.internal.pytest._retry_utils import set_retry_num
 from ddtrace.contrib.internal.pytest._types import _pytest_report_teststatus_return_type
 from ddtrace.contrib.internal.pytest._types import pytest_TestReport
-from ddtrace.contrib.internal.pytest._utils import TestPhase
 from ddtrace.contrib.internal.pytest._utils import _get_test_id_from_item
 from ddtrace.contrib.internal.pytest._utils import _TestOutcome
 from ddtrace.contrib.internal.pytest.constants import USER_PROPERTY_QUARANTINED
@@ -42,14 +41,10 @@ class _RETRY_OUTCOMES:
 def attempt_to_fix_handle_retries(
     test_id: InternalTestId,
     item: pytest.Item,
-    test_reports: t.Dict[str, pytest_TestReport],
+    when: str,
+    original_result: pytest_TestReport,
     test_outcome: _TestOutcome,
-    is_quarantined: bool = False,
 ):
-    setup_report = test_reports.get(TestPhase.SETUP)
-    call_report = test_reports.get(TestPhase.CALL)
-    teardown_report = test_reports.get(TestPhase.TEARDOWN)
-
     retry_outcomes = _RETRY_OUTCOMES
     final_outcomes = _FINAL_OUTCOMES
 
@@ -61,16 +56,13 @@ def attempt_to_fix_handle_retries(
         XPASS=retry_outcomes.ATTEMPT_FAILED,
     )
 
-    item.ihook.pytest_runtest_logreport(report=setup_report)
-
     # Overwrite the original result to avoid double-counting when displaying totals in final summary
-    if call_report:
+    if when == "call":
         if test_outcome.status == TestStatus.FAIL:
-            call_report.outcome = outcomes.FAILED
+            original_result.outcome = outcomes.FAILED
         elif test_outcome.status == TestStatus.SKIP:
-            call_report.outcome = outcomes.SKIPPED
-
-        item.ihook.pytest_runtest_logreport(report=call_report)
+            original_result.outcome = outcomes.SKIPPED
+        return
 
     retries_outcome = _do_retries(item, outcomes)
     longrepr = InternalTest.stash_get(test_id, "failure_longrepr")
@@ -78,16 +70,14 @@ def attempt_to_fix_handle_retries(
     final_report = RetryTestReport(
         nodeid=item.nodeid,
         location=item.location,
-        keywords={k: 1 for k in item.keywords},
-        when=TestPhase.CALL,
+        keywords=item.keywords,
+        when="call",
         longrepr=longrepr,
         outcome=final_outcomes[retries_outcome],
         user_properties=item.user_properties,
     )
     item.ihook.pytest_runtest_logreport(report=final_report)
 
-    item.ihook.pytest_runtest_logreport(report=teardown_report)
-
 
 def _do_retries(item: pytest.Item, outcomes: RetryOutcomes) -> TestStatus:
     test_id = _get_test_id_from_item(item)
diff --git a/ddtrace/contrib/internal/pytest/_efd_utils.py b/ddtrace/contrib/internal/pytest/_efd_utils.py
index 130304db484..2e48702efce 100644
--- a/ddtrace/contrib/internal/pytest/_efd_utils.py
+++ b/ddtrace/contrib/internal/pytest/_efd_utils.py
@@ -4,18 +4,15 @@
 import pytest
 
 from ddtrace.contrib.internal.pytest._retry_utils import RetryOutcomes
-from ddtrace.contrib.internal.pytest._retry_utils import RetryReason
-from ddtrace.contrib.internal.pytest._retry_utils import UserProperty
+from ddtrace.contrib.internal.pytest._retry_utils import RetryTestReport
 from ddtrace.contrib.internal.pytest._retry_utils import _get_outcome_from_retry
 from ddtrace.contrib.internal.pytest._retry_utils import _get_retry_attempt_string
 from ddtrace.contrib.internal.pytest._retry_utils import set_retry_num
 from ddtrace.contrib.internal.pytest._types import _pytest_report_teststatus_return_type
 from ddtrace.contrib.internal.pytest._types import pytest_TestReport
 from ddtrace.contrib.internal.pytest._utils import PYTEST_STATUS
-from ddtrace.contrib.internal.pytest._utils import TestPhase
 from ddtrace.contrib.internal.pytest._utils import _get_test_id_from_item
 from ddtrace.contrib.internal.pytest._utils import _TestOutcome
-from ddtrace.contrib.internal.pytest._utils import get_user_property
 from ddtrace.ext.test_visibility.api import TestStatus
 from ddtrace.internal.logger import get_logger
 from ddtrace.internal.test_visibility._efd_mixins import EFDTestStatus
@@ -40,37 +37,32 @@ class _EFD_RETRY_OUTCOMES:
 _EFD_FLAKY_OUTCOME = "flaky"
 
 _FINAL_OUTCOMES: t.Dict[EFDTestStatus, str] = {
-    EFDTestStatus.ALL_PASS: PYTEST_STATUS.PASSED,
-    EFDTestStatus.ALL_FAIL: PYTEST_STATUS.FAILED,
-    EFDTestStatus.ALL_SKIP: PYTEST_STATUS.SKIPPED,
-    EFDTestStatus.FLAKY: PYTEST_STATUS.PASSED,
+    EFDTestStatus.ALL_PASS: _EFD_RETRY_OUTCOMES.EFD_FINAL_PASSED,
+    EFDTestStatus.ALL_FAIL: _EFD_RETRY_OUTCOMES.EFD_FINAL_FAILED,
+    EFDTestStatus.ALL_SKIP: _EFD_RETRY_OUTCOMES.EFD_FINAL_SKIPPED,
+    EFDTestStatus.FLAKY: _EFD_RETRY_OUTCOMES.EFD_FINAL_FLAKY,
 }
 
 
 def efd_handle_retries(
     test_id: InternalTestId,
     item: pytest.Item,
-    test_reports: t.Dict[str, pytest_TestReport],
+    when: str,
+    original_result: pytest_TestReport,
     test_outcome: _TestOutcome,
-    is_quarantined: bool = False,
 ):
-    setup_report = test_reports.get(TestPhase.SETUP)
-    call_report = test_reports.get(TestPhase.CALL)
-    teardown_report = test_reports.get(TestPhase.TEARDOWN)
-
     # Overwrite the original result to avoid double-counting when displaying totals in final summary
-    if call_report:
+    if when == "call":
         if test_outcome.status == TestStatus.FAIL:
-            call_report.outcome = _EFD_RETRY_OUTCOMES.EFD_ATTEMPT_FAILED
+            original_result.outcome = _EFD_RETRY_OUTCOMES.EFD_ATTEMPT_FAILED
         elif test_outcome.status == TestStatus.PASS:
-            call_report.outcome = _EFD_RETRY_OUTCOMES.EFD_ATTEMPT_PASSED
+            original_result.outcome = _EFD_RETRY_OUTCOMES.EFD_ATTEMPT_PASSED
         elif test_outcome.status == TestStatus.SKIP:
-            call_report.outcome = _EFD_RETRY_OUTCOMES.EFD_ATTEMPT_SKIPPED
-
+            original_result.outcome = _EFD_RETRY_OUTCOMES.EFD_ATTEMPT_SKIPPED
+        return
     if InternalTest.get_tag(test_id, "_dd.ci.efd_setup_failed"):
         log.debug("Test item %s failed during setup, will not be retried for Early Flake Detection")
         return
-
     if InternalTest.get_tag(test_id, "_dd.ci.efd_teardown_failed"):
         # NOTE: tests that passed their call but failed during teardown are not retried
         log.debug("Test item %s failed during teardown, will not be retried for Early Flake Detection")
@@ -78,38 +70,34 @@ def efd_handle_retries(
 
     # If the test skipped (can happen either in setup or call depending on mark vs calling .skip()), we set the original
     # status as skipped and then continue handling retries because we may not return
-    if test_outcome.status == TestStatus.SKIP:
-        if call_report:
-            call_report.outcome = _EFD_RETRY_OUTCOMES.EFD_ATTEMPT_SKIPPED
-        else:
-            # When skip happens during setup, we don't have a call report.
-            setup_report.outcome = _EFD_RETRY_OUTCOMES.EFD_ATTEMPT_SKIPPED
-
-    item.ihook.pytest_runtest_logreport(report=setup_report)
-
-    if call_report:
-        item.ihook.pytest_runtest_logreport(report=call_report)
+    if test_outcome.status == TestStatus.SKIP and when in ["setup", "call"]:
+        original_result.outcome = _EFD_RETRY_OUTCOMES.EFD_ATTEMPT_SKIPPED
+        # We don't return for when == call when skip happens during setup, so we need to log it and make sure the status
+        # of the test is set
+        if when == "setup":
+            item.ihook.pytest_runtest_logreport(
+                nodeid=item.nodeid,
+                locationm=item.location,
+                keywords=item.keywords,
+                when="setup",
+                longrepr=None,
+                outcome=_EFD_RETRY_OUTCOMES.EFD_ATTEMPT_SKIPPED,
+            )
+            InternalTest.mark_skip(test_id)
 
     efd_outcome = _efd_do_retries(item)
     longrepr = InternalTest.stash_get(test_id, "failure_longrepr")
 
-    final_report = pytest_TestReport(
+    final_report = RetryTestReport(
         nodeid=item.nodeid,
         location=item.location,
-        keywords={k: 1 for k in item.keywords},
-        when=TestPhase.CALL,
+        keywords=item.keywords,
+        when="call",
         longrepr=longrepr,
         outcome=_FINAL_OUTCOMES[efd_outcome],
-        user_properties=item.user_properties
-        + [
-            (UserProperty.RETRY_REASON, RetryReason.EARLY_FLAKE_DETECTION),
-            (UserProperty.RETRY_FINAL_OUTCOME, efd_outcome.value),
-        ],
     )
     item.ihook.pytest_runtest_logreport(report=final_report)
 
-    item.ihook.pytest_runtest_logreport(report=teardown_report)
-
 
 def efd_get_failed_reports(terminalreporter: _pytest.terminal.TerminalReporter) -> t.List[pytest_TestReport]:
     return terminalreporter.getreports(_EFD_RETRY_OUTCOMES.EFD_ATTEMPT_FAILED)
@@ -337,17 +325,14 @@ def efd_get_teststatus(report: pytest_TestReport) -> _pytest_report_teststatus_r
             "s",
             (f"EFD RETRY {_get_retry_attempt_string(report.nodeid)}SKIPPED", {"yellow": True}),
         )
-
-    if get_user_property(report, UserProperty.RETRY_REASON) == RetryReason.EARLY_FLAKE_DETECTION:
-        efd_outcome = get_user_property(report, UserProperty.RETRY_FINAL_OUTCOME)
-        if efd_outcome == "passed":
-            return (_EFD_RETRY_OUTCOMES.EFD_FINAL_PASSED, ".", ("EFD FINAL STATUS: PASSED", {"green": True}))
-        if efd_outcome == "failed":
-            return (_EFD_RETRY_OUTCOMES.EFD_FINAL_FAILED, "F", ("EFD FINAL STATUS: FAILED", {"red": True}))
-        if efd_outcome == "skipped":
-            return (_EFD_RETRY_OUTCOMES.EFD_FINAL_SKIPPED, "S", ("EFD FINAL STATUS: SKIPPED", {"yellow": True}))
-        if efd_outcome == "flaky":
-            # Flaky tests are the only one that have a pretty string because they are intended to be displayed in the
-            # final count of terminal summary
-            return (_EFD_FLAKY_OUTCOME, "K", ("EFD FINAL STATUS: FLAKY", {"yellow": True}))
+    if report.outcome == _EFD_RETRY_OUTCOMES.EFD_FINAL_PASSED:
+        return (_EFD_RETRY_OUTCOMES.EFD_FINAL_PASSED, ".", ("EFD FINAL STATUS: PASSED", {"green": True}))
+    if report.outcome == _EFD_RETRY_OUTCOMES.EFD_FINAL_FAILED:
+        return (_EFD_RETRY_OUTCOMES.EFD_FINAL_FAILED, "F", ("EFD FINAL STATUS: FAILED", {"red": True}))
+    if report.outcome == _EFD_RETRY_OUTCOMES.EFD_FINAL_SKIPPED:
+        return (_EFD_RETRY_OUTCOMES.EFD_FINAL_SKIPPED, "S", ("EFD FINAL STATUS: SKIPPED", {"yellow": True}))
+    if report.outcome == _EFD_RETRY_OUTCOMES.EFD_FINAL_FLAKY:
+        # Flaky tests are the only one that have a pretty string because they are intended to be displayed in the final
+        # count of terminal summary
+        return (_EFD_FLAKY_OUTCOME, "K", ("EFD FINAL STATUS: FLAKY", {"yellow": True}))
     return None
diff --git a/ddtrace/contrib/internal/pytest/_plugin_v2.py b/ddtrace/contrib/internal/pytest/_plugin_v2.py
index 2ede6a62803..eb415d08667 100644
--- a/ddtrace/contrib/internal/pytest/_plugin_v2.py
+++ b/ddtrace/contrib/internal/pytest/_plugin_v2.py
@@ -3,7 +3,6 @@
 import re
 import typing as t
 
-from _pytest.runner import runtestprotocol
 import pytest
 
 from ddtrace import DDTraceDeprecationWarning
@@ -16,13 +15,13 @@
 from ddtrace.contrib.internal.coverage.utils import _is_coverage_invoked_by_coverage_run
 from ddtrace.contrib.internal.coverage.utils import _is_coverage_patched
 from ddtrace.contrib.internal.pytest._benchmark_utils import _set_benchmark_data_from_item
+from ddtrace.contrib.internal.pytest._plugin_v1 import _extract_reason
 from ddtrace.contrib.internal.pytest._plugin_v1 import _is_pytest_cov_enabled
 from ddtrace.contrib.internal.pytest._types import _pytest_report_teststatus_return_type
 from ddtrace.contrib.internal.pytest._types import pytest_CallInfo
 from ddtrace.contrib.internal.pytest._types import pytest_Config
 from ddtrace.contrib.internal.pytest._types import pytest_TestReport
 from ddtrace.contrib.internal.pytest._utils import PYTEST_STATUS
-from ddtrace.contrib.internal.pytest._utils import TestPhase
 from ddtrace.contrib.internal.pytest._utils import _get_module_path_from_item
 from ddtrace.contrib.internal.pytest._utils import _get_names_from_item
 from ddtrace.contrib.internal.pytest._utils import _get_session_command
@@ -38,7 +37,6 @@
 from ddtrace.contrib.internal.pytest._utils import _pytest_version_supports_itr
 from ddtrace.contrib.internal.pytest._utils import _pytest_version_supports_retries
 from ddtrace.contrib.internal.pytest._utils import _TestOutcome
-from ddtrace.contrib.internal.pytest._utils import excinfo_by_report
 from ddtrace.contrib.internal.pytest.constants import FRAMEWORK
 from ddtrace.contrib.internal.pytest.constants import USER_PROPERTY_QUARANTINED
 from ddtrace.contrib.internal.pytest.constants import XFAIL_REASON
@@ -418,8 +416,9 @@ def _pytest_runtest_protocol_post_yield(item, nextitem, coverage_collector):
             InternalTestModule.finish(module_id)
 
 
-@pytest.hookimpl(tryfirst=True, hookwrapper=True, specname="pytest_runtest_protocol")
-def pytest_runtest_protocol_wrapper(item, nextitem) -> None:
+@pytest.hookimpl(tryfirst=True, hookwrapper=True)
+def pytest_runtest_protocol(item, nextitem) -> None:
+    """Discovers tests, and starts tests, suites, and modules, then handles coverage data collection"""
     if not is_test_visibility_enabled():
         yield
         return
@@ -429,94 +428,20 @@ def pytest_runtest_protocol_wrapper(item, nextitem) -> None:
     except Exception:  # noqa: E722
         log.debug("encountered error during pre-test", exc_info=True)
 
+    # Yield control back to pytest to run the test
     yield
 
     try:
-        _pytest_runtest_protocol_post_yield(item, nextitem, coverage_collector)
+        return _pytest_runtest_protocol_post_yield(item, nextitem, coverage_collector)
     except Exception:  # noqa: E722
         log.debug("encountered error during post-test", exc_info=True)
-
-
-@pytest.hookimpl(specname="pytest_runtest_protocol")
-def pytest_runtest_protocol(item, nextitem) -> None:
-    if not is_test_visibility_enabled():
         return
 
-    item.ihook.pytest_runtest_logstart(nodeid=item.nodeid, location=item.location)
-    reports = runtestprotocol(item, nextitem=nextitem, log=False)
-    test_outcome = _process_reports(item, reports)
-
-    reports_dict = {report.when: report for report in reports}
-
-    test_id = _get_test_id_from_item(item)
-    is_quarantined = InternalTest.is_quarantined_test(test_id)
-    is_disabled = InternalTest.is_disabled_test(test_id)
-    is_attempt_to_fix = InternalTest.is_attempt_to_fix(test_id)
-    setup_or_teardown_failed = False
-
-    if not InternalTest.is_finished(test_id):
-        InternalTest.finish(test_id, test_outcome.status, test_outcome.skip_reason, test_outcome.exc_info)
-
-    for report in reports:
-        if report.failed and report.when in (TestPhase.SETUP, TestPhase.TEARDOWN):
-            setup_or_teardown_failed = True
-
-        if report.when == TestPhase.CALL or "failed" in report.outcome:
-            if is_quarantined or is_disabled:
-                # Ensure test doesn't count as failed for pytest's exit status logic
-                # (see <https://github.com/pytest-dev/pytest/blob/8.3.x/src/_pytest/main.py#L654>).
-                report.outcome = OUTCOME_QUARANTINED
-
-        if report.failed or report.skipped:
-            InternalTest.stash_set(test_id, "failure_longrepr", report.longrepr)
-
-    retry_handler = None
-
-    if setup_or_teardown_failed:
-        # ATR and EFD retry tests only if their teardown succeeded to ensure the best chance the retry will succeed.
-        log.debug("Test %s failed during setup or teardown, skipping retries", test_id)
-    elif is_attempt_to_fix and _pytest_version_supports_attempt_to_fix():
-        retry_handler = attempt_to_fix_handle_retries
-    elif InternalTestSession.efd_enabled() and InternalTest.efd_should_retry(test_id):
-        retry_handler = efd_handle_retries
-    elif InternalTestSession.atr_is_enabled() and InternalTest.atr_should_retry(test_id):
-        retry_handler = atr_handle_retries
-
-    if retry_handler:
-        # Retry handler is responsible for logging the test reports.
-        retry_handler(
-            test_id=test_id,
-            item=item,
-            test_reports=reports_dict,
-            test_outcome=test_outcome,
-            is_quarantined=is_quarantined,
-        )
-    else:
-        # If no retry handler, we log the reports ourselves.
-        for report in reports:
-            item.ihook.pytest_runtest_logreport(report=report)
-
-    item.ihook.pytest_runtest_logfinish(nodeid=item.nodeid, location=item.location)
-
-    return True  # Do not run pytest's internal `pytest_runtest_protocol`.
-
-
-def _process_reports(item, reports) -> _TestOutcome:
-    final_outcome = None
-    for report in reports:
-        outcome = _process_result(item, report)
-        if final_outcome is None or final_outcome.status is None:
-            final_outcome = outcome
-            if final_outcome.status is not None:
-                return final_outcome
-    return final_outcome
-
 
-def _process_result(item, result) -> _TestOutcome:
+def _process_result(item, call, result) -> _TestOutcome:
     test_id = _get_test_id_from_item(item)
 
-    report_excinfo = excinfo_by_report.get(result)
-    has_exception = report_excinfo is not None
+    has_exception = call.excinfo is not None
 
     # In cases where a test was marked as XFAIL, the reason is only available during when call.when == "call", so we
     # add it as a tag immediately:
@@ -531,7 +456,7 @@ def _process_result(item, result) -> _TestOutcome:
     # - the test passed with xfail
     # - we are tearing down the test
     # DEV NOTE: some skip scenarios (eg: skipif) have an exception during setup
-    if result.when != TestPhase.TEARDOWN and not (has_exception or result.failed):
+    if call.when != "teardown" and not (has_exception or result.failed):
         return _TestOutcome()
 
     xfail = hasattr(result, "wasxfail") or "xfail" in result.keywords
@@ -553,7 +478,7 @@ def _process_result(item, result) -> _TestOutcome:
                     InternalTest.set_tag(test_id, XFAIL_REASON, getattr(result, "wasxfail", "XFail"))
                 return _TestOutcome(TestStatus.PASS)
 
-        return _TestOutcome(TestStatus.SKIP, report_excinfo.value if report_excinfo else None)
+        return _TestOutcome(TestStatus.SKIP, _extract_reason(call))
 
     if result.passed:
         if xfail and not has_skip_keyword and not item.config.option.runxfail:
@@ -572,12 +497,12 @@ def _process_result(item, result) -> _TestOutcome:
         return _TestOutcome(TestStatus.FAIL)
 
     # NOTE: for ATR and EFD purposes, we need to know if the test failed during setup or teardown.
-    if result.when == TestPhase.SETUP and result.failed:
+    if call.when == "setup" and result.failed:
         InternalTest.stash_set(test_id, "setup_failed", True)
-    elif result.when == TestPhase.TEARDOWN and result.failed:
+    elif call.when == "teardown" and result.failed:
         InternalTest.stash_set(test_id, "teardown_failed", True)
 
-    exc_info = TestExcInfo(report_excinfo.type, report_excinfo.value, report_excinfo.tb) if report_excinfo else None
+    exc_info = TestExcInfo(call.excinfo.type, call.excinfo.value, call.excinfo.tb) if call.excinfo else None
 
     return _TestOutcome(status=TestStatus.FAIL, exc_info=exc_info)
 
@@ -588,17 +513,48 @@ def _pytest_runtest_makereport(item: pytest.Item, call: pytest_CallInfo, outcome
         return
 
     original_result = outcome.get_result()
-    test_outcome = _process_result(item, original_result)
+
+    test_id = _get_test_id_from_item(item)
+
+    is_quarantined = InternalTest.is_quarantined_test(test_id)
+    is_disabled = InternalTest.is_disabled_test(test_id)
+    is_attempt_to_fix = InternalTest.is_attempt_to_fix(test_id)
+
+    test_outcome = _process_result(item, call, original_result)
 
     # A None value for test_outcome.status implies the test has not finished yet
     # Only continue to finishing the test if the test has finished, or if tearing down the test
-    if test_outcome.status is None and call.when != TestPhase.TEARDOWN:
+    if test_outcome.status is None and call.when != "teardown":
         return
 
     # Support for pytest-benchmark plugin
     if item.config.pluginmanager.hasplugin("benchmark"):
         _set_benchmark_data_from_item(item)
 
+    # Record a result if we haven't already recorded it:
+    if not InternalTest.is_finished(test_id):
+        InternalTest.finish(test_id, test_outcome.status, test_outcome.skip_reason, test_outcome.exc_info)
+
+    if original_result.failed and (is_quarantined or is_disabled):
+        # Ensure test doesn't count as failed for pytest's exit status logic
+        # (see <https://github.com/pytest-dev/pytest/blob/8.3.x/src/_pytest/main.py#L654>).
+        original_result.outcome = OUTCOME_QUARANTINED
+
+    if original_result.failed or original_result.skipped:
+        InternalTest.stash_set(test_id, "failure_longrepr", original_result.longrepr)
+
+    # ATR and EFD retry tests only if their teardown succeeded to ensure the best chance the retry will succeed
+    # NOTE: this mutates the original result's outcome
+    if InternalTest.stash_get(test_id, "setup_failed") or InternalTest.stash_get(test_id, "teardown_failed"):
+        log.debug("Test %s failed during setup or teardown, skipping retries", test_id)
+        return
+    if is_attempt_to_fix and _pytest_version_supports_attempt_to_fix():
+        return attempt_to_fix_handle_retries(test_id, item, call.when, original_result, test_outcome)
+    if InternalTestSession.efd_enabled() and InternalTest.efd_should_retry(test_id):
+        return efd_handle_retries(test_id, item, call.when, original_result, test_outcome)
+    if InternalTestSession.atr_is_enabled() and InternalTest.atr_should_retry(test_id):
+        return atr_handle_retries(test_id, item, call.when, original_result, test_outcome, is_quarantined)
+
 
 @pytest.hookimpl(hookwrapper=True)
 def pytest_runtest_makereport(item: pytest.Item, call: pytest_CallInfo) -> None:
@@ -606,10 +562,6 @@ def pytest_runtest_makereport(item: pytest.Item, call: pytest_CallInfo) -> None:
     outcome: pytest_TestReport
     outcome = yield
 
-    # DEV: Make excinfo available for later use, when we don't have the `call` object anymore.
-    # We cannot stash it directly into the report because pytest-xdist fails to serialize the report if we do that.
-    excinfo_by_report[outcome.get_result()] = call.excinfo
-
     if not is_test_visibility_enabled():
         return
 
@@ -761,7 +713,7 @@ def pytest_report_teststatus(
     user_properties = getattr(report, "user_properties", [])
     is_quarantined = USER_PROPERTY_QUARANTINED in user_properties
     if is_quarantined:
-        if report.when == TestPhase.TEARDOWN:
+        if report.when == "teardown":
             return (OUTCOME_QUARANTINED, "q", ("QUARANTINED", {"blue": True}))
         else:
             # Don't show anything for setup and call of quarantined tests, regardless of
diff --git a/ddtrace/contrib/internal/pytest/_retry_utils.py b/ddtrace/contrib/internal/pytest/_retry_utils.py
index 5d585392bda..f58cc69b245 100644
--- a/ddtrace/contrib/internal/pytest/_retry_utils.py
+++ b/ddtrace/contrib/internal/pytest/_retry_utils.py
@@ -2,29 +2,20 @@
 from dataclasses import dataclass
 import typing as t
 
-from _pytest.runner import runtestprotocol
+import _pytest
+from _pytest.logging import caplog_handler_key
+from _pytest.logging import caplog_records_key
+from _pytest.runner import CallInfo
 import pytest
 
 from ddtrace.contrib.internal.pytest._types import pytest_TestReport
-from ddtrace.contrib.internal.pytest._utils import TestPhase
+from ddtrace.contrib.internal.pytest._types import tmppath_result_key
 from ddtrace.contrib.internal.pytest._utils import _TestOutcome
-from ddtrace.contrib.internal.pytest._utils import excinfo_by_report
 from ddtrace.ext.test_visibility.api import TestExcInfo
 from ddtrace.ext.test_visibility.api import TestStatus
 from ddtrace.internal import core
 
 
-class UserProperty:
-    RETRY_REASON = "dd_retry_reason"
-    RETRY_FINAL_OUTCOME = "dd_retry_final_outcome"
-
-
-class RetryReason:
-    EARLY_FLAKE_DETECTION = "early_flake_detection"
-    AUTO_TEST_RETRY = "auto_test_retry"
-    ATTEMPT_TO_FIX = "attempt_to_fix"
-
-
 @dataclass(frozen=True)
 class RetryOutcomes:
     PASSED: str
@@ -59,32 +50,85 @@ def _get_outcome_from_retry(
     _outcome_skip_reason: t.Optional[str] = None
     _outcome_exc_info: t.Optional[TestExcInfo] = None
 
-    item.ihook.pytest_runtest_logstart(nodeid=item.nodeid, location=item.location)
-    reports = runtestprotocol(item, nextitem=None, log=False)
+    # _initrequest() needs to be called first because the test has already executed once
+    item._initrequest()
+
+    # Reset output capture across retries.
+    item._report_sections = []
 
-    if any(report.failed for report in reports):
+    # Setup
+    setup_call, setup_report = _retry_run_when(item, "setup", outcomes)
+    if setup_report.outcome == outcomes.FAILED:
         _outcome_status = TestStatus.FAIL
-    elif any(report.skipped for report in reports):
+        if setup_call.excinfo is not None:
+            _outcome_exc_info = TestExcInfo(setup_call.excinfo.type, setup_call.excinfo.value, setup_call.excinfo.tb)
+            item.stash[caplog_records_key] = {}
+            item.stash[caplog_handler_key] = {}
+            if tmppath_result_key is not None:
+                item.stash[tmppath_result_key] = {}
+    if setup_report.outcome == outcomes.SKIPPED:
         _outcome_status = TestStatus.SKIP
-    else:
-        _outcome_status = TestStatus.PASS
 
-    for report in reports:
-        if report.failed:
-            report.outcome = outcomes.FAILED
-            report_excinfo = excinfo_by_report.get(report)
-            _outcome_exc_info = TestExcInfo(report_excinfo.type, report_excinfo.value, report_excinfo.tb)
-        elif report.skipped:
-            report.outcome = outcomes.SKIPPED
-        else:
-            report.outcome = outcomes.PASSED
+    # Call
+    if setup_report.outcome == outcomes.PASSED:
+        call_call, call_report = _retry_run_when(item, "call", outcomes)
+        if call_report.outcome == outcomes.FAILED:
+            _outcome_status = TestStatus.FAIL
+            if call_call.excinfo is not None:
+                _outcome_exc_info = TestExcInfo(call_call.excinfo.type, call_call.excinfo.value, call_call.excinfo.tb)
+                item.stash[caplog_records_key] = {}
+                item.stash[caplog_handler_key] = {}
+                if tmppath_result_key is not None:
+                    item.stash[tmppath_result_key] = {}
+        elif call_report.outcome == outcomes.SKIPPED:
+            _outcome_status = TestStatus.SKIP
+        elif call_report.outcome == outcomes.PASSED:
+            _outcome_status = TestStatus.PASS
+    # Teardown does not happen if setup skipped
+    if not setup_report.skipped:
+        teardown_call, teardown_report = _retry_run_when(item, "teardown", outcomes)
+        # Only override the outcome if the teardown failed, otherwise defer to either setup or call outcome
+        if teardown_report.outcome == outcomes.FAILED:
+            _outcome_status = TestStatus.FAIL
+            if teardown_call.excinfo is not None:
+                _outcome_exc_info = TestExcInfo(
+                    teardown_call.excinfo.type, teardown_call.excinfo.value, teardown_call.excinfo.tb
+                )
+                item.stash[caplog_records_key] = {}
+                item.stash[caplog_handler_key] = {}
+                if tmppath_result_key is not None:
+                    item.stash[tmppath_result_key] = {}
+
+    item._initrequest()
 
-        if report.when == TestPhase.CALL or "passed" not in report.outcome:
-            item.ihook.pytest_runtest_logreport(report=report)
+    return _TestOutcome(status=_outcome_status, skip_reason=_outcome_skip_reason, exc_info=_outcome_exc_info)
 
-    item.ihook.pytest_runtest_logfinish(nodeid=item.nodeid, location=item.location)
 
-    return _TestOutcome(status=_outcome_status, skip_reason=_outcome_skip_reason, exc_info=_outcome_exc_info)
+def _retry_run_when(item, when, outcomes: RetryOutcomes) -> t.Tuple[CallInfo, _pytest.reports.TestReport]:
+    hooks = {
+        "setup": item.ihook.pytest_runtest_setup,
+        "call": item.ihook.pytest_runtest_call,
+        "teardown": item.ihook.pytest_runtest_teardown,
+    }
+    hook = hooks[when]
+    # NOTE: we use nextitem=item here to make sure that logs don't generate a new line
+    if when == "teardown":
+        call = CallInfo.from_call(
+            lambda: hook(item=item, nextitem=pytest.Class.from_parent(item.session, name="forced_teardown")), when=when
+        )
+    else:
+        call = CallInfo.from_call(lambda: hook(item=item), when=when)
+    report = item.ihook.pytest_runtest_makereport(item=item, call=call)
+    if report.outcome == "passed":
+        report.outcome = outcomes.PASSED
+    elif report.outcome == "failed" or report.outcome == "error":
+        report.outcome = outcomes.FAILED
+    elif report.outcome == "skipped":
+        report.outcome = outcomes.SKIPPED
+    # Only log for actual test calls, or failures
+    if when == "call" or "passed" not in report.outcome:
+        item.ihook.pytest_runtest_logreport(report=report)
+    return call, report
 
 
 class RetryTestReport(pytest_TestReport):
diff --git a/ddtrace/contrib/internal/pytest/_utils.py b/ddtrace/contrib/internal/pytest/_utils.py
index 420b30eee78..718561752e5 100644
--- a/ddtrace/contrib/internal/pytest/_utils.py
+++ b/ddtrace/contrib/internal/pytest/_utils.py
@@ -3,7 +3,6 @@
 from pathlib import Path
 import re
 import typing as t
-import weakref
 
 import pytest
 
@@ -45,12 +44,6 @@ class _PYTEST_STATUS:
 PYTEST_STATUS = _PYTEST_STATUS()
 
 
-class TestPhase:
-    SETUP = "setup"
-    CALL = "call"
-    TEARDOWN = "teardown"
-
-
 @dataclass
 class TestNames:
     module: str
@@ -237,13 +230,3 @@ class _TestOutcome(t.NamedTuple):
     status: t.Optional[TestStatus] = None
     skip_reason: t.Optional[str] = None
     exc_info: t.Optional[TestExcInfo] = None
-
-
-def get_user_property(report, key, default=None):
-    for k, v in report.user_properties:
-        if k == key:
-            return v
-    return default
-
-
-excinfo_by_report = weakref.WeakKeyDictionary()
diff --git a/ddtrace/contrib/internal/pytest/plugin.py b/ddtrace/contrib/internal/pytest/plugin.py
index b629afdec83..0908878e0f0 100644
--- a/ddtrace/contrib/internal/pytest/plugin.py
+++ b/ddtrace/contrib/internal/pytest/plugin.py
@@ -113,7 +113,6 @@ def pytest_addoption(parser):
     from ddtrace.contrib.internal.pytest._plugin_v2 import pytest_report_teststatus  # noqa: F401
     from ddtrace.contrib.internal.pytest._plugin_v2 import pytest_runtest_makereport  # noqa: F401
     from ddtrace.contrib.internal.pytest._plugin_v2 import pytest_runtest_protocol  # noqa: F401
-    from ddtrace.contrib.internal.pytest._plugin_v2 import pytest_runtest_protocol_wrapper  # noqa: F401
     from ddtrace.contrib.internal.pytest._plugin_v2 import pytest_sessionfinish  # noqa: F401
     from ddtrace.contrib.internal.pytest._plugin_v2 import pytest_sessionstart  # noqa: F401
     from ddtrace.contrib.internal.pytest._plugin_v2 import pytest_terminal_summary  # noqa: F401