From 8a7648be0804f8fdf66d0b8c45fc482c7c03e275 Mon Sep 17 00:00:00 2001
From: Vasileios Karakasis <vkarak@gmail.com>
Date: Wed, 18 Sep 2024 23:09:04 +0200
Subject: [PATCH] Add new `--retries-threshold` option

---
 docs/manpage.rst                       | 11 +++++++++++
 reframe/frontend/cli.py                | 15 ++++++++++++++-
 reframe/frontend/executors/__init__.py |  7 +++++--
 unittests/test_policies.py             |  7 +++++++
 4 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/docs/manpage.rst b/docs/manpage.rst
index b33242b8a..3f1f13cdf 100644
--- a/docs/manpage.rst
+++ b/docs/manpage.rst
@@ -694,6 +694,17 @@ Options controlling ReFrame execution
    .. versionchanged:: 3.6.1
       Multiple report files are now accepted.
 
+
+.. option:: --retries-threshold=VALUE[%]
+
+   Skip retries (see :option:`--max-retries`) if failures exceed the given threshold.
+
+   Threshold can be specified either as an absolute value or as a percentage using the ``%`` character, e.g., ``--retries-threshold=30%``.
+   Note that in certain shells the ``%`` character may need to be escaped.
+
+   .. versionadded:: 4.7
+
+
 .. option:: -S, --setvar=[TEST.]VAR=VAL
 
    Set variable ``VAR`` in all tests or optionally only in test ``TEST`` to ``VAL``.
diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py
index 19d4ffec6..7aafa78dd 100644
--- a/reframe/frontend/cli.py
+++ b/reframe/frontend/cli.py
@@ -521,6 +521,11 @@ def main():
         metavar='REPORT',
         help='Restore a testing session from REPORT file'
     )
+    run_options.add_argument(
+        '--retries-threshold', action='store', default='1000%',
+        metavar='VALUE[%]',
+        help='Retry tests only if failures do not exceed threshold'
+    )
     run_options.add_argument(
         '-S', '--setvar', action='append', metavar='[TEST.]VAR=VAL',
         dest='vars', default=[],
@@ -1563,8 +1568,16 @@ def module_unuse(*paths):
                 f"{options.reruns}"
             )
 
+        # Parse retries threshold
+        if options.retries_threshold[-1] == '%':
+            ratio = int(options.retries_threshold[:-1]) / 100.
+            retries_threshold = int(len(testcases)*ratio)
+        else:
+            retries_threshold = int(options.retries_threshold)
+
         runner = Runner(exec_policy, printer, options.max_retries,
-                        options.maxfail, options.reruns, options.duration)
+                        options.maxfail, options.reruns, options.duration,
+                        retries_threshold)
         try:
             time_start = time.time()
             runner.runall(testcases, restored_cases)
diff --git a/reframe/frontend/executors/__init__.py b/reframe/frontend/executors/__init__.py
index d96cbaeb7..0714b63b9 100644
--- a/reframe/frontend/executors/__init__.py
+++ b/reframe/frontend/executors/__init__.py
@@ -573,10 +573,12 @@ class Runner:
     _timeout = fields.TypedField(typ.Duration, type(None), allow_implicit=True)
 
     def __init__(self, policy, printer=None, max_retries=0,
-                 max_failures=sys.maxsize, reruns=0, timeout=None):
+                 max_failures=sys.maxsize, reruns=0, timeout=None,
+                 retries_threshold=sys.maxsize):
         self._policy = policy
         self._printer = printer or PrettyPrinter()
         self._max_retries = max_retries
+        self._retries_threshold = retries_threshold
         self._num_reruns = reruns
         self._timeout = timeout
         self._t_init = timeout
@@ -620,7 +622,8 @@ def runall(self, testcases, restored_cases=None):
                 self._policy.set_expiry(self._t_init + self._timeout)
 
             self._runall(testcases)
-            if self._max_retries:
+            if (self._max_retries and
+                len(self._stats.failed()) <= self._retries_threshold):
                 restored_cases = restored_cases or []
                 self._retry_failed(testcases + restored_cases)
 
diff --git a/unittests/test_policies.py b/unittests/test_policies.py
index 37e40df5e..19ac55960 100644
--- a/unittests/test_policies.py
+++ b/unittests/test_policies.py
@@ -274,6 +274,13 @@ def test_retries_bad_check(make_runner, make_cases, common_exec_ctx):
     assert_runall(runner)
     assert runner.max_retries == rt.runtime().current_run
     assert 2 == len(runner.stats.failed())
+    assert 3 == runner.stats.num_runs
+
+
+def test_retries_threshold(make_runner, make_cases, common_exec_ctx):
+    runner = make_runner(max_retries=2, retries_threshold=1)
+    runner.runall(make_cases([BadSetupCheck(), BadSetupCheckEarly()]))
+    assert 1 == runner.stats.num_runs
 
 
 def test_retries_good_check(make_runner, make_cases, common_exec_ctx):