Ensure regression tasks don't start new tasks when not making progress (#4705)

mi-ac · web-flow · commit 656f49d2d2a1 · 2025-02-28T11:58:10.000-03:00
This returns a bad-build error type after a regression task, when a known min/max input pair didn't shrink. Previously, a timeout error type was returned, after which post-process respawns the task with the same min/max pair, leading to a task loop. This doesn't change behavior when either min or max are not known (e.g. when tasks are initially started) or when some progress is made (e.g. at least one improvement to min or max during several hours this task can run) or when other error conditions happen. Chrome bug: https://crbug.com/396344382
diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/regression_task.py b/src/clusterfuzz/_internal/bot/tasks/utasks/regression_task.py
@@ -414,8 +414,14 @@ def find_regression_range(
         error_type=uworker_msg_pb2.ErrorType.REGRESSION_REVISION_LIST_ERROR)  # pylint: disable=no-member
 
   # Pick up where left off in a previous run if necessary.
-  min_revision = testcase.get_metadata('last_regression_min')
-  max_revision = testcase.get_metadata('last_regression_max')
+  # Cache this data here to judge in the end if we actually made progress.
+  # Between here and the end of the loop also a lot of time might pass, in
+  # which another simultaneously running regression task might mess with
+  # the metadata.
+  last_min_revision = testcase.get_metadata('last_regression_min')
+  last_max_revision = testcase.get_metadata('last_regression_max')
+  min_revision = last_min_revision
+  max_revision = last_max_revision
 
   logs.info('Build set up, starting search for regression range. State: ' +
             f'crash_revision = {testcase.crash_revision}, ' +
@@ -548,13 +554,27 @@ def find_regression_range(
     regression_task_output.last_regression_min = revision_list[min_index]
     regression_task_output.last_regression_max = revision_list[max_index]
 
-  # If we've broken out of the above loop, we timed out. We'll finish by
-  # running another regression task and picking up from this point.
+  # If we've broken out of the above loop, we timed out. Remember where
+  # we left.
+  regression_task_output.last_regression_min = revision_list[min_index]
+  regression_task_output.last_regression_max = revision_list[max_index]
+
+  # Check if we made progress at all. If this task already resumed a previous
+  # timeout, it started with known min/max revisions. Without any progress,
+  # likely most builds failed the bad build check, in which case we don't
+  # want to restart another task to avoid a task loop.
+  if (last_min_revision == revision_list[min_index] and
+      last_max_revision == revision_list[max_index]):
+    return uworker_msg_pb2.Output(  # pylint: disable=no-member
+        regression_task_output=regression_task_output,
+        error_type=uworker_msg_pb2.REGRESSION_BAD_BUILD_ERROR,  # pylint: disable=no-member
+        error_message='No progress during bisect.')
+
+  # Because we made progress, the timeout error handler will trigger another
+  # regression task and pick up from this point.
   # TODO: Error handling should be moved to postprocess.
   error_message = 'Timed out, current range r%d:r%d' % (
       revision_list[min_index], revision_list[max_index])
-  regression_task_output.last_regression_min = revision_list[min_index]
-  regression_task_output.last_regression_max = revision_list[max_index]
   return uworker_msg_pb2.Output(  # pylint: disable=no-member
       regression_task_output=regression_task_output,
       error_type=uworker_msg_pb2.REGRESSION_TIMEOUT_ERROR,  # pylint: disable=no-member
diff --git a/src/clusterfuzz/_internal/tests/core/bot/tasks/utasks/regression_task_test.py b/src/clusterfuzz/_internal/tests/core/bot/tasks/utasks/regression_task_test.py
@@ -1014,6 +1014,46 @@ def reproduces(revision):
     self.assertEqual(output.regression_task_output.regression_range_start, 50)
     self.assertEqual(output.regression_task_output.regression_range_end, 100)
 
+  def test_timeout_bisect_no_progress(self):
+    """Verifies that bisection without progress will terminate."""
+    self.mock.get_revisions_list.return_value = list(range(0, 102, 2))
+    self.deadline = 5.
+
+    def repros(revision):
+      self.mock_time += 1.
+
+      if revision < 68:
+        # Let every revision except the max revision have bad build errors.
+        return False, uworker_msg_pb2.Output(
+            error_type=uworker_msg_pb2.REGRESSION_BAD_BUILD_ERROR)
+      return True, None
+
+    self.reproduces_in_revision = repros
+
+    testcase = test_utils.create_generic_testcase()
+    testcase.crash_revision = 100
+
+    # Pick up an unfinished task.
+    testcase.set_metadata('last_regression_max', 68)
+    testcase.set_metadata('last_regression_min', 36)
+
+    uworker_input = uworker_msg_pb2.Input(
+        testcase_id=str(testcase.key.id()),
+        testcase=uworker_io.entity_to_protobuf(testcase),
+        job_type='foo-job',
+        setup_input=uworker_msg_pb2.SetupInput(),
+        regression_task_input=uworker_msg_pb2.RegressionTaskInput(),
+    )
+
+    output = regression_task.utask_main(uworker_input)
+
+    self.assertEqual(output.error_type,
+                     uworker_msg_pb2.REGRESSION_BAD_BUILD_ERROR)
+
+    # The task made no progress.
+    self.assertEqual(output.regression_task_output.last_regression_max, 68)
+    self.assertEqual(output.regression_task_output.last_regression_min, 36)
+
   def test_inconsistent_state_max_none_min_not_none(self):
     """Verifies that when last_regression_max is None and last_regression_min
     is not None, we ignore the latter and restart from scratch."""