feat(eval): reliability improvement for SWE-Bench eval_infer (#6347)

All-Hands-AI · Jan 18, 2025 · 2b04ee2 · 2b04ee2
1 parent 4383be1
commit 2b04ee2
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 10 deletions.
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
@@ -355,7 +355,9 @@ def _process_instance_wrapper(
             )
             # e is likely an EvalException, so we can't directly infer it from type
             # but rather check if it's a fatal error
-            if is_fatal_runtime_error(str(e)):
+            # But it can also be AgentRuntime**Error (e.g., swe_bench/eval_infer.py)
+            _error_str = type(e).__name__ + ': ' + str(e)
+            if is_fatal_runtime_error(_error_str):
                 runtime_failure_count += 1
                 msg += f'Runtime disconnected error detected for instance {instance.instance_id}, runtime failure count: {runtime_failure_count}'
                 msg += '\n' + '-' * 10 + '\n'
@@ -531,6 +533,7 @@ def is_fatal_runtime_error(error: str | None) -> bool:
         return False
 
     FATAL_RUNTIME_ERRORS = [
+        AgentRuntimeTimeoutError,
         AgentRuntimeUnavailableError,
         AgentRuntimeDisconnectedError,
         AgentRuntimeNotFoundError,

diff --git a/openhands/runtime/impl/remote/remote_runtime.py b/openhands/runtime/impl/remote/remote_runtime.py
@@ -60,6 +60,12 @@ def __init__(
             )
         self.session.headers.update({'X-API-Key': self.config.sandbox.api_key})
 
+        if self.config.workspace_base is not None:
+            self.log(
+                'debug',
+                'Setting workspace_base is not supported in the remote runtime.',
+            )
+
         self.runtime_builder = RemoteRuntimeBuilder(
             self.config.sandbox.remote_runtime_api_url,
             self.config.sandbox.api_key,
@@ -70,12 +76,6 @@ def __init__(
         self.available_hosts: dict[str, int] = {}
         self._runtime_initialized: bool = False
 
-        if self.config.workspace_base is not None:
-            self.log(
-                'debug',
-                'Setting workspace_base is not supported in the remote runtime.',
-            )
-
     def log(self, level: str, message: str) -> None:
         message = f'[runtime session_id={self.sid} runtime_id={self.runtime_id or "unknown"}] {message}'
         getattr(logger, level)(message, stacklevel=2)
@@ -230,7 +230,7 @@ def _start_runtime(self):
                 f'Runtime started. URL: {self.runtime_url}',
             )
         except requests.HTTPError as e:
-            self.log('error', f'Unable to start runtime: {e}')
+            self.log('error', f'Unable to start runtime: {str(e)}')
             raise AgentRuntimeUnavailableError() from e
 
     def _resume_runtime(self):
@@ -315,10 +315,11 @@ def _wait_until_alive_impl(self):
                 self.check_if_alive()
             except requests.HTTPError as e:
                 self.log(
-                    'warning', f"Runtime /alive failed, but pod says it's ready: {e}"
+                    'warning',
+                    f"Runtime /alive failed, but pod says it's ready: {str(e)}",
                 )
                 raise AgentRuntimeNotReadyError(
-                    f'Runtime /alive failed to respond with 200: {e}'
+                    f'Runtime /alive failed to respond with 200: {str(e)}'
                 )
             return
         elif (
@@ -363,6 +364,7 @@ def close(self):
                 ):
                     self.log('debug', 'Runtime stopped.')
         except Exception as e:
+            self.log('error', f'Unable to stop runtime: {str(e)}')
             raise e
         finally:
             super().close()