Skip to content

Commit

Permalink
fix: improve remote runtime reliability on large-scale evaluation (#4869
Browse files Browse the repository at this point in the history
)
  • Loading branch information
xingyaoww authored Nov 9, 2024
1 parent be82832 commit a07e827
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 8 deletions.
1 change: 1 addition & 0 deletions evaluation/swe_bench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def get_config(instance: pd.Series) -> AppConfig:
timeout=1800,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
remote_runtime_init_timeout=1800,
),
# do not mount workspace
workspace_base=None,
Expand Down
1 change: 1 addition & 0 deletions evaluation/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def get_config(
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_remote_runtime_alive=False,
remote_runtime_init_timeout=1800,
),
# do not mount workspace
workspace_base=None,
Expand Down
4 changes: 3 additions & 1 deletion openhands/core/config/sandbox_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ class SandboxConfig:
base_container_image: The base container image from which to build the runtime image.
runtime_container_image: The runtime container image to use.
user_id: The user ID for the sandbox.
timeout: The timeout for the sandbox.
timeout: The timeout for the default sandbox action execution.
remote_runtime_init_timeout: The timeout for the remote runtime to start.
enable_auto_lint: Whether to enable auto-lint.
use_host_network: Whether to use the host network.
initialize_plugins: Whether to initialize plugins.
Expand All @@ -41,6 +42,7 @@ class SandboxConfig:
runtime_container_image: str | None = None
user_id: int = os.getuid() if hasattr(os, 'getuid') else 1000
timeout: int = 120
remote_runtime_init_timeout: int = 180
enable_auto_lint: bool = (
False # once enabled, OpenHands would lint files after editing
)
Expand Down
20 changes: 13 additions & 7 deletions openhands/runtime/impl/remote/remote_runtime.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from pathlib import Path
import tempfile
import threading
from pathlib import Path
from typing import Callable, Optional
from zipfile import ZipFile

Expand Down Expand Up @@ -260,13 +260,19 @@ def _parse_runtime_response(self, response: requests.Response):
{'X-Session-API-Key': start_response['session_api_key']}
)

@tenacity.retry(
stop=tenacity.stop_after_delay(180) | stop_if_should_exit(),
reraise=True,
retry=tenacity.retry_if_exception_type(RuntimeNotReadyError),
wait=tenacity.wait_fixed(2),
)
def _wait_until_alive(self):
retry_decorator = tenacity.retry(
stop=tenacity.stop_after_delay(
self.config.sandbox.remote_runtime_init_timeout
)
| stop_if_should_exit(),
reraise=True,
retry=tenacity.retry_if_exception_type(RuntimeNotReadyError),
wait=tenacity.wait_fixed(2),
)
return retry_decorator(self._wait_until_alive_impl)()

def _wait_until_alive_impl(self):
self.log('debug', f'Waiting for runtime to be alive at url: {self.runtime_url}')
runtime_info_response = self._send_request(
'GET',
Expand Down

0 comments on commit a07e827

Please sign in to comment.