From 76cdcd19895d8823648894692301eb12b96a2094 Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Tue, 15 Oct 2024 17:31:15 +0200
Subject: [PATCH 01/16] Updated tests

---
 tests/unit/test_runtime_build.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/unit/test_runtime_build.py b/tests/unit/test_runtime_build.py
index a8d215544303..e1e1c18eec94 100644
--- a/tests/unit/test_runtime_build.py
+++ b/tests/unit/test_runtime_build.py
@@ -259,6 +259,7 @@ def test_build_runtime_image_from_scratch(temp_dir):
                 f'{get_runtime_image_repo()}:{from_scratch_hash}',
                 f'{get_runtime_image_repo()}:{OH_VERSION}_image_debian_tag_11',
             ],
+            platform='linux/amd64',  # Added platform tag
         )
         assert image_name == f'{get_runtime_image_repo()}:{from_scratch_hash}'
 
@@ -340,6 +341,7 @@ def test_build_runtime_image_exact_hash_not_exist(mock_build_sandbox_image, temp
                 target_image_repo=repo,
                 target_image_hash_tag=from_scratch_hash,
                 target_image_tag=latest_image_tag,
+                platform='linux/amd64',  # Added platform argument
             )
             assert image_name == f'{repo}:{from_scratch_hash}'
 

From 3beaf5c02d97e5d01694899b3f9a1a1c2bbf0571 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 15 Oct 2024 18:59:26 +0200
Subject: [PATCH 02/16] chore(deps): bump litellm from 1.49.3 to 1.49.4 (#4406)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 poetry.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 97457465f380..4fe8048b7fd1 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3879,13 +3879,13 @@ types-tqdm = "*"
 
 [[package]]
 name = "litellm"
-version = "1.49.3"
+version = "1.49.4"
 description = "Library to easily interface with LLM API providers"
 optional = false
 python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8"
 files = [
-    {file = "litellm-1.49.3-py3-none-any.whl", hash = "sha256:300c3c9e1600441f8b6d3afe0fd79c6193f901b2091f3730883ffe3709eebfa2"},
-    {file = "litellm-1.49.3.tar.gz", hash = "sha256:e51ce30286894803dcf2949ddb4aab5c2e00809694a48ce6e997953566113c0b"},
+    {file = "litellm-1.49.4-py3-none-any.whl", hash = "sha256:3094a9f74979da993f4b3298372ec4416f7a3f82d11a0831c9c616098b3fb50a"},
+    {file = "litellm-1.49.4.tar.gz", hash = "sha256:5f16d40bfa7747fcc21f45f340454c57cbc705178244fe7326abac7c0759e05e"},
 ]
 
 [package.dependencies]

From c8db8aaf92d817c4ac7f867c5ee0edbbfe280276 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 15 Oct 2024 19:05:33 +0200
Subject: [PATCH 03/16] chore(deps-dev): bump llama-index from 0.11.17 to
 0.11.18 (#4408)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 poetry.lock | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 4fe8048b7fd1..73b6b1c394b1 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3922,19 +3922,19 @@ pydantic = ">=1.10"
 
 [[package]]
 name = "llama-index"
-version = "0.11.17"
+version = "0.11.18"
 description = "Interface between LLMs and your data"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index-0.11.17-py3-none-any.whl", hash = "sha256:85a3d2cd1908181555ae926f880dfae5284b24fda6b866e60969a44302d87350"},
-    {file = "llama_index-0.11.17.tar.gz", hash = "sha256:b2176f400b33cd765e86775724d8ad5c6e4812ecdc36f2ca4500edcadc015af4"},
+    {file = "llama_index-0.11.18-py3-none-any.whl", hash = "sha256:dc54c7fdd4c8ee32aa0c5565038894295fc76bd95e21e70fa67ca6fb2413a1b3"},
+    {file = "llama_index-0.11.18.tar.gz", hash = "sha256:5c43b46ea9957d539ad823e008c9b6957fbaf4ec5c8bc6903accfb19863edfd9"},
 ]
 
 [package.dependencies]
 llama-index-agent-openai = ">=0.3.4,<0.4.0"
 llama-index-cli = ">=0.3.1,<0.4.0"
-llama-index-core = ">=0.11.17,<0.12.0"
+llama-index-core = ">=0.11.18,<0.12.0"
 llama-index-embeddings-openai = ">=0.2.4,<0.3.0"
 llama-index-indices-managed-llama-cloud = ">=0.3.0"
 llama-index-legacy = ">=0.9.48,<0.10.0"
@@ -3980,13 +3980,13 @@ llama-index-llms-openai = ">=0.2.0,<0.3.0"
 
 [[package]]
 name = "llama-index-core"
-version = "0.11.17"
+version = "0.11.18"
 description = "Interface between LLMs and your data"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_core-0.11.17-py3-none-any.whl", hash = "sha256:d65565b54ea55b2db12f9a1cd5c250b770d7e43d3363137cff431a6116ef069c"},
-    {file = "llama_index_core-0.11.17.tar.gz", hash = "sha256:1143baf8d819e27555bdb142abdf2833d3d37731f270f46fa1e07fc4b97116ae"},
+    {file = "llama_index_core-0.11.18-py3-none-any.whl", hash = "sha256:8e57522e69d3c8a219b29b5f1624c20269c9c3f87729eff9ecfb796eab51dd55"},
+    {file = "llama_index_core-0.11.18.tar.gz", hash = "sha256:f94ae8d740b65c3bf0bc0422b0210613664c1a9f8e98b7328e037a68255bed83"},
 ]
 
 [package.dependencies]

From 308dc62546ddad4a70bcd1c86841801b02071ed0 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 15 Oct 2024 19:06:40 +0200
Subject: [PATCH 04/16] chore(deps): bump modal from 0.64.181 to 0.64.182
 (#4407)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 poetry.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 73b6b1c394b1..f396e504b4f4 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -4799,12 +4799,12 @@ type = ["mypy (==1.11.2)"]
 
 [[package]]
 name = "modal"
-version = "0.64.181"
+version = "0.64.182"
 description = "Python client library for Modal"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "modal-0.64.181-py3-none-any.whl", hash = "sha256:1d8dab39029abd2b11ca44b8401c71407b20e0e9a2e5c673ec6b1476d3b17fa2"},
+    {file = "modal-0.64.182-py3-none-any.whl", hash = "sha256:d3213550a0724b13b1dacf8b468d26c78f51d850fd2a76529f180921905bcad3"},
 ]
 
 [package.dependencies]

From 158a9230b0d89eaab448c5d371ab9f3516887550 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Tue, 15 Oct 2024 14:31:49 -0500
Subject: [PATCH 05/16] refactor: move get_pairs from memory to shared utils
 (#4411)

---
 openhands/events/utils.py   | 56 +++++++++++++++++++++++++++++++++++++
 openhands/memory/history.py | 54 +++--------------------------------
 tests/unit/test_is_stuck.py | 23 +++++++++++++--
 3 files changed, 81 insertions(+), 52 deletions(-)
 create mode 100644 openhands/events/utils.py

diff --git a/openhands/events/utils.py b/openhands/events/utils.py
new file mode 100644
index 000000000000..6c8cc415f675
--- /dev/null
+++ b/openhands/events/utils.py
@@ -0,0 +1,56 @@
+from openhands.core.logger import openhands_logger as logger
+from openhands.events.action.action import Action
+from openhands.events.action.empty import NullAction
+from openhands.events.event import Event
+from openhands.events.observation.commands import CmdOutputObservation
+from openhands.events.observation.empty import NullObservation
+from openhands.events.observation.observation import Observation
+
+
+def get_pairs_from_events(events: list[Event]) -> list[tuple[Action, Observation]]:
+    """Return the history as a list of tuples (action, observation)."""
+    tuples: list[tuple[Action, Observation]] = []
+    action_map: dict[int, Action] = {}
+    observation_map: dict[int, Observation] = {}
+
+    # runnable actions are set as cause of observations
+    # (MessageAction, NullObservation) for source=USER
+    # (MessageAction, NullObservation) for source=AGENT
+    # (other_action?, NullObservation)
+    # (NullAction, CmdOutputObservation) background CmdOutputObservations
+
+    for event in events:
+        if event.id is None or event.id == -1:
+            logger.debug(f'Event {event} has no ID')
+
+        if isinstance(event, Action):
+            action_map[event.id] = event
+
+        if isinstance(event, Observation):
+            if event.cause is None or event.cause == -1:
+                logger.debug(f'Observation {event} has no cause')
+
+            if event.cause is None:
+                # runnable actions are set as cause of observations
+                # NullObservations have no cause
+                continue
+
+            observation_map[event.cause] = event
+
+    for action_id, action in action_map.items():
+        observation = observation_map.get(action_id)
+        if observation:
+            # observation with a cause
+            tuples.append((action, observation))
+        else:
+            tuples.append((action, NullObservation('')))
+
+    for cause_id, observation in observation_map.items():
+        if cause_id not in action_map:
+            if isinstance(observation, NullObservation):
+                continue
+            if not isinstance(observation, CmdOutputObservation):
+                logger.debug(f'Observation {observation} has no cause')
+            tuples.append((NullAction(), observation))
+
+    return tuples.copy()
diff --git a/openhands/memory/history.py b/openhands/memory/history.py
index 89e50d67e455..1e4cfb8b5f05 100644
--- a/openhands/memory/history.py
+++ b/openhands/memory/history.py
@@ -10,12 +10,12 @@
 from openhands.events.action.message import MessageAction
 from openhands.events.event import Event, EventSource
 from openhands.events.observation.agent import AgentStateChangedObservation
-from openhands.events.observation.commands import CmdOutputObservation
 from openhands.events.observation.delegate import AgentDelegateObservation
 from openhands.events.observation.empty import NullObservation
 from openhands.events.observation.observation import Observation
 from openhands.events.serialization.event import event_to_dict
 from openhands.events.stream import EventStream
+from openhands.events.utils import get_pairs_from_events
 
 
 class ShortTermHistory(list[Event]):
@@ -216,55 +216,9 @@ def on_event(self, event: Event):
     def compatibility_for_eval_history_pairs(self) -> list[tuple[dict, dict]]:
         history_pairs = []
 
-        for action, observation in self.get_pairs():
+        for action, observation in get_pairs_from_events(
+            self.get_events_as_list(include_delegates=True)
+        ):
             history_pairs.append((event_to_dict(action), event_to_dict(observation)))
 
         return history_pairs
-
-    def get_pairs(self) -> list[tuple[Action, Observation]]:
-        """Return the history as a list of tuples (action, observation)."""
-        tuples: list[tuple[Action, Observation]] = []
-        action_map: dict[int, Action] = {}
-        observation_map: dict[int, Observation] = {}
-
-        # runnable actions are set as cause of observations
-        # (MessageAction, NullObservation) for source=USER
-        # (MessageAction, NullObservation) for source=AGENT
-        # (other_action?, NullObservation)
-        # (NullAction, CmdOutputObservation) background CmdOutputObservations
-
-        for event in self.get_events_as_list(include_delegates=True):
-            if event.id is None or event.id == -1:
-                logger.debug(f'Event {event} has no ID')
-
-            if isinstance(event, Action):
-                action_map[event.id] = event
-
-            if isinstance(event, Observation):
-                if event.cause is None or event.cause == -1:
-                    logger.debug(f'Observation {event} has no cause')
-
-                if event.cause is None:
-                    # runnable actions are set as cause of observations
-                    # NullObservations have no cause
-                    continue
-
-                observation_map[event.cause] = event
-
-        for action_id, action in action_map.items():
-            observation = observation_map.get(action_id)
-            if observation:
-                # observation with a cause
-                tuples.append((action, observation))
-            else:
-                tuples.append((action, NullObservation('')))
-
-        for cause_id, observation in observation_map.items():
-            if cause_id not in action_map:
-                if isinstance(observation, NullObservation):
-                    continue
-                if not isinstance(observation, CmdOutputObservation):
-                    logger.debug(f'Observation {observation} has no cause')
-                tuples.append((NullAction(), observation))
-
-        return tuples.copy()
diff --git a/tests/unit/test_is_stuck.py b/tests/unit/test_is_stuck.py
index 5e23a849286b..4a1330752161 100644
--- a/tests/unit/test_is_stuck.py
+++ b/tests/unit/test_is_stuck.py
@@ -17,6 +17,7 @@
 from openhands.events.observation.empty import NullObservation
 from openhands.events.observation.error import ErrorObservation
 from openhands.events.stream import EventSource, EventStream
+from openhands.events.utils import get_pairs_from_events
 from openhands.memory.history import ShortTermHistory
 from openhands.storage import get_file_store
 
@@ -170,7 +171,16 @@ def test_is_stuck_repeating_action_observation(
 
         assert len(collect_events(event_stream)) == 10
         assert len(list(stuck_detector.state.history.get_events())) == 8
-        assert len(stuck_detector.state.history.get_pairs()) == 5
+        assert (
+            len(
+                get_pairs_from_events(
+                    stuck_detector.state.history.get_events_as_list(
+                        include_delegates=True
+                    )
+                )
+            )
+            == 5
+        )
 
         assert stuck_detector.is_stuck() is False
         assert stuck_detector.state.almost_stuck == 1
@@ -186,7 +196,16 @@ def test_is_stuck_repeating_action_observation(
 
         assert len(collect_events(event_stream)) == 12
         assert len(list(stuck_detector.state.history.get_events())) == 10
-        assert len(stuck_detector.state.history.get_pairs()) == 6
+        assert (
+            len(
+                get_pairs_from_events(
+                    stuck_detector.state.history.get_events_as_list(
+                        include_delegates=True
+                    )
+                )
+            )
+            == 6
+        )
 
         with patch('logging.Logger.warning') as mock_warning:
             assert stuck_detector.is_stuck() is True

From b6a916342736dae8907179290f04d5116f072814 Mon Sep 17 00:00:00 2001
From: mamoodi <mamoodiha@gmail.com>
Date: Tue, 15 Oct 2024 18:45:08 -0400
Subject: [PATCH 06/16] Fix eval output path in case of @ char (#4416)

---
 evaluation/utils/shared.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index bed679f342a2..d184b5b98037 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -152,7 +152,7 @@ def make_metadata(
     details: dict[str, Any] | None = None,
 ) -> EvalMetadata:
     model_name = llm_config.model.split('/')[-1]
-    model_path = model_name.replace(':', '_')
+    model_path = model_name.replace(':', '_').replace('@', '-')
     eval_note = f'_N_{eval_note}' if eval_note else ''
 
     eval_output_path = os.path.join(

From 8ba531a012eefe824af8ece6a505c4913dfe48cc Mon Sep 17 00:00:00 2001
From: tofarr <tofarr@gmail.com>
Date: Tue, 15 Oct 2024 17:52:21 -0600
Subject: [PATCH 07/16] Fix for lockup - create the runtime in a background
 thread (#4412)

Co-authored-by: Robert Brennan <contact@rbren.io>
---
 openhands/runtime/runtime.py              |  4 +-
 openhands/security/invariant/analyzer.py  |  4 +-
 openhands/server/listen.py                | 14 +++---
 openhands/server/session/agent_session.py |  9 +++-
 openhands/utils/async_utils.py            | 16 ++++++-
 tests/unit/test_async_utils.py            | 54 +++++++++++++++++------
 6 files changed, 74 insertions(+), 27 deletions(-)

diff --git a/openhands/runtime/runtime.py b/openhands/runtime/runtime.py
index 7e420643c347..44614ee0a3dc 100644
--- a/openhands/runtime/runtime.py
+++ b/openhands/runtime/runtime.py
@@ -28,7 +28,7 @@
 )
 from openhands.events.serialization.action import ACTION_TYPE_TO_CLASS
 from openhands.runtime.plugins import JupyterRequirement, PluginRequirement
-from openhands.utils.async_utils import sync_from_async
+from openhands.utils.async_utils import call_sync_from_async
 
 
 def _default_env_vars(sandbox_config: SandboxConfig) -> dict[str, str]:
@@ -123,7 +123,7 @@ async def on_event(self, event: Event) -> None:
             if event.timeout is None:
                 event.timeout = self.config.sandbox.timeout
             assert event.timeout is not None
-            observation = await sync_from_async(self.run_action, event)
+            observation = await call_sync_from_async(self.run_action, event)
             observation._cause = event.id  # type: ignore[attr-defined]
             source = event.source if event.source else EventSource.AGENT
             await self.event_stream.async_add_event(observation, source)  # type: ignore[arg-type]
diff --git a/openhands/security/invariant/analyzer.py b/openhands/security/invariant/analyzer.py
index 275888bb4197..9d8b280716a7 100644
--- a/openhands/security/invariant/analyzer.py
+++ b/openhands/security/invariant/analyzer.py
@@ -19,7 +19,7 @@
 from openhands.security.analyzer import SecurityAnalyzer
 from openhands.security.invariant.client import InvariantClient
 from openhands.security.invariant.parser import TraceElement, parse_element
-from openhands.utils.async_utils import sync_from_async
+from openhands.utils.async_utils import call_sync_from_async
 
 
 class InvariantAnalyzer(SecurityAnalyzer):
@@ -146,7 +146,7 @@ async def confirm(self, event: Event) -> None:
             {'action': 'change_agent_state', 'args': {'agent_state': 'user_confirmed'}}
         )
         event_source = event.source if event.source else EventSource.AGENT
-        await sync_from_async(self.event_stream.add_event, new_event, event_source)
+        await call_sync_from_async(self.event_stream.add_event, new_event, event_source)
 
     async def security_risk(self, event: Action) -> ActionSecurityRisk:
         logger.info('Calling security_risk on InvariantAnalyzer')
diff --git a/openhands/server/listen.py b/openhands/server/listen.py
index d7f177734971..32c93a117e23 100644
--- a/openhands/server/listen.py
+++ b/openhands/server/listen.py
@@ -14,7 +14,7 @@
 from openhands.security.options import SecurityAnalyzers
 from openhands.server.data_models.feedback import FeedbackDataModel, store_feedback
 from openhands.storage import get_file_store
-from openhands.utils.async_utils import sync_from_async
+from openhands.utils.async_utils import call_sync_from_async
 
 with warnings.catch_warnings():
     warnings.simplefilter('ignore')
@@ -211,8 +211,8 @@ async def attach_session(request: Request, call_next):
             content={'error': 'Invalid token'},
         )
 
-    request.state.conversation = session_manager.attach_to_conversation(
-        request.state.sid
+    request.state.conversation = await call_sync_from_async(
+        session_manager.attach_to_conversation, request.state.sid
     )
     if request.state.conversation is None:
         return JSONResponse(
@@ -441,7 +441,9 @@ async def list_files(request: Request, path: str | None = None):
         )
 
     runtime: Runtime = request.state.conversation.runtime
-    file_list = await sync_from_async(runtime.list_files, path)
+    file_list = await asyncio.create_task(
+        call_sync_from_async(runtime.list_files, path)
+    )
     if path:
         file_list = [os.path.join(path, f) for f in file_list]
 
@@ -490,7 +492,7 @@ async def select_file(file: str, request: Request):
 
     file = os.path.join(runtime.config.workspace_mount_path_in_sandbox, file)
     read_action = FileReadAction(file)
-    observation = await sync_from_async(runtime.run_action, read_action)
+    observation = await call_sync_from_async(runtime.run_action, read_action)
 
     if isinstance(observation, FileReadObservation):
         content = observation.content
@@ -687,7 +689,7 @@ async def save_file(request: Request):
             runtime.config.workspace_mount_path_in_sandbox, file_path
         )
         write_action = FileWriteAction(file_path, content)
-        observation = await sync_from_async(runtime.run_action, write_action)
+        observation = await call_sync_from_async(runtime.run_action, write_action)
 
         if isinstance(observation, FileWriteObservation):
             return JSONResponse(
diff --git a/openhands/server/session/agent_session.py b/openhands/server/session/agent_session.py
index f172021a37d2..6bc442ac731a 100644
--- a/openhands/server/session/agent_session.py
+++ b/openhands/server/session/agent_session.py
@@ -14,6 +14,7 @@
 from openhands.runtime.runtime import Runtime
 from openhands.security import SecurityAnalyzer, options
 from openhands.storage.files import FileStore
+from openhands.utils.async_utils import call_sync_from_async
 
 
 class AgentSession:
@@ -102,7 +103,13 @@ async def _start(
     ):
         self.loop = asyncio.get_running_loop()
         self._create_security_analyzer(config.security.security_analyzer)
-        self._create_runtime(runtime_name, config, agent, status_message_callback)
+        await call_sync_from_async(
+            self._create_runtime,
+            runtime_name=runtime_name,
+            config=config,
+            agent=agent,
+            status_message_callback=status_message_callback,
+        )
         self._create_controller(
             agent,
             config.security.confirmation_mode,
diff --git a/openhands/utils/async_utils.py b/openhands/utils/async_utils.py
index 7da8d05ff5c6..2a3b73f5da7d 100644
--- a/openhands/utils/async_utils.py
+++ b/openhands/utils/async_utils.py
@@ -7,7 +7,7 @@
 EXECUTOR = ThreadPoolExecutor()
 
 
-async def sync_from_async(fn: Callable, *args, **kwargs):
+async def call_sync_from_async(fn: Callable, *args, **kwargs):
     """
     Shorthand for running a function in the default background thread pool executor
     and awaiting the result. The nature of synchronous code is that the future
@@ -19,7 +19,7 @@ async def sync_from_async(fn: Callable, *args, **kwargs):
     return result
 
 
-def async_from_sync(
+def call_async_from_sync(
     corofn: Callable, timeout: float = GENERAL_TIMEOUT, *args, **kwargs
 ):
     """
@@ -27,6 +27,11 @@ def async_from_sync(
     and awaiting the result
     """
 
+    if corofn is None:
+        raise ValueError('corofn is None')
+    if not asyncio.iscoroutinefunction(corofn):
+        raise ValueError('corofn is not a coroutine function')
+
     async def arun():
         coro = corofn(*args, **kwargs)
         result = await coro
@@ -46,6 +51,13 @@ def run():
     return result
 
 
+async def call_coro_in_bg_thread(
+    corofn: Callable, timeout: float = GENERAL_TIMEOUT, *args, **kwargs
+):
+    """Function for running a coroutine in a background thread."""
+    await call_sync_from_async(call_async_from_sync, corofn, timeout, *args, **kwargs)
+
+
 async def wait_all(
     iterable: Iterable[Coroutine], timeout: int = GENERAL_TIMEOUT
 ) -> List:
diff --git a/tests/unit/test_async_utils.py b/tests/unit/test_async_utils.py
index 89dd1e0f6915..3dc99438968e 100644
--- a/tests/unit/test_async_utils.py
+++ b/tests/unit/test_async_utils.py
@@ -1,11 +1,13 @@
 import asyncio
+import time
 
 import pytest
 
 from openhands.utils.async_utils import (
     AsyncException,
-    async_from_sync,
-    sync_from_async,
+    call_async_from_sync,
+    call_coro_in_bg_thread,
+    call_sync_from_async,
     wait_all,
 )
 
@@ -80,44 +82,44 @@ async def dummy(value: int):
 
 
 @pytest.mark.asyncio
-async def test_sync_from_async():
+async def test_call_sync_from_async():
     def dummy(value: int = 2):
         return value * 2
 
-    result = await sync_from_async(dummy)
+    result = await call_sync_from_async(dummy)
     assert result == 4
-    result = await sync_from_async(dummy, 3)
+    result = await call_sync_from_async(dummy, 3)
     assert result == 6
-    result = await sync_from_async(dummy, value=5)
+    result = await call_sync_from_async(dummy, value=5)
     assert result == 10
 
 
 @pytest.mark.asyncio
-async def test_sync_from_async_error():
+async def test_call_sync_from_async_error():
     def dummy():
         raise ValueError()
 
     with pytest.raises(ValueError):
-        await sync_from_async(dummy)
+        await call_sync_from_async(dummy)
 
 
-def test_async_from_sync():
+def test_call_async_from_sync():
     async def dummy(value: int):
         return value * 2
 
-    result = async_from_sync(dummy, 0, 3)
+    result = call_async_from_sync(dummy, 0, 3)
     assert result == 6
 
 
-def test_async_from_sync_error():
+def test_call_async_from_sync_error():
     async def dummy(value: int):
         raise ValueError()
 
     with pytest.raises(ValueError):
-        async_from_sync(dummy, 0, 3)
+        call_async_from_sync(dummy, 0, 3)
 
 
-def test_async_from_sync_background_tasks():
+def test_call_async_from_sync_background_tasks():
     events = []
 
     async def bg_task():
@@ -132,9 +134,33 @@ async def dummy(value: int):
         asyncio.create_task(bg_task())
         events.append('dummy_started')
 
-    async_from_sync(dummy, 0, 3)
+    call_async_from_sync(dummy, 0, 3)
 
     # We check that the function did not return until all coroutines completed
     # (Even though some of these were started as background tasks)
     expected = ['dummy_started', 'dummy_started', 'bg_started', 'bg_finished']
     assert expected == events
+
+
+@pytest.mark.asyncio
+async def test_call_coro_in_bg_thread():
+    times = {}
+
+    async def bad_async(id_):
+        # Dummy demonstrating some bad async function that does not cede control
+        time.sleep(0.1)
+        times[id_] = time.time()
+
+    async def curve_ball():
+        # A curve ball - an async function that wants to run while the bad async functions are in progress
+        await asyncio.sleep(0.05)
+        times['curve_ball'] = time.time()
+
+    start = time.time()
+    asyncio.create_task(curve_ball())
+    await wait_all(
+        call_coro_in_bg_thread(bad_async, id_=f'bad_async_{id_}') for id_ in range(5)
+    )
+    assert (times['curve_ball'] - start) == pytest.approx(0.05, abs=0.1)
+    for id_ in range(5):
+        assert (times[f'bad_async_{id_}'] - start) == pytest.approx(0.1, abs=0.1)

From 064e4ad5506bc08b5c219141b5a507520fb2ed36 Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 6 Nov 2024 15:35:08 -0800
Subject: [PATCH 08/16] prompt engineering to remind o1 to generate a patch

---
 openhands/agenthub/codeact_agent/codeact_agent.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index d1f67eae9c2c..41c520c560ce 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -495,7 +495,16 @@ def _get_messages(self, state: State) -> list[Message]:
             )
             # do not add this for function calling
             if latest_user_message:
-                reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
+                reminder_text = (
+                    '\n\nENVIRONMENT REMINDER:\n'
+                    f'- You have {state.max_iterations - state.iteration} turns left to complete the task\n'
+                    '- When finished reply with <finish></finish>\n'
+                    '\n<IMPORTANT>\n'
+                    '- You MUST generate only one action per turn!\n'
+                    '- A patch is a set of changes to the source code of the codebase that you are given\n'
+                    '- You MUST generate a patch that attempts to fix the issue described in the <pr_description>\n'
+                    '</IMPORTANT>\n'
+                )
                 latest_user_message.content.append(TextContent(text=reminder_text))
 
         return messages

From aaba2d42fcabc4613c11d2df9c7cee1faefae21d Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Thu, 7 Nov 2024 05:08:15 +0000
Subject: [PATCH 09/16] Make SWE Bench specific instructions conditional based
 on environment variable

1. Added SWE_BENCH_RUN environment variable in run_infer.py
2. Modified codeact_agent.py to only show SWE Bench specific instructions when SWE_BENCH_RUN is true
---
 evaluation/swe_bench/run_infer.py                |  3 +++
 .../agenthub/codeact_agent/codeact_agent.py      | 16 +++++++++++-----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index 8bd23939dd58..a28670cc9449 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -4,6 +4,9 @@
 import tempfile
 from typing import Any
 
+# Set environment variable to indicate SWE Bench context
+os.environ['SWE_BENCH_RUN'] = 'true'
+
 import pandas as pd
 import toml
 from datasets import load_dataset
diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index 484cedd53e2f..cc644e7e6571 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -513,12 +513,18 @@ def _get_messages(self, state: State) -> list[Message]:
                     '\n\nENVIRONMENT REMINDER:\n'
                     f'- You have {state.max_iterations - state.iteration} turns left to complete the task\n'
                     '- When finished reply with <finish></finish>\n'
-                    '\n<IMPORTANT>\n'
-                    '- You MUST generate only one action per turn!\n'
-                    '- A patch is a set of changes to the source code of the codebase that you are given\n'
-                    '- You MUST generate a patch that attempts to fix the issue described in the <pr_description>\n'
-                    '</IMPORTANT>\n'
                 )
+                
+                # Add SWE Bench specific instructions only when running in that context
+                if os.environ.get('SWE_BENCH_RUN', 'false').lower() == 'true':
+                    reminder_text += (
+                        '\n<IMPORTANT>\n'
+                        '- You MUST generate only one action per turn!\n'
+                        '- A patch is a set of changes to the source code of the codebase that you are given\n'
+                        '- You MUST generate a patch that attempts to fix the issue described in the <pr_description>\n'
+                        '</IMPORTANT>\n'
+                    )
+                
                 latest_user_message.content.append(TextContent(text=reminder_text))
 
         return messages

From b19fdf1a3730c49a5f8b869fa3b3111a260ffe51 Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 6 Nov 2024 21:13:04 -0800
Subject: [PATCH 10/16] fix

---
 evaluation/swe_bench/run_infer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index a28670cc9449..8bab6e7f78c4 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -4,9 +4,6 @@
 import tempfile
 from typing import Any
 
-# Set environment variable to indicate SWE Bench context
-os.environ['SWE_BENCH_RUN'] = 'true'
-
 import pandas as pd
 import toml
 from datasets import load_dataset
@@ -45,6 +42,9 @@
 USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
 USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
 RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
+os.environ['SWE_BENCH_RUN'] = (
+    'true'  # Set environment variable to indicate SWE Bench context
+)
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': codeact_user_response,

From f4b70661804f625023ab64039dcadf2e7b3220c4 Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 6 Nov 2024 21:15:58 -0800
Subject: [PATCH 11/16] fix

---
 .../agenthub/codeact_agent/codeact_agent.py   | 41 +++++++++++--------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index cc644e7e6571..8a6c9769d428 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -508,23 +508,28 @@ def _get_messages(self, state: State) -> list[Message]:
                 None,
             )
             # do not add this for function calling
-            if latest_user_message:
-                reminder_text = (
-                    '\n\nENVIRONMENT REMINDER:\n'
-                    f'- You have {state.max_iterations - state.iteration} turns left to complete the task\n'
-                    '- When finished reply with <finish></finish>\n'
-                )
-                
-                # Add SWE Bench specific instructions only when running in that context
-                if os.environ.get('SWE_BENCH_RUN', 'false').lower() == 'true':
-                    reminder_text += (
-                        '\n<IMPORTANT>\n'
-                        '- You MUST generate only one action per turn!\n'
-                        '- A patch is a set of changes to the source code of the codebase that you are given\n'
-                        '- You MUST generate a patch that attempts to fix the issue described in the <pr_description>\n'
-                        '</IMPORTANT>\n'
-                    )
-                
-                latest_user_message.content.append(TextContent(text=reminder_text))
+            if not latest_user_message:
+                return messages
+
+            # Build environment reminder text
+            reminder_parts = [
+                '\n\nENVIRONMENT REMINDER:',
+                f'- You have {state.max_iterations - state.iteration} turns left to complete the task',
+                '- When finished reply with <finish></finish>\n',
+            ]
+            reminder_text = '\n'.join(reminder_parts)
+
+            # Add SWE Bench specific instructions if needed
+            if os.environ.get('SWE_BENCH_RUN', 'false').lower() == 'true':
+                swe_bench_parts = [
+                    '\n<IMPORTANT>',
+                    '- You MUST generate only one action per turn!',
+                    '- A patch is a set of changes to the source code of the codebase that you are given',
+                    '- You MUST generate a patch that attempts to fix the issue described in the <pr_description>',
+                    '</IMPORTANT>\n',
+                ]
+                reminder_text += '\n'.join(swe_bench_parts)
+
+            latest_user_message.content.append(TextContent(text=reminder_text))
 
         return messages

From af947b0fa8b05c12642248ac33e753a0bac43849 Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Wed, 6 Nov 2024 21:36:40 -0800
Subject: [PATCH 12/16] fix

---
 openhands/agenthub/codeact_agent/codeact_agent.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index 8a6c9769d428..40235d277247 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -512,12 +512,7 @@ def _get_messages(self, state: State) -> list[Message]:
                 return messages
 
             # Build environment reminder text
-            reminder_parts = [
-                '\n\nENVIRONMENT REMINDER:',
-                f'- You have {state.max_iterations - state.iteration} turns left to complete the task',
-                '- When finished reply with <finish></finish>\n',
-            ]
-            reminder_text = '\n'.join(reminder_parts)
+            reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
 
             # Add SWE Bench specific instructions if needed
             if os.environ.get('SWE_BENCH_RUN', 'false').lower() == 'true':

From 9cf49272e95815b735cb77c136a7a5e3b788b26f Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Thu, 7 Nov 2024 17:36:07 -0800
Subject: [PATCH 13/16] fixed according to comments

---
 evaluation/swe_bench/run_infer.py                 |  9 ++++++++-
 openhands/agenthub/codeact_agent/codeact_agent.py | 11 -----------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index 8bab6e7f78c4..c6db373b5e09 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -73,7 +73,6 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
         instruction += CODEACT_SWE_PROMPT.format(workspace_dir_name=workspace_dir_name)
     else:
         # Instruction based on Anthropic's official trajectory
-        # https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/verified/20241022_tools_claude-3-5-sonnet-updated/trajs
         instruction = (
             '<uploaded_files>\n'
             f'/workspace/{workspace_dir_name}\n'
@@ -94,6 +93,14 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
             "Your thinking should be thorough and so it's fine if it's very long.\n"
         )
 
+    instruction += """
+        <IMPORTANT>
+        - You MUST generate only one action per turn!
+        - A patch is a set of changes to the source code of the codebase that you are given
+        - You MUST generate a patch that attempts to fix the issue described in the <pr_description>
+        </IMPORTANT>
+        """
+
     if RUN_WITH_BROWSING:
         instruction += (
             '<IMPORTANT!>\n'
diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index 40235d277247..536ae5615613 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -514,17 +514,6 @@ def _get_messages(self, state: State) -> list[Message]:
             # Build environment reminder text
             reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
 
-            # Add SWE Bench specific instructions if needed
-            if os.environ.get('SWE_BENCH_RUN', 'false').lower() == 'true':
-                swe_bench_parts = [
-                    '\n<IMPORTANT>',
-                    '- You MUST generate only one action per turn!',
-                    '- A patch is a set of changes to the source code of the codebase that you are given',
-                    '- You MUST generate a patch that attempts to fix the issue described in the <pr_description>',
-                    '</IMPORTANT>\n',
-                ]
-                reminder_text += '\n'.join(swe_bench_parts)
-
             latest_user_message.content.append(TextContent(text=reminder_text))
 
         return messages

From bc9f635afc961282b3a95469e0cea9fff8e1fe81 Mon Sep 17 00:00:00 2001
From: Alejandro Cuadron Lafuente <alex.cl.2000@gmail.com>
Date: Thu, 7 Nov 2024 17:39:23 -0800
Subject: [PATCH 14/16] Update run_infer.py

---
 evaluation/swe_bench/run_infer.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index c6db373b5e09..869e5e74bd1f 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -42,9 +42,6 @@
 USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
 USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
 RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
-os.environ['SWE_BENCH_RUN'] = (
-    'true'  # Set environment variable to indicate SWE Bench context
-)
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': codeact_user_response,

From deb54f72843bcef0a5e75453ec72b1180ac23fd2 Mon Sep 17 00:00:00 2001
From: Alejandro Cuadron Lafuente <alex.cl.2000@gmail.com>
Date: Thu, 7 Nov 2024 17:39:52 -0800
Subject: [PATCH 15/16] Update run_infer.py

---
 evaluation/swe_bench/run_infer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index 869e5e74bd1f..d7218bc16fc7 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -70,6 +70,7 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
         instruction += CODEACT_SWE_PROMPT.format(workspace_dir_name=workspace_dir_name)
     else:
         # Instruction based on Anthropic's official trajectory
+        # https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/verified/20241022_tools_claude-3-5-sonnet-updated/trajs
         instruction = (
             '<uploaded_files>\n'
             f'/workspace/{workspace_dir_name}\n'

From 1033d610b5bce3e18117c3c659d3e54972eb0f2c Mon Sep 17 00:00:00 2001
From: AlexCuadron <alex.cl.2000@gmail.com>
Date: Thu, 7 Nov 2024 18:47:03 -0800
Subject: [PATCH 16/16] fixed based on comments

---
 evaluation/swe_bench/run_infer.py                 | 14 +++++++-------
 openhands/agenthub/codeact_agent/codeact_agent.py | 10 +++-------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index d7218bc16fc7..a171d5f20a56 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -91,13 +91,13 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
             "Your thinking should be thorough and so it's fine if it's very long.\n"
         )
 
-    instruction += """
-        <IMPORTANT>
-        - You MUST generate only one action per turn!
-        - A patch is a set of changes to the source code of the codebase that you are given
-        - You MUST generate a patch that attempts to fix the issue described in the <pr_description>
-        </IMPORTANT>
-        """
+    instruction += (
+        '<IMPORTANT>\n'
+        '- You MUST generate only one action per turn!\n'
+        '- A patch is a set of changes to the source code of the codebase that you are given\n'
+        '- You MUST generate a patch that attempts to fix the issue described in the <pr_description>\n'
+        '</IMPORTANT>\n'
+    )
 
     if RUN_WITH_BROWSING:
         instruction += (
diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index 536ae5615613..314a2e0a089b 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -508,12 +508,8 @@ def _get_messages(self, state: State) -> list[Message]:
                 None,
             )
             # do not add this for function calling
-            if not latest_user_message:
-                return messages
-
-            # Build environment reminder text
-            reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
-
-            latest_user_message.content.append(TextContent(text=reminder_text))
+            if latest_user_message:
+                reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
+                latest_user_message.content.append(TextContent(text=reminder_text))
 
         return messages