From 76cdcd19895d8823648894692301eb12b96a2094 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Tue, 15 Oct 2024 17:31:15 +0200 Subject: [PATCH 01/16] Updated tests --- tests/unit/test_runtime_build.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/test_runtime_build.py b/tests/unit/test_runtime_build.py index a8d215544303..e1e1c18eec94 100644 --- a/tests/unit/test_runtime_build.py +++ b/tests/unit/test_runtime_build.py @@ -259,6 +259,7 @@ def test_build_runtime_image_from_scratch(temp_dir): f'{get_runtime_image_repo()}:{from_scratch_hash}', f'{get_runtime_image_repo()}:{OH_VERSION}_image_debian_tag_11', ], + platform='linux/amd64', # Added platform tag ) assert image_name == f'{get_runtime_image_repo()}:{from_scratch_hash}' @@ -340,6 +341,7 @@ def test_build_runtime_image_exact_hash_not_exist(mock_build_sandbox_image, temp target_image_repo=repo, target_image_hash_tag=from_scratch_hash, target_image_tag=latest_image_tag, + platform='linux/amd64', # Added platform argument ) assert image_name == f'{repo}:{from_scratch_hash}' From 3beaf5c02d97e5d01694899b3f9a1a1c2bbf0571 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 15 Oct 2024 18:59:26 +0200 Subject: [PATCH 02/16] chore(deps): bump litellm from 1.49.3 to 1.49.4 (#4406) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 97457465f380..4fe8048b7fd1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3879,13 +3879,13 @@ types-tqdm = "*" [[package]] name = "litellm" -version = "1.49.3" +version = "1.49.4" description = "Library to easily interface with LLM API providers" optional = false python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8" files = [ - {file = "litellm-1.49.3-py3-none-any.whl", hash = "sha256:300c3c9e1600441f8b6d3afe0fd79c6193f901b2091f3730883ffe3709eebfa2"}, - {file = "litellm-1.49.3.tar.gz", hash = "sha256:e51ce30286894803dcf2949ddb4aab5c2e00809694a48ce6e997953566113c0b"}, + {file = "litellm-1.49.4-py3-none-any.whl", hash = "sha256:3094a9f74979da993f4b3298372ec4416f7a3f82d11a0831c9c616098b3fb50a"}, + {file = "litellm-1.49.4.tar.gz", hash = "sha256:5f16d40bfa7747fcc21f45f340454c57cbc705178244fe7326abac7c0759e05e"}, ] [package.dependencies] From c8db8aaf92d817c4ac7f867c5ee0edbbfe280276 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 15 Oct 2024 19:05:33 +0200 Subject: [PATCH 03/16] chore(deps-dev): bump llama-index from 0.11.17 to 0.11.18 (#4408) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/poetry.lock b/poetry.lock index 4fe8048b7fd1..73b6b1c394b1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3922,19 +3922,19 @@ pydantic = ">=1.10" [[package]] name = "llama-index" -version = "0.11.17" +version = "0.11.18" description = "Interface between LLMs and your data" optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "llama_index-0.11.17-py3-none-any.whl", hash = "sha256:85a3d2cd1908181555ae926f880dfae5284b24fda6b866e60969a44302d87350"}, - {file = "llama_index-0.11.17.tar.gz", hash = "sha256:b2176f400b33cd765e86775724d8ad5c6e4812ecdc36f2ca4500edcadc015af4"}, + {file = "llama_index-0.11.18-py3-none-any.whl", hash = "sha256:dc54c7fdd4c8ee32aa0c5565038894295fc76bd95e21e70fa67ca6fb2413a1b3"}, + {file = "llama_index-0.11.18.tar.gz", hash = "sha256:5c43b46ea9957d539ad823e008c9b6957fbaf4ec5c8bc6903accfb19863edfd9"}, ] [package.dependencies] llama-index-agent-openai = ">=0.3.4,<0.4.0" llama-index-cli = ">=0.3.1,<0.4.0" -llama-index-core = ">=0.11.17,<0.12.0" +llama-index-core = ">=0.11.18,<0.12.0" llama-index-embeddings-openai = ">=0.2.4,<0.3.0" llama-index-indices-managed-llama-cloud = ">=0.3.0" llama-index-legacy = ">=0.9.48,<0.10.0" @@ -3980,13 +3980,13 @@ llama-index-llms-openai = ">=0.2.0,<0.3.0" [[package]] name = "llama-index-core" -version = "0.11.17" +version = "0.11.18" description = "Interface between LLMs and your data" optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "llama_index_core-0.11.17-py3-none-any.whl", hash = "sha256:d65565b54ea55b2db12f9a1cd5c250b770d7e43d3363137cff431a6116ef069c"}, - {file = "llama_index_core-0.11.17.tar.gz", hash = "sha256:1143baf8d819e27555bdb142abdf2833d3d37731f270f46fa1e07fc4b97116ae"}, + {file = "llama_index_core-0.11.18-py3-none-any.whl", hash = "sha256:8e57522e69d3c8a219b29b5f1624c20269c9c3f87729eff9ecfb796eab51dd55"}, + {file = "llama_index_core-0.11.18.tar.gz", hash = "sha256:f94ae8d740b65c3bf0bc0422b0210613664c1a9f8e98b7328e037a68255bed83"}, ] [package.dependencies] From 308dc62546ddad4a70bcd1c86841801b02071ed0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 15 Oct 2024 19:06:40 +0200 Subject: [PATCH 04/16] chore(deps): bump modal from 0.64.181 to 0.64.182 (#4407) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index 73b6b1c394b1..f396e504b4f4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4799,12 +4799,12 @@ type = ["mypy (==1.11.2)"] [[package]] name = "modal" -version = "0.64.181" +version = "0.64.182" description = "Python client library for Modal" optional = false python-versions = ">=3.8" files = [ - {file = "modal-0.64.181-py3-none-any.whl", hash = "sha256:1d8dab39029abd2b11ca44b8401c71407b20e0e9a2e5c673ec6b1476d3b17fa2"}, + {file = "modal-0.64.182-py3-none-any.whl", hash = "sha256:d3213550a0724b13b1dacf8b468d26c78f51d850fd2a76529f180921905bcad3"}, ] [package.dependencies] From 158a9230b0d89eaab448c5d371ab9f3516887550 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 15 Oct 2024 14:31:49 -0500 Subject: [PATCH 05/16] refactor: move get_pairs from memory to shared utils (#4411) --- openhands/events/utils.py | 56 +++++++++++++++++++++++++++++++++++++ openhands/memory/history.py | 54 +++-------------------------------- tests/unit/test_is_stuck.py | 23 +++++++++++++-- 3 files changed, 81 insertions(+), 52 deletions(-) create mode 100644 openhands/events/utils.py diff --git a/openhands/events/utils.py b/openhands/events/utils.py new file mode 100644 index 000000000000..6c8cc415f675 --- /dev/null +++ b/openhands/events/utils.py @@ -0,0 +1,56 @@ +from openhands.core.logger import openhands_logger as logger +from openhands.events.action.action import Action +from openhands.events.action.empty import NullAction +from openhands.events.event import Event +from openhands.events.observation.commands import CmdOutputObservation +from openhands.events.observation.empty import NullObservation +from openhands.events.observation.observation import Observation + + +def get_pairs_from_events(events: list[Event]) -> list[tuple[Action, Observation]]: + """Return the history as a list of tuples (action, observation).""" + tuples: list[tuple[Action, Observation]] = [] + action_map: dict[int, Action] = {} + observation_map: dict[int, Observation] = {} + + # runnable actions are set as cause of observations + # (MessageAction, NullObservation) for source=USER + # (MessageAction, NullObservation) for source=AGENT + # (other_action?, NullObservation) + # (NullAction, CmdOutputObservation) background CmdOutputObservations + + for event in events: + if event.id is None or event.id == -1: + logger.debug(f'Event {event} has no ID') + + if isinstance(event, Action): + action_map[event.id] = event + + if isinstance(event, Observation): + if event.cause is None or event.cause == -1: + logger.debug(f'Observation {event} has no cause') + + if event.cause is None: + # runnable actions are set as cause of observations + # NullObservations have no cause + continue + + observation_map[event.cause] = event + + for action_id, action in action_map.items(): + observation = observation_map.get(action_id) + if observation: + # observation with a cause + tuples.append((action, observation)) + else: + tuples.append((action, NullObservation(''))) + + for cause_id, observation in observation_map.items(): + if cause_id not in action_map: + if isinstance(observation, NullObservation): + continue + if not isinstance(observation, CmdOutputObservation): + logger.debug(f'Observation {observation} has no cause') + tuples.append((NullAction(), observation)) + + return tuples.copy() diff --git a/openhands/memory/history.py b/openhands/memory/history.py index 89e50d67e455..1e4cfb8b5f05 100644 --- a/openhands/memory/history.py +++ b/openhands/memory/history.py @@ -10,12 +10,12 @@ from openhands.events.action.message import MessageAction from openhands.events.event import Event, EventSource from openhands.events.observation.agent import AgentStateChangedObservation -from openhands.events.observation.commands import CmdOutputObservation from openhands.events.observation.delegate import AgentDelegateObservation from openhands.events.observation.empty import NullObservation from openhands.events.observation.observation import Observation from openhands.events.serialization.event import event_to_dict from openhands.events.stream import EventStream +from openhands.events.utils import get_pairs_from_events class ShortTermHistory(list[Event]): @@ -216,55 +216,9 @@ def on_event(self, event: Event): def compatibility_for_eval_history_pairs(self) -> list[tuple[dict, dict]]: history_pairs = [] - for action, observation in self.get_pairs(): + for action, observation in get_pairs_from_events( + self.get_events_as_list(include_delegates=True) + ): history_pairs.append((event_to_dict(action), event_to_dict(observation))) return history_pairs - - def get_pairs(self) -> list[tuple[Action, Observation]]: - """Return the history as a list of tuples (action, observation).""" - tuples: list[tuple[Action, Observation]] = [] - action_map: dict[int, Action] = {} - observation_map: dict[int, Observation] = {} - - # runnable actions are set as cause of observations - # (MessageAction, NullObservation) for source=USER - # (MessageAction, NullObservation) for source=AGENT - # (other_action?, NullObservation) - # (NullAction, CmdOutputObservation) background CmdOutputObservations - - for event in self.get_events_as_list(include_delegates=True): - if event.id is None or event.id == -1: - logger.debug(f'Event {event} has no ID') - - if isinstance(event, Action): - action_map[event.id] = event - - if isinstance(event, Observation): - if event.cause is None or event.cause == -1: - logger.debug(f'Observation {event} has no cause') - - if event.cause is None: - # runnable actions are set as cause of observations - # NullObservations have no cause - continue - - observation_map[event.cause] = event - - for action_id, action in action_map.items(): - observation = observation_map.get(action_id) - if observation: - # observation with a cause - tuples.append((action, observation)) - else: - tuples.append((action, NullObservation(''))) - - for cause_id, observation in observation_map.items(): - if cause_id not in action_map: - if isinstance(observation, NullObservation): - continue - if not isinstance(observation, CmdOutputObservation): - logger.debug(f'Observation {observation} has no cause') - tuples.append((NullAction(), observation)) - - return tuples.copy() diff --git a/tests/unit/test_is_stuck.py b/tests/unit/test_is_stuck.py index 5e23a849286b..4a1330752161 100644 --- a/tests/unit/test_is_stuck.py +++ b/tests/unit/test_is_stuck.py @@ -17,6 +17,7 @@ from openhands.events.observation.empty import NullObservation from openhands.events.observation.error import ErrorObservation from openhands.events.stream import EventSource, EventStream +from openhands.events.utils import get_pairs_from_events from openhands.memory.history import ShortTermHistory from openhands.storage import get_file_store @@ -170,7 +171,16 @@ def test_is_stuck_repeating_action_observation( assert len(collect_events(event_stream)) == 10 assert len(list(stuck_detector.state.history.get_events())) == 8 - assert len(stuck_detector.state.history.get_pairs()) == 5 + assert ( + len( + get_pairs_from_events( + stuck_detector.state.history.get_events_as_list( + include_delegates=True + ) + ) + ) + == 5 + ) assert stuck_detector.is_stuck() is False assert stuck_detector.state.almost_stuck == 1 @@ -186,7 +196,16 @@ def test_is_stuck_repeating_action_observation( assert len(collect_events(event_stream)) == 12 assert len(list(stuck_detector.state.history.get_events())) == 10 - assert len(stuck_detector.state.history.get_pairs()) == 6 + assert ( + len( + get_pairs_from_events( + stuck_detector.state.history.get_events_as_list( + include_delegates=True + ) + ) + ) + == 6 + ) with patch('logging.Logger.warning') as mock_warning: assert stuck_detector.is_stuck() is True From b6a916342736dae8907179290f04d5116f072814 Mon Sep 17 00:00:00 2001 From: mamoodi Date: Tue, 15 Oct 2024 18:45:08 -0400 Subject: [PATCH 06/16] Fix eval output path in case of @ char (#4416) --- evaluation/utils/shared.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index bed679f342a2..d184b5b98037 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -152,7 +152,7 @@ def make_metadata( details: dict[str, Any] | None = None, ) -> EvalMetadata: model_name = llm_config.model.split('/')[-1] - model_path = model_name.replace(':', '_') + model_path = model_name.replace(':', '_').replace('@', '-') eval_note = f'_N_{eval_note}' if eval_note else '' eval_output_path = os.path.join( From 8ba531a012eefe824af8ece6a505c4913dfe48cc Mon Sep 17 00:00:00 2001 From: tofarr Date: Tue, 15 Oct 2024 17:52:21 -0600 Subject: [PATCH 07/16] Fix for lockup - create the runtime in a background thread (#4412) Co-authored-by: Robert Brennan --- openhands/runtime/runtime.py | 4 +- openhands/security/invariant/analyzer.py | 4 +- openhands/server/listen.py | 14 +++--- openhands/server/session/agent_session.py | 9 +++- openhands/utils/async_utils.py | 16 ++++++- tests/unit/test_async_utils.py | 54 +++++++++++++++++------ 6 files changed, 74 insertions(+), 27 deletions(-) diff --git a/openhands/runtime/runtime.py b/openhands/runtime/runtime.py index 7e420643c347..44614ee0a3dc 100644 --- a/openhands/runtime/runtime.py +++ b/openhands/runtime/runtime.py @@ -28,7 +28,7 @@ ) from openhands.events.serialization.action import ACTION_TYPE_TO_CLASS from openhands.runtime.plugins import JupyterRequirement, PluginRequirement -from openhands.utils.async_utils import sync_from_async +from openhands.utils.async_utils import call_sync_from_async def _default_env_vars(sandbox_config: SandboxConfig) -> dict[str, str]: @@ -123,7 +123,7 @@ async def on_event(self, event: Event) -> None: if event.timeout is None: event.timeout = self.config.sandbox.timeout assert event.timeout is not None - observation = await sync_from_async(self.run_action, event) + observation = await call_sync_from_async(self.run_action, event) observation._cause = event.id # type: ignore[attr-defined] source = event.source if event.source else EventSource.AGENT await self.event_stream.async_add_event(observation, source) # type: ignore[arg-type] diff --git a/openhands/security/invariant/analyzer.py b/openhands/security/invariant/analyzer.py index 275888bb4197..9d8b280716a7 100644 --- a/openhands/security/invariant/analyzer.py +++ b/openhands/security/invariant/analyzer.py @@ -19,7 +19,7 @@ from openhands.security.analyzer import SecurityAnalyzer from openhands.security.invariant.client import InvariantClient from openhands.security.invariant.parser import TraceElement, parse_element -from openhands.utils.async_utils import sync_from_async +from openhands.utils.async_utils import call_sync_from_async class InvariantAnalyzer(SecurityAnalyzer): @@ -146,7 +146,7 @@ async def confirm(self, event: Event) -> None: {'action': 'change_agent_state', 'args': {'agent_state': 'user_confirmed'}} ) event_source = event.source if event.source else EventSource.AGENT - await sync_from_async(self.event_stream.add_event, new_event, event_source) + await call_sync_from_async(self.event_stream.add_event, new_event, event_source) async def security_risk(self, event: Action) -> ActionSecurityRisk: logger.info('Calling security_risk on InvariantAnalyzer') diff --git a/openhands/server/listen.py b/openhands/server/listen.py index d7f177734971..32c93a117e23 100644 --- a/openhands/server/listen.py +++ b/openhands/server/listen.py @@ -14,7 +14,7 @@ from openhands.security.options import SecurityAnalyzers from openhands.server.data_models.feedback import FeedbackDataModel, store_feedback from openhands.storage import get_file_store -from openhands.utils.async_utils import sync_from_async +from openhands.utils.async_utils import call_sync_from_async with warnings.catch_warnings(): warnings.simplefilter('ignore') @@ -211,8 +211,8 @@ async def attach_session(request: Request, call_next): content={'error': 'Invalid token'}, ) - request.state.conversation = session_manager.attach_to_conversation( - request.state.sid + request.state.conversation = await call_sync_from_async( + session_manager.attach_to_conversation, request.state.sid ) if request.state.conversation is None: return JSONResponse( @@ -441,7 +441,9 @@ async def list_files(request: Request, path: str | None = None): ) runtime: Runtime = request.state.conversation.runtime - file_list = await sync_from_async(runtime.list_files, path) + file_list = await asyncio.create_task( + call_sync_from_async(runtime.list_files, path) + ) if path: file_list = [os.path.join(path, f) for f in file_list] @@ -490,7 +492,7 @@ async def select_file(file: str, request: Request): file = os.path.join(runtime.config.workspace_mount_path_in_sandbox, file) read_action = FileReadAction(file) - observation = await sync_from_async(runtime.run_action, read_action) + observation = await call_sync_from_async(runtime.run_action, read_action) if isinstance(observation, FileReadObservation): content = observation.content @@ -687,7 +689,7 @@ async def save_file(request: Request): runtime.config.workspace_mount_path_in_sandbox, file_path ) write_action = FileWriteAction(file_path, content) - observation = await sync_from_async(runtime.run_action, write_action) + observation = await call_sync_from_async(runtime.run_action, write_action) if isinstance(observation, FileWriteObservation): return JSONResponse( diff --git a/openhands/server/session/agent_session.py b/openhands/server/session/agent_session.py index f172021a37d2..6bc442ac731a 100644 --- a/openhands/server/session/agent_session.py +++ b/openhands/server/session/agent_session.py @@ -14,6 +14,7 @@ from openhands.runtime.runtime import Runtime from openhands.security import SecurityAnalyzer, options from openhands.storage.files import FileStore +from openhands.utils.async_utils import call_sync_from_async class AgentSession: @@ -102,7 +103,13 @@ async def _start( ): self.loop = asyncio.get_running_loop() self._create_security_analyzer(config.security.security_analyzer) - self._create_runtime(runtime_name, config, agent, status_message_callback) + await call_sync_from_async( + self._create_runtime, + runtime_name=runtime_name, + config=config, + agent=agent, + status_message_callback=status_message_callback, + ) self._create_controller( agent, config.security.confirmation_mode, diff --git a/openhands/utils/async_utils.py b/openhands/utils/async_utils.py index 7da8d05ff5c6..2a3b73f5da7d 100644 --- a/openhands/utils/async_utils.py +++ b/openhands/utils/async_utils.py @@ -7,7 +7,7 @@ EXECUTOR = ThreadPoolExecutor() -async def sync_from_async(fn: Callable, *args, **kwargs): +async def call_sync_from_async(fn: Callable, *args, **kwargs): """ Shorthand for running a function in the default background thread pool executor and awaiting the result. The nature of synchronous code is that the future @@ -19,7 +19,7 @@ async def sync_from_async(fn: Callable, *args, **kwargs): return result -def async_from_sync( +def call_async_from_sync( corofn: Callable, timeout: float = GENERAL_TIMEOUT, *args, **kwargs ): """ @@ -27,6 +27,11 @@ def async_from_sync( and awaiting the result """ + if corofn is None: + raise ValueError('corofn is None') + if not asyncio.iscoroutinefunction(corofn): + raise ValueError('corofn is not a coroutine function') + async def arun(): coro = corofn(*args, **kwargs) result = await coro @@ -46,6 +51,13 @@ def run(): return result +async def call_coro_in_bg_thread( + corofn: Callable, timeout: float = GENERAL_TIMEOUT, *args, **kwargs +): + """Function for running a coroutine in a background thread.""" + await call_sync_from_async(call_async_from_sync, corofn, timeout, *args, **kwargs) + + async def wait_all( iterable: Iterable[Coroutine], timeout: int = GENERAL_TIMEOUT ) -> List: diff --git a/tests/unit/test_async_utils.py b/tests/unit/test_async_utils.py index 89dd1e0f6915..3dc99438968e 100644 --- a/tests/unit/test_async_utils.py +++ b/tests/unit/test_async_utils.py @@ -1,11 +1,13 @@ import asyncio +import time import pytest from openhands.utils.async_utils import ( AsyncException, - async_from_sync, - sync_from_async, + call_async_from_sync, + call_coro_in_bg_thread, + call_sync_from_async, wait_all, ) @@ -80,44 +82,44 @@ async def dummy(value: int): @pytest.mark.asyncio -async def test_sync_from_async(): +async def test_call_sync_from_async(): def dummy(value: int = 2): return value * 2 - result = await sync_from_async(dummy) + result = await call_sync_from_async(dummy) assert result == 4 - result = await sync_from_async(dummy, 3) + result = await call_sync_from_async(dummy, 3) assert result == 6 - result = await sync_from_async(dummy, value=5) + result = await call_sync_from_async(dummy, value=5) assert result == 10 @pytest.mark.asyncio -async def test_sync_from_async_error(): +async def test_call_sync_from_async_error(): def dummy(): raise ValueError() with pytest.raises(ValueError): - await sync_from_async(dummy) + await call_sync_from_async(dummy) -def test_async_from_sync(): +def test_call_async_from_sync(): async def dummy(value: int): return value * 2 - result = async_from_sync(dummy, 0, 3) + result = call_async_from_sync(dummy, 0, 3) assert result == 6 -def test_async_from_sync_error(): +def test_call_async_from_sync_error(): async def dummy(value: int): raise ValueError() with pytest.raises(ValueError): - async_from_sync(dummy, 0, 3) + call_async_from_sync(dummy, 0, 3) -def test_async_from_sync_background_tasks(): +def test_call_async_from_sync_background_tasks(): events = [] async def bg_task(): @@ -132,9 +134,33 @@ async def dummy(value: int): asyncio.create_task(bg_task()) events.append('dummy_started') - async_from_sync(dummy, 0, 3) + call_async_from_sync(dummy, 0, 3) # We check that the function did not return until all coroutines completed # (Even though some of these were started as background tasks) expected = ['dummy_started', 'dummy_started', 'bg_started', 'bg_finished'] assert expected == events + + +@pytest.mark.asyncio +async def test_call_coro_in_bg_thread(): + times = {} + + async def bad_async(id_): + # Dummy demonstrating some bad async function that does not cede control + time.sleep(0.1) + times[id_] = time.time() + + async def curve_ball(): + # A curve ball - an async function that wants to run while the bad async functions are in progress + await asyncio.sleep(0.05) + times['curve_ball'] = time.time() + + start = time.time() + asyncio.create_task(curve_ball()) + await wait_all( + call_coro_in_bg_thread(bad_async, id_=f'bad_async_{id_}') for id_ in range(5) + ) + assert (times['curve_ball'] - start) == pytest.approx(0.05, abs=0.1) + for id_ in range(5): + assert (times[f'bad_async_{id_}'] - start) == pytest.approx(0.1, abs=0.1) From 064e4ad5506bc08b5c219141b5a507520fb2ed36 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 6 Nov 2024 15:35:08 -0800 Subject: [PATCH 08/16] prompt engineering to remind o1 to generate a patch --- openhands/agenthub/codeact_agent/codeact_agent.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py index d1f67eae9c2c..41c520c560ce 100644 --- a/openhands/agenthub/codeact_agent/codeact_agent.py +++ b/openhands/agenthub/codeact_agent/codeact_agent.py @@ -495,7 +495,16 @@ def _get_messages(self, state: State) -> list[Message]: ) # do not add this for function calling if latest_user_message: - reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with .' + reminder_text = ( + '\n\nENVIRONMENT REMINDER:\n' + f'- You have {state.max_iterations - state.iteration} turns left to complete the task\n' + '- When finished reply with \n' + '\n\n' + '- You MUST generate only one action per turn!\n' + '- A patch is a set of changes to the source code of the codebase that you are given\n' + '- You MUST generate a patch that attempts to fix the issue described in the \n' + '\n' + ) latest_user_message.content.append(TextContent(text=reminder_text)) return messages From aaba2d42fcabc4613c11d2df9c7cee1faefae21d Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Thu, 7 Nov 2024 05:08:15 +0000 Subject: [PATCH 09/16] Make SWE Bench specific instructions conditional based on environment variable 1. Added SWE_BENCH_RUN environment variable in run_infer.py 2. Modified codeact_agent.py to only show SWE Bench specific instructions when SWE_BENCH_RUN is true --- evaluation/swe_bench/run_infer.py | 3 +++ .../agenthub/codeact_agent/codeact_agent.py | 16 +++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index 8bd23939dd58..a28670cc9449 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -4,6 +4,9 @@ import tempfile from typing import Any +# Set environment variable to indicate SWE Bench context +os.environ['SWE_BENCH_RUN'] = 'true' + import pandas as pd import toml from datasets import load_dataset diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py index 484cedd53e2f..cc644e7e6571 100644 --- a/openhands/agenthub/codeact_agent/codeact_agent.py +++ b/openhands/agenthub/codeact_agent/codeact_agent.py @@ -513,12 +513,18 @@ def _get_messages(self, state: State) -> list[Message]: '\n\nENVIRONMENT REMINDER:\n' f'- You have {state.max_iterations - state.iteration} turns left to complete the task\n' '- When finished reply with \n' - '\n\n' - '- You MUST generate only one action per turn!\n' - '- A patch is a set of changes to the source code of the codebase that you are given\n' - '- You MUST generate a patch that attempts to fix the issue described in the \n' - '\n' ) + + # Add SWE Bench specific instructions only when running in that context + if os.environ.get('SWE_BENCH_RUN', 'false').lower() == 'true': + reminder_text += ( + '\n\n' + '- You MUST generate only one action per turn!\n' + '- A patch is a set of changes to the source code of the codebase that you are given\n' + '- You MUST generate a patch that attempts to fix the issue described in the \n' + '\n' + ) + latest_user_message.content.append(TextContent(text=reminder_text)) return messages From b19fdf1a3730c49a5f8b869fa3b3111a260ffe51 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 6 Nov 2024 21:13:04 -0800 Subject: [PATCH 10/16] fix --- evaluation/swe_bench/run_infer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index a28670cc9449..8bab6e7f78c4 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -4,9 +4,6 @@ import tempfile from typing import Any -# Set environment variable to indicate SWE Bench context -os.environ['SWE_BENCH_RUN'] = 'true' - import pandas as pd import toml from datasets import load_dataset @@ -45,6 +42,9 @@ USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true' USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true' RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true' +os.environ['SWE_BENCH_RUN'] = ( + 'true' # Set environment variable to indicate SWE Bench context +) AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { 'CodeActAgent': codeact_user_response, From f4b70661804f625023ab64039dcadf2e7b3220c4 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 6 Nov 2024 21:15:58 -0800 Subject: [PATCH 11/16] fix --- .../agenthub/codeact_agent/codeact_agent.py | 41 +++++++++++-------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py index cc644e7e6571..8a6c9769d428 100644 --- a/openhands/agenthub/codeact_agent/codeact_agent.py +++ b/openhands/agenthub/codeact_agent/codeact_agent.py @@ -508,23 +508,28 @@ def _get_messages(self, state: State) -> list[Message]: None, ) # do not add this for function calling - if latest_user_message: - reminder_text = ( - '\n\nENVIRONMENT REMINDER:\n' - f'- You have {state.max_iterations - state.iteration} turns left to complete the task\n' - '- When finished reply with \n' - ) - - # Add SWE Bench specific instructions only when running in that context - if os.environ.get('SWE_BENCH_RUN', 'false').lower() == 'true': - reminder_text += ( - '\n\n' - '- You MUST generate only one action per turn!\n' - '- A patch is a set of changes to the source code of the codebase that you are given\n' - '- You MUST generate a patch that attempts to fix the issue described in the \n' - '\n' - ) - - latest_user_message.content.append(TextContent(text=reminder_text)) + if not latest_user_message: + return messages + + # Build environment reminder text + reminder_parts = [ + '\n\nENVIRONMENT REMINDER:', + f'- You have {state.max_iterations - state.iteration} turns left to complete the task', + '- When finished reply with \n', + ] + reminder_text = '\n'.join(reminder_parts) + + # Add SWE Bench specific instructions if needed + if os.environ.get('SWE_BENCH_RUN', 'false').lower() == 'true': + swe_bench_parts = [ + '\n', + '- You MUST generate only one action per turn!', + '- A patch is a set of changes to the source code of the codebase that you are given', + '- You MUST generate a patch that attempts to fix the issue described in the ', + '\n', + ] + reminder_text += '\n'.join(swe_bench_parts) + + latest_user_message.content.append(TextContent(text=reminder_text)) return messages From af947b0fa8b05c12642248ac33e753a0bac43849 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Wed, 6 Nov 2024 21:36:40 -0800 Subject: [PATCH 12/16] fix --- openhands/agenthub/codeact_agent/codeact_agent.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py index 8a6c9769d428..40235d277247 100644 --- a/openhands/agenthub/codeact_agent/codeact_agent.py +++ b/openhands/agenthub/codeact_agent/codeact_agent.py @@ -512,12 +512,7 @@ def _get_messages(self, state: State) -> list[Message]: return messages # Build environment reminder text - reminder_parts = [ - '\n\nENVIRONMENT REMINDER:', - f'- You have {state.max_iterations - state.iteration} turns left to complete the task', - '- When finished reply with \n', - ] - reminder_text = '\n'.join(reminder_parts) + reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with .' # Add SWE Bench specific instructions if needed if os.environ.get('SWE_BENCH_RUN', 'false').lower() == 'true': From 9cf49272e95815b735cb77c136a7a5e3b788b26f Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Thu, 7 Nov 2024 17:36:07 -0800 Subject: [PATCH 13/16] fixed according to comments --- evaluation/swe_bench/run_infer.py | 9 ++++++++- openhands/agenthub/codeact_agent/codeact_agent.py | 11 ----------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index 8bab6e7f78c4..c6db373b5e09 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -73,7 +73,6 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata): instruction += CODEACT_SWE_PROMPT.format(workspace_dir_name=workspace_dir_name) else: # Instruction based on Anthropic's official trajectory - # https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/verified/20241022_tools_claude-3-5-sonnet-updated/trajs instruction = ( '\n' f'/workspace/{workspace_dir_name}\n' @@ -94,6 +93,14 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata): "Your thinking should be thorough and so it's fine if it's very long.\n" ) + instruction += """ + + - You MUST generate only one action per turn! + - A patch is a set of changes to the source code of the codebase that you are given + - You MUST generate a patch that attempts to fix the issue described in the + + """ + if RUN_WITH_BROWSING: instruction += ( '\n' diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py index 40235d277247..536ae5615613 100644 --- a/openhands/agenthub/codeact_agent/codeact_agent.py +++ b/openhands/agenthub/codeact_agent/codeact_agent.py @@ -514,17 +514,6 @@ def _get_messages(self, state: State) -> list[Message]: # Build environment reminder text reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with .' - # Add SWE Bench specific instructions if needed - if os.environ.get('SWE_BENCH_RUN', 'false').lower() == 'true': - swe_bench_parts = [ - '\n', - '- You MUST generate only one action per turn!', - '- A patch is a set of changes to the source code of the codebase that you are given', - '- You MUST generate a patch that attempts to fix the issue described in the ', - '\n', - ] - reminder_text += '\n'.join(swe_bench_parts) - latest_user_message.content.append(TextContent(text=reminder_text)) return messages From bc9f635afc961282b3a95469e0cea9fff8e1fe81 Mon Sep 17 00:00:00 2001 From: Alejandro Cuadron Lafuente Date: Thu, 7 Nov 2024 17:39:23 -0800 Subject: [PATCH 14/16] Update run_infer.py --- evaluation/swe_bench/run_infer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index c6db373b5e09..869e5e74bd1f 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -42,9 +42,6 @@ USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true' USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true' RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true' -os.environ['SWE_BENCH_RUN'] = ( - 'true' # Set environment variable to indicate SWE Bench context -) AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { 'CodeActAgent': codeact_user_response, From deb54f72843bcef0a5e75453ec72b1180ac23fd2 Mon Sep 17 00:00:00 2001 From: Alejandro Cuadron Lafuente Date: Thu, 7 Nov 2024 17:39:52 -0800 Subject: [PATCH 15/16] Update run_infer.py --- evaluation/swe_bench/run_infer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index 869e5e74bd1f..d7218bc16fc7 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -70,6 +70,7 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata): instruction += CODEACT_SWE_PROMPT.format(workspace_dir_name=workspace_dir_name) else: # Instruction based on Anthropic's official trajectory + # https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/verified/20241022_tools_claude-3-5-sonnet-updated/trajs instruction = ( '\n' f'/workspace/{workspace_dir_name}\n' From 1033d610b5bce3e18117c3c659d3e54972eb0f2c Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Thu, 7 Nov 2024 18:47:03 -0800 Subject: [PATCH 16/16] fixed based on comments --- evaluation/swe_bench/run_infer.py | 14 +++++++------- openhands/agenthub/codeact_agent/codeact_agent.py | 10 +++------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index d7218bc16fc7..a171d5f20a56 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -91,13 +91,13 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata): "Your thinking should be thorough and so it's fine if it's very long.\n" ) - instruction += """ - - - You MUST generate only one action per turn! - - A patch is a set of changes to the source code of the codebase that you are given - - You MUST generate a patch that attempts to fix the issue described in the - - """ + instruction += ( + '\n' + '- You MUST generate only one action per turn!\n' + '- A patch is a set of changes to the source code of the codebase that you are given\n' + '- You MUST generate a patch that attempts to fix the issue described in the \n' + '\n' + ) if RUN_WITH_BROWSING: instruction += ( diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py index 536ae5615613..314a2e0a089b 100644 --- a/openhands/agenthub/codeact_agent/codeact_agent.py +++ b/openhands/agenthub/codeact_agent/codeact_agent.py @@ -508,12 +508,8 @@ def _get_messages(self, state: State) -> list[Message]: None, ) # do not add this for function calling - if not latest_user_message: - return messages - - # Build environment reminder text - reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with .' - - latest_user_message.content.append(TextContent(text=reminder_text)) + if latest_user_message: + reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with .' + latest_user_message.content.append(TextContent(text=reminder_text)) return messages