From 76cdcd19895d8823648894692301eb12b96a2094 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Tue, 15 Oct 2024 17:31:15 +0200 Subject: [PATCH 01/18] Updated tests --- tests/unit/test_runtime_build.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/test_runtime_build.py b/tests/unit/test_runtime_build.py index a8d215544303..e1e1c18eec94 100644 --- a/tests/unit/test_runtime_build.py +++ b/tests/unit/test_runtime_build.py @@ -259,6 +259,7 @@ def test_build_runtime_image_from_scratch(temp_dir): f'{get_runtime_image_repo()}:{from_scratch_hash}', f'{get_runtime_image_repo()}:{OH_VERSION}_image_debian_tag_11', ], + platform='linux/amd64', # Added platform tag ) assert image_name == f'{get_runtime_image_repo()}:{from_scratch_hash}' @@ -340,6 +341,7 @@ def test_build_runtime_image_exact_hash_not_exist(mock_build_sandbox_image, temp target_image_repo=repo, target_image_hash_tag=from_scratch_hash, target_image_tag=latest_image_tag, + platform='linux/amd64', # Added platform argument ) assert image_name == f'{repo}:{from_scratch_hash}' From 3beaf5c02d97e5d01694899b3f9a1a1c2bbf0571 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 15 Oct 2024 18:59:26 +0200 Subject: [PATCH 02/18] chore(deps): bump litellm from 1.49.3 to 1.49.4 (#4406) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 97457465f380..4fe8048b7fd1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3879,13 +3879,13 @@ types-tqdm = "*" [[package]] name = "litellm" -version = "1.49.3" +version = "1.49.4" description = "Library to easily interface with LLM API providers" optional = false python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8" files = [ - {file = "litellm-1.49.3-py3-none-any.whl", hash = "sha256:300c3c9e1600441f8b6d3afe0fd79c6193f901b2091f3730883ffe3709eebfa2"}, - {file = "litellm-1.49.3.tar.gz", hash = "sha256:e51ce30286894803dcf2949ddb4aab5c2e00809694a48ce6e997953566113c0b"}, + {file = "litellm-1.49.4-py3-none-any.whl", hash = "sha256:3094a9f74979da993f4b3298372ec4416f7a3f82d11a0831c9c616098b3fb50a"}, + {file = "litellm-1.49.4.tar.gz", hash = "sha256:5f16d40bfa7747fcc21f45f340454c57cbc705178244fe7326abac7c0759e05e"}, ] [package.dependencies] From c8db8aaf92d817c4ac7f867c5ee0edbbfe280276 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 15 Oct 2024 19:05:33 +0200 Subject: [PATCH 03/18] chore(deps-dev): bump llama-index from 0.11.17 to 0.11.18 (#4408) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/poetry.lock b/poetry.lock index 4fe8048b7fd1..73b6b1c394b1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3922,19 +3922,19 @@ pydantic = ">=1.10" [[package]] name = "llama-index" -version = "0.11.17" +version = "0.11.18" description = "Interface between LLMs and your data" optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "llama_index-0.11.17-py3-none-any.whl", hash = "sha256:85a3d2cd1908181555ae926f880dfae5284b24fda6b866e60969a44302d87350"}, - {file = "llama_index-0.11.17.tar.gz", hash = "sha256:b2176f400b33cd765e86775724d8ad5c6e4812ecdc36f2ca4500edcadc015af4"}, + {file = "llama_index-0.11.18-py3-none-any.whl", hash = "sha256:dc54c7fdd4c8ee32aa0c5565038894295fc76bd95e21e70fa67ca6fb2413a1b3"}, + {file = "llama_index-0.11.18.tar.gz", hash = "sha256:5c43b46ea9957d539ad823e008c9b6957fbaf4ec5c8bc6903accfb19863edfd9"}, ] [package.dependencies] llama-index-agent-openai = ">=0.3.4,<0.4.0" llama-index-cli = ">=0.3.1,<0.4.0" -llama-index-core = ">=0.11.17,<0.12.0" +llama-index-core = ">=0.11.18,<0.12.0" llama-index-embeddings-openai = ">=0.2.4,<0.3.0" llama-index-indices-managed-llama-cloud = ">=0.3.0" llama-index-legacy = ">=0.9.48,<0.10.0" @@ -3980,13 +3980,13 @@ llama-index-llms-openai = ">=0.2.0,<0.3.0" [[package]] name = "llama-index-core" -version = "0.11.17" +version = "0.11.18" description = "Interface between LLMs and your data" optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "llama_index_core-0.11.17-py3-none-any.whl", hash = "sha256:d65565b54ea55b2db12f9a1cd5c250b770d7e43d3363137cff431a6116ef069c"}, - {file = "llama_index_core-0.11.17.tar.gz", hash = "sha256:1143baf8d819e27555bdb142abdf2833d3d37731f270f46fa1e07fc4b97116ae"}, + {file = "llama_index_core-0.11.18-py3-none-any.whl", hash = "sha256:8e57522e69d3c8a219b29b5f1624c20269c9c3f87729eff9ecfb796eab51dd55"}, + {file = "llama_index_core-0.11.18.tar.gz", hash = "sha256:f94ae8d740b65c3bf0bc0422b0210613664c1a9f8e98b7328e037a68255bed83"}, ] [package.dependencies] From 308dc62546ddad4a70bcd1c86841801b02071ed0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 15 Oct 2024 19:06:40 +0200 Subject: [PATCH 04/18] chore(deps): bump modal from 0.64.181 to 0.64.182 (#4407) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- poetry.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index 73b6b1c394b1..f396e504b4f4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4799,12 +4799,12 @@ type = ["mypy (==1.11.2)"] [[package]] name = "modal" -version = "0.64.181" +version = "0.64.182" description = "Python client library for Modal" optional = false python-versions = ">=3.8" files = [ - {file = "modal-0.64.181-py3-none-any.whl", hash = "sha256:1d8dab39029abd2b11ca44b8401c71407b20e0e9a2e5c673ec6b1476d3b17fa2"}, + {file = "modal-0.64.182-py3-none-any.whl", hash = "sha256:d3213550a0724b13b1dacf8b468d26c78f51d850fd2a76529f180921905bcad3"}, ] [package.dependencies] From 158a9230b0d89eaab448c5d371ab9f3516887550 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 15 Oct 2024 14:31:49 -0500 Subject: [PATCH 05/18] refactor: move get_pairs from memory to shared utils (#4411) --- openhands/events/utils.py | 56 +++++++++++++++++++++++++++++++++++++ openhands/memory/history.py | 54 +++-------------------------------- tests/unit/test_is_stuck.py | 23 +++++++++++++-- 3 files changed, 81 insertions(+), 52 deletions(-) create mode 100644 openhands/events/utils.py diff --git a/openhands/events/utils.py b/openhands/events/utils.py new file mode 100644 index 000000000000..6c8cc415f675 --- /dev/null +++ b/openhands/events/utils.py @@ -0,0 +1,56 @@ +from openhands.core.logger import openhands_logger as logger +from openhands.events.action.action import Action +from openhands.events.action.empty import NullAction +from openhands.events.event import Event +from openhands.events.observation.commands import CmdOutputObservation +from openhands.events.observation.empty import NullObservation +from openhands.events.observation.observation import Observation + + +def get_pairs_from_events(events: list[Event]) -> list[tuple[Action, Observation]]: + """Return the history as a list of tuples (action, observation).""" + tuples: list[tuple[Action, Observation]] = [] + action_map: dict[int, Action] = {} + observation_map: dict[int, Observation] = {} + + # runnable actions are set as cause of observations + # (MessageAction, NullObservation) for source=USER + # (MessageAction, NullObservation) for source=AGENT + # (other_action?, NullObservation) + # (NullAction, CmdOutputObservation) background CmdOutputObservations + + for event in events: + if event.id is None or event.id == -1: + logger.debug(f'Event {event} has no ID') + + if isinstance(event, Action): + action_map[event.id] = event + + if isinstance(event, Observation): + if event.cause is None or event.cause == -1: + logger.debug(f'Observation {event} has no cause') + + if event.cause is None: + # runnable actions are set as cause of observations + # NullObservations have no cause + continue + + observation_map[event.cause] = event + + for action_id, action in action_map.items(): + observation = observation_map.get(action_id) + if observation: + # observation with a cause + tuples.append((action, observation)) + else: + tuples.append((action, NullObservation(''))) + + for cause_id, observation in observation_map.items(): + if cause_id not in action_map: + if isinstance(observation, NullObservation): + continue + if not isinstance(observation, CmdOutputObservation): + logger.debug(f'Observation {observation} has no cause') + tuples.append((NullAction(), observation)) + + return tuples.copy() diff --git a/openhands/memory/history.py b/openhands/memory/history.py index 89e50d67e455..1e4cfb8b5f05 100644 --- a/openhands/memory/history.py +++ b/openhands/memory/history.py @@ -10,12 +10,12 @@ from openhands.events.action.message import MessageAction from openhands.events.event import Event, EventSource from openhands.events.observation.agent import AgentStateChangedObservation -from openhands.events.observation.commands import CmdOutputObservation from openhands.events.observation.delegate import AgentDelegateObservation from openhands.events.observation.empty import NullObservation from openhands.events.observation.observation import Observation from openhands.events.serialization.event import event_to_dict from openhands.events.stream import EventStream +from openhands.events.utils import get_pairs_from_events class ShortTermHistory(list[Event]): @@ -216,55 +216,9 @@ def on_event(self, event: Event): def compatibility_for_eval_history_pairs(self) -> list[tuple[dict, dict]]: history_pairs = [] - for action, observation in self.get_pairs(): + for action, observation in get_pairs_from_events( + self.get_events_as_list(include_delegates=True) + ): history_pairs.append((event_to_dict(action), event_to_dict(observation))) return history_pairs - - def get_pairs(self) -> list[tuple[Action, Observation]]: - """Return the history as a list of tuples (action, observation).""" - tuples: list[tuple[Action, Observation]] = [] - action_map: dict[int, Action] = {} - observation_map: dict[int, Observation] = {} - - # runnable actions are set as cause of observations - # (MessageAction, NullObservation) for source=USER - # (MessageAction, NullObservation) for source=AGENT - # (other_action?, NullObservation) - # (NullAction, CmdOutputObservation) background CmdOutputObservations - - for event in self.get_events_as_list(include_delegates=True): - if event.id is None or event.id == -1: - logger.debug(f'Event {event} has no ID') - - if isinstance(event, Action): - action_map[event.id] = event - - if isinstance(event, Observation): - if event.cause is None or event.cause == -1: - logger.debug(f'Observation {event} has no cause') - - if event.cause is None: - # runnable actions are set as cause of observations - # NullObservations have no cause - continue - - observation_map[event.cause] = event - - for action_id, action in action_map.items(): - observation = observation_map.get(action_id) - if observation: - # observation with a cause - tuples.append((action, observation)) - else: - tuples.append((action, NullObservation(''))) - - for cause_id, observation in observation_map.items(): - if cause_id not in action_map: - if isinstance(observation, NullObservation): - continue - if not isinstance(observation, CmdOutputObservation): - logger.debug(f'Observation {observation} has no cause') - tuples.append((NullAction(), observation)) - - return tuples.copy() diff --git a/tests/unit/test_is_stuck.py b/tests/unit/test_is_stuck.py index 5e23a849286b..4a1330752161 100644 --- a/tests/unit/test_is_stuck.py +++ b/tests/unit/test_is_stuck.py @@ -17,6 +17,7 @@ from openhands.events.observation.empty import NullObservation from openhands.events.observation.error import ErrorObservation from openhands.events.stream import EventSource, EventStream +from openhands.events.utils import get_pairs_from_events from openhands.memory.history import ShortTermHistory from openhands.storage import get_file_store @@ -170,7 +171,16 @@ def test_is_stuck_repeating_action_observation( assert len(collect_events(event_stream)) == 10 assert len(list(stuck_detector.state.history.get_events())) == 8 - assert len(stuck_detector.state.history.get_pairs()) == 5 + assert ( + len( + get_pairs_from_events( + stuck_detector.state.history.get_events_as_list( + include_delegates=True + ) + ) + ) + == 5 + ) assert stuck_detector.is_stuck() is False assert stuck_detector.state.almost_stuck == 1 @@ -186,7 +196,16 @@ def test_is_stuck_repeating_action_observation( assert len(collect_events(event_stream)) == 12 assert len(list(stuck_detector.state.history.get_events())) == 10 - assert len(stuck_detector.state.history.get_pairs()) == 6 + assert ( + len( + get_pairs_from_events( + stuck_detector.state.history.get_events_as_list( + include_delegates=True + ) + ) + ) + == 6 + ) with patch('logging.Logger.warning') as mock_warning: assert stuck_detector.is_stuck() is True From b6a916342736dae8907179290f04d5116f072814 Mon Sep 17 00:00:00 2001 From: mamoodi Date: Tue, 15 Oct 2024 18:45:08 -0400 Subject: [PATCH 06/18] Fix eval output path in case of @ char (#4416) --- evaluation/utils/shared.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index bed679f342a2..d184b5b98037 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -152,7 +152,7 @@ def make_metadata( details: dict[str, Any] | None = None, ) -> EvalMetadata: model_name = llm_config.model.split('/')[-1] - model_path = model_name.replace(':', '_') + model_path = model_name.replace(':', '_').replace('@', '-') eval_note = f'_N_{eval_note}' if eval_note else '' eval_output_path = os.path.join( From 8ba531a012eefe824af8ece6a505c4913dfe48cc Mon Sep 17 00:00:00 2001 From: tofarr Date: Tue, 15 Oct 2024 17:52:21 -0600 Subject: [PATCH 07/18] Fix for lockup - create the runtime in a background thread (#4412) Co-authored-by: Robert Brennan --- openhands/runtime/runtime.py | 4 +- openhands/security/invariant/analyzer.py | 4 +- openhands/server/listen.py | 14 +++--- openhands/server/session/agent_session.py | 9 +++- openhands/utils/async_utils.py | 16 ++++++- tests/unit/test_async_utils.py | 54 +++++++++++++++++------ 6 files changed, 74 insertions(+), 27 deletions(-) diff --git a/openhands/runtime/runtime.py b/openhands/runtime/runtime.py index 7e420643c347..44614ee0a3dc 100644 --- a/openhands/runtime/runtime.py +++ b/openhands/runtime/runtime.py @@ -28,7 +28,7 @@ ) from openhands.events.serialization.action import ACTION_TYPE_TO_CLASS from openhands.runtime.plugins import JupyterRequirement, PluginRequirement -from openhands.utils.async_utils import sync_from_async +from openhands.utils.async_utils import call_sync_from_async def _default_env_vars(sandbox_config: SandboxConfig) -> dict[str, str]: @@ -123,7 +123,7 @@ async def on_event(self, event: Event) -> None: if event.timeout is None: event.timeout = self.config.sandbox.timeout assert event.timeout is not None - observation = await sync_from_async(self.run_action, event) + observation = await call_sync_from_async(self.run_action, event) observation._cause = event.id # type: ignore[attr-defined] source = event.source if event.source else EventSource.AGENT await self.event_stream.async_add_event(observation, source) # type: ignore[arg-type] diff --git a/openhands/security/invariant/analyzer.py b/openhands/security/invariant/analyzer.py index 275888bb4197..9d8b280716a7 100644 --- a/openhands/security/invariant/analyzer.py +++ b/openhands/security/invariant/analyzer.py @@ -19,7 +19,7 @@ from openhands.security.analyzer import SecurityAnalyzer from openhands.security.invariant.client import InvariantClient from openhands.security.invariant.parser import TraceElement, parse_element -from openhands.utils.async_utils import sync_from_async +from openhands.utils.async_utils import call_sync_from_async class InvariantAnalyzer(SecurityAnalyzer): @@ -146,7 +146,7 @@ async def confirm(self, event: Event) -> None: {'action': 'change_agent_state', 'args': {'agent_state': 'user_confirmed'}} ) event_source = event.source if event.source else EventSource.AGENT - await sync_from_async(self.event_stream.add_event, new_event, event_source) + await call_sync_from_async(self.event_stream.add_event, new_event, event_source) async def security_risk(self, event: Action) -> ActionSecurityRisk: logger.info('Calling security_risk on InvariantAnalyzer') diff --git a/openhands/server/listen.py b/openhands/server/listen.py index d7f177734971..32c93a117e23 100644 --- a/openhands/server/listen.py +++ b/openhands/server/listen.py @@ -14,7 +14,7 @@ from openhands.security.options import SecurityAnalyzers from openhands.server.data_models.feedback import FeedbackDataModel, store_feedback from openhands.storage import get_file_store -from openhands.utils.async_utils import sync_from_async +from openhands.utils.async_utils import call_sync_from_async with warnings.catch_warnings(): warnings.simplefilter('ignore') @@ -211,8 +211,8 @@ async def attach_session(request: Request, call_next): content={'error': 'Invalid token'}, ) - request.state.conversation = session_manager.attach_to_conversation( - request.state.sid + request.state.conversation = await call_sync_from_async( + session_manager.attach_to_conversation, request.state.sid ) if request.state.conversation is None: return JSONResponse( @@ -441,7 +441,9 @@ async def list_files(request: Request, path: str | None = None): ) runtime: Runtime = request.state.conversation.runtime - file_list = await sync_from_async(runtime.list_files, path) + file_list = await asyncio.create_task( + call_sync_from_async(runtime.list_files, path) + ) if path: file_list = [os.path.join(path, f) for f in file_list] @@ -490,7 +492,7 @@ async def select_file(file: str, request: Request): file = os.path.join(runtime.config.workspace_mount_path_in_sandbox, file) read_action = FileReadAction(file) - observation = await sync_from_async(runtime.run_action, read_action) + observation = await call_sync_from_async(runtime.run_action, read_action) if isinstance(observation, FileReadObservation): content = observation.content @@ -687,7 +689,7 @@ async def save_file(request: Request): runtime.config.workspace_mount_path_in_sandbox, file_path ) write_action = FileWriteAction(file_path, content) - observation = await sync_from_async(runtime.run_action, write_action) + observation = await call_sync_from_async(runtime.run_action, write_action) if isinstance(observation, FileWriteObservation): return JSONResponse( diff --git a/openhands/server/session/agent_session.py b/openhands/server/session/agent_session.py index f172021a37d2..6bc442ac731a 100644 --- a/openhands/server/session/agent_session.py +++ b/openhands/server/session/agent_session.py @@ -14,6 +14,7 @@ from openhands.runtime.runtime import Runtime from openhands.security import SecurityAnalyzer, options from openhands.storage.files import FileStore +from openhands.utils.async_utils import call_sync_from_async class AgentSession: @@ -102,7 +103,13 @@ async def _start( ): self.loop = asyncio.get_running_loop() self._create_security_analyzer(config.security.security_analyzer) - self._create_runtime(runtime_name, config, agent, status_message_callback) + await call_sync_from_async( + self._create_runtime, + runtime_name=runtime_name, + config=config, + agent=agent, + status_message_callback=status_message_callback, + ) self._create_controller( agent, config.security.confirmation_mode, diff --git a/openhands/utils/async_utils.py b/openhands/utils/async_utils.py index 7da8d05ff5c6..2a3b73f5da7d 100644 --- a/openhands/utils/async_utils.py +++ b/openhands/utils/async_utils.py @@ -7,7 +7,7 @@ EXECUTOR = ThreadPoolExecutor() -async def sync_from_async(fn: Callable, *args, **kwargs): +async def call_sync_from_async(fn: Callable, *args, **kwargs): """ Shorthand for running a function in the default background thread pool executor and awaiting the result. The nature of synchronous code is that the future @@ -19,7 +19,7 @@ async def sync_from_async(fn: Callable, *args, **kwargs): return result -def async_from_sync( +def call_async_from_sync( corofn: Callable, timeout: float = GENERAL_TIMEOUT, *args, **kwargs ): """ @@ -27,6 +27,11 @@ def async_from_sync( and awaiting the result """ + if corofn is None: + raise ValueError('corofn is None') + if not asyncio.iscoroutinefunction(corofn): + raise ValueError('corofn is not a coroutine function') + async def arun(): coro = corofn(*args, **kwargs) result = await coro @@ -46,6 +51,13 @@ def run(): return result +async def call_coro_in_bg_thread( + corofn: Callable, timeout: float = GENERAL_TIMEOUT, *args, **kwargs +): + """Function for running a coroutine in a background thread.""" + await call_sync_from_async(call_async_from_sync, corofn, timeout, *args, **kwargs) + + async def wait_all( iterable: Iterable[Coroutine], timeout: int = GENERAL_TIMEOUT ) -> List: diff --git a/tests/unit/test_async_utils.py b/tests/unit/test_async_utils.py index 89dd1e0f6915..3dc99438968e 100644 --- a/tests/unit/test_async_utils.py +++ b/tests/unit/test_async_utils.py @@ -1,11 +1,13 @@ import asyncio +import time import pytest from openhands.utils.async_utils import ( AsyncException, - async_from_sync, - sync_from_async, + call_async_from_sync, + call_coro_in_bg_thread, + call_sync_from_async, wait_all, ) @@ -80,44 +82,44 @@ async def dummy(value: int): @pytest.mark.asyncio -async def test_sync_from_async(): +async def test_call_sync_from_async(): def dummy(value: int = 2): return value * 2 - result = await sync_from_async(dummy) + result = await call_sync_from_async(dummy) assert result == 4 - result = await sync_from_async(dummy, 3) + result = await call_sync_from_async(dummy, 3) assert result == 6 - result = await sync_from_async(dummy, value=5) + result = await call_sync_from_async(dummy, value=5) assert result == 10 @pytest.mark.asyncio -async def test_sync_from_async_error(): +async def test_call_sync_from_async_error(): def dummy(): raise ValueError() with pytest.raises(ValueError): - await sync_from_async(dummy) + await call_sync_from_async(dummy) -def test_async_from_sync(): +def test_call_async_from_sync(): async def dummy(value: int): return value * 2 - result = async_from_sync(dummy, 0, 3) + result = call_async_from_sync(dummy, 0, 3) assert result == 6 -def test_async_from_sync_error(): +def test_call_async_from_sync_error(): async def dummy(value: int): raise ValueError() with pytest.raises(ValueError): - async_from_sync(dummy, 0, 3) + call_async_from_sync(dummy, 0, 3) -def test_async_from_sync_background_tasks(): +def test_call_async_from_sync_background_tasks(): events = [] async def bg_task(): @@ -132,9 +134,33 @@ async def dummy(value: int): asyncio.create_task(bg_task()) events.append('dummy_started') - async_from_sync(dummy, 0, 3) + call_async_from_sync(dummy, 0, 3) # We check that the function did not return until all coroutines completed # (Even though some of these were started as background tasks) expected = ['dummy_started', 'dummy_started', 'bg_started', 'bg_finished'] assert expected == events + + +@pytest.mark.asyncio +async def test_call_coro_in_bg_thread(): + times = {} + + async def bad_async(id_): + # Dummy demonstrating some bad async function that does not cede control + time.sleep(0.1) + times[id_] = time.time() + + async def curve_ball(): + # A curve ball - an async function that wants to run while the bad async functions are in progress + await asyncio.sleep(0.05) + times['curve_ball'] = time.time() + + start = time.time() + asyncio.create_task(curve_ball()) + await wait_all( + call_coro_in_bg_thread(bad_async, id_=f'bad_async_{id_}') for id_ in range(5) + ) + assert (times['curve_ball'] - start) == pytest.approx(0.05, abs=0.1) + for id_ in range(5): + assert (times[f'bad_async_{id_}'] - start) == pytest.approx(0.1, abs=0.1) From 79cb41a94cae1cb2313845450c941cafde4ee032 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Thu, 17 Oct 2024 14:33:30 +0200 Subject: [PATCH 08/18] Initial Commit for the Supervisor Agent --- evaluation/swe_bench/run_infer.py | 2 + openhands/agenthub/__init__.py | 2 + .../agenthub/supervisor_agent/__init__.py | 4 + openhands/agenthub/supervisor_agent/agent.py | 156 +++++++++++++++ openhands/agenthub/supervisor_agent/prompt.py | 179 ++++++++++++++++++ 5 files changed, 343 insertions(+) create mode 100644 openhands/agenthub/supervisor_agent/__init__.py create mode 100644 openhands/agenthub/supervisor_agent/agent.py create mode 100644 openhands/agenthub/supervisor_agent/prompt.py diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index 91984c1a7b1a..80e8ccbee20a 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -41,11 +41,13 @@ AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { 'CodeActAgent': codeact_user_response, 'CodeActSWEAgent': codeact_user_response, + 'SupervisorAgent': codeact_user_response, } AGENT_CLS_TO_INST_SUFFIX = { 'CodeActAgent': 'When you think you have fixed the issue through code changes, please run the following command: exit .\n', 'CodeActSWEAgent': 'When you think you have fixed the issue through code changes, please run the following command: exit .\n', + 'SupervisorAgent': 'When you think you have fixed the issue, please run the following command: exit .\n', } diff --git a/openhands/agenthub/__init__.py b/openhands/agenthub/__init__.py index 0076976c27ed..489ecc7aaead 100644 --- a/openhands/agenthub/__init__.py +++ b/openhands/agenthub/__init__.py @@ -14,6 +14,7 @@ delegator_agent, dummy_agent, planner_agent, + supervisor_agent, ) __all__ = [ @@ -23,6 +24,7 @@ 'delegator_agent', 'dummy_agent', 'browsing_agent', + 'supervisor_agent', ] for agent in all_microagents.values(): diff --git a/openhands/agenthub/supervisor_agent/__init__.py b/openhands/agenthub/supervisor_agent/__init__.py new file mode 100644 index 000000000000..6b07ea69fc67 --- /dev/null +++ b/openhands/agenthub/supervisor_agent/__init__.py @@ -0,0 +1,4 @@ +from openhands.agenthub.supervisor_agent.agent import SupervisorAgent +from openhands.controller.agent import Agent + +Agent.register('SupervisorAgent', SupervisorAgent) diff --git a/openhands/agenthub/supervisor_agent/agent.py b/openhands/agenthub/supervisor_agent/agent.py new file mode 100644 index 000000000000..1c79e61e0c87 --- /dev/null +++ b/openhands/agenthub/supervisor_agent/agent.py @@ -0,0 +1,156 @@ +import copy +import logging +from typing import Dict, List + +from openhands.agenthub.supervisor_agent.prompt import ( + adjust_milestones, + get_initial_prompt, +) +from openhands.controller.agent import Agent +from openhands.controller.state.state import State +from openhands.core.config import AgentConfig +from openhands.core.message import Message, TextContent +from openhands.core.utils import json +from openhands.events.action import Action, AgentDelegateAction, AgentFinishAction +from openhands.events.action.agent import AgentRejectAction +from openhands.events.observation.delegate import AgentDelegateObservation +from openhands.llm.llm import LLM + + +class SupervisorAgent(Agent): + VERSION = '1.0' + """ + The Supervisor Agent is an agent that collects information from other agents + and makes decisions based on the information. + """ + + current_delegate: str = '' + sub_goals: List[Dict[str, str]] = [] + current_goal_index: int = 0 + summary: str = '' + task: str = '' + + def __init__(self, llm: LLM, config: AgentConfig): + """Initialize the Supervisor Agent with an LLM + + Parameters: + - llm (LLM): The llm to be used by this agent + """ + super().__init__(llm, config) + # Set up logger + self.logger = logging.getLogger(__name__) + logging.basicConfig(level=logging.DEBUG) # Set the logging level + + def step(self, state: State) -> Action: + """Checks to see if current step is completed, returns AgentFinishAction if True. + Otherwise, delegates the task to the next agent in the pipeline. + + Parameters: + - state (State): The current state given the previous actions and observations + + Returns: + - AgentFinishAction: If the last state was 'completed', 'verified', or 'abandoned' + - AgentDelegateAction: The next agent to delegate the task to + """ + self.logger.debug('Starting step with state: %s', state) + # Example logic for breaking down tasks and delegating + if not self.sub_goals: + self.logger.debug('No sub-goals found, breaking down task.') + task, _ = state.get_current_user_intent() + self.sub_goals = self.break_down_task(task) + self.logger.debug('Sub-goals: %s', self.sub_goals) + # If the LLM returns an empty list, reject the action + if self.sub_goals is None or self.sub_goals == []: + return AgentRejectAction() + + if self.current_delegate == '': + self.logger.debug("Current delegate is empty, assigning 'manager'.") + # First subgoal as the current delegate is empty + self.current_delegate = 'manager' + return AgentDelegateAction( + agent='ManagerAgent', + inputs={'task': json.dumps(self.sub_goals[self.current_goal_index])}, + ) + elif self.current_delegate == 'manager': + self.logger.debug("Current delegate is 'manager'.") + last_observation = state.history.get_last_observation() + + if not isinstance(last_observation, AgentDelegateObservation): + raise Exception('Last observation is not an AgentDelegateObservation') + + if last_observation.outputs.get('action', '') == 'reject': + self.logger.debug('No summary found, creating adjustment prompt.') + reason = getattr(last_observation, 'reason', '') + # Ensure reason is a string + prompt = self.create_adjustment_prompt(reason) + # Get the sub-goals from the language model using the generated prompt + self.sub_goals = self.get_sub_goals_from_llm(prompt) + # Add the summary to the current sub-goal + current_task = copy.deepcopy(self.sub_goals[self.current_goal_index]) + current_task['summary'] = ( + f'Summary from previous milestones: {self.summary}' + ) + return AgentDelegateAction( + agent='ManagerAgent', inputs={'task': json.dumps(current_task)} + ) + else: + # Append the current milestone and summary to the agent's summary + summary = last_observation.outputs.get('summary', '') + self.append_to_summary( + self.sub_goals[self.current_goal_index]['task'], summary + ) + self.current_goal_index += 1 + + if self.current_goal_index < len(self.sub_goals): + # Add the summary to the current sub-goal + current_task = copy.deepcopy( + self.sub_goals[self.current_goal_index] + ) + current_task['summary'] = ( + f'Summary from previous milestones: {self.summary}' + ) + + return AgentDelegateAction( + agent='ManagerAgent', inputs={'task': json.dumps(current_task)} + ) + + return AgentFinishAction() + + def break_down_task(self, task: str) -> List[Dict[str, str]]: + # Generate the initial prompt for breaking down the task + prompt = get_initial_prompt(task) + # Get the sub-goals from the language model using the generated prompt + return self.get_sub_goals_from_llm(prompt) + + def should_interrupt(self, observation) -> bool: + # Logic to determine if the task should be interrupted + return False # Placeholder + + def summarize_history(self, history) -> str: + # Logic to summarize the history + return 'summary' # Placeholder + + def provide_guidance(self, state: State) -> Action: + # Logic to provide high-level guidance + return AgentFinishAction() # Placeholder + + def create_adjustment_prompt(self, reason: str) -> str: + return adjust_milestones( + self.sub_goals, + self.sub_goals[self.current_goal_index], + reason, + self.summary, + self.task, + ) + + def get_sub_goals_from_llm(self, prompt: str) -> List[Dict[str, str]]: + content = [TextContent(text=prompt)] + message = Message(role='user', content=content) + response = self.llm.completion( + messages=self.llm.format_messages_for_llm(message) + ) + return json.loads(response['choices'][0]['message']['content']) + + def append_to_summary(self, milestone_name: str, summary: str): + """Appends the milestone name and summary to the agent's summary state.""" + self.summary += f'Milestone: {milestone_name}\nSummary: {summary}\n\n' diff --git a/openhands/agenthub/supervisor_agent/prompt.py b/openhands/agenthub/supervisor_agent/prompt.py new file mode 100644 index 000000000000..5cdf5b76d6e2 --- /dev/null +++ b/openhands/agenthub/supervisor_agent/prompt.py @@ -0,0 +1,179 @@ +from typing import Dict, List + +from openhands.core.utils import json + +HISTORY_SIZE = 20 + +# General Description +general_description = """ +You are a strategic manager AI in a software development team. You MUST think CAREFULLY how to complete the task assigned to you. +You MUST think on a HIGHER LEVEL view always. + +You've been given the following task: +%(task)s + +As a strategic manager, you create a plan with different sub-tasks and delegate the tasks to your team. +At your disposal, you have a team of agents who will complete tasks for you. However, those agents only focus on the details. +They CANNOT see the big picture. +They need you to define self-contained tasks, that are easy for them to understand and complete. + +""" + +# Initial Prompt +initial_prompt = """ +## Plan +Your goal is to create a high-level plan, a list of subtasks that will bring you closer to the completion of the task. Remember to think +CAREFULLY about how to complete the task. With each subtask, you MUST provide a "suggested approach". +Think, step by step, how you would complete the subtask. Then provide that as the suggested approach. +Try to be as detailed as possible, your goal is to HELP the agent finish the subtask as soon as possible. + +You MAY provide a list of "important details" for each subtask. These are details that the agent MUST consider when completing the subtask. + +ONLY generate tasks that are necessary to complete the task. + +You MUST ONLY generate a list of JSONs: + +[ + { + "task": "", + "suggested_approach": "", + "important_details": "" + }, + { + "task": "", + "suggested_approach": "", + "important_details": "" + }, + { + "task": "", + "suggested_approach": "", + "important_details": "" + }, +] + +The tasks MUST be generated in order, they MUST NOT depend on future tasks or previous tasks. They MUST be independent. +You MUST generate at least 1 task. + +For example: +User prompt: + +" +Enable quiet mode/no-verbose in CLI for use in pre-commit hook There seems to be only an option to increase the level of verbosity when using +SQLFluff [CLI](https://docs.sqlfluff.com/en/stable/cli.html), not to limit it further. It would be great to have an option to further limit the amount of prints when running +`sqlfluff fix`, especially in combination with deployment using a pre-commit hook. For example, only print the return status and the number of fixes applied, similar to how it +is when using `black` in a pre-commit hook: ![image](https://user-images.githubusercontent.com/10177212/140480676-dc98d00b-4383-44f2-bb90-3301a6eedec2.png) This hides the potentially +long list of fixes that are being applied to the SQL files, which can get quite verbose. +" + +Your response: + +[ + { + "task": "Research SQLFluff CLI verbosity options", + "suggested_approach": "Investigate the current SQLFluff CLI documentation and source code to understand how verbosity levels are currently implemented. Identify if there are any existing flags or settings that can be adjusted to reduce verbosity.", + "important_details": "Focus on the 'fix' command and any related verbosity settings. Document any findings that could be useful for implementing a quiet mode." + }, + { + "task": "Design a quiet mode feature for SQLFluff CLI", + "suggested_approach": "Based on the research findings, design a new feature that allows users to enable a quiet mode. This mode should minimize output to only essential information such as return status and number of fixes applied.", + "important_details": "Ensure the design is compatible with existing CLI options and does not interfere with other functionalities." + }, + { + "task": "Implement the quiet mode feature", + "suggested_approach": "Modify the SQLFluff CLI codebase to add the new quiet mode feature. Implement the necessary changes in the code to support this feature and ensure it can be activated via a command-line flag.", + "important_details": "Write unit tests to verify that the quiet mode works as expected and does not affect other CLI functionalities." + }, + { + "task": "Test the quiet mode feature", + "suggested_approach": "Conduct thorough testing of the new quiet mode feature in various scenarios, including its use in a pre-commit hook. Ensure that it behaves as expected and provides the desired level of output reduction.", + "important_details": "Test with different verbosity levels to ensure compatibility and check for any edge cases that might cause unexpected behavior." + }, + { + "task": "Document the new feature", + "suggested_approach": "Update the SQLFluff CLI documentation to include information about the new quiet mode feature. Provide examples of how to use it and explain its benefits.", + "important_details": "Ensure the documentation is clear and easy to understand for users who may not be familiar with the technical details." + } +] +""" + +adjustment_prompt = """ + + This is the current active plan that your subordinates are working on: + %(milestones)s + + And this is the current subtask that your subordinates are working on: + ## Current subtask + subtask: %(milestone_task)s + Suggested Approach: %(milestone_suggested_approach)s + Important Details: %(milestone_important_details)s + + However, it seems that the current subtask is not being completed successfully. + Because of the following reason: %(reason)s + + You have the following contextual information that has been gathered up to this point. + This information MIGHT help you adjust the plan: + %(summary)s + + ## Task + As a strategic manager, you must reflect on the failed subtask and decide on the necessary adjustments. Consider the following: + + 1. Analyze the reason for failure and determine if the suggested approach or important details need modification. + 2. Decide if the failed subtask should be split into smaller, more manageable tasks. + 3. Consider if new plan need to be added to address any gaps in the plan. + 4. Update the remaining plan to ensure the overall plan remains feasible and effective. + + You MUST NOT change the task you were given. + + You MUST make changes to the current subtask or to the ones AFTER. In NO case you can change the ones BEFORE. + Generate ONLY a list of JSONs. Do NOT generate any markdown or comments. + """ + + +def get_initial_prompt(task: str) -> str: + """Gets the prompt for the planner agent. + + Formatted with the most recent action-observation pairs, current task, and hint based on last action + + Parameters: + - state (State): The state of the current agent + + Returns: with historical values + """ + return (general_description + initial_prompt) % { + 'task': task, + } + + +def adjust_milestones( + milestones: List[Dict], + subtask: Dict[str, str], + reason: str, + summary: str, + task: str, +) -> str: + """Adjusts the milestones based on a failed subtask and its reason. + + Parameters: + - milestones (List[Dict]): The current list of milestones. + - subtask (Dict): The subtask that was not completed successfully. + - reason (str): The reason provided for the failure. + - summary (str): A summary of everything up to this point. + - task (str): The user's task. + + Returns: A prompt for the strategic manager agent to self-reflect and adjust the milestones. + """ + # Extract values from the subtask dictionary + milestone_task = subtask['task'] + milestone_suggested_approach = subtask['suggested_approach'] + milestone_important_details = subtask['important_details'] + + # Use the extracted values in the string formatting + return (general_description + adjustment_prompt) % { + 'milestones': json.dumps(milestones), + 'reason': reason, + 'summary': summary, + 'task': task, + 'milestone_task': milestone_task, + 'milestone_suggested_approach': milestone_suggested_approach, + 'milestone_important_details': milestone_important_details, + } From 640f769e4dfc6349dd534705aa2cd652a3745a06 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Thu, 24 Oct 2024 12:21:21 +0200 Subject: [PATCH 09/18] enables codeactagent delegation --- openhands/agenthub/codeact_agent/codeact_agent.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py index cacd68353732..634bbefbbadd 100644 --- a/openhands/agenthub/codeact_agent/codeact_agent.py +++ b/openhands/agenthub/codeact_agent/codeact_agent.py @@ -282,7 +282,12 @@ def _get_messages(self, state: State) -> list[Message]: ), None, ) + if latest_user_message: + # Enables AgentDelegation + task: str = state.inputs.get('task', '') + if task: + latest_user_message.content.append(TextContent(text=task)) reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with .' latest_user_message.content.append(TextContent(text=reminder_text)) From d5d44e279eb6254f18449672b4a03cd4b025f29c Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Thu, 24 Oct 2024 12:43:16 +0200 Subject: [PATCH 10/18] hacky way to enable different LLMs --- openhands/agenthub/supervisor_agent/agent.py | 145 ++++++++++-------- openhands/agenthub/supervisor_agent/prompt.py | 56 +++---- openhands/controller/agent_controller.py | 3 + openhands/events/action/agent.py | 3 +- openhands/runtime/builder/docker.py | 1 + openhands/runtime/utils/edit.py | 2 +- 6 files changed, 103 insertions(+), 107 deletions(-) diff --git a/openhands/agenthub/supervisor_agent/agent.py b/openhands/agenthub/supervisor_agent/agent.py index 1c79e61e0c87..61ef4bed8fc0 100644 --- a/openhands/agenthub/supervisor_agent/agent.py +++ b/openhands/agenthub/supervisor_agent/agent.py @@ -40,82 +40,91 @@ def __init__(self, llm: LLM, config: AgentConfig): # Set up logger self.logger = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG) # Set the logging level + self.llm_config = llm.config def step(self, state: State) -> Action: - """Checks to see if current step is completed, returns AgentFinishAction if True. - Otherwise, delegates the task to the next agent in the pipeline. - - Parameters: - - state (State): The current state given the previous actions and observations - - Returns: - - AgentFinishAction: If the last state was 'completed', 'verified', or 'abandoned' - - AgentDelegateAction: The next agent to delegate the task to - """ self.logger.debug('Starting step with state: %s', state) - # Example logic for breaking down tasks and delegating + self.logger.debug('LLM config: %s', self.llm_config) + if not self.sub_goals: - self.logger.debug('No sub-goals found, breaking down task.') - task, _ = state.get_current_user_intent() - self.sub_goals = self.break_down_task(task) - self.logger.debug('Sub-goals: %s', self.sub_goals) - # If the LLM returns an empty list, reject the action - if self.sub_goals is None or self.sub_goals == []: - return AgentRejectAction() + self.initialize_sub_goals(state) if self.current_delegate == '': - self.logger.debug("Current delegate is empty, assigning 'manager'.") - # First subgoal as the current delegate is empty - self.current_delegate = 'manager' - return AgentDelegateAction( - agent='ManagerAgent', - inputs={'task': json.dumps(self.sub_goals[self.current_goal_index])}, + self.current_delegate = 'CodeActAgent' + return self.delegate_to_agent( + 'CodeActAgent', self.construct_task_details(self.prepare_current_task()) ) - elif self.current_delegate == 'manager': - self.logger.debug("Current delegate is 'manager'.") - last_observation = state.history.get_last_observation() - - if not isinstance(last_observation, AgentDelegateObservation): - raise Exception('Last observation is not an AgentDelegateObservation') - - if last_observation.outputs.get('action', '') == 'reject': - self.logger.debug('No summary found, creating adjustment prompt.') - reason = getattr(last_observation, 'reason', '') - # Ensure reason is a string - prompt = self.create_adjustment_prompt(reason) - # Get the sub-goals from the language model using the generated prompt - self.sub_goals = self.get_sub_goals_from_llm(prompt) - # Add the summary to the current sub-goal - current_task = copy.deepcopy(self.sub_goals[self.current_goal_index]) - current_task['summary'] = ( - f'Summary from previous milestones: {self.summary}' - ) - return AgentDelegateAction( - agent='ManagerAgent', inputs={'task': json.dumps(current_task)} - ) - else: - # Append the current milestone and summary to the agent's summary - summary = last_observation.outputs.get('summary', '') - self.append_to_summary( - self.sub_goals[self.current_goal_index]['task'], summary - ) - self.current_goal_index += 1 - - if self.current_goal_index < len(self.sub_goals): - # Add the summary to the current sub-goal - current_task = copy.deepcopy( - self.sub_goals[self.current_goal_index] - ) - current_task['summary'] = ( - f'Summary from previous milestones: {self.summary}' - ) - - return AgentDelegateAction( - agent='ManagerAgent', inputs={'task': json.dumps(current_task)} - ) + + elif self.current_delegate == 'CodeActAgent': + return self.handle_code_act_agent(state) + + return AgentFinishAction() + + def initialize_sub_goals(self, state: State): + self.logger.debug('No sub-goals found, breaking down task.') + self.task, _ = state.get_current_user_intent() + self.sub_goals = self.break_down_task(self.task) + self.logger.debug('Sub-goals: %s', self.sub_goals) + if not self.sub_goals: + return AgentRejectAction() + + def delegate_to_agent(self, agent_name: str, task: str) -> AgentDelegateAction: + self.logger.debug(f'Delegating to agent: {agent_name}') + + return AgentDelegateAction(agent=agent_name, inputs={'task': task}) + + def handle_code_act_agent(self, state: State) -> Action: + self.logger.debug("Current delegate is 'CodeActAgent'.") + last_observation = state.history.get_last_observation() + + if not isinstance(last_observation, AgentDelegateObservation): + raise Exception('Last observation is not an AgentDelegateObservation') + + if last_observation.outputs.get('action', '') == 'reject': + return self.handle_rejection(last_observation) + + return self.handle_success(last_observation) + + def handle_rejection( + self, last_observation: AgentDelegateObservation + ) -> AgentDelegateAction: + self.logger.debug('No summary found, creating adjustment prompt.') + reason = getattr(last_observation, 'reason', '') + prompt = self.create_adjustment_prompt(reason) + self.sub_goals = self.get_sub_goals_from_llm(prompt) + current_task = self.prepare_current_task() + return self.delegate_to_agent( + 'CodeActAgent', self.construct_task_details(current_task) + ) + + def handle_success(self, last_observation: AgentDelegateObservation) -> Action: + summary = last_observation.outputs.get('summary', '') + self.append_to_summary(summary) + self.current_goal_index += 1 + + if self.current_goal_index < len(self.sub_goals): + current_task = self.prepare_current_task() + task_details = self.construct_task_details(current_task) + return self.delegate_to_agent('CodeActAgent', task_details) return AgentFinishAction() + def prepare_current_task(self) -> Dict[str, str]: + current_task = copy.deepcopy(self.sub_goals[self.current_goal_index]) + current_task['summary'] = self.summary if self.summary else '' + return current_task + + def construct_task_details(self, current_task: Dict[str, str]) -> str: + task_details = ( + f"Task: {self.task}\n\n" + f"Next Subtask: {current_task['task']}\n" + f"Suggested Approach: {current_task['suggested_approach']}\n" + f"Important Details: {current_task['important_details']}" + ) + if self.summary: + task_details = f'Progress so far: {self.summary}\n\n' + task_details + return task_details + def break_down_task(self, task: str) -> List[Dict[str, str]]: # Generate the initial prompt for breaking down the task prompt = get_initial_prompt(task) @@ -151,6 +160,6 @@ def get_sub_goals_from_llm(self, prompt: str) -> List[Dict[str, str]]: ) return json.loads(response['choices'][0]['message']['content']) - def append_to_summary(self, milestone_name: str, summary: str): + def append_to_summary(self, summary: str): """Appends the milestone name and summary to the agent's summary state.""" - self.summary += f'Milestone: {milestone_name}\nSummary: {summary}\n\n' + self.summary += f'{summary}\n\n' diff --git a/openhands/agenthub/supervisor_agent/prompt.py b/openhands/agenthub/supervisor_agent/prompt.py index 5cdf5b76d6e2..e4e0eaedfa22 100644 --- a/openhands/agenthub/supervisor_agent/prompt.py +++ b/openhands/agenthub/supervisor_agent/prompt.py @@ -6,8 +6,9 @@ # General Description general_description = """ -You are a strategic manager AI in a software development team. You MUST think CAREFULLY how to complete the task assigned to you. -You MUST think on a HIGHER LEVEL view always. +You are a strategic planner AI in a software development team. You have a team of agents +who will complete the tasks you give them. Each agent is an expert in a specific area. +You MUST think CAREFULLY how to complete the task assigned to you. You've been given the following task: %(task)s @@ -44,15 +45,10 @@ "suggested_approach": "", "important_details": "" }, - { - "task": "", - "suggested_approach": "", - "important_details": "" - }, ] The tasks MUST be generated in order, they MUST NOT depend on future tasks or previous tasks. They MUST be independent. -You MUST generate at least 1 task. +You MUST generate at least 1 task. The last task MUST be the implementation task. You WILL NOT need a test file. For example: User prompt: @@ -73,35 +69,20 @@ "suggested_approach": "Investigate the current SQLFluff CLI documentation and source code to understand how verbosity levels are currently implemented. Identify if there are any existing flags or settings that can be adjusted to reduce verbosity.", "important_details": "Focus on the 'fix' command and any related verbosity settings. Document any findings that could be useful for implementing a quiet mode." }, - { - "task": "Design a quiet mode feature for SQLFluff CLI", - "suggested_approach": "Based on the research findings, design a new feature that allows users to enable a quiet mode. This mode should minimize output to only essential information such as return status and number of fixes applied.", - "important_details": "Ensure the design is compatible with existing CLI options and does not interfere with other functionalities." - }, { "task": "Implement the quiet mode feature", "suggested_approach": "Modify the SQLFluff CLI codebase to add the new quiet mode feature. Implement the necessary changes in the code to support this feature and ensure it can be activated via a command-line flag.", "important_details": "Write unit tests to verify that the quiet mode works as expected and does not affect other CLI functionalities." - }, - { - "task": "Test the quiet mode feature", - "suggested_approach": "Conduct thorough testing of the new quiet mode feature in various scenarios, including its use in a pre-commit hook. Ensure that it behaves as expected and provides the desired level of output reduction.", - "important_details": "Test with different verbosity levels to ensure compatibility and check for any edge cases that might cause unexpected behavior." - }, - { - "task": "Document the new feature", - "suggested_approach": "Update the SQLFluff CLI documentation to include information about the new quiet mode feature. Provide examples of how to use it and explain its benefits.", - "important_details": "Ensure the documentation is clear and easy to understand for users who may not be familiar with the technical details." } ] """ adjustment_prompt = """ - This is the current active plan that your subordinates are working on: + This is the current active plan that your agents are working on: %(milestones)s - And this is the current subtask that your subordinates are working on: + And this is the current subtask that your agents are working on: ## Current subtask subtask: %(milestone_task)s Suggested Approach: %(milestone_suggested_approach)s @@ -130,19 +111,15 @@ def get_initial_prompt(task: str) -> str: - """Gets the prompt for the planner agent. - - Formatted with the most recent action-observation pairs, current task, and hint based on last action - - Parameters: - - state (State): The state of the current agent - - Returns: with historical values - """ - return (general_description + initial_prompt) % { + formatted_prompt = (general_description + initial_prompt) % { 'task': task, } + # Add instruction to not include json formatting + formatted_prompt += '\n\nIMPORTANT: Do not include ```json at the start or ``` at the end of your response. Just return the raw JSON list.' + + return formatted_prompt + def adjust_milestones( milestones: List[Dict], @@ -167,8 +144,8 @@ def adjust_milestones( milestone_suggested_approach = subtask['suggested_approach'] milestone_important_details = subtask['important_details'] - # Use the extracted values in the string formatting - return (general_description + adjustment_prompt) % { + # Get the formatted prompt + formatted_prompt = (general_description + adjustment_prompt) % { 'milestones': json.dumps(milestones), 'reason': reason, 'summary': summary, @@ -177,3 +154,8 @@ def adjust_milestones( 'milestone_suggested_approach': milestone_suggested_approach, 'milestone_important_details': milestone_important_details, } + + # Add instruction to not include json formatting + formatted_prompt += '\n\nIMPORTANT: Do not include ```json at the start or ``` at the end of your response. Just return the raw JSON list.' + + return formatted_prompt diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py index 55ca61ddddee..139ba06ebc17 100644 --- a/openhands/controller/agent_controller.py +++ b/openhands/controller/agent_controller.py @@ -1,5 +1,6 @@ import asyncio import copy +import logging import traceback from typing import Type @@ -63,6 +64,7 @@ class AgentController: parent: 'AgentController | None' = None delegate: 'AgentController | None' = None _pending_action: Action | None = None + logger: logging.Logger def __init__( self, @@ -98,6 +100,7 @@ def __init__( self.id = sid self.agent = agent self.headless_mode = headless_mode + self.logger = logging.getLogger(f'AgentController-{sid}') # subscribe to the event stream self.event_stream = event_stream diff --git a/openhands/events/action/agent.py b/openhands/events/action/agent.py index f49f573ed698..eedac830422b 100644 --- a/openhands/events/action/agent.py +++ b/openhands/events/action/agent.py @@ -1,5 +1,5 @@ from dataclasses import dataclass, field -from typing import Any +from typing import Any, Dict, Optional from openhands.core.schema import ActionType from openhands.events.action.action import Action @@ -74,6 +74,7 @@ class AgentDelegateAction(Action): inputs: dict thought: str = '' action: str = ActionType.DELEGATE + llm_config: Optional[Dict[str, Any]] = None @property def message(self) -> str: diff --git a/openhands/runtime/builder/docker.py b/openhands/runtime/builder/docker.py index 09f94f103dff..9c5850982633 100644 --- a/openhands/runtime/builder/docker.py +++ b/openhands/runtime/builder/docker.py @@ -70,6 +70,7 @@ def build( f'--build-arg=OPENHANDS_RUNTIME_BUILD_TIME={datetime.datetime.now().isoformat()}', f'--tag={target_image_hash_name}', '--load', + '--platform=linux/amd64', ] # Include the platform argument only if platform is specified diff --git a/openhands/runtime/utils/edit.py b/openhands/runtime/utils/edit.py index 4ed5c0edafc6..350683175750 100644 --- a/openhands/runtime/utils/edit.py +++ b/openhands/runtime/utils/edit.py @@ -101,7 +101,7 @@ class FileEditRuntimeMixin(FileEditRuntimeInterface): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - llm_config = self.config.get_llm_config() + llm_config = self.config.get_llm_config_from_agent(self.config.default_agent) if llm_config.draft_editor is None: llm_config.draft_editor = copy.deepcopy(llm_config) From f1d317c7004461f412728f20fd0f53dae70a9f68 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Fri, 25 Oct 2024 19:01:00 +0200 Subject: [PATCH 11/18] Some progress --- openhands/agenthub/__init__.py | 2 + openhands/agenthub/searcher_agent/__init__.py | 4 + .../agenthub/searcher_agent/action_parser.py | 158 ++++++++ openhands/agenthub/searcher_agent/agent.py | 175 ++++++++ openhands/agenthub/searcher_agent/prompt.py | 69 ++++ openhands/agenthub/supervisor_agent/agent.py | 182 ++++----- openhands/agenthub/supervisor_agent/prompt.py | 383 +++++++++++++----- 7 files changed, 764 insertions(+), 209 deletions(-) create mode 100644 openhands/agenthub/searcher_agent/__init__.py create mode 100644 openhands/agenthub/searcher_agent/action_parser.py create mode 100644 openhands/agenthub/searcher_agent/agent.py create mode 100644 openhands/agenthub/searcher_agent/prompt.py diff --git a/openhands/agenthub/__init__.py b/openhands/agenthub/__init__.py index 489ecc7aaead..96c766124989 100644 --- a/openhands/agenthub/__init__.py +++ b/openhands/agenthub/__init__.py @@ -14,6 +14,7 @@ delegator_agent, dummy_agent, planner_agent, + searcher_agent, supervisor_agent, ) @@ -24,6 +25,7 @@ 'delegator_agent', 'dummy_agent', 'browsing_agent', + 'searcher_agent', 'supervisor_agent', ] diff --git a/openhands/agenthub/searcher_agent/__init__.py b/openhands/agenthub/searcher_agent/__init__.py new file mode 100644 index 000000000000..1f4b7d50c642 --- /dev/null +++ b/openhands/agenthub/searcher_agent/__init__.py @@ -0,0 +1,4 @@ +from openhands.agenthub.searcher_agent.agent import SearcherAgent +from openhands.controller.agent import Agent + +Agent.register('SearcherAgent', SearcherAgent) diff --git a/openhands/agenthub/searcher_agent/action_parser.py b/openhands/agenthub/searcher_agent/action_parser.py new file mode 100644 index 000000000000..54ad267cad8b --- /dev/null +++ b/openhands/agenthub/searcher_agent/action_parser.py @@ -0,0 +1,158 @@ +import re + +from openhands.controller.action_parser import ( + ActionParser, + ResponseParser, +) +from openhands.events.action import ( + Action, + AgentFinishAction, + CmdRunAction, + IPythonRunCellAction, + MessageAction, +) + + +class SearcherAgentResponseParser(ResponseParser): + """Parser action: + - CmdRunAction(command) - bash command to run + - IPythonRunCellAction(code) - IPython code to run + - MessageAction(content) - Message action to run (e.g. ask for clarification) + - AgentFinishAction() - end the interaction + """ + + def __init__(self): + # Need pay attention to the item order in self.action_parsers + super().__init__() + self.action_parsers = [ + SearcherAgentActionParserFinish(), + SearcherAgentActionParserCmdRun(), + SearcherAgentActionParserIPythonRunCell(), + ] + self.default_parser = SearcherAgentActionParserMessage() + + def parse(self, response) -> Action: + action_str = self.parse_response(response) + return self.parse_action(action_str) + + def parse_response(self, response) -> str: + action = response.choices[0].message.content + if action is None: + return '' + for lang in ['bash', 'ipython', 'browse']: + # special handling for DeepSeek: it has stop-word bug and returns + if f'' not in action: + action = action.replace(f'') + + if f'' in action and f'' not in action: + action += f'' + if '' not in action: + action += '' + return action + + def parse_action(self, action_str: str) -> Action: + for action_parser in self.action_parsers: + if action_parser.check_condition(action_str): + return action_parser.parse(action_str) + return self.default_parser.parse(action_str) + + +class SearcherAgentActionParserFinish(ActionParser): + """Parser action: + - AgentFinishAction() - end the interaction + """ + + def __init__( + self, + ): + self.finish_command = None + + def check_condition(self, action_str: str) -> bool: + self.finish_command = re.search(r'.*', action_str, re.DOTALL) + return self.finish_command is not None + + def parse(self, action_str: str) -> Action: + assert ( + self.finish_command is not None + ), 'self.finish_command should not be None when parse is called' + output = action_str.replace(self.finish_command.group(0), '').strip() + outputs = {'output': output} + return AgentFinishAction(outputs=outputs) + + +class SearcherAgentActionParserCmdRun(ActionParser): + """Parser action: + - CmdRunAction(command) - bash command to run + - AgentFinishAction() - end the interaction + """ + + def __init__( + self, + ): + self.bash_command = None + + def check_condition(self, action_str: str) -> bool: + self.bash_command = re.search( + r'(.*?)', action_str, re.DOTALL + ) + return self.bash_command is not None + + def parse(self, action_str: str) -> Action: + assert ( + self.bash_command is not None + ), 'self.bash_command should not be None when parse is called' + thought = action_str.replace(self.bash_command.group(0), '').strip() + # a command was found + command_group = self.bash_command.group(1).strip() + if command_group.strip() == 'exit': + return AgentFinishAction(thought=thought) + return CmdRunAction(command=command_group, thought=thought) + + +class SearcherAgentActionParserIPythonRunCell(ActionParser): + """Parser action: + - IPythonRunCellAction(code) - IPython code to run + """ + + def __init__( + self, + ): + self.python_code = None + self.jupyter_kernel_init_code: str = 'from agentskills import *' + + def check_condition(self, action_str: str) -> bool: + self.python_code = re.search( + r'(.*?)', action_str, re.DOTALL + ) + return self.python_code is not None + + def parse(self, action_str: str) -> Action: + assert ( + self.python_code is not None + ), 'self.python_code should not be None when parse is called' + code_group = self.python_code.group(1).strip() + thought = action_str.replace(self.python_code.group(0), '').strip() + return IPythonRunCellAction( + code=code_group, + thought=thought, + kernel_init_code=self.jupyter_kernel_init_code, + ) + + +class SearcherAgentActionParserMessage(ActionParser): + """Parser action: + - MessageAction(content) - Message action to run (e.g. ask for clarification) + """ + + def __init__( + self, + ): + pass + + def check_condition(self, action_str: str) -> bool: + # We assume the LLM is GOOD enough that when it returns pure natural language + # it wants to talk to the user + return True + + def parse(self, action_str: str) -> Action: + return MessageAction(content=action_str, wait_for_response=True) diff --git a/openhands/agenthub/searcher_agent/agent.py b/openhands/agenthub/searcher_agent/agent.py new file mode 100644 index 000000000000..f987c0708310 --- /dev/null +++ b/openhands/agenthub/searcher_agent/agent.py @@ -0,0 +1,175 @@ +import logging + +from openhands.agenthub.searcher_agent.action_parser import SearcherAgentResponseParser +from openhands.agenthub.searcher_agent.prompt import get_prompt +from openhands.controller.agent import Agent +from openhands.controller.state.state import State +from openhands.core.config import AgentConfig +from openhands.core.config.llm_config import LLMConfig +from openhands.core.message import Message, TextContent +from openhands.events.action import Action, AgentFinishAction +from openhands.events.action.commands import CmdRunAction, IPythonRunCellAction +from openhands.events.action.message import MessageAction +from openhands.events.observation.commands import ( + CmdOutputObservation, + IPythonRunCellObservation, +) +from openhands.events.observation.error import ErrorObservation +from openhands.events.observation.observation import Observation +from openhands.events.observation.reject import UserRejectObservation +from openhands.llm.llm import LLM + + +class SearcherAgent(Agent): + VERSION = '1.0' + """ + The Searcher Agent is an agent that searches the codebase for relevant information. + """ + + action_parser = SearcherAgentResponseParser() + + def __init__(self, llm: LLM, config: AgentConfig): + """Initialize the Searcher Agent with an LLM + + Parameters: + - llm (LLM): The llm to be used by this agent + - config (AgentConfig): The configuration for this agent + """ + # TODO: Remove this once we have a real LLM config + llm_config = LLMConfig( + model='deepseek/deepseek-chat', api_key='REDACTED', temperature=0.0 + ) + llm = LLM(llm_config) + # TODO: Remove this once we have a real AgentConfig + config = AgentConfig(llm_config='deepseek') + super().__init__(llm, config) + # Set up logger + self.logger = logging.getLogger(__name__) + logging.basicConfig(level=logging.DEBUG) # Set the logging level + + def step(self, state: State) -> Action: + """Performs one step using the Searcher Agent. + This includes gathering info about the codebase and summarizing relevant information. + + Parameters: + - state (State): used to get updated info + + Returns: + - Action: The next action to take + """ + # Check if we should exit + latest_user_message = state.history.get_last_user_message() + if latest_user_message and latest_user_message.strip() == '/exit': + return AgentFinishAction() + + # Prepare messages for LLM + messages = [] + + # Add system and initial messages + task: str = state.inputs.get('task', '') + suggested_approach: str = state.inputs.get('suggested_approach', '') + messages.extend( + [ + Message( + role='system', + content=[TextContent(text=get_prompt(task, suggested_approach))], + ) + ] + ) + + # Add history messages + for event in state.history.get_events(): + if isinstance(event, Action): + message = self.get_action_message(event) + elif isinstance(event, Observation): + message = self.get_observation_message(event) + else: + raise ValueError(f'Unknown event type: {type(event)}') + + if message: + # Handle consecutive messages from same role + if messages and messages[-1].role == message.role: + messages[-1].content.extend(message.content) + else: + messages.append(message) + + # Get response from LLM + params = { + 'messages': self.llm.format_messages_for_llm(messages), + 'stop': [ + '', + '', + '', + ], + } + + response = self.llm.completion(**params) + + # Parse and return the next action + return self.action_parser.parse(response) + + def get_action_message(self, action: Action) -> Message | None: + """Convert an Action to a Message for the LLM conversation. + + Parameters: + - action (Action): The action to convert + + Returns: + - Message | None: The converted message, or None if action type is not supported + """ + if isinstance(action, CmdRunAction): + return Message( + role='assistant', + content=[ + TextContent( + text=f'{action.thought}\n\n{action.command}\n' + ) + ], + ) + elif isinstance(action, IPythonRunCellAction): + return Message( + role='assistant', + content=[ + TextContent( + text=f'{action.thought}\n\n{action.code}\n' + ) + ], + ) + elif isinstance(action, MessageAction): + return Message( + role='user' if action.source == 'user' else 'assistant', + content=[TextContent(text=action.content)], + ) + elif isinstance(action, AgentFinishAction) and action.source == 'agent': + return Message(role='assistant', content=[TextContent(text=action.thought)]) + return None + + def get_observation_message(self, obs: Observation) -> Message | None: + """Convert an Observation to a Message for the LLM conversation. + + Parameters: + - obs (Observation): The observation to convert + + Returns: + - Message | None: The converted message, or None if observation type is not supported + """ + obs_prefix = 'OBSERVATION:\n' + if isinstance(obs, CmdOutputObservation): + text = obs_prefix + obs.content + text += ( + f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]' + ) + return Message(role='user', content=[TextContent(text=text)]) + elif isinstance(obs, IPythonRunCellObservation): + text = obs_prefix + obs.content + return Message(role='user', content=[TextContent(text=text)]) + elif isinstance(obs, ErrorObservation): + text = obs_prefix + obs.content + text += '\n[Error occurred in processing last action]' + return Message(role='user', content=[TextContent(text=text)]) + elif isinstance(obs, UserRejectObservation): + text = obs_prefix + obs.content + text += '\n[Last action has been rejected by the user]' + return Message(role='user', content=[TextContent(text=text)]) + else: + raise ValueError(f'Unknown observation type: {type(obs)}') diff --git a/openhands/agenthub/searcher_agent/prompt.py b/openhands/agenthub/searcher_agent/prompt.py new file mode 100644 index 000000000000..bfc9bc612647 --- /dev/null +++ b/openhands/agenthub/searcher_agent/prompt.py @@ -0,0 +1,69 @@ +# General Description, the goal is to devise a manager that is able to iterate if the solution has not been found yet. +# In order to successfully fix an issue there are two phases: +# 1. Exploring the codebase, finding the root cause of the issue. +# 2. Implementing the solution. +# Then the manager needs to check if the issue has been fixed, if not, it needs to iterate. +general_description = """ +You are a detail-oriented AI, an expert in searching through files and code. +You are also an expert in summarising code and its purpose. +As a detail-oriented AI, you MUST always read more and more code until you are sure you have found +all the information you need. + +Your goal is to gather information about the codebase to help the programmer fix the issue. +Here is the task you are trying to complete: +%(task)s + +IMPORTANT: YOU SHOULD NEVER TRY TO IMPLEMENT A SOLUTION. YOUR ONLY GOAL IS TO GATHER INFORMATION. +As an expert in searching through files and code, you have been equipped with a set of tools +that will help you gather information about the codebase: +- You can execute bash commands wrapped with , e.g. ls . +- If a bash command returns exit code `-1`, this means the process is not yet finished. +- You must then send a second . The second can be empty + (which will retrieve any additional logs), or it can contain text to be sent to STDIN of the running process, + or it can contain the text `ctrl+c` to interrupt the process. +- For commands that may run indefinitely, the output should be redirected to a file and the command run + in the background, e.g. python3 app.py > server.log 2>&1 & +- If a command execution result says "Command timed out. Sending SIGINT to the process", + you should retry running the command in the background. + +You should ONLY `run` commands that have no side-effects, like `ls` and `grep`. + +Your manager gave you a suggested approach that you should follow: +%(suggested_approach)s + +Follow the suggested approach to gather information about the codebase. +When you think you have gathered enough information, generate a JSON with the following format: + +[ + { + "summary": "", + "location_of_the_file": "", + "functions_of_interest": [ + { + "name": "", + "summary": "", + "calls_to_this_function": [""], + "is_called_by_these_functions": [""] + }, + ] + } +] + + +IMPORTANT: Every entry in the JSON MUST be relevant to the task. +IMPORTANT: The JSON MUST be contained inside and tags. +IMPORTANT: You MUST have at least one file in the response. + +""" + + +def get_prompt(task: str, suggested_approach: str) -> str: + formatted_prompt = (general_description) % { + 'task': task, + 'suggested_approach': suggested_approach, + } + + # Add instruction to not include json formatting + formatted_prompt += '\n\nIMPORTANT: Do not include ```json at the start or ``` at the end of your response. Just return the raw JSON list.' + + return formatted_prompt diff --git a/openhands/agenthub/supervisor_agent/agent.py b/openhands/agenthub/supervisor_agent/agent.py index 61ef4bed8fc0..034a3ba1b7e1 100644 --- a/openhands/agenthub/supervisor_agent/agent.py +++ b/openhands/agenthub/supervisor_agent/agent.py @@ -1,15 +1,15 @@ -import copy import logging -from typing import Dict, List +from typing import Any, Dict, List, Literal, Union from openhands.agenthub.supervisor_agent.prompt import ( - adjust_milestones, - get_initial_prompt, + TASK_TYPE_ISSUE, + get_prompt, ) from openhands.controller.agent import Agent from openhands.controller.state.state import State from openhands.core.config import AgentConfig from openhands.core.message import Message, TextContent +from openhands.core.schema.action import ActionType from openhands.core.utils import json from openhands.events.action import Action, AgentDelegateAction, AgentFinishAction from openhands.events.action.agent import AgentRejectAction @@ -25,10 +25,13 @@ class SupervisorAgent(Agent): """ current_delegate: str = '' - sub_goals: List[Dict[str, str]] = [] - current_goal_index: int = 0 - summary: str = '' + suggested_approaches: List[Dict[str, List[str]]] = [] + suggested_approach_index: int = -1 # -1 Because we increment it before using it + results: Dict[str, List[Any]] = {'search': [], 'code': []} + condensed_information: str = '' + does_it_needs_a_test: str = '' task: str = '' + phase: Literal['search', 'summary', 'code'] = 'search' def __init__(self, llm: LLM, config: AgentConfig): """Initialize the Supervisor Agent with an LLM @@ -46,120 +49,91 @@ def step(self, state: State) -> Action: self.logger.debug('Starting step with state: %s', state) self.logger.debug('LLM config: %s', self.llm_config) - if not self.sub_goals: - self.initialize_sub_goals(state) + if not self.suggested_approaches: + self.suggested_approaches = self.get_suggested_approaches(state) + self.suggested_approach_index += 1 - if self.current_delegate == '': - self.current_delegate = 'CodeActAgent' + last_observation = state.history.get_last_observation() + if ( + isinstance(last_observation, AgentDelegateObservation) + and last_observation.outputs.get('action', '') == ActionType.FINISH + ): + self.results[self.phase].append(last_observation.outputs.get('output', '')) + + if len(self.results[self.phase]) < len(self.suggested_approaches): + # Delegate to the SearcherAgent as we need to gather more information return self.delegate_to_agent( - 'CodeActAgent', self.construct_task_details(self.prepare_current_task()) + 'SearcherAgent', + self.task, + self.suggested_approaches[self.suggested_approach_index].get( + 'suggested_approach', [] + ), ) - elif self.current_delegate == 'CodeActAgent': - return self.handle_code_act_agent(state) + if self.phase == 'search': + # We don't change the phase until we have the condensed information + condensed_information = self.ask_llm( + self.task, '2', json.dumps(self.results['search']) + )[0] + if condensed_information.get('summary', '') != '': + self.phase = 'summary' + self.condensed_information = condensed_information.get('summary', '') + else: + suggested_approach: str | list[str] = condensed_information.get( + 'suggested_approach', [] + ) + self.results['search'].append(suggested_approach) + return self.delegate_to_agent( + 'SearcherAgent', self.task, suggested_approach + ) + + if self.phase == 'summary': + # Now we have to judge if this issue requires a test or not before fixing it + does_it_needs_a_test = self.ask_llm( + self.task, 'code', self.condensed_information + )[0] + if does_it_needs_a_test.get('suggested_approach', '') == TASK_TYPE_ISSUE: + self.phase = 'code' + else: + self.phase = 'code' + + # WIP: Implement the code phase return AgentFinishAction() - def initialize_sub_goals(self, state: State): - self.logger.debug('No sub-goals found, breaking down task.') + def get_suggested_approaches(self, state: State): + self.logger.debug('No suggested approaches found, breaking down task.') self.task, _ = state.get_current_user_intent() - self.sub_goals = self.break_down_task(self.task) - self.logger.debug('Sub-goals: %s', self.sub_goals) - if not self.sub_goals: + suggested_approaches = self.ask_llm(self.task, 'search') + self.logger.debug('Suggested approaches: %s', self.suggested_approaches) + if not suggested_approaches: return AgentRejectAction() + return suggested_approaches - def delegate_to_agent(self, agent_name: str, task: str) -> AgentDelegateAction: - self.logger.debug(f'Delegating to agent: {agent_name}') - - return AgentDelegateAction(agent=agent_name, inputs={'task': task}) - - def handle_code_act_agent(self, state: State) -> Action: - self.logger.debug("Current delegate is 'CodeActAgent'.") - last_observation = state.history.get_last_observation() - - if not isinstance(last_observation, AgentDelegateObservation): - raise Exception('Last observation is not an AgentDelegateObservation') - - if last_observation.outputs.get('action', '') == 'reject': - return self.handle_rejection(last_observation) - - return self.handle_success(last_observation) - - def handle_rejection( - self, last_observation: AgentDelegateObservation + def delegate_to_agent( + self, agent_name: str, task: str, suggested_approach: Union[str, List[str]] ) -> AgentDelegateAction: - self.logger.debug('No summary found, creating adjustment prompt.') - reason = getattr(last_observation, 'reason', '') - prompt = self.create_adjustment_prompt(reason) - self.sub_goals = self.get_sub_goals_from_llm(prompt) - current_task = self.prepare_current_task() - return self.delegate_to_agent( - 'CodeActAgent', self.construct_task_details(current_task) - ) - - def handle_success(self, last_observation: AgentDelegateObservation) -> Action: - summary = last_observation.outputs.get('summary', '') - self.append_to_summary(summary) - self.current_goal_index += 1 - - if self.current_goal_index < len(self.sub_goals): - current_task = self.prepare_current_task() - task_details = self.construct_task_details(current_task) - return self.delegate_to_agent('CodeActAgent', task_details) - - return AgentFinishAction() - - def prepare_current_task(self) -> Dict[str, str]: - current_task = copy.deepcopy(self.sub_goals[self.current_goal_index]) - current_task['summary'] = self.summary if self.summary else '' - return current_task - - def construct_task_details(self, current_task: Dict[str, str]) -> str: - task_details = ( - f"Task: {self.task}\n\n" - f"Next Subtask: {current_task['task']}\n" - f"Suggested Approach: {current_task['suggested_approach']}\n" - f"Important Details: {current_task['important_details']}" + self.logger.debug(f'Delegating to agent: {agent_name}') + # Join the list of strings with newlines if it's a list + approach = ( + '\n'.join(suggested_approach) + if isinstance(suggested_approach, list) + else suggested_approach ) - if self.summary: - task_details = f'Progress so far: {self.summary}\n\n' + task_details - return task_details - - def break_down_task(self, task: str) -> List[Dict[str, str]]: - # Generate the initial prompt for breaking down the task - prompt = get_initial_prompt(task) - # Get the sub-goals from the language model using the generated prompt - return self.get_sub_goals_from_llm(prompt) - - def should_interrupt(self, observation) -> bool: - # Logic to determine if the task should be interrupted - return False # Placeholder - - def summarize_history(self, history) -> str: - # Logic to summarize the history - return 'summary' # Placeholder - - def provide_guidance(self, state: State) -> Action: - # Logic to provide high-level guidance - return AgentFinishAction() # Placeholder - - def create_adjustment_prompt(self, reason: str) -> str: - return adjust_milestones( - self.sub_goals, - self.sub_goals[self.current_goal_index], - reason, - self.summary, - self.task, + return AgentDelegateAction( + agent=agent_name, inputs={'task': task, 'suggested_approach': approach} ) - def get_sub_goals_from_llm(self, prompt: str) -> List[Dict[str, str]]: + def ask_llm( + self, task: str, phase: str, search_results: str = '' + ) -> List[Dict[str, str]]: + prompt = get_prompt(task, phase, search_results) + return self.get_response(prompt) + + def get_response(self, prompt: str) -> List[Dict[str, str]]: content = [TextContent(text=prompt)] message = Message(role='user', content=content) response = self.llm.completion( messages=self.llm.format_messages_for_llm(message) ) return json.loads(response['choices'][0]['message']['content']) - - def append_to_summary(self, summary: str): - """Appends the milestone name and summary to the agent's summary state.""" - self.summary += f'{summary}\n\n' diff --git a/openhands/agenthub/supervisor_agent/prompt.py b/openhands/agenthub/supervisor_agent/prompt.py index e4e0eaedfa22..4d8e68a92df9 100644 --- a/openhands/agenthub/supervisor_agent/prompt.py +++ b/openhands/agenthub/supervisor_agent/prompt.py @@ -1,57 +1,153 @@ -from typing import Dict, List - -from openhands.core.utils import json - HISTORY_SIZE = 20 -# General Description +# General Description, the goal is to devise a manager that is able to iterate if the solution has not been found yet. +# In order to successfully fix an issue there are two phases: +# 1. Exploring the codebase, finding the root cause of the issue. +# 2. Implementing the solution. +# Then the manager needs to check if the issue has been fixed, if not, it needs to iterate. general_description = """ You are a strategic planner AI in a software development team. You have a team of agents -who will complete the tasks you give them. Each agent is an expert in a specific area. -You MUST think CAREFULLY how to complete the task assigned to you. +who will complete the tasks you give them. Each agent is an expert in a specific area, +but it can only focus on one very specific sub-task at a time. -You've been given the following task: +Your goal is to complete the following task: %(task)s -As a strategic manager, you create a plan with different sub-tasks and delegate the tasks to your team. -At your disposal, you have a team of agents who will complete tasks for you. However, those agents only focus on the details. -They CANNOT see the big picture. -They need you to define self-contained tasks, that are easy for them to understand and complete. +This task is very complex, it requires careful planning and thinking. +In order to properly complete the task, there are two phases: +- Search: exploring the codebase, finding the relevant details. (e.g. what is the root cause of the issue?) +- Summary: summarising the information you have gathered. +- Code: implementing the solution. (e.g. how to fix the issue?) +As a strategic manager, your goal is to create a suggested approach for phase %(phase)s. + +## Detailed Suggested Approaches +Generate several detailed suggested approaches that will be used by your agents to complete the task. +Each agent will be assigned one of the suggested approaches and will bring you back feedback. +So, be creative and think of as many different approaches as possible. +You are trying to HELP the agents complete the task, you MUST be AS DETAILED AS POSSIBLE. """ -# Initial Prompt -initial_prompt = """ -## Plan -Your goal is to create a high-level plan, a list of subtasks that will bring you closer to the completion of the task. Remember to think -CAREFULLY about how to complete the task. With each subtask, you MUST provide a "suggested approach". -Think, step by step, how you would complete the subtask. Then provide that as the suggested approach. -Try to be as detailed as possible, your goal is to HELP the agent finish the subtask as soon as possible. -You MAY provide a list of "important details" for each subtask. These are details that the agent MUST consider when completing the subtask. +condense_information_prompt = """ +Previously, your agents were tasked to gather information about the codebase. +They have now returned their findings. + +As a strategic manager, your job is to look CAREFULLY at the information they have gathered. +You need to make sure you have a good understanding of the codebase, and the potential solutions +to the task. + +## Information Gathered +%(search_results)s + +## Summary +Do you think you have enough information to complete the task? +If not, you need to request more information from the agents. +Return a list of 1 JSON describing what extra information you would need and the suggested approach to gather that information. +[ + { + "suggested_approach": [""] + } +] +If you have enough information, you need to summarise the information you have gathered. +How would you explain this to a new joiner to the team? +Where would you point them to? +Provide a detailed step by step guide. +Remember, the agents DON'T have access to the internet. Every task must be conducted OFFLINE. +The agents have cloned the repo, so they can open files, browse the code, interact with it... +In the information gathered, there might be some repeated information, or some information +that is actually not relevant. +You need to be able to distinguish what is relevant, and what is not. +In the information you have gathered, there might be file names, function names, class names. You MUST include +them in the summary, so the agents know where to look. +Generate a list of 1 JSON with the following format: +[ + { + "summary": [""] + } +] + +IMPORTANT: Be VERY VERY VERY SPECIFIC. +IMPORTANT: Include the file names, function names, class names, code blocks, in the step by step guide. +IMPORTANT: Generate as many steps as possible. +""" + +# Constants for task type choices +TASK_TYPE_ISSUE = 'yes, the task is an issue that needs to be replicated' +TASK_TYPE_FEATURE = 'no, the task is a new feature that needs to be implemented' + +does_it_needs_a_test_prompt = ( + """ +As a strategic manager, you need to judge if the task is an issue that needs to be replicated first +or if it is a new feature that just needs to be implemented. + +Your agents have already gathered information about the codebase. -ONLY generate tasks that are necessary to complete the task. +## Information Gathered +%(search_results)s + +Think CAREFULLY before answering. +What do you think is the best course of action? +IMPORTANT: You MUST return a list of 1 JSON with the following format: +[ + { + "suggested_approach": [""] + } +] + +IMPORTANT: You MUST choose one of the two options. +""" +) +initial_prompt = """ You MUST ONLY generate a list of JSONs: [ { - "task": "", - "suggested_approach": "", - "important_details": "" + "suggested_approach": [""] }, { - "task": "", - "suggested_approach": "", - "important_details": "" + "suggested_approach": [""] }, ] -The tasks MUST be generated in order, they MUST NOT depend on future tasks or previous tasks. They MUST be independent. -You MUST generate at least 1 task. The last task MUST be the implementation task. You WILL NOT need a test file. +Suggested approaches MUST be independent. +You MUST generate at least 1 suggested approach. +IMPORTANT: the agents DON'T have access to the internet. Every task must be conducted OFFLINE. +The agents have cloned the repo, so they can open files, browse the code, interact with it... +The goal of phase 1, exploring the codebase, finding the relevant details is ONLY to collect information. +Be as HELPFUL and DETAILED as possible. +Use the suggested approach to guide the agents in their exploration of the codebase. +They MUST interact with the environment: +- Open as many files as needed to gather as much information as possible. +- Read every piece of code that might be relevant to the task, summarise what does it do. +- Decide which functions are important to the task, understand how they are used and how they are called. + +Remember that the agents can use a Python environment with , e.g.: + +print("Hello World!") + + +They can execute bash commands wrapped with , e.g. ls . +If a bash command returns exit code `-1`, this means the process is not yet finished. +They must then send a second . The second can be empty +(which will retrieve any additional logs), or it can contain text to be sent to STDIN of the running process, +or it can contain the text `ctrl+c` to interrupt the process. + +For commands that may run indefinitely, the output should be redirected to a file and the command run +in the background, e.g. python3 app.py > server.log 2>&1 & +If a command execution result says "Command timed out. Sending SIGINT to the process", +the assistant should retry running the command in the background. -For example: -User prompt: +Be VERY VERY SPECIFIC. + +---- START OF EXAMPLE ---- + +## TASK " Enable quiet mode/no-verbose in CLI for use in pre-commit hook There seems to be only an option to increase the level of verbosity when using @@ -61,98 +157,175 @@ long list of fixes that are being applied to the SQL files, which can get quite verbose. " -Your response: +## YOUR RESPONSE: [ - { - "task": "Research SQLFluff CLI verbosity options", - "suggested_approach": "Investigate the current SQLFluff CLI documentation and source code to understand how verbosity levels are currently implemented. Identify if there are any existing flags or settings that can be adjusted to reduce verbosity.", - "important_details": "Focus on the 'fix' command and any related verbosity settings. Document any findings that could be useful for implementing a quiet mode." - }, - { - "task": "Implement the quiet mode feature", - "suggested_approach": "Modify the SQLFluff CLI codebase to add the new quiet mode feature. Implement the necessary changes in the code to support this feature and ensure it can be activated via a command-line flag.", - "important_details": "Write unit tests to verify that the quiet mode works as expected and does not affect other CLI functionalities." - } + { + "suggested_approach": [ + "1. Open the SQLFluff codebase and navigate to the CLI module, likely located in 'src/sqlfluff/cli/'.", + "2. Locate the file responsible for parsing command-line arguments, such as 'commands.py' or 'cli.py'.", + "3. Examine how the '--verbose' flag is implemented in the code.", + "4. Identify if there is an existing '--quiet' or '--no-verbose' option.", + "5. Understand how verbosity levels are set and managed within the CLI code.", + "6. Look for any variables or settings that control the default verbosity level.", + "7. Determine how the '--verbose' flag increases verbosity and see if a similar mechanism can decrease verbosity.", + "8. Note down any functions or methods that output information to the console.", + "9. Identify how these functions can be controlled via verbosity levels.", + "10. Summarize findings and consider how to implement a '--quiet' flag." + ] + }, + { + "suggested_approach": [ + "1. Investigate the logging configuration in SQLFluff, possibly located in 'src/sqlfluff/core/logger.py' or similar.", + "2. Understand how logging levels are set (e.g., DEBUG, INFO, WARNING, ERROR).", + "3. Examine if the logging levels are affected by CLI arguments.", + "4. Identify where in the code the logging configuration is initialized based on user input.", + "5. Check if there is a way to adjust the logging level via a CLI option.", + "6. Determine if adding a '--quiet' flag can set the logging level to WARNING or ERROR to suppress INFO messages.", + "7. Note the changes needed in the logging setup to support a quiet mode.", + "8. Identify all logging statements that may need to respect the new logging level.", + "9. Consider the impact on existing functionality and ensure that critical messages are still displayed.", + "10. Summarize how logging can be adjusted to implement a quiet mode." + ] + }, + { + "suggested_approach": [ + "1. Analyze how output to the console is handled throughout the codebase.", + "2. Identify the functions used for outputting messages, such as 'click.echo', 'print', or custom wrapper functions.", + "3. Trace where these output functions are called in the code, especially during 'sqlfluff fix' execution.", + "4. Determine if there is a centralized output function or if output is scattered across multiple functions.", + "5. Assess whether output functions can be modified to check a verbosity level before printing.", + "6. Consider creating or modifying a wrapper function that respects a verbosity or quiet setting.", + "7. Identify any messages that should always be displayed, regardless of verbosity settings (e.g., errors).", + "8. Note the locations in the code where changes need to be made to control output.", + "9. Evaluate the feasibility of implementing a quiet mode by adjusting output functions.", + "10. Summarize the steps required to control output at the source." + ] + }, + { + "suggested_approach": [ + "1. Explore the configuration options available in SQLFluff by examining the configuration parser code, possibly in 'src/sqlfluff/core/config.py'.", + "2. Look for existing configuration parameters related to verbosity or output control.", + "3. Determine how configuration files (like '.sqlfluff') are parsed and applied.", + "4. Assess if a new configuration option can be introduced to control verbosity levels.", + "5. Identify how this configuration option can be read and applied during runtime.", + "6. Check if the CLI options can override configuration file settings for verbosity.", + "7. Map out the code changes required to implement and support a new configuration option.", + "8. Ensure that the new configuration integrates smoothly with existing settings.", + "9. Consider user documentation and how users would be informed about the new option.", + "10. Summarize the process of adding a verbosity control via configuration files." + ] + }, + { + "suggested_approach": [ + "1. Examine the implementation of the 'sqlfluff fix' command to understand its workflow.", + "2. Identify where the command generates output and how that output is formatted.", + "3. Determine if 'sqlfluff fix' has different output modes or formats based on context.", + "4. Check if the command detects when it's running in a pre-commit hook or similar environment.", + "5. Consider if output suppression can be contextually applied when running in certain environments.", + "6. Identify any existing mechanisms for output control based on execution context.", + "7. Explore how the 'black' formatter handles output suppression in pre-commit hooks.", + "8. Analyze if similar techniques can be applied within SQLFluff's codebase.", + "9. Note any dependencies or external factors that influence output generation.", + "10. Summarize how context-aware output control can be implemented." + ] + } ] -""" - -adjustment_prompt = """ - - This is the current active plan that your agents are working on: - %(milestones)s - - And this is the current subtask that your agents are working on: - ## Current subtask - subtask: %(milestone_task)s - Suggested Approach: %(milestone_suggested_approach)s - Important Details: %(milestone_important_details)s - However, it seems that the current subtask is not being completed successfully. - Because of the following reason: %(reason)s - You have the following contextual information that has been gathered up to this point. - This information MIGHT help you adjust the plan: - %(summary)s +---- END OF EXAMPLE ---- - ## Task - As a strategic manager, you must reflect on the failed subtask and decide on the necessary adjustments. Consider the following: - 1. Analyze the reason for failure and determine if the suggested approach or important details need modification. - 2. Decide if the failed subtask should be split into smaller, more manageable tasks. - 3. Consider if new plan need to be added to address any gaps in the plan. - 4. Update the remaining plan to ensure the overall plan remains feasible and effective. +--- START OF EXAMPLE 2 --- - You MUST NOT change the task you were given. +## TASK +" +ModelChain.prepare_inputs can succeed with missing dhi From the docstring for `ModelChain.prepare_inputs()` +I believe the method should fail if `weather` does not have a `dhi` column. The validation checks for `'ghi'` twice, +but not `'dhi`' https://github.com/pvlib/pvlib-python/blob/11c356f9a89fc88b4d3ff368ce1aae170a97ebd7/pvlib/modelchain.py#L1136 +" - You MUST make changes to the current subtask or to the ones AFTER. In NO case you can change the ones BEFORE. - Generate ONLY a list of JSONs. Do NOT generate any markdown or comments. - """ +## YOUR RESPONSE: +[ + { + "suggested_approach": [ + "1. Open the file pvlib/modelchain.py and locate the ModelChain.prepare_inputs method. Carefully read through the method's code, focusing on the section where it validates the weather DataFrame columns, specifically around line 1136.", + "2. Identify the validation checks for the weather DataFrame. Note whether it checks for the presence of 'dhi' or mistakenly checks for 'ghi' twice.", + "3. Examine the docstring of ModelChain.prepare_inputs to understand the expected behavior when dhi is missing from the weather data.", + "4. Investigate any helper functions called within prepare_inputs that handle irradiance data, such as methods for inferring missing components.", + "5. Review the unit tests related to prepare_inputs in pvlib/tests/test_modelchain.py to see if cases with missing dhi are covered.", + "6. Use the Python environment to simulate calling prepare_inputs with weather data missing the dhi column and observe the outcome.", + "", + "import pvlib", + "from pvlib import modelchain, location, pvsystem", + "import pandas as pd", + "mc = modelchain.ModelChain(pvsystem.PVSystem(), location.Location(32.2, -110.9))", + "weather = pd.DataFrame({'ghi': [1000], 'dni': [800]})", + "mc.prepare_inputs(weather)", + "", + "7. Document any discrepancies between the code and the documentation, and note any unexpected behaviors." + ] + }, + { + "suggested_approach": [ + "1. Generate a flowchart of the prepare_inputs method to understand its logic and how it processes the weather DataFrame.", + "2. Open pvlib/modelchain.py and trace each step within prepare_inputs, paying attention to how it handles missing data.", + "3. Look for any conditional statements that manage cases where dhi is not provided and see if alternative calculations are performed or if an error is raised.", + "4. Explore related methods like complete_irradiance or irradiance.get_total_irradiance to see how missing components are handled.", + "5. Test different weather DataFrame scenarios in the Python environment to observe how prepare_inputs behaves with various missing columns.", + "", + "import pvlib", + "from pvlib import modelchain, location, pvsystem", + "import pandas as pd", + "mc = modelchain.ModelChain(pvsystem.PVSystem(), location.Location(32.2, -110.9))", + "# Weather data missing 'dhi'", + "weather_missing_dhi = pd.DataFrame({'ghi': [1000], 'dni': [800]})", + "mc.prepare_inputs(weather_missing_dhi)", + "# Weather data missing 'ghi'", + "weather_missing_ghi = pd.DataFrame({'dhi': [200], 'dni': [800]})", + "mc.prepare_inputs(weather_missing_ghi)", + "", + "6. Record the outcomes and any exceptions raised to determine if the method behaves as intended." + ] + }, + { + "suggested_approach": [ + "1. Analyze the git commit history for modelchain.py to identify when the validation issue was introduced.", + "", + "cd pvlib-python", + "git log -L 1136,1140 /modelchain.py", + "", + "2. Review the changes in each commit affecting the validation checks in prepare_inputs.", + "3. Open the relevant commits and examine the differences in the validation code.", + "4. Check for any related issues or pull requests in the repository's local clone that discuss missing dhi validation.", + "5. Look into the test coverage reports (if available locally) to see if the validation logic is adequately tested.", + "6. Summarize findings on whether the issue is a recent regression or an existing oversight." + ] + } +] -def get_initial_prompt(task: str) -> str: - formatted_prompt = (general_description + initial_prompt) % { - 'task': task, - } +--- END OF EXAMPLE 2 --- - # Add instruction to not include json formatting - formatted_prompt += '\n\nIMPORTANT: Do not include ```json at the start or ``` at the end of your response. Just return the raw JSON list.' +--- YOUR TURN --- - return formatted_prompt +## TASK +%(task)s +## YOUR RESPONSE: +""" -def adjust_milestones( - milestones: List[Dict], - subtask: Dict[str, str], - reason: str, - summary: str, - task: str, -) -> str: - """Adjusts the milestones based on a failed subtask and its reason. - Parameters: - - milestones (List[Dict]): The current list of milestones. - - subtask (Dict): The subtask that was not completed successfully. - - reason (str): The reason provided for the failure. - - summary (str): A summary of everything up to this point. - - task (str): The user's task. +def get_prompt(task: str, phase: str, search_results: str = '') -> str: + if phase == 'search': + base_prompt = general_description + initial_prompt + elif phase == 'summary': + base_prompt = general_description + condense_information_prompt - Returns: A prompt for the strategic manager agent to self-reflect and adjust the milestones. - """ - # Extract values from the subtask dictionary - milestone_task = subtask['task'] - milestone_suggested_approach = subtask['suggested_approach'] - milestone_important_details = subtask['important_details'] - - # Get the formatted prompt - formatted_prompt = (general_description + adjustment_prompt) % { - 'milestones': json.dumps(milestones), - 'reason': reason, - 'summary': summary, + formatted_prompt = base_prompt % { 'task': task, - 'milestone_task': milestone_task, - 'milestone_suggested_approach': milestone_suggested_approach, - 'milestone_important_details': milestone_important_details, + 'phase': phase, + 'search_results': search_results, } # Add instruction to not include json formatting From 04c56c65449b1793cab879a2e5d54d056e7db37c Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Mon, 28 Oct 2024 18:48:14 +0100 Subject: [PATCH 12/18] fix --- openhands/runtime/builder/docker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/openhands/runtime/builder/docker.py b/openhands/runtime/builder/docker.py index 2c5a965b1dd0..5a22302d61d5 100644 --- a/openhands/runtime/builder/docker.py +++ b/openhands/runtime/builder/docker.py @@ -70,7 +70,6 @@ def build( f'--build-arg=OPENHANDS_RUNTIME_BUILD_TIME={datetime.datetime.now().isoformat()}', f'--tag={target_image_hash_name}', '--load', - '--platform=linux/amd64', ] # Include the platform argument only if platform is specified From 399f19ebf539e4cb44763e168b2ab25751ce7016 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Thu, 31 Oct 2024 11:14:16 -0700 Subject: [PATCH 13/18] MAS --- openhands/agenthub/__init__.py | 2 + .../agenthub/codeact_agent/codeact_agent.py | 11 + .../agenthub/searcher_agent/action_parser.py | 19 +- openhands/agenthub/searcher_agent/agent.py | 162 +++++---- openhands/agenthub/searcher_agent/prompt.py | 99 +++++- openhands/agenthub/supervisor_agent/agent.py | 99 ++++-- openhands/agenthub/tester_agent/__init__.py | 4 + .../agenthub/tester_agent/action_parser.py | 158 +++++++++ openhands/agenthub/tester_agent/agent.py | 201 +++++++++++ openhands/agenthub/tester_agent/prompt.py | 329 ++++++++++++++++++ 10 files changed, 962 insertions(+), 122 deletions(-) create mode 100644 openhands/agenthub/tester_agent/__init__.py create mode 100644 openhands/agenthub/tester_agent/action_parser.py create mode 100644 openhands/agenthub/tester_agent/agent.py create mode 100644 openhands/agenthub/tester_agent/prompt.py diff --git a/openhands/agenthub/__init__.py b/openhands/agenthub/__init__.py index 96c766124989..1ec266ce7501 100644 --- a/openhands/agenthub/__init__.py +++ b/openhands/agenthub/__init__.py @@ -16,6 +16,7 @@ planner_agent, searcher_agent, supervisor_agent, + tester_agent, ) __all__ = [ @@ -27,6 +28,7 @@ 'browsing_agent', 'searcher_agent', 'supervisor_agent', + 'tester_agent', ] for agent in all_microagents.values(): diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py index d1f67eae9c2c..4dbb1503dcc0 100644 --- a/openhands/agenthub/codeact_agent/codeact_agent.py +++ b/openhands/agenthub/codeact_agent/codeact_agent.py @@ -10,6 +10,7 @@ from openhands.controller.agent import Agent from openhands.controller.state.state import State from openhands.core.config import AgentConfig +from openhands.core.config.llm_config import LLMConfig from openhands.core.logger import openhands_logger as logger from openhands.core.message import ImageContent, Message, TextContent from openhands.events.action import ( @@ -81,6 +82,16 @@ def __init__( Parameters: - llm (LLM): The llm to be used by this agent """ + + llm_config = LLMConfig( + model='litellm_proxy/claude-3-5-sonnet-20241022', + api_key='REDACTED', + temperature=0.0, + base_url='https://llm-proxy.app.all-hands.dev', + ) + llm = LLM(llm_config) + # TODO: Remove this once we have a real AgentConfig + config = AgentConfig(llm_config='o1-mini') super().__init__(llm, config) self.reset() diff --git a/openhands/agenthub/searcher_agent/action_parser.py b/openhands/agenthub/searcher_agent/action_parser.py index 54ad267cad8b..46846641b8df 100644 --- a/openhands/agenthub/searcher_agent/action_parser.py +++ b/openhands/agenthub/searcher_agent/action_parser.py @@ -39,15 +39,12 @@ def parse_response(self, response) -> str: action = response.choices[0].message.content if action is None: return '' - for lang in ['bash', 'ipython', 'browse']: - # special handling for DeepSeek: it has stop-word bug and returns + for lang in ['bash', 'ipython']: if f'' not in action: action = action.replace(f'') if f'' in action and f'' not in action: action += f'' - if '' not in action: - action += '' return action def parse_action(self, action_str: str) -> Action: @@ -68,14 +65,16 @@ def __init__( self.finish_command = None def check_condition(self, action_str: str) -> bool: - self.finish_command = re.search(r'.*', action_str, re.DOTALL) + self.finish_command = re.search( + r'(.*?)', action_str, re.DOTALL + ) return self.finish_command is not None def parse(self, action_str: str) -> Action: assert ( self.finish_command is not None ), 'self.finish_command should not be None when parse is called' - output = action_str.replace(self.finish_command.group(0), '').strip() + output = self.finish_command.group(1).strip() outputs = {'output': output} return AgentFinishAction(outputs=outputs) @@ -114,9 +113,7 @@ class SearcherAgentActionParserIPythonRunCell(ActionParser): - IPythonRunCellAction(code) - IPython code to run """ - def __init__( - self, - ): + def __init__(self): self.python_code = None self.jupyter_kernel_init_code: str = 'from agentskills import *' @@ -127,9 +124,7 @@ def check_condition(self, action_str: str) -> bool: return self.python_code is not None def parse(self, action_str: str) -> Action: - assert ( - self.python_code is not None - ), 'self.python_code should not be None when parse is called' + assert self.python_code is not None code_group = self.python_code.group(1).strip() thought = action_str.replace(self.python_code.group(0), '').strip() return IPythonRunCellAction( diff --git a/openhands/agenthub/searcher_agent/agent.py b/openhands/agenthub/searcher_agent/agent.py index f987c0708310..195f3823eed7 100644 --- a/openhands/agenthub/searcher_agent/agent.py +++ b/openhands/agenthub/searcher_agent/agent.py @@ -7,25 +7,35 @@ from openhands.core.config import AgentConfig from openhands.core.config.llm_config import LLMConfig from openhands.core.message import Message, TextContent -from openhands.events.action import Action, AgentFinishAction -from openhands.events.action.commands import CmdRunAction, IPythonRunCellAction +from openhands.events.action import Action, AgentFinishAction, IPythonRunCellAction +from openhands.events.action.commands import CmdRunAction from openhands.events.action.message import MessageAction -from openhands.events.observation.commands import ( - CmdOutputObservation, - IPythonRunCellObservation, -) +from openhands.events.observation import IPythonRunCellObservation +from openhands.events.observation.commands import CmdOutputObservation from openhands.events.observation.error import ErrorObservation from openhands.events.observation.observation import Observation from openhands.events.observation.reject import UserRejectObservation from openhands.llm.llm import LLM +from openhands.runtime.plugins.agent_skills import AgentSkillsRequirement +from openhands.runtime.plugins.jupyter import JupyterRequirement +from openhands.runtime.plugins.requirement import PluginRequirement +# WIP: Make this agent be able to detect when to stop and automatically stop (or make the supervisor able to stop the agent). class SearcherAgent(Agent): VERSION = '1.0' """ The Searcher Agent is an agent that searches the codebase for relevant information. """ + sandbox_plugins: list[PluginRequirement] = [ + # NOTE: AgentSkillsRequirement need to go before JupyterRequirement, since + # AgentSkillsRequirement provides a lot of Python functions, + # and it needs to be initialized before Jupyter for Jupyter to use those functions. + AgentSkillsRequirement(), + JupyterRequirement(), + ] + action_parser = SearcherAgentResponseParser() def __init__(self, llm: LLM, config: AgentConfig): @@ -47,67 +57,6 @@ def __init__(self, llm: LLM, config: AgentConfig): self.logger = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG) # Set the logging level - def step(self, state: State) -> Action: - """Performs one step using the Searcher Agent. - This includes gathering info about the codebase and summarizing relevant information. - - Parameters: - - state (State): used to get updated info - - Returns: - - Action: The next action to take - """ - # Check if we should exit - latest_user_message = state.history.get_last_user_message() - if latest_user_message and latest_user_message.strip() == '/exit': - return AgentFinishAction() - - # Prepare messages for LLM - messages = [] - - # Add system and initial messages - task: str = state.inputs.get('task', '') - suggested_approach: str = state.inputs.get('suggested_approach', '') - messages.extend( - [ - Message( - role='system', - content=[TextContent(text=get_prompt(task, suggested_approach))], - ) - ] - ) - - # Add history messages - for event in state.history.get_events(): - if isinstance(event, Action): - message = self.get_action_message(event) - elif isinstance(event, Observation): - message = self.get_observation_message(event) - else: - raise ValueError(f'Unknown event type: {type(event)}') - - if message: - # Handle consecutive messages from same role - if messages and messages[-1].role == message.role: - messages[-1].content.extend(message.content) - else: - messages.append(message) - - # Get response from LLM - params = { - 'messages': self.llm.format_messages_for_llm(messages), - 'stop': [ - '', - '', - '', - ], - } - - response = self.llm.completion(**params) - - # Parse and return the next action - return self.action_parser.parse(response) - def get_action_message(self, action: Action) -> Message | None: """Convert an Action to a Message for the LLM conversation. @@ -162,6 +111,13 @@ def get_observation_message(self, obs: Observation) -> Message | None: return Message(role='user', content=[TextContent(text=text)]) elif isinstance(obs, IPythonRunCellObservation): text = obs_prefix + obs.content + splitted = text.split('\n') + for i, line in enumerate(splitted): + if '![image](data:image/png;base64,' in line: + splitted[i] = ( + '![image](data:image/png;base64, ...) already displayed to user' + ) + text = '\n'.join(splitted) return Message(role='user', content=[TextContent(text=text)]) elif isinstance(obs, ErrorObservation): text = obs_prefix + obs.content @@ -173,3 +129,75 @@ def get_observation_message(self, obs: Observation) -> Message | None: return Message(role='user', content=[TextContent(text=text)]) else: raise ValueError(f'Unknown observation type: {type(obs)}') + + def step(self, state: State) -> Action: + """Performs one step using the SearcherAgent. + This includes gathering info on previous steps and prompting the model to make a command to execute. + + Parameters: + - state (State): used to get updated info + + Returns: + - CmdRunAction(command) - bash command to run + - IPythonRunCellAction(code) - IPython code to run + - MessageAction(content) - Message action to run (e.g. ask for clarification) + - AgentFinishAction() - end the interaction + """ + + # prepare what we want to send to the LLM + messages = self._get_messages(state) + params = { + 'messages': self.llm.format_messages_for_llm(messages), + 'stop': [ + '', + '', + ], + } + + response = self.llm.completion(**params) + + return self.action_parser.parse(response) + + def _get_messages(self, state: State) -> list[Message]: + # Get task and suggested approach from state inputs + task = state.inputs.get('task', '') + suggested_approach = state.inputs.get('suggested_approach', '') + + messages: list[Message] = [ + Message( + role='system', + content=[ + TextContent( + text=get_prompt(task, suggested_approach), + cache_prompt=self.llm.is_caching_prompt_active(), + ) + ], + ), + ] + + for event in state.history.get_events(): + # create message from event + if isinstance(event, Action): + message = self.get_action_message(event) + elif isinstance(event, Observation): + message = self.get_observation_message(event) + else: + raise ValueError(f'Unknown event type: {type(event)}') + + # add regular message + if message: + # handle error if the message is the SAME role as the previous message + if messages and messages[-1].role == message.role: + messages[-1].content.extend(message.content) + else: + messages.append(message) + + # Add caching to the last 2 user messages + if self.llm.is_caching_prompt_active(): + user_turns_processed = 0 + for message in reversed(messages): + if message.role == 'user' and user_turns_processed < 2: + message.content[-1].cache_prompt = True + user_turns_processed += 1 + + return messages diff --git a/openhands/agenthub/searcher_agent/prompt.py b/openhands/agenthub/searcher_agent/prompt.py index bfc9bc612647..6479cda7eab6 100644 --- a/openhands/agenthub/searcher_agent/prompt.py +++ b/openhands/agenthub/searcher_agent/prompt.py @@ -4,21 +4,21 @@ # 2. Implementing the solution. # Then the manager needs to check if the issue has been fixed, if not, it needs to iterate. general_description = """ -You are a detail-oriented AI, an expert in searching through files and code. -You are also an expert in summarising code and its purpose. +The assistant is a detail-oriented AI, an expert in searching through files and code. +The assistant is also an expert in summarising code and its purpose. As a detail-oriented AI, you MUST always read more and more code until you are sure you have found all the information you need. -Your goal is to gather information about the codebase to help the programmer fix the issue. +The assistant's goal is to gather information about the codebase to help the programmer fix the issue. Here is the task you are trying to complete: %(task)s -IMPORTANT: YOU SHOULD NEVER TRY TO IMPLEMENT A SOLUTION. YOUR ONLY GOAL IS TO GATHER INFORMATION. +IMPORTANT: THE ASSISTANT SHOULD NEVER TRY TO IMPLEMENT A SOLUTION. THE ASSISTANTR ONLY GOAL IS TO GATHER INFORMATION. As an expert in searching through files and code, you have been equipped with a set of tools that will help you gather information about the codebase: -- You can execute bash commands wrapped with , e.g. ls . +- The assistant can execute bash commands wrapped with , e.g. ls . - If a bash command returns exit code `-1`, this means the process is not yet finished. -- You must then send a second . The second can be empty +- The assistant must then send a second . The second can be empty (which will retrieve any additional logs), or it can contain text to be sent to STDIN of the running process, or it can contain the text `ctrl+c` to interrupt the process. - For commands that may run indefinitely, the output should be redirected to a file and the command run @@ -26,9 +26,85 @@ - If a command execution result says "Command timed out. Sending SIGINT to the process", you should retry running the command in the background. -You should ONLY `run` commands that have no side-effects, like `ls` and `grep`. +The assistant should ONLY `run` commands that have no side-effects, like `ls` and `grep`. -Your manager gave you a suggested approach that you should follow: +The assistant can use a Python environment with , e.g.: + +print("Hello World!") + + +The assistant can install Python packages using the %%pip magic command in an IPython environment by using the following syntax: %%pip install [package needed] and should always import packages and define variables before starting to use them. + +Apart from the standard Python library, the assistant can also use the following functions (already imported) in environment: +open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None: + Opens the file at the given path in the editor. IF the file is to be edited, first use `scroll_down` repeatedly to read the full file! + If line_number is provided, the window will be moved to include that line. + It only shows the first 100 lines by default! `context_lines` is the max number of lines to be displayed, up to 100. Use `scroll_up` and `scroll_down` to view more content up or down. + Args: + path: str: The path to the file to open, preferred absolute path. + line_number: int | None = 1: The line number to move to. Defaults to 1. + context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100. + +goto_line(line_number: int) -> None: + Moves the window to show the specified line number. + Args: + line_number: int: The line number to move to. + +scroll_down() -> None: + Moves the window down by 100 lines. + Args: + None + +scroll_up() -> None: + Moves the window up by 100 lines. + Args: + None + +search_dir(search_term: str, dir_path: str = './') -> None: + Searches for search_term in all files in dir. If dir is not provided, searches in the current directory. + Args: + search_term: str: The term to search for. + dir_path: str: The path to the directory to search. + +search_file(search_term: str, file_path: str | None = None) -> None: + Searches for search_term in file. If file is not provided, searches in the current open file. + Args: + search_term: str: The term to search for. + file_path: str | None: The path to the file to search. + +find_file(file_name: str, dir_path: str = './') -> None: + Finds all files with the given name in the specified directory. + Args: + file_name: str: The name of the file to find. + dir_path: str: The path to the directory to search. + +parse_pdf(file_path: str) -> None: + Parses the content of a PDF file and prints it. + Args: + file_path: str: The path to the file to open. + +parse_docx(file_path: str) -> None: + Parses the content of a DOCX file and prints it. + Args: + file_path: str: The path to the file to open. + +parse_latex(file_path: str) -> None: + Parses the content of a LaTex file and prints it. + Args: + file_path: str: The path to the file to open. + +parse_pptx(file_path: str) -> None: + Parses the content of a pptx file and prints it. + Args: + file_path: str: The path to the file to open. + + +IMPORTANT: +- `open_file` only returns the first 100 lines of the file by default! The assistant MUST use `scroll_down` repeatedly to read the full file BEFORE making edits! +- Indentation is important and code that is not indented correctly will fail and require fixing before it can be run. +- Any code issued should be less than 50 lines to avoid context being cut off! + +The assistant's manager gave you a suggested approach that you should follow: %(suggested_approach)s Follow the suggested approach to gather information about the codebase. @@ -52,13 +128,14 @@ IMPORTANT: Every entry in the JSON MUST be relevant to the task. IMPORTANT: The JSON MUST be contained inside and tags. -IMPORTANT: You MUST have at least one file in the response. - +IMPORTANT: The assistant MUST have at least one file in the response. +IMPORTANT: THE ASSISTANT MUST NOT modify the codebase or NOT ADD any new files. """ def get_prompt(task: str, suggested_approach: str) -> str: - formatted_prompt = (general_description) % { + # Escape any % characters in the input strings + formatted_prompt = general_description % { 'task': task, 'suggested_approach': suggested_approach, } diff --git a/openhands/agenthub/supervisor_agent/agent.py b/openhands/agenthub/supervisor_agent/agent.py index 034a3ba1b7e1..0c0ba4e83b3d 100644 --- a/openhands/agenthub/supervisor_agent/agent.py +++ b/openhands/agenthub/supervisor_agent/agent.py @@ -8,8 +8,8 @@ from openhands.controller.agent import Agent from openhands.controller.state.state import State from openhands.core.config import AgentConfig +from openhands.core.config.llm_config import LLMConfig from openhands.core.message import Message, TextContent -from openhands.core.schema.action import ActionType from openhands.core.utils import json from openhands.events.action import Action, AgentDelegateAction, AgentFinishAction from openhands.events.action.agent import AgentRejectAction @@ -29,8 +29,9 @@ class SupervisorAgent(Agent): suggested_approach_index: int = -1 # -1 Because we increment it before using it results: Dict[str, List[Any]] = {'search': [], 'code': []} condensed_information: str = '' - does_it_needs_a_test: str = '' + does_it_needs_a_test: bool = False task: str = '' + test_command: str = '' phase: Literal['search', 'summary', 'code'] = 'search' def __init__(self, llm: LLM, config: AgentConfig): @@ -39,6 +40,12 @@ def __init__(self, llm: LLM, config: AgentConfig): Parameters: - llm (LLM): The llm to be used by this agent """ + llm_config = LLMConfig( + model='openai/o1-mini', api_key='REDACTED', temperature=1.0 + ) + llm = LLM(llm_config) + # TODO: Remove this once we have a real AgentConfig + config = AgentConfig(llm_config='o1-mini') super().__init__(llm, config) # Set up logger self.logger = logging.getLogger(__name__) @@ -49,18 +56,16 @@ def step(self, state: State) -> Action: self.logger.debug('Starting step with state: %s', state) self.logger.debug('LLM config: %s', self.llm_config) - if not self.suggested_approaches: + if len(self.suggested_approaches) == 0: self.suggested_approaches = self.get_suggested_approaches(state) self.suggested_approach_index += 1 last_observation = state.history.get_last_observation() - if ( - isinstance(last_observation, AgentDelegateObservation) - and last_observation.outputs.get('action', '') == ActionType.FINISH - ): + # At first the history is empty, so we proceed to the SearchAgent + if isinstance(last_observation, AgentDelegateObservation): self.results[self.phase].append(last_observation.outputs.get('output', '')) - if len(self.results[self.phase]) < len(self.suggested_approaches): + if self.suggested_approach_index < len(self.suggested_approaches): # Delegate to the SearcherAgent as we need to gather more information return self.delegate_to_agent( 'SearcherAgent', @@ -71,33 +76,57 @@ def step(self, state: State) -> Action: ) if self.phase == 'search': - # We don't change the phase until we have the condensed information condensed_information = self.ask_llm( - self.task, '2', json.dumps(self.results['search']) - )[0] - if condensed_information.get('summary', '') != '': - self.phase = 'summary' - self.condensed_information = condensed_information.get('summary', '') - else: - suggested_approach: str | list[str] = condensed_information.get( - 'suggested_approach', [] - ) - self.results['search'].append(suggested_approach) - return self.delegate_to_agent( - 'SearcherAgent', self.task, suggested_approach - ) + self.task, 'summary', self.results[self.phase] + ) + if condensed_information and len(condensed_information) > 0: + first_result = condensed_information[0] + if first_result.get('summary', '') != '': + self.phase = 'summary' + self.condensed_information = first_result.get('summary', '') + else: + suggested_approach: str | list[str] = first_result.get( + 'suggested_approach', [] + ) + self.results['search'].append(suggested_approach) + return self.delegate_to_agent( + 'SearcherAgent', self.task, suggested_approach + ) if self.phase == 'summary': - # Now we have to judge if this issue requires a test or not before fixing it - does_it_needs_a_test = self.ask_llm( - self.task, 'code', self.condensed_information - )[0] - if does_it_needs_a_test.get('suggested_approach', '') == TASK_TYPE_ISSUE: - self.phase = 'code' - else: + if not self.does_it_needs_a_test: + test_check = self.ask_llm(self.task, 'code', self.condensed_information) + first_check = ( + test_check[0] if test_check and len(test_check) > 0 else {} + ) + self.does_it_needs_a_test = ( + first_check.get('suggested_approach', '') == TASK_TYPE_ISSUE + ) self.phase = 'code' - - # WIP: Implement the code phase + if self.does_it_needs_a_test: + self.current_delegate = 'TesterAgent' + return AgentDelegateAction( + agent='TesterAgent', + inputs={ + 'task': self.task, + 'summary': self.condensed_information, + }, + ) + if self.phase == 'code': + if ( + self.does_it_needs_a_test + and last_observation is not None + and isinstance(last_observation, AgentDelegateObservation) + ): + self.test_command = last_observation.outputs.get('output', '') + return AgentDelegateAction( + agent='CoderAgent', + inputs={ + 'task': self.task, + 'summary': self.condensed_information, + 'test_command': self.test_command, + }, + ) return AgentFinishAction() @@ -114,6 +143,7 @@ def delegate_to_agent( self, agent_name: str, task: str, suggested_approach: Union[str, List[str]] ) -> AgentDelegateAction: self.logger.debug(f'Delegating to agent: {agent_name}') + self.current_delegate = agent_name # Join the list of strings with newlines if it's a list approach = ( '\n'.join(suggested_approach) @@ -125,8 +155,11 @@ def delegate_to_agent( ) def ask_llm( - self, task: str, phase: str, search_results: str = '' + self, task: str, phase: str, search_results: Union[str, List[str]] = '' ) -> List[Dict[str, str]]: + # Format search_results as one item per line if it's a list + if isinstance(search_results, list): + search_results = '\n'.join(search_results) prompt = get_prompt(task, phase, search_results) return self.get_response(prompt) @@ -136,4 +169,6 @@ def get_response(self, prompt: str) -> List[Dict[str, str]]: response = self.llm.completion( messages=self.llm.format_messages_for_llm(message) ) + if isinstance(response, list): + return json.loads(response[0]['message']['content']) return json.loads(response['choices'][0]['message']['content']) diff --git a/openhands/agenthub/tester_agent/__init__.py b/openhands/agenthub/tester_agent/__init__.py new file mode 100644 index 000000000000..54be665abd42 --- /dev/null +++ b/openhands/agenthub/tester_agent/__init__.py @@ -0,0 +1,4 @@ +from openhands.agenthub.tester_agent.agent import TesterAgent +from openhands.controller.agent import Agent + +Agent.register('TesterAgent', TesterAgent) diff --git a/openhands/agenthub/tester_agent/action_parser.py b/openhands/agenthub/tester_agent/action_parser.py new file mode 100644 index 000000000000..8abc7c353916 --- /dev/null +++ b/openhands/agenthub/tester_agent/action_parser.py @@ -0,0 +1,158 @@ +import re + +from openhands.controller.action_parser import ( + ActionParser, + ResponseParser, +) +from openhands.events.action import ( + Action, + AgentFinishAction, + CmdRunAction, + IPythonRunCellAction, + MessageAction, +) + + +class TesterAgentResponseParser(ResponseParser): + """Parser action: + - CmdRunAction(command) - bash command to run + - IPythonRunCellAction(code) - IPython code to run + - MessageAction(content) - Message action to run (e.g. ask for clarification) + - AgentFinishAction() - end the interaction + """ + + def __init__(self): + # Need pay attention to the item order in self.action_parsers + super().__init__() + self.action_parsers = [ + TesterAgentActionParserFinish(), + TesterAgentActionParserCmdRun(), + TesterAgentActionParserIPythonRunCell(), + ] + self.default_parser = TesterAgentActionParserMessage() + + def parse(self, response) -> Action: + action_str = self.parse_response(response) + return self.parse_action(action_str) + + def parse_response(self, response) -> str: + action = response.choices[0].message.content + if action is None: + return '' + for lang in ['bash', 'ipython', 'browse']: + # special handling for DeepSeek: it has stop-word bug and returns + if f'' not in action: + action = action.replace(f'') + + if f'' in action and f'' not in action: + action += f'' + if '' not in action: + action += '' + return action + + def parse_action(self, action_str: str) -> Action: + for action_parser in self.action_parsers: + if action_parser.check_condition(action_str): + return action_parser.parse(action_str) + return self.default_parser.parse(action_str) + + +class TesterAgentActionParserFinish(ActionParser): + """Parser action: + - AgentFinishAction() - end the interaction + """ + + def __init__( + self, + ): + self.finish_command = None + + def check_condition(self, action_str: str) -> bool: + self.finish_command = re.search(r'.*', action_str, re.DOTALL) + return self.finish_command is not None + + def parse(self, action_str: str) -> Action: + assert ( + self.finish_command is not None + ), 'self.finish_command should not be None when parse is called' + output = self.finish_command.group(1).strip() + outputs = {'output': output} + return AgentFinishAction(outputs=outputs) + + +class TesterAgentActionParserCmdRun(ActionParser): + """Parser action: + - CmdRunAction(command) - bash command to run + - AgentFinishAction() - end the interaction + """ + + def __init__( + self, + ): + self.bash_command = None + + def check_condition(self, action_str: str) -> bool: + self.bash_command = re.search( + r'(.*?)', action_str, re.DOTALL + ) + return self.bash_command is not None + + def parse(self, action_str: str) -> Action: + assert ( + self.bash_command is not None + ), 'self.bash_command should not be None when parse is called' + thought = action_str.replace(self.bash_command.group(0), '').strip() + # a command was found + command_group = self.bash_command.group(1).strip() + if command_group.strip() == 'exit': + return AgentFinishAction(thought=thought) + return CmdRunAction(command=command_group, thought=thought) + + +class TesterAgentActionParserIPythonRunCell(ActionParser): + """Parser action: + - IPythonRunCellAction(code) - IPython code to run + """ + + def __init__( + self, + ): + self.python_code = None + self.jupyter_kernel_init_code: str = 'from agentskills import *' + + def check_condition(self, action_str: str) -> bool: + self.python_code = re.search( + r'(.*?)', action_str, re.DOTALL + ) + return self.python_code is not None + + def parse(self, action_str: str) -> Action: + assert ( + self.python_code is not None + ), 'self.python_code should not be None when parse is called' + code_group = self.python_code.group(1).strip() + thought = action_str.replace(self.python_code.group(0), '').strip() + return IPythonRunCellAction( + code=code_group, + thought=thought, + kernel_init_code=self.jupyter_kernel_init_code, + ) + + +class TesterAgentActionParserMessage(ActionParser): + """Parser action: + - MessageAction(content) - Message action to run (e.g. ask for clarification) + """ + + def __init__( + self, + ): + pass + + def check_condition(self, action_str: str) -> bool: + # We assume the LLM is GOOD enough that when it returns pure natural language + # it wants to talk to the user + return True + + def parse(self, action_str: str) -> Action: + return MessageAction(content=action_str, wait_for_response=True) diff --git a/openhands/agenthub/tester_agent/agent.py b/openhands/agenthub/tester_agent/agent.py new file mode 100644 index 000000000000..3dc449f43430 --- /dev/null +++ b/openhands/agenthub/tester_agent/agent.py @@ -0,0 +1,201 @@ +import logging + +from openhands.agenthub.tester_agent.action_parser import TesterAgentResponseParser +from openhands.agenthub.tester_agent.prompt import get_prompt +from openhands.controller.agent import Agent +from openhands.controller.state.state import State +from openhands.core.config import AgentConfig +from openhands.core.config.llm_config import LLMConfig +from openhands.core.message import Message, TextContent +from openhands.events.action import Action, AgentFinishAction +from openhands.events.action.commands import CmdRunAction, IPythonRunCellAction +from openhands.events.action.message import MessageAction +from openhands.events.observation.commands import ( + CmdOutputObservation, + IPythonRunCellObservation, +) +from openhands.events.observation.error import ErrorObservation +from openhands.events.observation.observation import Observation +from openhands.events.observation.reject import UserRejectObservation +from openhands.llm.llm import LLM + + +class TesterAgent(Agent): + VERSION = '1.0' + """ + The Tester Agent is an agent that tries to replicate the issue. + """ + + action_parser = TesterAgentResponseParser() + + def __init__(self, llm: LLM, config: AgentConfig): + """Initialize the Tester Agent with an LLM + + Parameters: + - llm (LLM): The llm to be used by this agent + - config (AgentConfig): The configuration for this agent + """ + # TODO: Remove this once we have a real LLM config + llm_config = LLMConfig( + model='deepseek/deepseek-chat', api_key='REDACTED', temperature=0.0 + ) + llm = LLM(llm_config) + # TODO: Remove this once we have a real AgentConfig + config = AgentConfig(llm_config='deepseek') + super().__init__(llm, config) + # Set up logger + self.logger = logging.getLogger(__name__) + logging.basicConfig(level=logging.DEBUG) # Set the logging level + + def get_action_message(self, action: Action) -> Message | None: + """Convert an Action to a Message for the LLM conversation. + + Parameters: + - action (Action): The action to convert + + Returns: + - Message | None: The converted message, or None if action type is not supported + """ + if isinstance(action, CmdRunAction): + return Message( + role='assistant', + content=[ + TextContent( + text=f'{action.thought}\n\n{action.command}\n' + ) + ], + ) + elif isinstance(action, IPythonRunCellAction): + return Message( + role='assistant', + content=[ + TextContent( + text=f'{action.thought}\n\n{action.code}\n' + ) + ], + ) + elif isinstance(action, MessageAction): + return Message( + role='user' if action.source == 'user' else 'assistant', + content=[TextContent(text=action.content)], + ) + elif isinstance(action, AgentFinishAction) and action.source == 'agent': + return Message(role='assistant', content=[TextContent(text=action.thought)]) + return None + + def get_observation_message(self, obs: Observation) -> Message | None: + """Convert an Observation to a Message for the LLM conversation. + + Parameters: + - obs (Observation): The observation to convert + + Returns: + - Message | None: The converted message, or None if observation type is not supported + """ + obs_prefix = 'OBSERVATION:\n' + if isinstance(obs, CmdOutputObservation): + text = obs_prefix + obs.content + text += ( + f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]' + ) + return Message(role='user', content=[TextContent(text=text)]) + elif isinstance(obs, IPythonRunCellObservation): + text = obs_prefix + obs.content + return Message(role='user', content=[TextContent(text=text)]) + elif isinstance(obs, ErrorObservation): + text = obs_prefix + obs.content + text += '\n[Error occurred in processing last action]' + return Message(role='user', content=[TextContent(text=text)]) + elif isinstance(obs, UserRejectObservation): + text = obs_prefix + obs.content + text += '\n[Last action has been rejected by the user]' + return Message(role='user', content=[TextContent(text=text)]) + else: + raise ValueError(f'Unknown observation type: {type(obs)}') + + def step(self, state: State) -> Action: + """Performs one step using the Tester Agent. + This includes gathering info on previous steps and prompting the model to make a command to execute. + + Parameters: + - state (State): used to get updated info + + Returns: + - CmdRunAction(command) - bash command to run + - IPythonRunCellAction(code) - IPython code to run + - MessageAction(content) - Message action to run (e.g. ask for clarification) + - AgentFinishAction() - end the interaction + """ + # if we're done, go back + latest_user_message = state.history.get_last_user_message() + if latest_user_message and latest_user_message.strip() == '/exit': + return AgentFinishAction() + + # prepare what we want to send to the LLM + messages = self._get_messages(state) + params = { + 'messages': self.llm.format_messages_for_llm(messages), + 'stop': [ + '', + '', + ], + } + + response = self.llm.completion(**params) + + return self.action_parser.parse(response) + + def _get_messages(self, state: State) -> list[Message]: + task = state.inputs.get('task', '') + summary = state.inputs.get('summary', '') + + messages: list[Message] = [ + Message( + role='system', + content=[ + TextContent( + text=get_prompt(task, summary), + cache_prompt=self.llm.is_caching_prompt_active(), # Cache system prompt + ) + ], + ), + ] + + for event in state.history.get_events(): + # create a regular message from an event + if isinstance(event, Action): + message = self.get_action_message(event) + elif isinstance(event, Observation): + message = self.get_observation_message(event) + else: + raise ValueError(f'Unknown event type: {type(event)}') + + # add regular message + if message: + # handle error if the message is the SAME role as the previous message + if messages and messages[-1].role == message.role: + messages[-1].content.extend(message.content) + else: + messages.append(message) + + # Add caching to the last 2 user messages + if self.llm.is_caching_prompt_active(): + user_turns_processed = 0 + for message in reversed(messages): + if message.role == 'user' and user_turns_processed < 2: + message.content[ + -1 + ].cache_prompt = True # Last item inside the message content + user_turns_processed += 1 + + # Add environment reminder to the latest user message + latest_user_message = next( + (m for m in reversed(messages) if m.role == 'user'), + None, + ) + + if latest_user_message: + reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with .' + latest_user_message.content.append(TextContent(text=reminder_text)) + + return messages diff --git a/openhands/agenthub/tester_agent/prompt.py b/openhands/agenthub/tester_agent/prompt.py new file mode 100644 index 000000000000..7523bc7d16f3 --- /dev/null +++ b/openhands/agenthub/tester_agent/prompt.py @@ -0,0 +1,329 @@ +# General Description, the goal is to devise a manager that is able to iterate if the solution has not been found yet. +# In order to successfully fix an issue there are two phases: +# 1. Exploring the codebase, finding the root cause of the issue. +# 2. Implementing the solution. +# Then the manager needs to check if the issue has been fixed, if not, it needs to iterate. +general_description = """ +You are a QA Engineer, an expert in testing software. +You are given an issue and your goal is to understand how to replicate the issue. + +Here is the issue you are trying to replicate: +%(task)s + +Some other agents have already gathered information about the codebase. +You can use this information to understand the codebase and replicate the issue. +%(summary)s + +IMPORTANT: YOU SHOULD NEVER TRY TO IMPLEMENT A SOLUTION. YOUR ONLY GOAL IS TO REPLICATE THE ISSUE. +As an expert in testing software, you have been equipped with a set of tools +that will help you replicate the issue: +- You can execute bash commands wrapped with , e.g. ls . +- If a bash command returns exit code `-1`, this means the process is not yet finished. +- You must then send a second . The second can be empty + (which will retrieve any additional logs), or it can contain text to be sent to STDIN of the running process, + or it can contain the text `ctrl+c` to interrupt the process. +- For commands that may run indefinitely, the output should be redirected to a file and the command run + in the background, e.g. python3 app.py > server.log 2>&1 & +- If a command execution result says "Command timed out. Sending SIGINT to the process", + you should retry running the command in the background. + +You have access to a python interpreter wrapped with . +e.g.: + +print("Hello World!") + + +You can install Python packages using the %pip magic command in an IPython environment by using the following syntax: %pip install [package needed] and should always import packages and define variables before starting to use them. + +Apart from the standard Python library, you can also use the following functions (already imported) in environment: +open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None: + Opens the file at the given path in the editor. IF the file is to be edited, first use `scroll_down` repeatedly to read the full file! + If line_number is provided, the window will be moved to include that line. + It only shows the first 100 lines by default! `context_lines` is the max number of lines to be displayed, up to 100. Use `scroll_up` and `scroll_down` to view more content up or down. + Args: + path: str: The path to the file to open, preferred absolute path. + line_number: int | None = 1: The line number to move to. Defaults to 1. + context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100. + +goto_line(line_number: int) -> None: + Moves the window to show the specified line number. + Args: + line_number: int: The line number to move to. + +scroll_down() -> None: + Moves the window down by 100 lines. + Args: + None + +scroll_up() -> None: + Moves the window up by 100 lines. + Args: + None + +search_dir(search_term: str, dir_path: str = './') -> None: + Searches for search_term in all files in dir. If dir is not provided, searches in the current directory. + Args: + search_term: str: The term to search for. + dir_path: str: The path to the directory to search. + +search_file(search_term: str, file_path: str | None = None) -> None: + Searches for search_term in file. If file is not provided, searches in the current open file. + Args: + search_term: str: The term to search for. + file_path: str | None: The path to the file to search. + +find_file(file_name: str, dir_path: str = './') -> None: + Finds all files with the given name in the specified directory. + Args: + file_name: str: The name of the file to find. + dir_path: str: The path to the directory to search. + +parse_pdf(file_path: str) -> None: + Parses the content of a PDF file and prints it. + Args: + file_path: str: The path to the file to open. + +parse_docx(file_path: str) -> None: + Parses the content of a DOCX file and prints it. + Args: + file_path: str: The path to the file to open. + +parse_latex(file_path: str) -> None: + Parses the content of a LaTex file and prints it. + Args: + file_path: str: The path to the file to open. + +parse_pptx(file_path: str) -> None: + Parses the content of a pptx file and prints it. + Args: + file_path: str: The path to the file to open. + + +IMPORTANT: +- `open_file` only returns the first 100 lines of the file by default! +- Indentation is important and code that is not indented correctly will fail and require fixing before it can be run. +- Any code issued should be less than 50 lines to avoid context being cut off! + + +Create a test that when executed, it will replicate the issue. +Responses should be concise. +You should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block. +Include ONLY ONE , , or per response, unless you is finished with the task or needs more input or action from the user in order to proceed. +If you is finished with the task you MUST include in your response. +IMPORTANT: Execute code using , , or whenever possible. +IMPORTANT: You MUST NOT edit or remove any files, but you are allowed to add new files. Code executed in WON'T BE SAVED. +You should utilize full file paths and the `pwd` command to prevent path-related errors. +You MUST NOT apologize to the user or thank the user after running commands or editing files. It should only address the user in response to an explicit message from the user, or to ask for more information. +You MUST NOT push any changes to GitHub unless explicitly requested to do so. + +When you have successfully managed to replicate the issue in a test, +you MUST generate a NEW test file that when executed will replicate the issue. +Then, you MUST run the test and verify that the issue is reproducible. +If it is not reproducible, remove the new test file and think of a different approach to replicate the issue. +If it is reproducible, you MUST output the command to run the test file wrapped with tags. + +--- START OF EXAMPLE --- + +USER: I need to test if there's an issue with the ModelChain when processing multiple inputs in parallel. + +YOU: I'll create a test to verify the ModelChain's behavior with parallel processing. + + +import torch +from openhands.models import ModelChain +import concurrent.futures + +def test_model_chain_parallel(): + # Create a simple model chain + model = ModelChain( + backbone="resnet18", + num_classes=10 + ) + + # Generate dummy input data + batch_size = 4 + input_data = torch.randn(batch_size, 3, 224, 224) + + # Process inputs in parallel + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + futures = [] + for i in range(batch_size): + futures.append( + executor.submit(model.forward, input_data[i:i+1]) + ) + + # Get results + results = [future.result() for future in futures] + + return results + +# Run the test +results = test_model_chain_parallel() +print(f"Processed {len(results)} inputs in parallel") + + +USER: +OBSERVATION: +RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use 'spawn' start method + +YOU: Ah, I see the issue is reproducible. Let me create a test file that demonstrates this CUDA initialization problem. + + +test_code = '''import torch +import torch.multiprocessing as mp +import pytest +from openhands.models import ModelChain + +def process_input(model, input_tensor): + return model(input_tensor) + +def test_model_chain_multiprocess(): + # Force CUDA if available + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + # Create model and move to device + model = ModelChain( + backbone="resnet18", + num_classes=10 + ).to(device) + + # Create dummy input + input_tensor = torch.randn(1, 3, 224, 224).to(device) + + # Try parallel processing + mp.set_start_method('fork') # This will cause the CUDA error + processes = [] + + for _ in range(2): + p = mp.Process(target=process_input, args=(model, input_tensor)) + p.start() + processes.append(p) + + for p in processes: + p.join()''' + +with open('test_model_chain_cuda.py', 'w') as f: + f.write(test_code) + + +YOU: I've created a test file that reproduces the CUDA initialization issue. Let me run it to confirm it works. + + +python3 -m pytest test_model_chain_cuda.py -v + + +USER: +OBSERVATION: +RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use 'spawn' start method + +YOU: Perfect, I've successfully replicated the issue and created a test file that demonstrates the problem. + + +python3 -m pytest test_model_chain_cuda.py -v + + +--- END OF EXAMPLE --- + +--- START OF EXAMPLE 2 --- + +USER: I need to test if the CLI's quiet mode is working properly. The issue is that there's no way to reduce verbosity below the default level. + +YOU: I'll create a test to verify the CLI's verbosity behavior. + + +import subprocess +import tempfile + +def test_cli_verbosity(): + # Create a temporary file with some content to test + with tempfile.NamedTemporaryFile(mode='w', suffix='.sql') as temp_file: + temp_file.write("SELECT * FROM table") # Extra spaces to trigger formatting + temp_file.flush() + + # Run command with default verbosity + default_result = subprocess.run( + ['sqlfluff', 'fix', temp_file.name], + capture_output=True, + text=True + ) + + # Run command with --quiet flag (which doesn't exist yet, this should fail) + quiet_result = subprocess.run( + ['sqlfluff', 'fix', '--quiet', temp_file.name], + capture_output=True, + text=True + ) + + print("Default verbosity output length:", len(default_result.stdout.split('\n'))) + print("Default output:", default_result.stdout) + print("\nQuiet mode output length:", len(quiet_result.stdout.split('\n'))) + print("Quiet output:", quiet_result.stdout) + +test_cli_verbosity() + + +USER: +OBSERVATION: +FileNotFoundError: [Errno 2] No such file or directory: 'sqlfluff' + +YOU: Let me create a test file that demonstrates this verbosity issue with our CLI tool. + + +test_code = '''import subprocess +import tempfile +import pytest + +def test_cli_verbosity_levels(): + # Create a temporary file with content that needs formatting + with tempfile.NamedTemporaryFile(mode='w', suffix='.sql') as temp_file: + temp_file.write("SELECT * FROM table") # Extra spaces to trigger formatting + temp_file.flush() + + # Test default verbosity + default_cmd = ['sqlfluff', 'fix', temp_file.name] + default_output = subprocess.run(default_cmd, capture_output=True, text=True) + + # Test with quiet flag (should fail as feature doesn't exist) + quiet_cmd = ['sqlfluff', 'fix', '--quiet', temp_file.name] + quiet_output = subprocess.run(quiet_cmd, capture_output=True, text=True) + + # Verify that default output is more verbose + assert len(default_output.stdout.split('\\n')) > 1, "Default output should be verbose" + + # This assertion should fail since --quiet flag doesn't exist + with pytest.raises(subprocess.CalledProcessError): + subprocess.run(quiet_cmd, check=True, capture_output=True, text=True)''' + +with open('test_cli_verbosity.py', 'w') as f: + f.write(test_code) + + +YOU: Let me run the test to verify it demonstrates the missing quiet mode functionality. + + +python3 -m pytest test_cli_verbosity.py -v + + +USER: +OBSERVATION: +FileNotFoundError: [Errno 2] No such file or directory: 'sqlfluff' + +YOU: Perfect, I've created a test file that demonstrates the lack of quiet mode functionality. The test fails because sqlfluff is not installed, but the test case correctly shows that the CLI tool lacks the ability to reduce verbosity below the default level. + + +python3 -m pytest test_cli_verbosity.py -v + + +--- END OF EXAMPLE 2 --- + +IMPORTANT: YOU MUST ONLY GENERATE A TAG IN YOUR RESPONSE with only ONE command to run the test file. + +""" + + +def get_prompt(task: str, summary: str) -> str: + formatted_prompt = (general_description) % { + 'task': task, + 'summary': summary, + } + return formatted_prompt From 500112aab722b454b17358a759c8624deb4c36f7 Mon Sep 17 00:00:00 2001 From: AlexCuadron Date: Tue, 5 Nov 2024 14:17:32 -0800 Subject: [PATCH 14/18] merge --- .github/ISSUE_TEMPLATE/bug_template.yml | 2 + .github/workflows/ghcr-build.yml | 6 +- .github/workflows/openhands-resolver.yml | 2 + .gitignore | 1 + README.md | 9 +- .../usage/how-to/evaluation-harness.md | 5 +- .../usage/how-to/evaluation-harness.md | 4 +- .../current/usage/how-to/headless-mode.md | 1 - .../usage/how-to/evaluation-harness.md | 4 +- docs/modules/usage/how-to/gui-mode.md | 9 + docs/modules/usage/runtimes.md | 2 - evaluation/EDA/run_infer.py | 7 +- evaluation/README.md | 1 - evaluation/agent_bench/run_infer.py | 5 +- evaluation/aider_bench/run_infer.py | 3 +- evaluation/biocoder/run_infer.py | 3 +- evaluation/bird/run_infer.py | 5 +- evaluation/browsing_delegation/run_infer.py | 3 +- evaluation/discoverybench/README.md | 37 ++ .../discoverybench/eval_utils/README.md | 7 + .../discoverybench/eval_utils/__init__.py | 0 .../eval_utils/eval_w_subhypo_gen.py | 538 ++++++++++++++++++ .../discoverybench/eval_utils/lm_utils.py | 64 +++ .../eval_utils/openai_helpers.py | 190 +++++++ .../eval_utils/openai_semantic_gen_prompts.py | 151 +++++ .../eval_utils/response_parser.py | 52 ++ evaluation/discoverybench/run_infer.py | 492 ++++++++++++++++ .../discoverybench/scripts/run_infer.sh | 46 ++ evaluation/gaia/run_infer.py | 5 +- evaluation/gorilla/run_infer.py | 5 +- evaluation/gpqa/run_infer.py | 5 +- evaluation/humanevalfix/run_infer.py | 3 +- evaluation/integration_tests/run_infer.py | 17 +- .../tests/t06_github_pr_browsing.py | 44 ++ evaluation/logic_reasoning/run_infer.py | 5 +- evaluation/miniwob/run_infer.py | 53 +- evaluation/mint/run_infer.py | 9 +- evaluation/ml_bench/run_infer.py | 3 +- evaluation/scienceagentbench/Dockerfile | 17 + .../scienceagentbench/Dockerfile.evaluator | 25 + evaluation/scienceagentbench/README.md | 54 ++ evaluation/scienceagentbench/post_proc.py | 30 + evaluation/scienceagentbench/run_infer.py | 292 ++++++++++ .../scienceagentbench/scripts/run_infer.sh | 49 ++ evaluation/swe_bench/run_infer.py | 26 +- evaluation/swe_bench/scripts/run_infer.sh | 11 + evaluation/toolqa/run_infer.py | 5 +- evaluation/utils/shared.py | 46 +- evaluation/webarena/run_infer.py | 5 +- frontend/.eslintrc | 2 +- frontend/.gitignore | 7 +- .../components/chat/chat-interface.test.tsx | 4 +- .../file-explorer/FileExplorer.test.tsx | 13 +- .../utils/extractModelAndProvider.test.ts | 1 - .../utils/organizeModelsAndProviders.test.ts | 1 - frontend/package-lock.json | 105 +++- frontend/package.json | 5 +- frontend/playwright.config.ts | 79 +++ frontend/src/api/open-hands.ts | 135 +---- frontend/src/api/open-hands.types.ts | 5 + frontend/src/assets/arrow-send.svg | 2 +- .../assets/branding/all-hands-logo-spark.svg | 2 +- frontend/src/assets/branding/github-logo.svg | 2 +- frontend/src/assets/clip.svg | 2 +- frontend/src/assets/clipboard.svg | 2 +- frontend/src/assets/default-user.svg | 2 +- frontend/src/assets/docs.svg | 2 +- frontend/src/assets/external-link.svg | 2 +- frontend/src/assets/lightbulb.svg | 2 +- frontend/src/assets/loading-outer.svg | 2 +- frontend/src/assets/message.svg | 2 +- frontend/src/assets/new-project.svg | 2 +- frontend/src/assets/refresh.svg | 2 +- frontend/src/components/AgentStatusBar.tsx | 27 +- .../analytics-consent-form-modal.tsx | 42 ++ .../src/components/buttons/ModalButton.tsx | 3 + frontend/src/components/chat-interface.tsx | 2 +- frontend/src/components/chat/message.d.ts | 3 +- frontend/src/components/error-message.tsx | 35 +- frontend/src/components/feedback-form.tsx | 10 +- .../components/file-explorer/FileExplorer.tsx | 82 ++- .../src/components/file-explorer/TreeNode.tsx | 19 +- .../github-repositories-suggestion-box.tsx | 94 +++ .../modals/AccountSettingsModal.tsx | 13 + .../modals/confirmation-modals/BaseModal.tsx | 8 +- .../modals/connect-to-github-modal.tsx | 1 + frontend/src/context/socket.tsx | 16 +- frontend/src/entry.client.tsx | 15 +- frontend/src/i18n/translation.json | 18 + frontend/src/mocks/handlers.ts | 33 +- .../_oh._index/github-repo-selector.tsx | 10 +- frontend/src/routes/_oh._index/route.tsx | 104 ++-- .../_oh.app._index/code-editor-component.tsx | 33 +- frontend/src/routes/_oh.app._index/route.tsx | 57 +- frontend/src/routes/_oh.app.tsx | 33 +- frontend/src/routes/_oh.tsx | 28 +- frontend/src/routes/oauth.github.callback.tsx | 4 +- frontend/src/routes/set-consent.ts | 9 + frontend/src/routes/settings.ts | 3 + frontend/src/services/actions.ts | 32 +- frontend/src/services/api.ts | 34 +- frontend/src/services/auth.ts | 21 +- frontend/src/state/chatSlice.ts | 6 +- frontend/src/state/statusSlice.ts | 6 +- frontend/src/types/Message.tsx | 10 +- frontend/src/types/core/observations.ts | 3 + frontend/src/utils/download-workspace.ts | 7 +- frontend/src/utils/get-valid-fallback-host.ts | 19 - .../utils/suggestions/non-repo-suggestions.ts | 4 +- frontend/src/utils/user-is-authenticated.ts | 18 +- frontend/test-utils.tsx | 7 +- frontend/tests/fixtures/project.zip | 0 frontend/tests/redirect.spec.ts | 61 ++ frontend/tsconfig.json | 2 +- frontend/vite.config.ts | 2 + openhands/__init__.py | 14 +- openhands/agenthub/__init__.py | 4 - .../agenthub/browsing_agent/browsing_agent.py | 4 +- .../agenthub/codeact_agent/codeact_agent.py | 32 +- .../codeact_agent/function_calling.py | 153 ++++- .../codeact_swe_agent/codeact_swe_agent.py | 6 +- openhands/agenthub/delegator_agent/agent.py | 8 +- openhands/agenthub/dummy_agent/agent.py | 2 +- openhands/agenthub/micro/agent.py | 8 +- openhands/agenthub/planner_agent/prompt.py | 4 +- openhands/agenthub/searcher_agent/__init__.py | 4 - .../agenthub/searcher_agent/action_parser.py | 153 ----- openhands/agenthub/searcher_agent/agent.py | 203 ------- openhands/agenthub/searcher_agent/prompt.py | 146 ----- openhands/agenthub/supervisor_agent/agent.py | 8 +- openhands/agenthub/tester_agent/__init__.py | 4 - .../agenthub/tester_agent/action_parser.py | 158 ----- openhands/agenthub/tester_agent/agent.py | 201 ------- openhands/agenthub/tester_agent/prompt.py | 329 ----------- openhands/controller/agent_controller.py | 303 ++++++---- openhands/controller/state/state.py | 51 +- openhands/controller/stuck.py | 2 +- openhands/core/cli.py | 59 +- openhands/core/config/agent_config.py | 4 +- openhands/core/config/app_config.py | 2 - openhands/core/loop.py | 50 ++ openhands/core/main.py | 54 +- openhands/core/message.py | 2 + openhands/events/action/browse.py | 4 +- openhands/events/action/message.py | 2 +- openhands/events/event.py | 1 + openhands/events/observation/__init__.py | 3 +- openhands/events/observation/browse.py | 48 +- openhands/events/observation/error.py | 15 +- openhands/events/stream.py | 102 +++- openhands/llm/llm.py | 192 ++++--- openhands/memory/__init__.py | 3 +- openhands/memory/history.py | 224 -------- openhands/runtime/action_execution_server.py | 3 +- openhands/runtime/base.py | 62 +- openhands/runtime/browser/browser_env.py | 6 +- openhands/runtime/builder/docker.py | 2 +- openhands/runtime/builder/remote.py | 35 +- openhands/runtime/impl/e2b/e2b_runtime.py | 4 +- openhands/runtime/impl/e2b/sandbox.py | 4 +- .../impl/eventstream/eventstream_runtime.py | 252 ++++---- openhands/runtime/impl/modal/modal_runtime.py | 6 +- .../runtime/impl/remote/remote_runtime.py | 356 +++++------- .../plugins/agent_skills/file_editor/impl.py | 6 +- openhands/runtime/utils/bash.py | 8 +- openhands/runtime/utils/edit.py | 9 +- openhands/runtime/utils/request.py | 46 +- openhands/runtime/utils/runtime_build.py | 4 +- .../utils/runtime_templates/Dockerfile.j2 | 1 + openhands/runtime/utils/tenacity_stop.py | 5 +- openhands/security/analyzer.py | 3 +- openhands/security/invariant/analyzer.py | 1 + openhands/server/github.py | 128 +++++ openhands/server/listen.py | 113 ++-- openhands/server/middleware.py | 4 +- openhands/server/session/agent_session.py | 44 +- openhands/server/session/manager.py | 6 +- openhands/server/session/session.py | 80 ++- openhands/server/sheets_client.py | 68 +++ poetry.lock | 22 +- pyproject.toml | 7 +- tests/runtime/test_stress_remote_runtime.py | 231 ++++++++ tests/unit/test_agent_controller.py | 93 ++- tests/unit/test_codeact_agent.py | 2 +- tests/unit/test_is_stuck.py | 379 ++++++------ tests/unit/test_llm.py | 3 + tests/unit/test_memory.py | 2 +- tests/unit/test_micro_agents.py | 14 +- tests/unit/test_prompt_caching.py | 92 +-- 189 files changed, 5161 insertions(+), 3177 deletions(-) create mode 100644 evaluation/discoverybench/README.md create mode 100644 evaluation/discoverybench/eval_utils/README.md create mode 100644 evaluation/discoverybench/eval_utils/__init__.py create mode 100644 evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py create mode 100644 evaluation/discoverybench/eval_utils/lm_utils.py create mode 100644 evaluation/discoverybench/eval_utils/openai_helpers.py create mode 100644 evaluation/discoverybench/eval_utils/openai_semantic_gen_prompts.py create mode 100644 evaluation/discoverybench/eval_utils/response_parser.py create mode 100644 evaluation/discoverybench/run_infer.py create mode 100755 evaluation/discoverybench/scripts/run_infer.sh create mode 100644 evaluation/integration_tests/tests/t06_github_pr_browsing.py create mode 100644 evaluation/scienceagentbench/Dockerfile create mode 100644 evaluation/scienceagentbench/Dockerfile.evaluator create mode 100644 evaluation/scienceagentbench/README.md create mode 100644 evaluation/scienceagentbench/post_proc.py create mode 100644 evaluation/scienceagentbench/run_infer.py create mode 100755 evaluation/scienceagentbench/scripts/run_infer.sh create mode 100644 frontend/playwright.config.ts create mode 100644 frontend/src/components/analytics-consent-form-modal.tsx create mode 100644 frontend/src/components/github-repositories-suggestion-box.tsx create mode 100644 frontend/src/routes/set-consent.ts delete mode 100644 frontend/src/utils/get-valid-fallback-host.ts create mode 100644 frontend/tests/fixtures/project.zip create mode 100644 frontend/tests/redirect.spec.ts delete mode 100644 openhands/agenthub/searcher_agent/__init__.py delete mode 100644 openhands/agenthub/searcher_agent/action_parser.py delete mode 100644 openhands/agenthub/searcher_agent/agent.py delete mode 100644 openhands/agenthub/searcher_agent/prompt.py delete mode 100644 openhands/agenthub/tester_agent/__init__.py delete mode 100644 openhands/agenthub/tester_agent/action_parser.py delete mode 100644 openhands/agenthub/tester_agent/agent.py delete mode 100644 openhands/agenthub/tester_agent/prompt.py create mode 100644 openhands/core/loop.py delete mode 100644 openhands/memory/history.py create mode 100644 openhands/server/github.py create mode 100644 openhands/server/sheets_client.py create mode 100644 tests/runtime/test_stress_remote_runtime.py diff --git a/.github/ISSUE_TEMPLATE/bug_template.yml b/.github/ISSUE_TEMPLATE/bug_template.yml index ad618f82e5a1..7a6a0ba244f6 100644 --- a/.github/ISSUE_TEMPLATE/bug_template.yml +++ b/.github/ISSUE_TEMPLATE/bug_template.yml @@ -31,6 +31,8 @@ body: options: - Docker command in README - Development workflow + - app.all-hands.dev + - Other default: 0 - type: input diff --git a/.github/workflows/ghcr-build.yml b/.github/workflows/ghcr-build.yml index 25d05b9a0ca7..a7398961da3c 100644 --- a/.github/workflows/ghcr-build.yml +++ b/.github/workflows/ghcr-build.yml @@ -401,7 +401,7 @@ jobs: exit 1 update_pr_description: name: Update PR Description - if: github.event_name == 'pull_request' && !github.event.pull_request.head.repo.fork + if: github.event_name == 'pull_request' && !github.event.pull_request.head.repo.fork && github.actor != 'dependabot[bot]' needs: [ghcr_build_runtime] runs-on: ubuntu-latest steps: @@ -424,9 +424,9 @@ jobs: -p 3000:3000 \ -v /var/run/docker.sock:/var/run/docker.sock \ --add-host host.docker.internal:host-gateway \ - -e SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:$SHORT_SHA-nikolaik \ + -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:$SHORT_SHA-nikolaik \ --name openhands-app-$SHORT_SHA \ - ghcr.io/all-hands-ai/runtime:$SHORT_SHA" + docker.all-hands.dev/all-hands-ai/openhands:$SHORT_SHA" PR_BODY=$(gh pr view $PR_NUMBER --json body --jq .body) diff --git a/.github/workflows/openhands-resolver.yml b/.github/workflows/openhands-resolver.yml index fa253905e1f5..1e5360afba0b 100644 --- a/.github/workflows/openhands-resolver.yml +++ b/.github/workflows/openhands-resolver.yml @@ -3,6 +3,8 @@ name: Resolve Issues with OpenHands on: issues: types: [labeled] + pull_request: + types: [labeled] jobs: call-openhands-resolver: diff --git a/.gitignore b/.gitignore index a4bc03c4eeb1..0cc7d149d781 100644 --- a/.gitignore +++ b/.gitignore @@ -174,6 +174,7 @@ evaluation/bird/data evaluation/gaia/data evaluation/gorilla/data evaluation/toolqa/data +evaluation/scienceagentbench/benchmark # frontend diff --git a/README.md b/README.md index 39e9e746edfc..e67bd0599478 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ CodeCov MIT License
- Join our Slack community + Join our Slack community Join our Discord community Credits
@@ -40,7 +40,7 @@ system requirements and more information. ```bash docker pull docker.all-hands.dev/all-hands-ai/runtime:0.12-nikolaik -docker run -it --rm --pull=always \ +docker run -it --pull=always \ -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.12-nikolaik \ -v /var/run/docker.sock:/var/run/docker.sock \ -p 3000:3000 \ @@ -59,7 +59,8 @@ works best, but you have [many options](https://docs.all-hands.dev/modules/usage You can also [connect OpenHands to your local filesystem](https://docs.all-hands.dev/modules/usage/runtimes), run OpenHands in a scriptable [headless mode](https://docs.all-hands.dev/modules/usage/how-to/headless-mode), -or interact with it via a [friendly CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode). +interact with it via a [friendly CLI](https://docs.all-hands.dev/modules/usage/how-to/cli-mode), +or run it on tagged issues with [a github action](https://github.com/All-Hands-AI/OpenHands-resolver). Visit [Installation](https://docs.all-hands.dev/modules/usage/installation) for more information and setup instructions. @@ -92,7 +93,7 @@ For details, please check [CONTRIBUTING.md](./CONTRIBUTING.md). Whether you're a developer, a researcher, or simply enthusiastic about OpenHands, we'd love to have you in our community. Let's make software engineering better together! -- [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2oikve2hu-UDxHeo8nsE69y6T7yFX_BA) - Here we talk about research, architecture, and future development. +- [Slack workspace](https://join.slack.com/t/openhands-ai/shared_invite/zt-2tom0er4l-JeNUGHt_AxpEfIBstbLPiw) - Here we talk about research, architecture, and future development. - [Discord server](https://discord.gg/ESHStjSjD4) - This is a community-run server for general discussion, questions, and feedback. ## 📈 Progress diff --git a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md index d027a0ead929..3f191053998f 100644 --- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md +++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md @@ -161,7 +161,7 @@ Pour créer un workflow d'évaluation pour votre benchmark, suivez ces étapes : instruction=instruction, test_result=evaluation_result, metadata=metadata, - history=state.history.compatibility_for_eval_history_pairs(), + history=compatibility_for_eval_history_pairs(state.history), metrics=state.metrics.get() if state.metrics else None, error=state.last_error if state and state.last_error else None, ) @@ -260,7 +260,7 @@ def codeact_user_response(state: State | None) -> str: # vérifier si l'agent a essayé de parler à l'utilisateur 3 fois, si oui, faire savoir à l'agent qu'il peut abandonner user_msgs = [ event - for event in state.history.get_events() + for event in state.history if isinstance(event, MessageAction) and event.source == 'user' ] if len(user_msgs) >= 2: @@ -279,4 +279,3 @@ Cette fonction fait ce qui suit : 3. Si l'agent a fait plusieurs tentatives, il lui donne la possibilité d'abandonner En utilisant cette fonction, vous pouvez garantir un comportement cohérent sur plusieurs exécutions d'évaluation et empêcher l'agent de rester bloqué en attendant une entrée humaine. - diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md index a50bb18502e2..eb99a30ea3fd 100644 --- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md +++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md @@ -158,7 +158,7 @@ OpenHands 的主要入口点在 `openhands/core/main.py` 中。以下是它工 instruction=instruction, test_result=evaluation_result, metadata=metadata, - history=state.history.compatibility_for_eval_history_pairs(), + history=compatibility_for_eval_history_pairs(state.history), metrics=state.metrics.get() if state.metrics else None, error=state.last_error if state and state.last_error else None, ) @@ -257,7 +257,7 @@ def codeact_user_response(state: State | None) -> str: # 检查代理是否已尝试与用户对话 3 次,如果是,让代理知道它可以放弃 user_msgs = [ event - for event in state.history.get_events() + for event in state.history if isinstance(event, MessageAction) and event.source == 'user' ] if len(user_msgs) >= 2: diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md index 8beacdd208b6..bfcca8386ebe 100644 --- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md +++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md @@ -58,4 +58,3 @@ docker run -it \ ghcr.io/all-hands-ai/openhands:0.11 \ python -m openhands.core.main -t "write a bash script that prints hi" ``` - diff --git a/docs/modules/usage/how-to/evaluation-harness.md b/docs/modules/usage/how-to/evaluation-harness.md index 622f7e5607ba..e4d1e5d15bc7 100644 --- a/docs/modules/usage/how-to/evaluation-harness.md +++ b/docs/modules/usage/how-to/evaluation-harness.md @@ -158,7 +158,7 @@ To create an evaluation workflow for your benchmark, follow these steps: instruction=instruction, test_result=evaluation_result, metadata=metadata, - history=state.history.compatibility_for_eval_history_pairs(), + history=compatibility_for_eval_history_pairs(state.history), metrics=state.metrics.get() if state.metrics else None, error=state.last_error if state and state.last_error else None, ) @@ -257,7 +257,7 @@ def codeact_user_response(state: State | None) -> str: # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up user_msgs = [ event - for event in state.history.get_events() + for event in state.history if isinstance(event, MessageAction) and event.source == 'user' ] if len(user_msgs) >= 2: diff --git a/docs/modules/usage/how-to/gui-mode.md b/docs/modules/usage/how-to/gui-mode.md index 8726922574a8..df5a070c01e5 100644 --- a/docs/modules/usage/how-to/gui-mode.md +++ b/docs/modules/usage/how-to/gui-mode.md @@ -19,6 +19,15 @@ OpenHands provides a user-friendly Graphical User Interface (GUI) mode for inter 3. Enter the corresponding `API Key` for your chosen provider. 4. Click "Save" to apply the settings. +### GitHub Token Setup + +OpenHands automatically exports a `GITHUB_TOKEN` to the shell environment if it is available. This can happen in two ways: + +1. Locally (OSS): The user directly inputs their GitHub token. +2. Online (SaaS): The token is obtained through GitHub OAuth authentication. + +When you reach the `/app` route, the app checks if a token is present. If it finds one, it sets it in the environment for the agent to use. + ### Advanced Settings 1. Toggle `Advanced Options` to access additional settings. diff --git a/docs/modules/usage/runtimes.md b/docs/modules/usage/runtimes.md index 92fa04b009ad..3c227ffaf74d 100644 --- a/docs/modules/usage/runtimes.md +++ b/docs/modules/usage/runtimes.md @@ -60,7 +60,6 @@ docker run # ... -e SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.app.all-hands.dev" \ -e SANDBOX_API_KEY="your-all-hands-api-key" \ -e SANDBOX_KEEP_REMOTE_RUNTIME_ALIVE="true" \ - -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.11-nikolaik \ # ... ``` @@ -75,5 +74,4 @@ docker run # ... -e RUNTIME=modal \ -e MODAL_API_TOKEN_ID="your-id" \ -e MODAL_API_TOKEN_SECRET="your-secret" \ - -e SANDBOX_RUNTIME_CONTAINER_IMAGE=docker.all-hands.dev/all-hands-ai/runtime:0.11-nikolaik \ ``` diff --git a/evaluation/EDA/run_infer.py b/evaluation/EDA/run_infer.py index 2c896939a751..fb5df3b44f01 100644 --- a/evaluation/EDA/run_infer.py +++ b/evaluation/EDA/run_infer.py @@ -8,6 +8,7 @@ from evaluation.utils.shared import ( EvalMetadata, EvalOutput, + compatibility_for_eval_history_pairs, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -34,7 +35,7 @@ def codeact_user_response_eda(state: State) -> str: # retrieve the latest model message from history if state.history: - model_guess = state.history.get_last_agent_message() + model_guess = state.get_last_agent_message() assert game is not None, 'Game is not initialized.' msg = game.generate_user_response(model_guess) @@ -139,7 +140,7 @@ def process_instance( if state is None: raise ValueError('State should not be None.') - final_message = state.history.get_last_agent_message() + final_message = state.get_last_agent_message() logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}') test_result = game.reward() @@ -148,7 +149,7 @@ def process_instance( # history is now available as a stream of events, rather than list of pairs of (Action, Observation) # for compatibility with the existing output format, we can remake the pairs here # remove when it becomes unnecessary - histories = state.history.compatibility_for_eval_history_pairs() + histories = compatibility_for_eval_history_pairs(state.history) # Save the output output = EvalOutput( diff --git a/evaluation/README.md b/evaluation/README.md index 7eb59c7b8d5a..8be0822875f8 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -84,4 +84,3 @@ all the preprocessing/evaluation/analysis scripts. - Raw data and experimental records should not be stored within this repo. - For model outputs, they should be stored at [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization. - Important data files of manageable size and analysis scripts (e.g., jupyter notebooks) can be directly uploaded to this repo. - diff --git a/evaluation/agent_bench/run_infer.py b/evaluation/agent_bench/run_infer.py index d6fcc62e0798..acdf60fe4850 100644 --- a/evaluation/agent_bench/run_infer.py +++ b/evaluation/agent_bench/run_infer.py @@ -16,6 +16,7 @@ from evaluation.utils.shared import ( EvalMetadata, EvalOutput, + compatibility_for_eval_history_pairs, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -242,7 +243,7 @@ def process_instance( raw_ans = '' # retrieve the last agent message or thought - for event in state.history.get_events(reverse=True): + for event in reversed(state.history): if event.source == 'agent': if isinstance(event, AgentFinishAction): raw_ans = event.thought @@ -271,7 +272,7 @@ def process_instance( # history is now available as a stream of events, rather than list of pairs of (Action, Observation) # for compatibility with the existing output format, we can remake the pairs here # remove when it becomes unnecessary - histories = state.history.compatibility_for_eval_history_pairs() + histories = compatibility_for_eval_history_pairs(state.history) metrics = state.metrics.get() if state.metrics else None diff --git a/evaluation/aider_bench/run_infer.py b/evaluation/aider_bench/run_infer.py index fa1bb9534a83..cddc4bfe7db9 100644 --- a/evaluation/aider_bench/run_infer.py +++ b/evaluation/aider_bench/run_infer.py @@ -15,6 +15,7 @@ from evaluation.utils.shared import ( EvalMetadata, EvalOutput, + compatibility_for_eval_history_pairs, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -250,7 +251,7 @@ def process_instance( # history is now available as a stream of events, rather than list of pairs of (Action, Observation) # for compatibility with the existing output format, we can remake the pairs here # remove when it becomes unnecessary - histories = state.history.compatibility_for_eval_history_pairs() + histories = compatibility_for_eval_history_pairs(state.history) metrics = state.metrics.get() if state.metrics else None # Save the output diff --git a/evaluation/biocoder/run_infer.py b/evaluation/biocoder/run_infer.py index 4535ccba4e4e..5ab4b3b88313 100644 --- a/evaluation/biocoder/run_infer.py +++ b/evaluation/biocoder/run_infer.py @@ -13,6 +13,7 @@ EvalMetadata, EvalOutput, codeact_user_response, + compatibility_for_eval_history_pairs, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -299,7 +300,7 @@ def process_instance( # history is now available as a stream of events, rather than list of pairs of (Action, Observation) # for compatibility with the existing output format, we can remake the pairs here # remove when it becomes unnecessary - histories = state.history.compatibility_for_eval_history_pairs() + histories = compatibility_for_eval_history_pairs(state.history) test_result['generated'] = test_result['metadata']['1_copy_change_code'] diff --git a/evaluation/bird/run_infer.py b/evaluation/bird/run_infer.py index adb498cd2eb1..248dbb66181c 100644 --- a/evaluation/bird/run_infer.py +++ b/evaluation/bird/run_infer.py @@ -16,6 +16,7 @@ from evaluation.utils.shared import ( EvalMetadata, EvalOutput, + compatibility_for_eval_history_pairs, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -46,7 +47,7 @@ def codeact_user_response(state: State) -> str: # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up user_msgs = [ event - for event in state.history.get_events() + for event in state.history if isinstance(event, MessageAction) and event.source == 'user' ] if len(user_msgs) > 2: @@ -431,7 +432,7 @@ def execute_sql(db_path, sql): # history is now available as a stream of events, rather than list of pairs of (Action, Observation) # for compatibility with the existing output format, we can remake the pairs here # remove when it becomes unnecessary - histories = state.history.compatibility_for_eval_history_pairs() + histories = compatibility_for_eval_history_pairs(state.history) # Save the output output = EvalOutput( diff --git a/evaluation/browsing_delegation/run_infer.py b/evaluation/browsing_delegation/run_infer.py index c9fe2ebd18bc..5c1ab8c062e3 100644 --- a/evaluation/browsing_delegation/run_infer.py +++ b/evaluation/browsing_delegation/run_infer.py @@ -9,6 +9,7 @@ from evaluation.utils.shared import ( EvalMetadata, EvalOutput, + compatibility_for_eval_history_pairs, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -89,7 +90,7 @@ def process_instance( # history is now available as a stream of events, rather than list of pairs of (Action, Observation) # for compatibility with the existing output format, we can remake the pairs here # remove when it becomes unnecessary - histories = state.history.compatibility_for_eval_history_pairs() + histories = compatibility_for_eval_history_pairs(state.history) # find the last delegate action last_delegate_action = None diff --git a/evaluation/discoverybench/README.md b/evaluation/discoverybench/README.md new file mode 100644 index 000000000000..a0d8994709df --- /dev/null +++ b/evaluation/discoverybench/README.md @@ -0,0 +1,37 @@ +# DiscoveryBench with OpenHands + +[DiscoveryBench](https://github.com/allenai/discoverybench/) [(Paper)](https://arxiv.org/abs/2407.01725v1) contains 264 tasks collected across 6 diverse domains, such as biology, economics, and sociology. It incorporates discovery workflows from published papers to approximate the real-world challenges faced by researchers. + +

+ + DiscoveryBench Background + +

+ + +## Setup Environment and LLM Configuration + +1. Please follow instructions mentioned [here](https://github.com/openlocus/OpenHands/blob/discoverybench-openhands-integration/evaluation/README.md#setup) to setup OpenHands development environment and LLMs locally + +2. Execute the bash script to start DiscoveryBench Evaluation + +``` +./evaluation/discoverybench/scripts/run_infer.sh [YOUR MODEL CONFIG] +``` +Replace `[YOUR MODEL CONFIG]` with any model the model that you have set up in `config.toml` + + +## Run Inference on DiscoveryBench Instances + +When the `run_infer.sh` script is started, it will automatically pull the latest DiscoveryBench instances & set up the agent environment. The OpenHands agent is invoked to process the task within this environment, producing a hypothesis. We then evaluate it against the “gold” hypothesis provided by DiscoveryBench. The evaluation result, along with the agent chat history is logged to `output.jsonl` under `evaluation_outputs`. + + +``` +./evaluation/discoverybench/scripts/run_infer.sh [MODEL_CONFIG] [GIT_COMMIT] [AGENT] [EVAL_LIMIT] [NUM_WORKERS] +``` + +- `MODEL_CONFIG`: Name of the model you want to evaluate with +- `GIT_COMMIT`: This should be the git commit hash or release tag for OpenHands, e.g., HEAD or a specific tag like 0.6.2. +- `AGENT`: Use CoderActAgent, right now it only supports that. +- `EVAL_LIMIT`: Number of samples to evaluate. +- `NUM_WORKERS`: Number of workers to parallelize the evaluation process. diff --git a/evaluation/discoverybench/eval_utils/README.md b/evaluation/discoverybench/eval_utils/README.md new file mode 100644 index 000000000000..13c98ebaa8d2 --- /dev/null +++ b/evaluation/discoverybench/eval_utils/README.md @@ -0,0 +1,7 @@ +## DiscoveryBench Evaluation Utils + +- **`eval_w_subhypo_gen.py`**: Implements the DiscoveryBench logic for evaluating agent-generated hypotheses. +- **`lm_utils.py`**: Provides utility functions necessary for the evaluation process. +- **`openai_helpers.py`**: Includes helper functions for OpenAI-related tasks. +- **`openai_semantic_gen_prompts.py`**: Contains prompts used for semantic generation. +- **`response_parser.py`**: Handles the parsing of agent-generated hypotheses. diff --git a/evaluation/discoverybench/eval_utils/__init__.py b/evaluation/discoverybench/eval_utils/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py b/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py new file mode 100644 index 000000000000..a80df8279cfb --- /dev/null +++ b/evaluation/discoverybench/eval_utils/eval_w_subhypo_gen.py @@ -0,0 +1,538 @@ +import json +import logging + +from openai import OpenAI + +from .lm_utils import run_chatgpt_query_multi_turn +from .openai_helpers import get_response + +logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO, +) +logger = logging.getLogger(__name__) + + +def get_score_from_answer(type, answer): + if type == 'context': + answer = answer.replace('Answer:', '').strip() + if answer.startswith('A)'): + return 1.0 + elif answer.startswith('B)'): + return 0.0 + return -1.0 + + elif type == 'var': + try: + var_json = json.loads(answer) + # print(f"var_json:{var_json}") + p = 0.0 + r = 0.0 + f1 = 0.0 + if var_json['sizeB']: + p = var_json['intersection'] / var_json['sizeB'] + if var_json['sizeA']: + r = var_json['intersection'] / var_json['sizeA'] + if p > 0.0 and r > 0.0: + f1 = (2 * p * r) / (p + r) + else: + f1 = 0.0 + eval_rec = { + 'p': p, + 'r': r, + 'f1': f1, + 'sizeA': var_json['sizeA'], + 'sizeB': var_json['sizeB'], + 'intersection': var_json['intersection'], + 'explanation': var_json['explanation'], + } + print(f'var_eval: {eval_rec}') + return eval_rec + except Exception: # COMMENT: added Exception + return {'p': -1.0, 'r': -1.0, 'f1': -1.0} + elif type == 'rel': + print(answer) + rel_json = json.loads(answer) + answer_str = rel_json['answer'].strip() + if answer_str.startswith('A') or 'very similar' in answer_str: + return 1.0 + elif ( + answer_str.startswith('B') or 'similar but general than HypoA' in answer_str + ): + return 0.5 + elif answer_str.startswith('C') or 'different' in answer_str: + return 0.0 + return -1.0 + return -1.0 + + +def ask_dimension_question( + query, + gold_hypo, + gold_workflow, + gen_hypo, + gen_workflow, + dataset_meta, + llm_used, + dimension, + dataset_type, + use_column_metadata=True, +): + dimension_question = '' + answer = '' + score = 0.0 + if dimension == 'var': + score = {'p': -1.0, 'r': -1.0, 'f1': -1.0} + num_tokens = 256 + num_retries = 1 + json_response = False + + messages = [ + { + 'role': 'system', + 'content': 'You are an AI assistant that helps evaluate a data-driven hypothesis. You are a helpful assistant who is not talkative. You only respond with the exact answer to a query without additional conversation.', + }, + ] + if dimension == 'context': + dimension_question = """\ + Question: Is HypoB defined in the same context as HypoA? + (Context refers to assumptions/stratification under which the hypotheses are defined.) + Options: A) same B) different + What is your answer?""" + elif dimension == 'var': + dimension_question = """\ + Question: For both HypoA and HypoB, what are the different variables found in the hypotheses? \ + Return your answer as a JSON object in the following format: + ```json + {{ + "sizeA": num of variables used in HypoA + "sizeB": num of variables used in HypoB + "intersection": num of variables common in HypoA and HypoB. Use *fuzzy matching* to determine intersection, accounting for paraphrases or slightly different surface forms + "explanation": a short text explanation about the variables + }}``` + Answer:""" + num_tokens = 512 + num_retries = 1 + json_response = True + elif dimension == 'rel': + dimension_question = """\ + Question: Does HypoB exhibit the same relation as HypoA? + Compare using following example hierarchy of relationships (based on specificity): \ + "there exists a relationship" > "positive relationship" > "positive AND (linear OR quadratic)" > "positive AND linear". + Options: A) very similar B) similar but general than HypoA C) different + Return your answer as a JSON object in the following format: + ```json + {{ + "answer": one of the options from A) very similar B) similar but general than HypoA C) different + "explanation": a short text explanation about the relationship comparison + }}``` + Answer:""" + num_tokens = 512 + num_retries = 1 + json_response = True + + datasets_json = prepare_dataset_metadata_json( + dataset_meta, dataset_type=dataset_type, use_column_metadata=use_column_metadata + ) + + dimension_question_str = f"""\ + You are going to compare two natural-language hypotheses HypoA and HypoB accompanied with optional workflows: WorkflowA for HypoA and WorkflowB for HypoB. \ + Both the hypotheses answer the natural language query "QUERY" over the dataset(s) described by dataset description(s) and column description(s) below. \ + Compare HypoA and HypoB in terms of three aspects: Contexts, Variables, and Relations. \ + E.g., for the hypothesis "From 1995 to 2009, the number of sandhill cranes around the tundra (Indigilka River) surged by an astounding ~10X": + * Contexts refer to stratification of the data under which the given hypothesis is True. E.g., "For all women", "From 1995 to 2009". + * Variables refer to the set of variables (either dependent or independent) that are mentioned in the hypothesis. E.g., number of sandhill cranes, location. + * Relations refer to the form of relation between the variables. E.g., "surged by ~10x". + + Answer following questions for a given pair of hypotheses, HypoA and HypoB, along with an explanation grounded on the QUERY and the DATASET(S). + + Here is the metadata for the task: + ```json + {{ + "datasets": {datasets_json}, + "query": {query}, + "HypoA": {gold_hypo}, + "WorkflowA": {gold_workflow}, + "HypoB": {gen_hypo}, + "WorkflowB": {gen_workflow} + }} + ``` + + {dimension_question}""" + + messages.append({'role': 'user', 'content': dimension_question_str}) + for retry in range(num_retries): + response = run_chatgpt_query_multi_turn( + messages=messages, + model_name=llm_used, + max_tokens=num_tokens, + temperature=0, # 0 for greedy best decoding + json_response=json_response, + ) + if response is not None: # COMMENT: changed from != to is not + break + + if response is not None: # COMMENT: changed from != to is not + answer = response.choices[0].message.content.strip() + score = get_score_from_answer(type=dimension, answer=answer) + + return dimension_question, answer, score + + +def prepare_dataset_metadata_json(dataset_meta, dataset_type, use_column_metadata=True): + if dataset_meta is None: # COMMENT: changed from == to is None + return [ + { + 'dataset_description': '', + 'columns': [], + } + ] + datasets_json = [] + if dataset_type == 'real': + for d in dataset_meta['datasets']: + datasets_json.append( + { + 'dataset_description': d['description'], + 'columns': [ + {'name': col['name'], 'description': col['description']} + for col in d['columns']['raw'] + ] + if use_column_metadata + else [], + } + ) + else: + for d in dataset_meta['datasets']: + datasets_json.append( + { + 'dataset_description': d['description'], + 'columns': [ + {'name': col['name'], 'description': col['description']} + for col in d['columns'] + ] + if use_column_metadata + else [], + } + ) + return datasets_json + + +def get_sub_hypotheses( + query, + hypo, + workflow, + dataset_meta, + llm_used, + dataset_type, + use_column_metadata=True, +): + client = OpenAI() + extraction_prompt = """\ + Given a set of dataset columns, a ground-truth hypothesis, and the analysis workflow used, your task is to extract three dimensions that define the hypothesis: Context, Variables, and Relations. \ + Here are the definitions for these dimensions: + - Contexts: Boundary conditions that limit the scope of a hypothesis. E.g., “for men over \ + the age of 30”, “in Asia and Europe”. If the context applies to the full dataset, then extract the context from the dataset_descrption. + - Variables: Known concepts that interact in a meaningful way under a given context to \ + produce the hypothesis. E.g., gender, age, income, or "None" if there is no interacting variable. + - Relations: Interactions between a given set of variables under a given context to produce \ + the hypothesis. E.g., “quadratic relationship”, “inversely proportional”, piecewise conditionals, \ + or "None" if there is no interacting relationship. + Make sure to only use the information present in the hypothesis and the workflow. Do not add any new information. \ + For each dimension, be specific, and do not omit any important details. + + Here is the metadata for the task: + ```json + { + "datasets": %s, + "hypothesis": "%s", + "workflow": "%s" + } + ``` + + Return your answer as a JSON object in the following format: + ```json + { + "sub_hypo": [ + { + "text": the hypothesis in natural language, + "context": a short text description of the context of the hypothesis, + "variables": a list of columns involved in the hypothesis, + "relations": a short text description of the relationship between the variables of the hypothesis + }, + ... + ] + }``` + """ + datasets_json = prepare_dataset_metadata_json( + dataset_meta, dataset_type, use_column_metadata=use_column_metadata + ) + _prompt = extraction_prompt % (datasets_json, hypo, workflow) + sub_hypo_json = get_response(client, _prompt, model=llm_used, max_retry=1) + + if sub_hypo_json is not None: # COMMENT: changed from != to is not + # print(f"full hypothesis: {hypo}") + print(f'sub_hypo_json: {sub_hypo_json}') + else: + sub_hypo_json = { + 'sub_hypo': [], + } + + sub_hypo_json['full_hypo'] = hypo + + return sub_hypo_json + + +def match_context_with_gpt( + gold_hyp, gold_context, pred_hyp, pred_context, model='gpt-3.5-turbo' +): + prompt = f"""\ + Given a gold hypothesis, a gold context, a predicted hypothesis, and a predicted context, your task is \ + to determine if the predicted context semantically matches the ground-truth context. \ + Here is the definition for Context: Boundary conditions that limit the scope of a sub-hypothesis. E.g., “for men over the age of 30”, “in Asia and Europe”. If the context applies to the full dataset, then the context is derived from the dataset_descrption. \ + Here is the definition for Context: Boundary conditions that limit the scope of a sub-hypothesis. E.g., “for men over the age of 30”, “in Asia and Europe”. If the context applies to the full dataset, then the context is derived from the dataset_descrption. \ + If the predicted context matches the gold context, return true, otherwise return false. + If both gold and predicted hypotheses are defined over the context of the full dataset, then also return true. + If both gold and predicted hypotheses are defined over the context of the full dataset, then also return true. + + Here is the metadata for the task: + ```json + {{ + "gold_hypothesis": "{gold_hyp}", + "gold_context": "{gold_context}", + "predicted_hypothesis": "{pred_hyp}", + "predicted_context": "{pred_context}" + }} + ``` + + Return your answer as a JSON object in the following format: + ```json + {{ + "match": true or false + }} + ```""" + + client = OpenAI() + output = get_response(client, prompt, model=model) + return output.get('match', False) + + +def is_matching_context(gold_hyp, gold_context, pred_hyp, pred_context, llm_used): + if gold_context == pred_context: + return True + if 'None' in [gold_context, pred_context]: + return False + return match_context_with_gpt( + gold_hyp, gold_context, pred_hyp, pred_context, model=llm_used + ) + + +def run_eval_gold_vs_gen_NL_subhypo( + query, + gold_hypo, + gold_workflow, + gen_hypo, + gen_workflow, + dataset_meta, + llm_used, + context_score, + dataset_type, + use_column_metadata=True, +): + # GPT-4 based evaluation to evaluate generated hypothesis in terms of context, variables, relation + + eval_rec = { + 'query': query, + 'HypoA': gold_hypo, + 'WorkflowA': gold_workflow, + 'HypoB': gen_hypo, + 'WorkflowB': gen_workflow, + } + + for dimension in ['var', 'rel']: + question, answer, score = ask_dimension_question( + query, + gold_hypo, + gold_workflow, + gen_hypo, + gen_workflow, + dataset_meta, + llm_used, + dimension=dimension, + dataset_type=dataset_type, + use_column_metadata=use_column_metadata, + ) + + eval_rec[dimension] = {'question': question, 'answer': answer, 'score': score} + + eval_rec['context'] = context_score + eval_rec['accuracy_score'] = ( + 1.0 + * eval_rec['context']['score'] + * eval_rec['var']['score']['f1'] + * eval_rec['rel']['score'] + ) + + return eval_rec + + +def run_eval_gold_vs_gen_NL_hypo_workflow( + query, + gold_hypo, + gold_workflow, + gen_hypo, + gen_workflow, + dataset_meta, + llm_used, + dataset_type, + use_column_metadata=True, +): + # Input: Dataset Metadata, Query, Gold {Hg, Wg}, Predicted {Hp, Wp} + # Output: eval_rec json includes final_score + + # Procedure: + # Dataset Metadata, Query, Gold {Hg, Wg}, Pred {Hg, Wg} + # Gold: [Hg1, Hg2] (compute on the fly) Hg1 is a NL form of subhypothesis + # Predicted: [Hp1, Hp2] (compute on the fly) + + # Compute Intersection: [(Hg_i, Hp_j), …] # tuples of (gold,pred) that matched with context (do this w/o explicit extraction) + # # filter so that a gold context and a predicted context are only attached to one tuple + # Compute recall_context (programmatically) + + # r_v_list = [] + # For (Hg_i, Hp_j) in the intersection: + # With Hg_i, Hp_j in NL, ask GPT4 → #variables and #intersection and a paragraph explanation and programmatically calculate f1_v + # Hg_i, Hp_j in NL, ask GPT4 → matching score (0, 0.5 or 1) : A) very similar B) similar but general than HypoA C) different + explanation + # r_v_list ← f1_v * score_r + # accuracy_score = mean(r_v_list) + # score = [ recall_context * mean over predicted context(context_score * var_score *rel_score )] + + # recall_context = 1.0 # COMMENT: never used + eval_rec = { + 'query': query, + 'HypoA': gold_hypo, + 'WorkflowA': gold_workflow, + 'HypoB': gen_hypo, + 'WorkflowB': gen_workflow, + } + + gold_sub_hypo_json = get_sub_hypotheses( + query=query, + hypo=gold_hypo, + workflow=gold_workflow, + dataset_meta=dataset_meta, + llm_used=llm_used, + dataset_type=dataset_type, + use_column_metadata=use_column_metadata, + ) + if len(gold_sub_hypo_json['sub_hypo']) == 0: + gold_sub_hypo_json['sub_hypo'] = [ + { + 'text': gold_hypo, + 'context': 'None', + 'variables': [], + 'relations': '', + 'explanation': 'unable to segment', + } + ] + print(f'gold_sub_hypo_json: {gold_sub_hypo_json}') + + gen_sub_hypo_json = get_sub_hypotheses( + query=query, + hypo=gen_hypo, + workflow=gen_workflow, + dataset_meta=dataset_meta, + llm_used=llm_used, + dataset_type=dataset_type, + use_column_metadata=use_column_metadata, + ) + if len(gen_sub_hypo_json['sub_hypo']) == 0: + gen_sub_hypo_json['sub_hypo'] = [ + { + 'text': gen_hypo, + 'context': 'None', + 'variables': [], + 'relations': '', + 'explanation': 'unable to segment', + } + ] + print(f'gen_sub_hypo_json: {gen_sub_hypo_json}') + + eval_rec['gold_sub_hypo'] = gold_sub_hypo_json + eval_rec['gen_sub_hypo'] = gen_sub_hypo_json + + gold_subh_covered = [] + gen_subh_to_gold_subh = dict() + gen_gold_subh_to_context = dict() + + for p_id, gen_subh in enumerate(gen_sub_hypo_json['sub_hypo']): + gen_subh_to_gold_subh[p_id] = -1 + + for g_id, gold_subh in enumerate(gold_sub_hypo_json['sub_hypo']): + if g_id in gold_subh_covered: + continue + + # match context + context_bool = is_matching_context( + gold_subh['text'], + gold_subh.get('context', ''), + gen_subh['text'], + gen_subh.get('context', ''), + llm_used, + ) + if context_bool: + context_score = 1.0 + else: + context_score = 0.0 + + if context_score == 1.0: # match only when context_score = 1.0 + gen_subh_to_gold_subh[p_id] = g_id + gold_subh_covered.append(g_id) + gen_gold_subh_to_context[f'P{p_id}||G{g_id}'] = { + 'question': f"""Comapring: GoldH: {gold_subh["text"]}, GoldC: {gold_subh['context']}\nGenH: {gen_subh['text']}, GenC: {gen_subh['context']}""", + 'answer': context_bool, + 'score': context_score, + } + break + + print(f'gen_subh_to_gold_subh: {gen_subh_to_gold_subh}') + eval_rec['gen_subh_to_gold_subh'] = gen_subh_to_gold_subh + eval_rec['gold_subh_covered'] = gold_subh_covered + matched_gold_gen_subh_evals = dict() + sum_accuracy_score = 0.0 + for p_id, g_id in gen_subh_to_gold_subh.items(): + if g_id >= 0: + key = f'P{p_id}||G{g_id}' + context_score = gen_gold_subh_to_context[key] + subh_eval_rec = run_eval_gold_vs_gen_NL_subhypo( + query, + gold_hypo, + gold_workflow, + gen_hypo, + gen_workflow, + dataset_meta, + llm_used, + context_score, + dataset_type=dataset_type, + use_column_metadata=use_column_metadata, + ) + sum_accuracy_score += subh_eval_rec['accuracy_score'] + matched_gold_gen_subh_evals[key] = subh_eval_rec + + eval_rec['matched_gold_gen_subh_evals'] = matched_gold_gen_subh_evals + eval_rec['recall_context'] = ( + len(gold_subh_covered) / len(gold_sub_hypo_json['sub_hypo']) + if len(gold_sub_hypo_json['sub_hypo']) + else 0.0 + ) + mean_accuracy_score = ( + sum_accuracy_score / len(gen_subh_to_gold_subh) + if len(gen_subh_to_gold_subh) + else 0.0 + ) + eval_rec['mean_accuracy_score'] = mean_accuracy_score + final_score = eval_rec['recall_context'] * mean_accuracy_score + eval_rec['final_score'] = final_score + print(f'eval_rec: {json.dumps(eval_rec, indent=2)}') + + return eval_rec diff --git a/evaluation/discoverybench/eval_utils/lm_utils.py b/evaluation/discoverybench/eval_utils/lm_utils.py new file mode 100644 index 000000000000..10486ee82294 --- /dev/null +++ b/evaluation/discoverybench/eval_utils/lm_utils.py @@ -0,0 +1,64 @@ +import os +import sys +import time + +from openai import OpenAI +from tenacity import ( + retry, + stop_after_attempt, # type: ignore + wait_random_exponential, # type: ignore +) + +if sys.version_info >= (3, 8): + from typing import Literal +else: + from typing_extensions import Literal + + +Model = Literal['gpt-4', 'gpt-3.5-turbo', 'text-davinci-003'] + +OpenAI.api_key = os.getenv('OPENAI_API_KEY') +OPENAI_GEN_HYP = { + 'temperature': 0, + 'max_tokens': 250, + 'top_p': 1.0, + 'frequency_penalty': 0, + 'presence_penalty': 0, +} + + +@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) +def run_chatgpt_query_multi_turn( + messages, + model_name='gpt-4-turbo', # pass "gpt4" for more recent model output + max_tokens=256, + temperature=0.0, + json_response=False, +): + response = None + num_retries = 3 + retry = 0 + while retry < num_retries: + retry += 1 + try: + client = OpenAI() + + if json_response: + response = client.chat.completions.create( + model=model_name, + response_format={'type': 'json_object'}, + messages=messages, + **OPENAI_GEN_HYP, + ) + else: + response = client.chat.completions.create( + model=model_name, messages=messages, **OPENAI_GEN_HYP + ) + break + + except Exception as e: + print(e) + print('GPT error. Retrying in 2 seconds...') + time.sleep(2) + + return response diff --git a/evaluation/discoverybench/eval_utils/openai_helpers.py b/evaluation/discoverybench/eval_utils/openai_helpers.py new file mode 100644 index 000000000000..95ab23cf9c2e --- /dev/null +++ b/evaluation/discoverybench/eval_utils/openai_helpers.py @@ -0,0 +1,190 @@ +import json + + +def OPENAI_TOPIC_GEN_MESSAGES(n=10): + return [ + { + 'role': 'system', + 'content': 'You are a helpful assistant who is not talkative. You only respond with the exact answer to a query without additional conversation.', + }, + { + 'role': 'user', + 'content': f'Given `n`, come up with a list of `n` distinct topics and their descriptions. The topics can be absolutely anything. Be as creative as possible. Return your answer as a JSON object. \n\nFor example, for `n`=3, a valid answer might be:\n```json\n{{"topics": [\n {{"id": 1, "topic": "cooking", "description": "Related to recipes, ingredients, chefs, etc."}},\n {{"id": 2, "topic": "sports", "description": "Related to players, stadiums, trophies, etc."}},\n {{"id": 3, "topic": "antiquing", "description": "Related to unique items, history, etc."}}\n]}}```\n\nNow, give me a list for `n`={n}. Remember, pick diverse topics from everything possible. No consecutive topics should be broadly similar. Directly respond with the answer JSON object.', + }, + ] + + +OPENAI_GEN_HYP = { + 'temperature': 1.0, + 'max_tokens': 4096, + 'top_p': 1.0, + 'frequency_penalty': 0, + 'presence_penalty': 0, +} + + +def OPENAI_SEMANTICS_GEN_MESSAGES(dependent, relationship, domain, domain_desc): + return [ + { + 'role': 'system', + 'content': 'You are a helpful assistant who is not talkative. You only respond with the exact answer to a query without additional conversation.', + }, + { + 'role': 'user', + 'content': f'Given the true relationship in a dataset and a given domain, your task is to come up with an interpretation of some real-world concepts that the relationship could be modeling from the provided domain. It\'s okay to be wrong, but suggest something reasonable. Try as much as possible to make sure that the TARGET is actually derivable from the other variables. Give your answer as a JSON object. Here\'s an example:\n\nRelationship for x2 = "(96.4 * x1 ** 3) + (88.72 * x5 ** 2) + (81.96 * x6 ** -2) + (28.13 * x3) + (97.0) + (0 * x4)"\nDomain="Sales"\nDomain description="Related to product distribution, revenues, marketing, etc."\n\nBased on this, the following real-world concepts might be applicable:\n```json\n{{\n "dependent": "x2",\n "relationship": "(96.4 * x1 ** 3) + (88.72 * x5 ** 2) + (81.96 * x6 ** -2) + (28.13 * x3) + (97.0) + (0 * x4)",\n "domain": "Sales",\n "trends": {{\n "x1": "Positive, cubic factor",\n "x2": "TARGET",\n "x3": "Positive, linear factor",\n "x4": "No relation",\n "x5": "Positive quadratic factor",\n "x6": "Positive, inverse quadratic factor"\n }},\n "interpretation": {{\n "x2": {{"description": "Volume of product sales by area", "name": "sales_area", "is_target": true}},\n "x1": {{"description": "Population by area", "name": "pop_area"}},\n "x3": {{"description": "Advertising spending", "name": "ad_spend"}},\n "x4": {{"description": "Gender ratio of marketing team", "name": "gdr_ratio_mkt_team"}},\n "x5": {{"description": "Intensity of marketing campaign", "name": "mkt_intensity"}}\n }},\n "x6": {{"description": "Distance to distribution center", "name": "dist_to_distr_ctr"}}\n}}```\n\nHere\'s a new test question:\nRelationship for {dependent} = "{relationship}"\nDomain = "{domain}"\nDomain description="{domain_desc}"\n\nRespond only with the answer JSON. Make sure that you do not forget to include the TARGET variable in the interpretation object.', + }, + ] + + +def OPENAI_SEMANTICS_GEN_W_MAP_MESSAGES( + dependent, relationship, domain, domain_desc, mapping +): + return [ + { + 'role': 'system', + 'content': 'You are a helpful assistant who is not talkative. You only respond with the exact answer to a query without additional conversation.', + }, + { + 'role': 'user', + 'content': f'Given a partial mapping from variables to real-world concepts and a true relationship in a dataset, your task is to come up with an interpretation of real-world concepts for the variables without any assigned mapping (those starting with x). Suggest something reasonable. The dependent variable must be derivable only from the other variables in the dependent relationship. Give your answer as a JSON object. Here\'s an example:\n\nExample partial mapping and relationship:\n```json\n{{\n "domain": "Sales",\n "domain_description": "Related to product distribution, revenues, marketing, etc.",\n "variable_mapping": {{\n "x1": {{"description": "Population by area", "name": "pop_area"}},\n "x2": {{"description": "Volume of product sales by area", "name": "sales_area"}},\n "x4": {{"description": "Gender ratio of marketing team", "name": "gdr_ratio_mkt_team"}},\n "x6": {{"description": "Distance to distribution center", "name": "dist_to_distr_ctr"}}\n }},\n "dependent_variable": "sales_area",\n "dependent_relationship": "(96.4 * pop_area ** 3) + (88.72 * x5 ** 2) + (81.96 * dist_to_distr_ctr ** -2) + (28.13 * x3) + (97.0)"\n}}```\nBased on this, an example answer would be:\n```json\n{{\n "dependent_variable": "sales_area",\n "missing_mapping": ["x3", "x5"],\n "trends": {{\n "x3": "Positive, linear factor",\n "x5": "Positive quadratic factor"\n }},\n "interpretation": {{\n "x3": {{"description": "Advertising spending", "name": "ad_spend"}},\n "x5": {{"description": "Intensity of marketing campaign", "name": "mkt_intensity"}}\n }}\n}}```\n\nHere\'s a new test question:\n```json\n{{\n "domain": "{domain}",\n "domain_description": "{domain_desc}",\n "variable_mapping": {json.dumps(mapping, indent=2)},\n "dependent_variable": "{dependent}",\n "dependent_relationship": "{relationship}"\n}}```\nRespond only with the answer JSON.', + }, + ] + + +def OPENAI_SEMANTICS_GEN_SUMMARY_MESSAGES(dataset): + return [ + { + 'role': 'system', + 'content': 'You are a helpful assistant who is not talkative. You only respond with the exact answer to a query without additional conversation.', + }, + { + 'role': 'user', + 'content': f'Given the following descriptions of the columns of a dataset, your task is to come up with a natural language overview of the dataset, which should include (1) what the dataset is about, (2) how the data was collected, (3) when the data was collected, and (3) for what purpose the data was collected. Be specific and creative.\n\nExample dataset:\n```json\n{{ \n "dataset": {{ \n "x6": {{"description": "Ancient artifact significance score", "name": "artifact_significance_score", "is_target": true}},\n "x1": {{"description": "Distance to ancient city center", "name": "dist_to_ancient_city_ctr"}},\n "x2": {{"description": "Quantity of discovered relics", "name": "relic_discovery_qty"}},\n "x3": {{"description": "Years since last archaeological expedition", "name": "years_since_exp"}},\n "x4": {{"description": "Number of artifacts in excavation site", "name": "artifact_qty"}},\n "x5": {{"description": "Soil fertility coefficient", "name": "soil_fertility_coef"}},\n "x7": {{"description": "Distance to ancient burial grounds", "name": "dist_to_burial_grounds"}},\n "x8": {{"description": "Population estimate of ancient civilization", "name": "ancient_civilization_pop_estimate"}},\n "x9": {{"description": "Temperature variation in excavation region", "name": "temp_variation"}}\n }}\n}}```\nExample description:\nThis dataset is about archaeological explorations and findings linked to ancient civilizations. The data was collected in the form of field metrics during various archaeological expeditions during the late mid-20th century. The purpose of the data collection is to evaluate the significance of ancient artifacts discovered during excavations.\n\nHere is a new test dataset.\n{json.dumps(dataset, indent=2)}\nProvide only the description.', + }, + ] + + +def OPENAI_GEN_HYPO_MESSAGES(dataset): + return [ + { + 'role': 'system', + 'content': 'You are a helpful assistant who is not talkative. You only respond with the exact answer to a query without additional conversation.', + }, + { + 'role': 'user', + 'content': f'Given a dataset with its descriptions and the true functional relationship between its variables, your task is to generate 3 levels of hypotheses for the stated relationship in plain English. The three levels are "broad", "medium" and "narrow". Make sure that the hypotheses sound natural. *Only include concepts for variables that are present in the provided functional relationship.* Give your answer as a JSON.\n\nFor example, an example dataset might be the following:\n```json\n{{\n "domain": "cybersecurity",\n "summary": "This dataset is about measuring cybersecurity threats in a system. The data was collected by monitoring various cybersecurity metrics in a network environment. The purpose of the data collection is to assess and predict potential cybersecurity risks and vulnerabilities.",\n "variables": [\n {{\n "description": "Level of cybersecurity threat",\n "name": "cybersecurity_threat",\n "is_target": true\n }},\n {{\n "description": "Number of failed login attempts",\n "name": "failed_login_attempts"\n }},\n {{\n "description": "Amount of encrypted data",\n "name": "encrypted_data"\n }},\n {{\n "description": "Frequency of software updates",\n "name": "software_updates"\n }},\n {{\n "description": "Number of antivirus software installed",\n "name": "antivirus_software"\n }},\n {{\n "description": "Quality of firewall protection",\n "name": "firewall_quality"\n }}\n ],\n "relationship": {{\n "dependent": "cybersecurity_threat",\n "relation": "-53.5*encrypted_data**2 - 53.85*failed_login_attempts**2 + 67.75*firewall_quality - 92.16 - 36.68/software_updates**3"\n }}\n}}```\nGiven this dataset, the following is a valid answer:\n```json\n{{\n "broad": {{\n "instruction": "Be vague. Only indicate which concepts might be related but not how they are related",\n "hypothesis": "Threat to cybersecurity is influenced by several factors including the amount of encrypted data, the number of failed login attempts, the quality of the firewall, as well as how often the software is updated."\n }},\n "medium": {{\n "instruction": "Be slightly more specific. For each factor, indicate carefully whether it positively or negatively affects the relationship, but do not indicate what the exponent is.",\n "hypothesis": "Cybersecurity threat tends to decrease with the amount of data encryption, the number of failed login attempts, as well as the frequency of software updates to some extent, while improvement in the firewall quality has a positive effect."\n }},\n "narrow": {{\n "instruction": "Be specific. Communicate the concepts, whether there is a positive or negative effect (be careful), and the meaning of the exponent",\n "hypothesis": "The threat to cybersecurity interacts in a complex manner with various factors. As the amount of encrypted data increases, there is a quadratic decrease in threat. Similarly for the number of failed login attempts, there is a negative quadratic relationship. The quality of the firewall protection on the other hand demonstrates a positive and linear relationship. Finally, the frequency of software updates has an inverse cubic relationship to the threat."\n }},\n}}\n```\n\nBased on this, provide an answer for the following test dataset:\n```json\n{dataset}```\nRespond only with a JSON.', + }, + ] + + +def create_prompt(usr_msg): + return [ + { + 'role': 'system', + 'content': 'You are a helpful assistant who is not talkative. You only respond with the exact answer to a query without additional conversation.', + }, + {'role': 'user', 'content': usr_msg}, + ] + + +def get_response(client, prompt, max_retry=5, model='gpt-3.5-turbo', verbose=False): + n_try = 0 + while n_try < max_retry: + response = client.chat.completions.create( + model=model, messages=create_prompt(prompt), **OPENAI_GEN_HYP + ) + + # COMMENT: changed from + # response.choices[0].message.content.strip().strip('```json').strip('```') + content = response.choices[0].message.content + cleaned_content = content.split('```json')[1].split('```')[0].strip() + output = cleaned_content + try: + response_json = json.loads(output) + return response_json + except ValueError: + if verbose: + print(f'Bad JSON output:\n\n{output}') + n_try += 1 + if n_try < max_retry: + if verbose: + print('Retrying...') + else: + if verbose: + print('Retry limit reached') + return None + + +def get_code_fix( + client, code, error, max_retry=5, model='gpt-3.5-turbo', verbose=False +): + prompt = f"""\ +Given the following code snippet and error message, provide a single-line fix for the error. \ +Note that the code is going to be executed using python `eval`. \ +The code should be executable and should not produce the error message. Be as specific as possible. + +Here's the code and the error: +{{ + "code": "{code}", + "error": "{error}" +}} + +Return only a JSON object with the fixed code in the following format: +```json +{{ + "fixed_code": "..." +}}""" + response = get_response( + client, prompt, max_retry=max_retry, model=model, verbose=verbose + ) + return response + + +def get_new_hypothesis( + client, target, old, expr, cols, model='gpt-3.5-turbo', verbose=False +): + prompt = f"""\ +Given a target column from a dataset, a pandas expression to derive the column from existing columns, a list of \ +existing columns, and a previously written hypothesis text, carefully check if the hypothesis text is consistent with \ +the pandas expression or not. If it is consistent, simply return the hypothesis as it is. If it is not consistent, \ +provide a new natural language hypothesis that is consistent with the pandas expression using only the provided \ +information. Be specific. + +Here's the information: +```json +{{ + "target_column": "{target}", + "pandas_expression": "{expr}", + "existing_columns": {json.dumps(cols, indent=4)} + "old_hypothesis": "{old}", +}}``` + +Give your answer as a new JSON with the following format: +```json +{{ + "hypothesis": "..." +}}""" + response = get_response(client, prompt, model=model, verbose=verbose) + return response + + +def replace_variable(client, expr, old, new, model='gpt-3.5-turbo', verbose=False): + prompt = f"""\ +Given a pandas "expression", replace mentions of the "old" column with its "new" value such that the resultant \ +expression is equivalent to the original expression. + +Here's the information: +```json +{{ + "expression": "{expr}", + "old": "{old}", + "new": "{new}" +}}``` + +Give your answer as a new JSON with the following format: +```json +{{ + "new_expression": "..." +}}""" + response = get_response(client, prompt, model=model, verbose=verbose) + return response diff --git a/evaluation/discoverybench/eval_utils/openai_semantic_gen_prompts.py b/evaluation/discoverybench/eval_utils/openai_semantic_gen_prompts.py new file mode 100644 index 000000000000..a0b5438e4c8a --- /dev/null +++ b/evaluation/discoverybench/eval_utils/openai_semantic_gen_prompts.py @@ -0,0 +1,151 @@ +common_hypothesis_features = [ + '1-2 sentences', + 'surprising finding', + 'includes numeric concepts', + 'includes categorical concepts', + 'includes binary concepts', +] +hypothesis_features = [ + ['requires within-cluster analysis'], + ['requires across-cluster analysis'], + ['corresponds to a polynomial relationship of some columns'], + ['corresponds to a ratio between some columns'], + ['requires temporal analysis'], + ['relationship is based on descriptive statistics of some columns'], + ['requires concepts based on percentage or percentiles'], + ['relationship is only applicable to one cluster in the data and not the others'], +] + +column_features = [ + [ + 'must have one target column', + 'must have quantifiable columns', + 'must have a few categorical columns', + 'make sure the categorical column values do not contain special characters', + 'include a few distractor columns', + ] +] + +common_pandas_features = [ + 'must be executable using python `eval` to create the target column in variable `df` (pandas dataframe)', + "for e.g., df['A']**2 + 3*df['B'] + 9, np.where(df['A'] > 3, 'Yes', 'No'), etc.", + 'variables in pandas_expression must be from the existing columns listed above', + 'variables in pandas_expression must NOT contain the target column itself', +] +pandas_features = [ + ['expression is a quadratic polynomial'], + ['expression is a cubic polynomial'], + ['expression is a ratio of existing columns'], + ['expression is derived through logical combination of existing columns'], + # workflow +] +pandas_features = [common_pandas_features + p for p in pandas_features] + +common_derived_features = [ + '1-2 sentences', + 'includes numeric concepts', + 'includes categorical concepts', + 'includes binary concepts', +] +derived_features = [common_derived_features + h for h in hypothesis_features] +hypothesis_features = [common_hypothesis_features + h for h in hypothesis_features] + +PROMPT_HYP = """\ +Given a dataset topic and description, generate an interesting hypothesis based on \ +the provided instructions. Be creative and come up with an unusual finding. + +```json +{ + "topic": "%s", + "description": "%s", + "hypothesis_features": %s, + "hypothesis": "..." +}``` + +Give your answer as a new JSON with the following format: +```json +{ + "hypothesis": "..." +} +```""" + +PROMPT_COL = """\ +Given a dataset topic, its description, and a true hypothesis that can be determined from it, \ +generate a list of valid columns based on the provided instructions. + +```json +{ + "topic": "%s", + "description": "%s", + "hypothesis": "%s", + "column_instructions": %s, + "columns": [ + { + "col_name": "...", # should be an "_"-separated string + "description": "...", + "data_type": "...", # should be executable using python's `eval` function. E.g., str, float, int, bool + "data_range": {...}, # should be either {"min": ..., "max": ...} or {"values": [...]} + "is_distractor": true/false, # boolean indicating whether this is a distractor that could cause confusion during data analysis + "is_target": true/false # boolean indicating whether this is the target variable for the hypothesis; at least one column should be the target + }, + ... + ], + "pandas_instructions": %s, + "pandas_equation_for_hypothesis": { + "target_col": "...", + "target_col_type": "...", + "target_col_range": {...}, + "independent_cols_in_pandas_expression": [], # list of column names that will be used to derive the target column + "pandas_expression": "..." # expression to derive df[target_col] using df[ind_col1], df[ind_col2], etc. + } +}``` + +Give your answer as a new JSON with the "columns" and "pandas_equation_for_hypothesis" keys filled using the following format: +```json +{ + "columns": [...], + "pandas_equation_for_hypothesis": {...} +} +```""" + +PROMPT_DER = """\ +Given a dataset topic, description, a true hypothesis that can be determined from the data, \ +and a target column from the dataset, generate a hypothesis for the target column using new independent columns not present in the existing columns. + +```json +{ + "topic": "%s", + "description": "%s", + "hypothesis": "%s", + "existing_columns": %s, + "target_column": "%s", + "new_to_target_instructions": %s, + "new_to_target_hypothesis": "...", # describe a relationship between new columns that explains the target column + "new_columns_for_target": [ # do not repeat any of the existing columns in the dataset + { + "col_name": "...", # should be an "_"-separated string + "description": "...", + "data_type": "...", # should be executable using python's `eval` function. E.g., str, float, int, bool + "data_range": {...}, # should be either {"min": ..., "max": ...} or {"values": [...]} + }, + ... + ], + "pandas_instructions": %s, + "pandas_equation_for_new_to_target_hypothesis": { + "target_col": "...", + "target_col_type": "...", + "target_col_range": {...}, + "independent_cols_in_pandas_expression": [], # list of column names from new_columns_for_target that will be used to derive target_col + "pandas_expression": "..." # expression to derive df[target_col] using df[ind_col1], df[ind_col2], etc. + } +}``` + +Give your answer as a new JSON with the "new_to_target_hypothesis", "new_columns_for_target", and \ +"pandas_equation_for_new_to_target_hypothesis" keys filled using the following format: +```json +{ + "new_to_target_hypothesis": "...", + "new_columns_for_target": [...], + "pandas_equation_for_new_to_target_hypothesis": {...} +} +```""" diff --git a/evaluation/discoverybench/eval_utils/response_parser.py b/evaluation/discoverybench/eval_utils/response_parser.py new file mode 100644 index 000000000000..b5de82b5df9e --- /dev/null +++ b/evaluation/discoverybench/eval_utils/response_parser.py @@ -0,0 +1,52 @@ +workflow_summary_markers = [ + 'WORKFLOW SUMMARY', + 'WORKFLOW_SUMMARY', + 'WORKFLOW-SUMMARY', + 'Workflow Summary', +] + +final_answer_markers = [ + 'FINAL ANSWER', + 'FINAL_ANSWER', + 'FINAL-ANSWER', + 'Final Answer', + 'Scientific Hypothesis', + 'Hypothesis', +] + +next_agent_markers = [ + 'NEXT AGENT', + 'NEXT-AGENT', + 'NEXT_AGENT', + 'FEEDBACK', +] + + +def extract_between(content, start_markers, end_markers=None): + for marker in start_markers: + if marker in content: + result = content.split(marker, 1)[1] + if end_markers: + for end_marker in end_markers: + if end_marker in result: + result = result.split(end_marker, 1)[0] + return result + return '' + + +def extract_gen_hypo_from_logs(content: str): + error = '' + + gen_workflow = extract_between( + content, workflow_summary_markers, final_answer_markers + ) + + if not gen_workflow: + error += 'No Workflow Summary found in the line. | ' + + gen_hypothesis = extract_between(content, final_answer_markers, next_agent_markers) + + if not gen_hypothesis: + error += 'No Final Answer in the line.' + + return gen_hypothesis, gen_workflow, error diff --git a/evaluation/discoverybench/run_infer.py b/evaluation/discoverybench/run_infer.py new file mode 100644 index 000000000000..72148a64e759 --- /dev/null +++ b/evaluation/discoverybench/run_infer.py @@ -0,0 +1,492 @@ +import asyncio +import json +import os + +import git +import pandas as pd + +from evaluation.discoverybench.eval_utils.eval_w_subhypo_gen import ( + run_eval_gold_vs_gen_NL_hypo_workflow, +) +from evaluation.discoverybench.eval_utils.response_parser import ( + extract_gen_hypo_from_logs, +) +from evaluation.utils.shared import ( + EvalMetadata, + EvalOutput, + codeact_user_response, + compatibility_for_eval_history_pairs, + make_metadata, + prepare_dataset, + reset_logger_for_multiprocessing, + run_evaluation, +) +from openhands.controller.state.state import State +from openhands.core.config import ( + AgentConfig, + AppConfig, + SandboxConfig, + get_llm_config_arg, + parse_arguments, +) +from openhands.core.logger import openhands_logger as logger +from openhands.core.main import create_runtime, run_controller +from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction +from openhands.events.observation import CmdOutputObservation +from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync + +EVALUATION_LLM = 'gpt-4-1106-preview' + +DATA_FILES = {} + +LIBRARIES = [ + 'pandas', + 'numpy', + 'scipy', + 'matplotlib', + 'seaborn', + 'scikit-learn', + 'statsmodels', +] + +AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { + 'CodeActAgent': codeact_user_response, +} + +AGENT_CLS_TO_INST_SUFFIX = { + 'CodeActAgent': 'When you think you have fixed the issue through code changes, please run the following command: exit .\n' +} + + +def get_config( + metadata: EvalMetadata, +) -> AppConfig: + config = AppConfig( + default_agent=metadata.agent_class, + run_as_openhands=False, + runtime='eventstream', + max_iterations=metadata.max_iterations, + sandbox=SandboxConfig( + base_container_image='python:3.12-bookworm', + enable_auto_lint=True, + use_host_network=False, + ), + # do not mount workspace + workspace_base=None, + workspace_mount_path=None, + ) + config.set_llm_config(metadata.llm_config) + agent_config = AgentConfig( + function_calling=False, + codeact_enable_jupyter=True, + codeact_enable_browsing_delegate=True, + ) + config.set_agent_config(agent_config) + return config + + +def get_dv_query_for_real( + datasets, question, domain_knowledge=None, workflow_tags=None +): + """ + Prepare a structured query for the agent to execute on the specified datasets. + + This function constructs a query by compiling metadata from the provided datasets, along with any relevant domain knowledge and workflow tags. + + Args: + datasets: List of datasets + question: Query to be answered + domain_knowledge: Domain knowledge if any + workflow_tags: Workflow tags if any + + Returns: + query_to_dv: Query to be run on the dataset + dataset_meta: Metadata of the dataset + """ + + dataset_meta = '' + for dataset_metadata in datasets: + dataset_meta += 'Dataset name: ' + dataset_metadata['name'] + dataset_meta += 'Dataset description: ' + dataset_metadata['description'] + dataset_meta += '\nBrief description of columns: ' + for col in dataset_metadata['columns']['raw']: + dataset_meta += col['name'] + ': ' + col['description'] + ', ' + + query_to_dv = dataset_meta + + query_to_dv += f'\nQuery: {question}' + + if domain_knowledge: + query_to_dv += ( + '\nAdditionally, we provide some hints that might be useful to solve the task. Domain Knowledge: \n' + + domain_knowledge + + '.\n' + ) + + if workflow_tags: + query_to_dv += 'The meta tags are: ' + workflow_tags + '.\n' + + query_to_dv += ( + 'In the final answer, please write down a scientific hypothesis in ' + 'natural language, derived from the provided dataset, clearly stating the ' + 'context of hypothesis (if any), variables chosen (if any) and ' + 'relationship between those variables (if any) including any statistical significance.' + 'Also generate a summary of the full workflow starting from data loading that led to the final answer as WORKFLOW SUMMARY:' + ) + + # Run the NL query through datavoyager + return query_to_dv, dataset_meta + + +def initialize_runtime(runtime: Runtime, data_files: list[str]): + """ + Initialize the runtime for the agent. + + This function is called before the runtime is used to run the agent. + """ + logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}") + obs: CmdOutputObservation + + action = CmdRunAction(command='mkdir -p /workspace') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + assert obs.exit_code == 0 + + action = CmdRunAction(command='cd /workspace') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + assert obs.exit_code == 0 + + for file in data_files: + runtime.copy_to( + file, + '/workspace', + ) + + for lib in LIBRARIES: + action = CmdRunAction(command=f'pip install {lib}') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + assert obs.exit_code == 0 + + logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}") + + +def get_last_agent_finish_action(state: State) -> AgentFinishAction: + for event in reversed(state.history): + if isinstance(event, AgentFinishAction): + return event + return None + + +def get_last_message_action(state: State) -> MessageAction: + for event in reversed(state.history): + if isinstance(event, MessageAction): + return event + return None + + +def complete_runtime(state: State): + last_agent_finish_action = get_last_agent_finish_action(state) + last_agent_message_action = get_last_message_action(state) + + if last_agent_finish_action is not None: + final_message_1 = last_agent_finish_action.thought + gen_hypo_1, gen_workflow_1, error_1 = extract_gen_hypo_from_logs( + final_message_1 + ) + else: + gen_hypo_1, gen_workflow_1, error_1 = '', '', '' + + if last_agent_message_action is not None: + final_message_2 = last_agent_message_action.content + gen_hypo_2, gen_workflow_2, error_2 = extract_gen_hypo_from_logs( + final_message_2 + ) + else: + gen_hypo_2, gen_workflow_2, error_2 = '', '', '' + + if gen_hypo_1 and gen_hypo_2: + test_result = { + 'gen_hypo': last_agent_finish_action.thought + if last_agent_finish_action + else last_agent_message_action.content, + 'gen_workflow': '', + 'error': '', + } + return test_result + + test_result = { + 'gen_hypo': gen_hypo_1 if gen_hypo_1 else gen_hypo_2, + 'gen_workflow': gen_workflow_1 if gen_workflow_1 else gen_workflow_2, + 'error': error_1 if error_1 else error_2, + } + + return test_result + + +def process_instance( + instance: pd.Series, + metadata: EvalMetadata, + reset_logger: bool = True, +): + """ + Process and evaluate a single instance of the dataset. + + This function executes the OpenHands agent + for a specific instance of the dataset. It retrieves + the agent's results and evaluates them against the gold + hypothesis. + + Args: + instance: A single row of the dataset + metadata: Metadata for the evaluation + reset_logger: Whether to reset the logger + + Returns: + output: EvalOutput object + """ + + config = get_config(metadata) + + # use a session id for concurrent evaluation + sid = 'ID_' + str(instance.instance_id) + + # Setup the logger properly, so you can run + # multi-processing to parallelize the evaluation + if reset_logger: + log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs') + reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir) + else: + logger.info(f'Starting evaluation for instance {instance.instance_id}.') + + problem_statement, dataset_metadata = get_dv_query_for_real( + datasets=instance.datasets, + question=instance.query, + domain_knowledge=instance.domain_knowledge, + workflow_tags=instance.workflow_tags, + ) + + # Prepare instruction + instruction = ( + f'You are a discovery agent who can execute a python code only once to answer a query based on one or more datasets. The datasets will be present in the current directory.\n\n' + 'Environment has been set up for you to start working. You may assume all necessary tools and datasets are installed.\n\n' + '# Problem Statement\n' + f'{problem_statement}\n\n' + ) + instruction += ( + 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n' + 'You should NOT modify any existing test case files. If needed, you can add new test cases in a NEW file to reproduce the issue.\n' + 'You SHOULD INCLUDE PROPER INDENTATION in your edit commands.\n' + ) + # NOTE: You can actually set slightly different instruction for different agents + instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class] + + # Here's how you can run the agent (similar to the `main` function) and get the final task state + runtime = create_runtime(config, sid=sid) + call_async_from_sync(runtime.connect) + initialize_runtime(runtime, instance.data_files) + + state: State | None = asyncio.run( + run_controller( + config=config, + initial_user_action=MessageAction(content=instruction), + runtime=runtime, + fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get( + metadata.agent_class + ), + ) + ) + + if state is None: + raise ValueError('State should not be None.') + + metrics = state.metrics.get() if state.metrics else None + test_result = complete_runtime(state) + + # history is now available as a stream of events, rather than list of pairs of (Action, Observation) + # for compatibility with the existing output format, we can remake the pairs here + # remove when it becomes unnecessary + histories = compatibility_for_eval_history_pairs(state.history) + + # DiscoveryBench Evaluation + eval_rec = run_eval_gold_vs_gen_NL_hypo_workflow( + query=instance.query, + gold_hypo=instance.gold_hypo, + gold_workflow='', + gen_hypo=test_result['gen_hypo'], + gen_workflow='', + dataset_meta=instance.dataset_metadata, + llm_used=EVALUATION_LLM, + dataset_type='real', + ) + + test_result['eval_rec'] = eval_rec + + output = EvalOutput( + instance_id=str(instance.instance_id), + instruction=instruction, + metadata=metadata, + history=histories, + metrics=metrics, + error=state.last_error if state and state.last_error else None, + test_result=test_result, + ) + + return output + + +def update_csv_name(name): + name = name.replace('-', '_') + + if 'meta_regression' in name: + name = name.replace('meta_regression', 'meta-regression') + if 'ML_enabled' in name: + name = name.replace('ML_enabled', 'ML-enabled') + + return name + + +def list_csv_files(list_of_datasets): + res = [] + for ele in list_of_datasets: + for key, value in ele.items(): + if key == 'name': + csv_file_name = update_csv_name(value) + res.append(DATA_FILES[csv_file_name]) + return res + + +def create_dataset(repo_location: str, split: str = 'test'): + """ + Create a dataset from the discoverybench repository + by walking through the repository and extracting metadata + from the metadata_{}.json files + + Args: + repo_location: Location of the repository + split: Split of the dataset to use + + Returns: + df: DataFrame containing the dataset instances + """ + + data_dict = {} + + data_location = os.path.join(repo_location, 'discoverybench', 'real', split) + answer_key_location = os.path.join(repo_location, 'eval', 'answer_key_real.csv') + + idx = 0 + + for root, dirs, files in os.walk(data_location): + for file in files: + if file.endswith('.json'): + if 'metadata' in file: + metadata = json.load(open(os.path.join(root, file))) + + dataset = root.split('/')[-1] + metadata_id = file.split('_')[-1].split('.')[0] + domain = metadata.get('domain', '') + domain_knowledge = metadata.get('domain_knowledge', '') + workflow_tags = metadata.get('workflow_tags', '') + datasets = metadata.get('datasets', []) + queries = metadata.get('queries', []) + gold_workflow = metadata.get('workflow') + + # loop through queries list to get queries + # and each query has qid; add that to dictionary + for query in queries[0]: + qid = query.get('qid', '') + + data = { + 'dataset': dataset, + 'metadata_id': metadata_id, + 'qid': qid, + 'domain': domain, + 'domain_knowledge': domain_knowledge, + 'workflow_tags': workflow_tags, + 'datasets': datasets, + 'question_type': query['question_type'], + 'query': query['question'], + 'gold_workflow': gold_workflow, + 'dataset_metadata': metadata, + } + + data_dict[idx] = data + idx += 1 + + if file.endswith('.csv'): + DATA_FILES[file] = os.path.join(root, file) + if file.endswith('.txt'): + DATA_FILES[file] = os.path.join(root, file) + + df = pd.DataFrame.from_dict(data_dict, orient='index') + + df['instance_id'] = df.index + + df['data_files'] = df['datasets'].apply(lambda x: list_csv_files(x)) + + answer_key = pd.read_csv(answer_key_location) + + answer_key = answer_key.rename( + columns={ + 'metadataid': 'metadata_id', + 'query_id': 'qid', + 'gold_hypothesis': 'gold_hypothesis', + } + ) + + df['qid'] = df['qid'].astype(int) + df['metadata_id'] = df['metadata_id'].astype(int) + + answer_key['qid'] = answer_key['qid'].astype(int) + answer_key['metadata_id'] = answer_key['metadata_id'].astype(int) + + df = pd.merge(df, answer_key, on=['dataset', 'metadata_id', 'qid'], how='left') + + return df + + +if __name__ == '__main__': + args = parse_arguments() + + # clone git repositor for csv files + repo_url = 'https://github.com/allenai/discoverybench.git' + repo_location = 'git-discoverybench-allenai' + + try: + git.Repo.clone_from(repo_url, repo_location) + except git.exc.GitCommandError: + print('Repository already exists') + + dataset = create_dataset(repo_location) + + # check if there is any empty csv_file + if dataset['data_files'].isnull().any(): + raise ValueError('Some csv files are missing.') + + llm_config = None + if args.llm_config: + llm_config = get_llm_config_arg(args.llm_config) + if llm_config is None: + raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') + + metadata = make_metadata( + llm_config, + 'discoverybench-python', + args.agent_cls, + args.max_iterations, + args.eval_note, + args.eval_output_dir, + ) + output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') + instances = prepare_dataset(dataset, output_file, args.eval_n_limit) + + run_evaluation( + instances, + metadata, + output_file, + args.eval_num_workers, + process_instance, + ) diff --git a/evaluation/discoverybench/scripts/run_infer.sh b/evaluation/discoverybench/scripts/run_infer.sh new file mode 100755 index 000000000000..8b9fffd7c579 --- /dev/null +++ b/evaluation/discoverybench/scripts/run_infer.sh @@ -0,0 +1,46 @@ +#!/bin/bash +set -eo pipefail + +source "evaluation/utils/version_control.sh" + +MODEL_CONFIG=$1 +COMMIT_HASH=$2 +AGENT=$3 +EVAL_LIMIT=$4 +NUM_WORKERS=$5 + +if [ -z "$NUM_WORKERS" ]; then + NUM_WORKERS=1 + echo "Number of workers not specified, use default $NUM_WORKERS" +fi + +# ################################################################################ + +checkout_eval_branch + +if [ -z "$AGENT" ]; then + echo "Agent not specified, use default CodeActAgent" + AGENT="CodeActAgent" +fi + +get_agent_version + +echo "AGENT: $AGENT" +echo "AGENT_VERSION: $AGENT_VERSION" +echo "MODEL_CONFIG: $MODEL_CONFIG" + +COMMAND="poetry run python evaluation/discoverybench/run_infer.py \ + --agent-cls $AGENT \ + --llm-config $MODEL_CONFIG \ + --max-iterations 10 \ + --max-chars 10000000 \ + --eval-num-workers $NUM_WORKERS \ + --eval-note $AGENT_VERSION" + +if [ -n "$EVAL_LIMIT" ]; then + echo "EVAL_LIMIT: $EVAL_LIMIT" + COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT" +fi + +# Run the command +eval $COMMAND diff --git a/evaluation/gaia/run_infer.py b/evaluation/gaia/run_infer.py index c02cd0aee737..1fa0c00e6d6a 100644 --- a/evaluation/gaia/run_infer.py +++ b/evaluation/gaia/run_infer.py @@ -12,6 +12,7 @@ EvalMetadata, EvalOutput, codeact_user_response, + compatibility_for_eval_history_pairs, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -166,7 +167,7 @@ def process_instance( model_answer_raw = '' # get the last message or thought from the agent - for event in state.history.get_events(reverse=True): + for event in reversed(state.history): if event.source == 'agent': if isinstance(event, AgentFinishAction): model_answer_raw = event.thought @@ -203,7 +204,7 @@ def process_instance( # history is now available as a stream of events, rather than list of pairs of (Action, Observation) # for compatibility with the existing output format, we can remake the pairs here # remove when it becomes unnecessary - histories = state.history.compatibility_for_eval_history_pairs() + histories = compatibility_for_eval_history_pairs(state.history) # Save the output output = EvalOutput( diff --git a/evaluation/gorilla/run_infer.py b/evaluation/gorilla/run_infer.py index 873cb7f89694..e437f2b6075a 100644 --- a/evaluation/gorilla/run_infer.py +++ b/evaluation/gorilla/run_infer.py @@ -10,6 +10,7 @@ EvalMetadata, EvalOutput, codeact_user_response, + compatibility_for_eval_history_pairs, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -101,7 +102,7 @@ def process_instance( raise ValueError('State should not be None.') # retrieve the last message from the agent - model_answer_raw = state.history.get_last_agent_message() + model_answer_raw = state.get_last_agent_message() # attempt to parse model_answer ast_eval_fn = instance['ast_eval'] @@ -114,7 +115,7 @@ def process_instance( # history is now available as a stream of events, rather than list of pairs of (Action, Observation) # for compatibility with the existing output format, we can remake the pairs here # remove when it becomes unnecessary - histories = state.history.compatibility_for_eval_history_pairs() + histories = compatibility_for_eval_history_pairs(state.history) output = EvalOutput( instance_id=instance_id, diff --git a/evaluation/gpqa/run_infer.py b/evaluation/gpqa/run_infer.py index 8fd4034c9d5e..58db2e404fc8 100644 --- a/evaluation/gpqa/run_infer.py +++ b/evaluation/gpqa/run_infer.py @@ -28,6 +28,7 @@ from evaluation.utils.shared import ( EvalMetadata, EvalOutput, + compatibility_for_eval_history_pairs, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -244,7 +245,7 @@ def process_instance( 'C': False, 'D': False, } - for event in state.history.get_events(reverse=True): + for event in reversed(state.history): if ( isinstance(event, AgentFinishAction) and event.source != 'user' @@ -300,7 +301,7 @@ def process_instance( instance_id=str(instance.instance_id), instruction=instruction, metadata=metadata, - history=state.history.compatibility_for_eval_history_pairs(), + history=compatibility_for_eval_history_pairs(state.history), metrics=metrics, error=state.last_error if state and state.last_error else None, test_result={ diff --git a/evaluation/humanevalfix/run_infer.py b/evaluation/humanevalfix/run_infer.py index 25fee65561fc..2aa184758b33 100644 --- a/evaluation/humanevalfix/run_infer.py +++ b/evaluation/humanevalfix/run_infer.py @@ -21,6 +21,7 @@ EvalMetadata, EvalOutput, codeact_user_response, + compatibility_for_eval_history_pairs, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -255,7 +256,7 @@ def process_instance( # history is now available as a stream of events, rather than list of pairs of (Action, Observation) # for compatibility with the existing output format, we can remake the pairs here # remove when it becomes unnecessary - histories = state.history.compatibility_for_eval_history_pairs() + histories = compatibility_for_eval_history_pairs(state.history) # Save the output output = EvalOutput( diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py index ddc044088bb1..5e3205fefe2e 100644 --- a/evaluation/integration_tests/run_infer.py +++ b/evaluation/integration_tests/run_infer.py @@ -13,6 +13,7 @@ prepare_dataset, reset_logger_for_multiprocessing, run_evaluation, + update_llm_config_for_completions_logging, ) from openhands.controller.state.state import State from openhands.core.config import ( @@ -55,18 +56,14 @@ def get_config( workspace_base=None, workspace_mount_path=None, ) - if metadata.llm_config.log_completions: - metadata.llm_config.log_completions_folder = os.path.join( - metadata.eval_output_dir, 'llm_completions', instance_id + config.set_llm_config( + update_llm_config_for_completions_logging( + metadata.llm_config, metadata.eval_output_dir, instance_id ) - logger.info( - f'Logging LLM completions for instance {instance_id} to ' - f'{metadata.llm_config.log_completions_folder}' - ) - config.set_llm_config(metadata.llm_config) + ) agent_config = AgentConfig( codeact_enable_jupyter=True, - codeact_enable_browsing_delegate=True, + codeact_enable_browsing=True, codeact_enable_llm_editor=False, ) config.set_agent_config(agent_config) @@ -132,7 +129,7 @@ def process_instance( # # result evaluation # # ============================================= - histories = [event_to_dict(event) for event in state.history.get_events()] + histories = [event_to_dict(event) for event in state.history] test_result: TestResult = test_class.verify_result(runtime, histories) metrics = state.metrics.get() if state.metrics else None diff --git a/evaluation/integration_tests/tests/t06_github_pr_browsing.py b/evaluation/integration_tests/tests/t06_github_pr_browsing.py new file mode 100644 index 000000000000..52ec927cd334 --- /dev/null +++ b/evaluation/integration_tests/tests/t06_github_pr_browsing.py @@ -0,0 +1,44 @@ +from evaluation.integration_tests.tests.base import BaseIntegrationTest, TestResult +from openhands.events.action import AgentFinishAction, MessageAction +from openhands.events.event import Event +from openhands.events.observation import AgentDelegateObservation +from openhands.runtime.base import Runtime + + +class Test(BaseIntegrationTest): + INSTRUCTION = 'Look at https://github.com/All-Hands-AI/OpenHands/pull/8, and tell me what is happening there and what did @asadm suggest.' + + @classmethod + def initialize_runtime(cls, runtime: Runtime) -> None: + pass + + @classmethod + def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: + # check if the "The answer is OpenHands is all you need!" is in any message + message_actions = [ + event + for event in histories + if isinstance( + event, (MessageAction, AgentFinishAction, AgentDelegateObservation) + ) + ] + for event in message_actions: + if isinstance(event, AgentDelegateObservation): + content = event.content + elif isinstance(event, AgentFinishAction): + content = event.outputs.get('content', '') + elif isinstance(event, MessageAction): + content = event.content + else: + raise ValueError(f'Unknown event type: {type(event)}') + + if ( + 'non-commercial' in content + or 'MIT' in content + or 'Apache 2.0' in content + ): + return TestResult(success=True) + return TestResult( + success=False, + reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}', + ) diff --git a/evaluation/logic_reasoning/run_infer.py b/evaluation/logic_reasoning/run_infer.py index 5b7d35f21130..116b438b3ee9 100644 --- a/evaluation/logic_reasoning/run_infer.py +++ b/evaluation/logic_reasoning/run_infer.py @@ -8,6 +8,7 @@ EvalMetadata, EvalOutput, codeact_user_response, + compatibility_for_eval_history_pairs, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -225,7 +226,7 @@ def process_instance( raise ValueError('State should not be None.') final_message = '' - for event in state.history.get_events(reverse=True): + for event in reversed(state.history): if isinstance(event, AgentFinishAction): final_message = event.thought break @@ -247,7 +248,7 @@ def process_instance( # history is now available as a stream of events, rather than list of pairs of (Action, Observation) # for compatibility with the existing output format, we can remake the pairs here # remove when it becomes unnecessary - histories = state.history.compatibility_for_eval_history_pairs() + histories = compatibility_for_eval_history_pairs(state.history) # Save the output output = EvalOutput( diff --git a/evaluation/miniwob/run_infer.py b/evaluation/miniwob/run_infer.py index 9c2aaf1e0963..715bdaa470ae 100644 --- a/evaluation/miniwob/run_infer.py +++ b/evaluation/miniwob/run_infer.py @@ -10,10 +10,13 @@ from evaluation.utils.shared import ( EvalMetadata, EvalOutput, + codeact_user_response, + compatibility_for_eval_history_pairs, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, run_evaluation, + update_llm_config_for_completions_logging, ) from openhands.controller.state.state import State from openhands.core.config import ( @@ -29,7 +32,10 @@ CmdRunAction, MessageAction, ) -from openhands.events.observation import CmdOutputObservation +from openhands.events.observation import ( + BrowserOutputObservation, + CmdOutputObservation, +) from openhands.runtime.base import Runtime from openhands.runtime.browser.browser_env import ( BROWSER_EVAL_GET_GOAL_ACTION, @@ -37,7 +43,11 @@ ) from openhands.utils.async_utils import call_async_from_sync -SUPPORTED_AGENT_CLS = {'BrowsingAgent'} +SUPPORTED_AGENT_CLS = {'BrowsingAgent', 'CodeActAgent'} + +AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { + 'CodeActAgent': codeact_user_response, +} def get_config( @@ -47,25 +57,32 @@ def get_config( config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, - runtime='eventstream', + runtime=os.environ.get('RUNTIME', 'eventstream'), max_iterations=metadata.max_iterations, sandbox=SandboxConfig( base_container_image='xingyaoww/od-eval-miniwob:v1.0', enable_auto_lint=True, use_host_network=False, browsergym_eval_env=env_id, + api_key=os.environ.get('ALLHANDS_API_KEY', None), + remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), + keep_remote_runtime_alive=False, ), # do not mount workspace workspace_base=None, workspace_mount_path=None, ) - config.set_llm_config(metadata.llm_config) + config.set_llm_config( + update_llm_config_for_completions_logging( + metadata.llm_config, metadata.eval_output_dir, env_id + ) + ) return config def initialize_runtime( runtime: Runtime, -) -> str: +) -> tuple[str, BrowserOutputObservation]: """Initialize the runtime for the agent. This function is called before the runtime is used to run the agent. @@ -85,8 +102,14 @@ def initialize_runtime( logger.info(obs, extra={'msg_type': 'OBSERVATION'}) goal = obs.content + # Run noop to get the initial browser observation (e.g., the page URL & content) + action = BrowseInteractiveAction(browser_actions='noop(1000)') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}") - return goal + return goal, obs def complete_runtime( @@ -117,7 +140,7 @@ def process_instance( metadata: EvalMetadata, reset_logger: bool = True, ) -> EvalOutput: - env_id = instance.id + env_id = instance.instance_id config = get_config(metadata, env_id) # Setup the logger properly, so you can run multi-processing to parallelize the evaluation @@ -129,7 +152,12 @@ def process_instance( runtime = create_runtime(config) call_async_from_sync(runtime.connect) - task_str = initialize_runtime(runtime) + task_str, obs = initialize_runtime(runtime) + + task_str += ( + f'\nInitial browser state (output of `noop(1000)`):\n{obs.get_agent_obs_text()}' + ) + state: State | None = asyncio.run( run_controller( config=config, @@ -137,6 +165,9 @@ def process_instance( content=task_str ), # take output from initialize_runtime runtime=runtime, + fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ + metadata.agent_class + ], ) ) @@ -152,19 +183,19 @@ def process_instance( # Instruction is the first message from the USER instruction = '' - for event in state.history.get_events(): + for event in state.history: if isinstance(event, MessageAction): instruction = event.content break return_val = complete_runtime(runtime) logger.info(f'Return value from complete_runtime: {return_val}') - reward = max(return_val['rewards']) + reward = max(return_val['rewards'], default=0) # history is now available as a stream of events, rather than list of pairs of (Action, Observation) # for compatibility with the existing output format, we can remake the pairs here # remove when it becomes unnecessary - histories = state.history.compatibility_for_eval_history_pairs() + histories = compatibility_for_eval_history_pairs(state.history) # Save the output output = EvalOutput( diff --git a/evaluation/mint/run_infer.py b/evaluation/mint/run_infer.py index 8017b194d8d8..2165c3c03fe4 100644 --- a/evaluation/mint/run_infer.py +++ b/evaluation/mint/run_infer.py @@ -13,6 +13,7 @@ from evaluation.utils.shared import ( EvalMetadata, EvalOutput, + compatibility_for_eval_history_pairs, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -28,6 +29,7 @@ from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller from openhands.events.action import ( + Action, CmdRunAction, MessageAction, ) @@ -45,7 +47,10 @@ def codeact_user_response_mint(state: State, task: Task, task_config: dict[str, task=task, task_config=task_config, ) - last_action = state.history.get_last_action() + last_action = next( + (event for event in reversed(state.history) if isinstance(event, Action)), + None, + ) result_state: TaskState = env.step(last_action.message or '') state.extra_data['task_state'] = result_state @@ -202,7 +207,7 @@ def process_instance( # history is now available as a stream of events, rather than list of pairs of (Action, Observation) # for compatibility with the existing output format, we can remake the pairs here # remove when it becomes unnecessary - histories = state.history.compatibility_for_eval_history_pairs() + histories = compatibility_for_eval_history_pairs(state.history) # Save the output output = EvalOutput( diff --git a/evaluation/ml_bench/run_infer.py b/evaluation/ml_bench/run_infer.py index deec068f3392..2bb667e3c947 100644 --- a/evaluation/ml_bench/run_infer.py +++ b/evaluation/ml_bench/run_infer.py @@ -24,6 +24,7 @@ EvalMetadata, EvalOutput, codeact_user_response, + compatibility_for_eval_history_pairs, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -256,7 +257,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = # history is now available as a stream of events, rather than list of pairs of (Action, Observation) # for compatibility with the existing output format, we can remake the pairs here # remove when it becomes unnecessary - histories = state.history.compatibility_for_eval_history_pairs() + histories = compatibility_for_eval_history_pairs(state.history) # Save the output output = EvalOutput( diff --git a/evaluation/scienceagentbench/Dockerfile b/evaluation/scienceagentbench/Dockerfile new file mode 100644 index 000000000000..70ed92cc4dc8 --- /dev/null +++ b/evaluation/scienceagentbench/Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.11-bookworm + + +# For OpenHands agents to explore the dataset directories, please download the full benchmark [here](https://buckeyemailosu-my.sharepoint.com/:u:/g/personal/chen_8336_buckeyemail_osu_edu/EQuA6uJ3CtRHvRfZ2GiN1tYBRVJE4DSUD10MW61fr7HuSQ?e=sCBegG) and unzip it with password `scienceagentbench`. +# **Please DO NOT redistribute the unzipped data files online.** +# It will download a benchmark.zip file to the current directory. +# unzip it and put the benchmark folder under evaluation/scienceagentbench/ + +RUN mkdir -p /benchmark +COPY benchmark /benchmark + +RUN mkdir -p /workspace +WORKDIR /workspace + +# pushd evaluation/scienceagentbench +# docker build -t xingyaoww/openhands-eval-scienceagentbench . +# popd diff --git a/evaluation/scienceagentbench/Dockerfile.evaluator b/evaluation/scienceagentbench/Dockerfile.evaluator new file mode 100644 index 000000000000..f8263e1bb0aa --- /dev/null +++ b/evaluation/scienceagentbench/Dockerfile.evaluator @@ -0,0 +1,25 @@ +FROM mambaorg/micromamba:debian12 + +USER root +# For https://github.com/OSU-NLP-Group/ScienceAgentBench/tree/main?tab=readme-ov-file#code-generation-with-agents + +RUN micromamba create -n sci-agent-eval python=3.10 pip setuptools wheel +RUN micromamba run -n sci-agent-eval pip install pip-tools + +RUN mkdir -p /workspace +WORKDIR /workspace + +RUN apt-get update && apt-get install -y git + +RUN git clone https://github.com/OSU-NLP-Group/ScienceAgentBench.git /workspace/ +RUN git checkout 4eddc7db6449a5ade3e37285747c8b208cd54ce7 + +RUN micromamba create -n sci-agent python=3.10 pip setuptools wheel +RUN micromamba run -n sci-agent pip install -r requirements.txt + +# Replace all occurence of conda with micromamba under the /workspace +RUN find ./ -type f -exec sed -i 's/conda/micromamba/g' {} \; + +# pushd evaluation/scienceagentbench +# docker build -t xingyaoww/openhands-eval-scienceagentbench-evaluator -f Dockerfile.evaluator . +# popd diff --git a/evaluation/scienceagentbench/README.md b/evaluation/scienceagentbench/README.md new file mode 100644 index 000000000000..3182c2e117be --- /dev/null +++ b/evaluation/scienceagentbench/README.md @@ -0,0 +1,54 @@ +# ScienceAgentBench Evaluation with OpenHands + +This folder contains the evaluation harness for [ScienceAgentBench](https://osu-nlp-group.github.io/ScienceAgentBench/) (paper: https://arxiv.org/abs/2410.05080). + +## Setup Environment and LLM Configuration + +Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. + +## Setup ScienceAgentBench + +To prevent benchmark data contamination, we only provide the annotation sheet on [Huggingface](https://huggingface.co/datasets/osunlp/ScienceAgentBench), which includes all necessary *inputs* to run an agent. + +## Run Inference on ScienceAgentBench + +```bash +./evaluation/scienceagentbench/scripts/run_infer.sh [model_config] [git-version] [use_knowledge] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] + +# Example +./evaluation/scienceagentbench/scripts/run_infer.sh llm.eval_gpt4o 0.9.3 +``` + +where `model_config` is mandatory, and the rest are optional. + +- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your +LLM settings, as defined in your `config.toml`. +- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version you would +like to evaluate. It could also be a release tag like `0.6.2`. +- `use_knowledge`, e.g. `true`, specifies whether allowing the agent to use expert-provided knowledge as additional input or not. By default, it is set to `false`. +- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting +to `CodeActAgent`. +- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By +default, the script evaluates the entire SWE-bench_Lite test set (300 issues). Note: +in order to use `eval_limit`, you must also set `agent`. +- `max_iter`, e.g. `20`, is the maximum number of iterations for the agent to run. By +default, it is set to 30. +- `num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By +default, it is set to 1. + +## Evaluate Generated Programs + +### Extract Necessary Information from OpenHands Log + +After the inference is completed, you may use the following command to extract necessary information from the output log for evaluation: + +```bash +python post_proc.py [log_fname] +``` +- `log_fname`, e.g. `evaluation/.../output.jsonl`, is the automatically saved trajectory log of an OpenHands agent. + +Output will be write to e.g. `evaluation/.../output.converted.jsonl` + +### Run evaluation + +Please follow the steps [here](https://github.com/OSU-NLP-Group/ScienceAgentBench/tree/main?tab=readme-ov-file#evaluation-of-generated-code) to evaluate the generated programs. diff --git a/evaluation/scienceagentbench/post_proc.py b/evaluation/scienceagentbench/post_proc.py new file mode 100644 index 000000000000..46cfbe2b2a7a --- /dev/null +++ b/evaluation/scienceagentbench/post_proc.py @@ -0,0 +1,30 @@ +import json +from argparse import ArgumentParser + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument( + 'log_fname', + type=str, + ) + args = parser.parse_args() + + fname = args.log_fname + out_fname = args.log_fname.replace('.jsonl', '.converted.jsonl') + + log = [json.loads(line) for line in open(fname)] + + simple_log = [ + json.dumps( + { + 'instance_id': ex['instance_id'], + 'instruction': ex['instruction'], + 'test_result': ex['test_result'], + 'cost': ex['metrics']['accumulated_cost'], + } + ) + for ex in log + ] + + with open(out_fname, 'w+', encoding='utf-8') as f: + f.write('\n'.join(simple_log)) diff --git a/evaluation/scienceagentbench/run_infer.py b/evaluation/scienceagentbench/run_infer.py new file mode 100644 index 000000000000..93a82855452e --- /dev/null +++ b/evaluation/scienceagentbench/run_infer.py @@ -0,0 +1,292 @@ +import asyncio +import os +from typing import Any + +import pandas as pd +from datasets import load_dataset +from tqdm import tqdm + +from evaluation.utils.shared import ( + EvalMetadata, + EvalOutput, + codeact_user_response, + compatibility_for_eval_history_pairs, + make_metadata, + prepare_dataset, + reset_logger_for_multiprocessing, + run_evaluation, + update_llm_config_for_completions_logging, +) +from openhands.controller.state.state import State +from openhands.core.config import ( + AppConfig, + SandboxConfig, + get_llm_config_arg, + get_parser, +) +from openhands.core.logger import openhands_logger as logger +from openhands.core.main import create_runtime, run_controller +from openhands.events.action import CmdRunAction, MessageAction +from openhands.events.observation import CmdOutputObservation +from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync + +AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { + 'CodeActAgent': codeact_user_response, +} + +LOCAL_DATASET_PATH = os.path.join(os.path.dirname(__file__), 'benchmark') + + +def format_task_dict(example, use_knowledge): + task = { + 'instance_id': example['instance_id'], + 'task_inst': example['task_inst'], + 'dataset_path': '/benchmark/datasets/' + + example['dataset_folder_tree'].split('\n')[0][4:], + 'dataset_folder_tree': example['dataset_folder_tree'], + 'dataset_preview': example['dataset_preview'], + 'pred_program_name': 'pred_' + example['gold_program_name'], + } + + if use_knowledge: + task['task_inst'] += '\n' + str(example['domain_knowledge']) + + return task + + +def get_config( + metadata: EvalMetadata, + instance_id: str, +) -> AppConfig: + config = AppConfig( + default_agent=metadata.agent_class, + run_as_openhands=False, + runtime=os.environ.get('RUNTIME', 'eventstream'), + max_budget_per_task=4, + max_iterations=metadata.max_iterations, + sandbox=SandboxConfig( + base_container_image='docker.io/xingyaoww/openhands-eval-scienceagentbench', + enable_auto_lint=True, + use_host_network=False, + timeout=300, + api_key=os.environ.get('ALLHANDS_API_KEY', None), + remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), + keep_remote_runtime_alive=False, + ), + # do not mount workspace + workspace_base=None, + workspace_mount_path=None, + ) + config.set_llm_config( + update_llm_config_for_completions_logging( + metadata.llm_config, + metadata.eval_output_dir, + instance_id, + ) + ) + return config + + +def initialize_runtime( + runtime: Runtime, + instance: pd.Series, # this argument is not required +): + """Initialize the runtime for the agent. + + This function is called before the runtime is used to run the agent. + """ + logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}") + obs: CmdOutputObservation + + # Set up workspace directories + action = CmdRunAction(command='mkdir -p /workspace/pred_programs') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + assert obs.exit_code == 0 + + action = CmdRunAction(command='mkdir -p /workspace/pred_results') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + assert obs.exit_code == 0 + + dataset_name = instance['dataset_folder_tree'].split('\n')[0][4:].rstrip('/') + + # Copy the dataset to the workspace + dataset_dir = os.path.join( + LOCAL_DATASET_PATH, + 'datasets', + dataset_name, + ) + runtime.copy_to(dataset_dir, '/workspace/benchmark/datasets', recursive=True) + + # Check the dataset exists + action = CmdRunAction( + command='cd /workspace/benchmark/datasets && ls', + keep_prompt=False, + ) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.exit_code == 0 + assert dataset_name in obs.content + + logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}") + + +def complete_runtime( + runtime: Runtime, + instance: pd.Series, +) -> dict[str, Any]: + """Complete the runtime for the agent. + + This function is called before the runtime is used to run the agent. + If you need to do something in the sandbox to get the correctness metric after + the agent has run, modify this function. + """ + logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}") + obs: CmdOutputObservation + + test_result = {} + + action = CmdRunAction(command='cd /workspace') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + + assert obs.exit_code == 0 + + action = CmdRunAction( + command=f'cat pred_programs/{instance.pred_program_name}', + keep_prompt=False, + ) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + + if obs.exit_code == 0: + test_result = {'program': obs.content} + else: + test_result = {'program': 'ERROR'} + + logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}") + return test_result + + +def process_instance( + instance: pd.Series, + metadata: EvalMetadata, + reset_logger: bool = True, +) -> EvalOutput: + instance_id = instance.instance_id.replace('/', '__') + config = get_config(metadata, instance_id) + + # Set up the logger properly, so you can run multi-processing to parallelize the evaluation + if reset_logger: + log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs') + reset_logger_for_multiprocessing(logger, instance_id, log_dir) + else: + logger.info(f'Starting evaluation for instance {instance_id}.') + + instruction = f"""You are an expert Python programming assistant that helps scientist users to write high-quality code to solve their tasks. +Given a user request, you are expected to write a complete program that accomplishes the requested task and save any outputs to `/workspace/pred_results/` in the correct format. + +Here's the user request you need to work on: +{instance.task_inst} + +You can access the dataset at `{instance.dataset_path}`. Here is the directory structure of the dataset: +``` +{instance.dataset_folder_tree} +``` +Here are some helpful previews for the dataset file(s): +{instance.dataset_preview} + +Please save your program as `/workspace/pred_programs/{instance.pred_program_name}`. +Then, please run the program to check and fix any errors. +Please do NOT run the program in the background. +If the program uses some packages that are incompatible, please figure out alternative implementations and do NOT restart the environment. + +""" + + runtime = create_runtime(config) + call_async_from_sync(runtime.connect) + initialize_runtime(runtime, instance) + + # Here's how you can run the agent (similar to the `main` function) and get the final task state + state: State | None = asyncio.run( + run_controller( + config=config, + initial_user_action=MessageAction(content=instruction), + runtime=runtime, + fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get( + metadata.agent_class + ), + ) + ) + + # ======= Attempt to evaluate the agent's edits ======= + test_result = complete_runtime(runtime, instance) + + # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction) + # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation. + if state is None: + raise ValueError('State should not be None.') + metrics = state.metrics.get() if state.metrics else None + + # history is now available as a stream of events, rather than list of pairs of (Action, Observation) + # for compatibility with the existing output format, we can remake the pairs here + # remove when it becomes unnecessary + histories = compatibility_for_eval_history_pairs(state.history) + + # Save the output + output = EvalOutput( + instance_id=instance.instance_id, + instruction=instruction, + metadata=metadata, + history=histories, + metrics=metrics, + error=state.last_error if state and state.last_error else None, + test_result=test_result, + ) + return output + + +if __name__ == '__main__': + parser = get_parser() + parser.add_argument( + '--use_knowledge', + type=str, + default='false', + choices=['true', 'false'], + help='use expert-provided knowledge or not', + ) + args, _ = parser.parse_known_args() + + sab_dataset = load_dataset('osunlp/ScienceAgentBench', split='validation') + + dataset_processed = [] + for example in tqdm(sab_dataset): + dataset_processed.append( + format_task_dict(example, args.use_knowledge == 'true') + ) + + dataset = pd.DataFrame(dataset_processed) + + llm_config = None + if args.llm_config: + llm_config = get_llm_config_arg(args.llm_config) + if llm_config is None: + raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') + + metadata = make_metadata( + llm_config, + 'ScienceAgentBench', + args.agent_cls, + args.max_iterations, + args.eval_note, + args.eval_output_dir, + ) + output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') + dataset['instance_id'] = dataset['instance_id'].apply(str) + instances = prepare_dataset(dataset, output_file, args.eval_n_limit) + + run_evaluation( + instances, metadata, output_file, args.eval_num_workers, process_instance + ) diff --git a/evaluation/scienceagentbench/scripts/run_infer.sh b/evaluation/scienceagentbench/scripts/run_infer.sh new file mode 100755 index 000000000000..7667e5723789 --- /dev/null +++ b/evaluation/scienceagentbench/scripts/run_infer.sh @@ -0,0 +1,49 @@ +#!/bin/bash +set -eo pipefail + +source "evaluation/utils/version_control.sh" + +MODEL_CONFIG=$1 +COMMIT_HASH=$2 +USE_KNOWLEDGE=$3 +AGENT=$4 +EVAL_LIMIT=$5 +NUM_WORKERS=$6 + +if [ -z "$NUM_WORKERS" ]; then + NUM_WORKERS=1 + echo "Number of workers not specified, use default $NUM_WORKERS" +fi +checkout_eval_branch + +if [ -z "$AGENT" ]; then + echo "Agent not specified, use default CodeActAgent" + AGENT="CodeActAgent" +fi + +if [ -z "$USE_KNOWLEDGE" ]; then + echo "Use knowledge not specified, use default False" + USE_KNOWLEDGE=false +fi + +get_agent_version + +echo "AGENT: $AGENT" +echo "AGENT_VERSION: $AGENT_VERSION" +echo "MODEL_CONFIG: $MODEL_CONFIG" + +COMMAND="poetry run python evaluation/scienceagentbench/run_infer.py \ + --agent-cls $AGENT \ + --llm-config $MODEL_CONFIG \ + --use_knowledge $USE_KNOWLEDGE \ + --max-iterations 30 \ + --eval-num-workers $NUM_WORKERS \ + --eval-note $AGENT_VERSION" \ + +if [ -n "$EVAL_LIMIT" ]; then + echo "EVAL_LIMIT: $EVAL_LIMIT" + COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT" +fi + +# Run the command +eval $COMMAND diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index 7578e7e9562b..2cc0dfd7d9a6 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -20,6 +20,7 @@ prepare_dataset, reset_logger_for_multiprocessing, run_evaluation, + update_llm_config_for_completions_logging, ) from openhands.controller.state.state import State from openhands.core.config import ( @@ -40,6 +41,7 @@ USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true' USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true' +RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true' AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { 'CodeActAgent': codeact_user_response, @@ -89,6 +91,13 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata): '5. Think about edgecases and make sure your fix handles them as well\n' "Your thinking should be thorough and so it's fine if it's very long.\n" ) + + if RUN_WITH_BROWSING: + instruction += ( + '\n' + 'You SHOULD NEVER attempt to browse the web. ' + '\n' + ) return instruction @@ -143,18 +152,14 @@ def get_config( workspace_base=None, workspace_mount_path=None, ) - if metadata.llm_config.log_completions: - metadata.llm_config.log_completions_folder = os.path.join( - metadata.eval_output_dir, 'llm_completions', instance['instance_id'] + config.set_llm_config( + update_llm_config_for_completions_logging( + metadata.llm_config, metadata.eval_output_dir, instance['instance_id'] ) - logger.info( - f'Logging LLM completions for instance {instance["instance_id"]} to ' - f'{metadata.llm_config.log_completions_folder}' - ) - config.set_llm_config(metadata.llm_config) + ) agent_config = AgentConfig( codeact_enable_jupyter=False, - codeact_enable_browsing_delegate=False, + codeact_enable_browsing=RUN_WITH_BROWSING, codeact_enable_llm_editor=False, ) config.set_agent_config(agent_config) @@ -439,7 +444,8 @@ def process_instance( if state is None: raise ValueError('State should not be None.') - histories = [event_to_dict(event) for event in state.history.get_events()] + # NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events + histories = [event_to_dict(event) for event in state.history] metrics = state.metrics.get() if state.metrics else None # Save the output diff --git a/evaluation/swe_bench/scripts/run_infer.sh b/evaluation/swe_bench/scripts/run_infer.sh index 54bcbbbc3391..520003635a4e 100755 --- a/evaluation/swe_bench/scripts/run_infer.sh +++ b/evaluation/swe_bench/scripts/run_infer.sh @@ -34,6 +34,11 @@ if [ -z "$USE_INSTANCE_IMAGE" ]; then USE_INSTANCE_IMAGE=true fi +if [ -z "$RUN_WITH_BROWSING" ]; then + echo "RUN_WITH_BROWSING not specified, use default false" + RUN_WITH_BROWSING=false +fi + if [ -z "$DATASET" ]; then echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite" @@ -47,6 +52,8 @@ fi export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE" +export RUN_WITH_BROWSING=$RUN_WITH_BROWSING +echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING" get_agent_version @@ -67,6 +74,10 @@ if [ "$USE_HINT_TEXT" = false ]; then EVAL_NOTE="$EVAL_NOTE-no-hint" fi +if [ "$RUN_WITH_BROWSING" = true ]; then + EVAL_NOTE="$EVAL_NOTE-with-browsing" +fi + if [ -n "$EXP_NAME" ]; then EVAL_NOTE="$EVAL_NOTE-$EXP_NAME" fi diff --git a/evaluation/toolqa/run_infer.py b/evaluation/toolqa/run_infer.py index 5c2c53422785..25633ce6ce23 100644 --- a/evaluation/toolqa/run_infer.py +++ b/evaluation/toolqa/run_infer.py @@ -9,6 +9,7 @@ EvalMetadata, EvalOutput, codeact_user_response, + compatibility_for_eval_history_pairs, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -126,7 +127,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = raise ValueError('State should not be None.') # retrieve the last message from the agent - model_answer_raw = state.history.get_last_agent_message() + model_answer_raw = state.get_last_agent_message() # attempt to parse model_answer correct = eval_answer(str(model_answer_raw), str(answer)) @@ -137,7 +138,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = # history is now available as a stream of events, rather than list of pairs of (Action, Observation) # for compatibility with the existing output format, we can remake the pairs here # remove when it becomes unnecessary - histories = state.history.compatibility_for_eval_history_pairs() + histories = compatibility_for_eval_history_pairs(state.history) # Save the output output = EvalOutput( diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index b8d2ad281ad6..d5a6d6d89de8 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -18,6 +18,9 @@ from openhands.core.logger import openhands_logger as logger from openhands.events.action import Action from openhands.events.action.message import MessageAction +from openhands.events.event import Event +from openhands.events.serialization.event import event_to_dict +from openhands.events.utils import get_pairs_from_events class EvalMetadata(BaseModel): @@ -112,7 +115,14 @@ def codeact_user_response( if state.history: # check if the last action has an answer, if so, early exit if try_parse is not None: - last_action = state.history.get_last_action() + last_action = next( + ( + event + for event in reversed(state.history) + if isinstance(event, Action) + ), + None, + ) ans = try_parse(last_action) if ans is not None: return '/exit' @@ -120,7 +130,7 @@ def codeact_user_response( # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up user_msgs = [ event - for event in state.history.get_events() + for event in state.history if isinstance(event, MessageAction) and event.source == 'user' ] if len(user_msgs) >= 2: @@ -411,3 +421,35 @@ def reset_logger_for_multiprocessing( ) file_handler.setLevel(logging.INFO) logger.addHandler(file_handler) + + +def update_llm_config_for_completions_logging( + llm_config: LLMConfig, + eval_output_dir: str, + instance_id: str, +) -> LLMConfig: + """Update the LLM config for logging completions.""" + if llm_config.log_completions: + llm_config.log_completions_folder = os.path.join( + eval_output_dir, 'llm_completions', instance_id + ) + logger.info( + f'Logging LLM completions for instance {instance_id} to ' + f'{llm_config.log_completions_folder}' + ) + return llm_config + + +# history is now available as a filtered stream of events, rather than list of pairs of (Action, Observation) +# we rebuild the pairs here +# for compatibility with the existing output format in evaluations +# remove this when it's no longer necessary +def compatibility_for_eval_history_pairs( + history: list[Event], +) -> list[tuple[dict, dict]]: + history_pairs = [] + + for action, observation in get_pairs_from_events(history): + history_pairs.append((event_to_dict(action), event_to_dict(observation))) + + return history_pairs diff --git a/evaluation/webarena/run_infer.py b/evaluation/webarena/run_infer.py index cfc2bdae493a..531f134fd988 100644 --- a/evaluation/webarena/run_infer.py +++ b/evaluation/webarena/run_infer.py @@ -10,6 +10,7 @@ from evaluation.utils.shared import ( EvalMetadata, EvalOutput, + compatibility_for_eval_history_pairs, make_metadata, prepare_dataset, reset_logger_for_multiprocessing, @@ -166,7 +167,7 @@ def process_instance( # Instruction is the first message from the USER instruction = '' - for event in state.history.get_events(): + for event in state.history: if isinstance(event, MessageAction): instruction = event.content break @@ -178,7 +179,7 @@ def process_instance( # history is now available as a stream of events, rather than list of pairs of (Action, Observation) # for compatibility with the existing output format, we can remake the pairs here # remove when it becomes unnecessary - histories = state.history.compatibility_for_eval_history_pairs() + histories = compatibility_for_eval_history_pairs(state.history) # Save the output output = EvalOutput( diff --git a/frontend/.eslintrc b/frontend/.eslintrc index c0b7a8c9be2c..d5cb543bd728 100644 --- a/frontend/.eslintrc +++ b/frontend/.eslintrc @@ -84,4 +84,4 @@ } } ] -} \ No newline at end of file +} diff --git a/frontend/.gitignore b/frontend/.gitignore index 44029f58cb4f..d62bd04c1a2f 100644 --- a/frontend/.gitignore +++ b/frontend/.gitignore @@ -1,4 +1,9 @@ # i18n translation files make by script using `make build` public/locales/**/* src/i18n/declaration.ts -.env \ No newline at end of file +.env +node_modules/ +/test-results/ +/playwright-report/ +/blob-report/ +/playwright/.cache/ diff --git a/frontend/__tests__/components/chat/chat-interface.test.tsx b/frontend/__tests__/components/chat/chat-interface.test.tsx index d27897c2d8e3..501389f9897a 100644 --- a/frontend/__tests__/components/chat/chat-interface.test.tsx +++ b/frontend/__tests__/components/chat/chat-interface.test.tsx @@ -128,14 +128,14 @@ describe.skip("ChatInterface", () => { timestamp: new Date().toISOString(), }, { - error: "Woops!", + error: true, + id: "", message: "Something went wrong", }, ]; renderChatInterface(messages); const error = screen.getByTestId("error-message"); - expect(within(error).getByText("Woops!")).toBeInTheDocument(); expect(within(error).getByText("Something went wrong")).toBeInTheDocument(); }); diff --git a/frontend/__tests__/components/file-explorer/FileExplorer.test.tsx b/frontend/__tests__/components/file-explorer/FileExplorer.test.tsx index b1faa3c18bf4..a1c0717783e9 100644 --- a/frontend/__tests__/components/file-explorer/FileExplorer.test.tsx +++ b/frontend/__tests__/components/file-explorer/FileExplorer.test.tsx @@ -16,13 +16,16 @@ vi.mock("../../services/fileService", async () => ({ })); const renderFileExplorerWithRunningAgentState = () => - renderWithProviders(, { - preloadedState: { - agent: { - curAgentState: AgentState.RUNNING, + renderWithProviders( + {}} />, + { + preloadedState: { + agent: { + curAgentState: AgentState.RUNNING, + }, }, }, - }); + ); describe.skip("FileExplorer", () => { afterEach(() => { diff --git a/frontend/__tests__/utils/extractModelAndProvider.test.ts b/frontend/__tests__/utils/extractModelAndProvider.test.ts index 6ea84db241db..c1ec4ee838ec 100644 --- a/frontend/__tests__/utils/extractModelAndProvider.test.ts +++ b/frontend/__tests__/utils/extractModelAndProvider.test.ts @@ -78,4 +78,3 @@ describe("extractModelAndProvider", () => { }); }); }); - diff --git a/frontend/__tests__/utils/organizeModelsAndProviders.test.ts b/frontend/__tests__/utils/organizeModelsAndProviders.test.ts index 1062309dbf68..aa3c84707432 100644 --- a/frontend/__tests__/utils/organizeModelsAndProviders.test.ts +++ b/frontend/__tests__/utils/organizeModelsAndProviders.test.ts @@ -63,4 +63,3 @@ test("organizeModelsAndProviders", () => { }, }); }); - diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 5a1f634dfb44..e7cce105b9a1 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -1,12 +1,12 @@ { "name": "openhands-frontend", - "version": "0.12.0", + "version": "0.12.3", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "openhands-frontend", - "version": "0.12.0", + "version": "0.12.3", "dependencies": { "@monaco-editor/react": "^4.6.0", "@nextui-org/react": "^2.4.8", @@ -26,6 +26,7 @@ "isbot": "^5.1.17", "jose": "^5.9.4", "monaco-editor": "^0.52.0", + "posthog-js": "^1.176.0", "react": "^18.3.1", "react-dom": "^18.3.1", "react-highlight": "^0.15.0", @@ -45,6 +46,7 @@ "ws": "^8.18.0" }, "devDependencies": { + "@playwright/test": "^1.48.2", "@remix-run/dev": "^2.11.2", "@remix-run/testing": "^2.11.2", "@tailwindcss/typography": "^0.5.15", @@ -3378,6 +3380,21 @@ "url": "https://opencollective.com/unts" } }, + "node_modules/@playwright/test": { + "version": "1.48.2", + "resolved": "https://registry.npmjs.org/@playwright/test/-/test-1.48.2.tgz", + "integrity": "sha512-54w1xCWfXuax7dz4W2M9uw0gDyh+ti/0K/MxcCUxChFh37kkdxPdfZDw5QBbuPUJHr1CiHJ1hXgSs+GgeQc5Zw==", + "dev": true, + "dependencies": { + "playwright": "1.48.2" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/@polka/url": { "version": "1.0.0-next.28", "resolved": "https://registry.npmjs.org/@polka/url/-/url-1.0.0-next.28.tgz", @@ -7864,6 +7881,16 @@ "node": ">=6.6.0" } }, + "node_modules/core-js": { + "version": "3.38.1", + "resolved": "https://registry.npmjs.org/core-js/-/core-js-3.38.1.tgz", + "integrity": "sha512-OP35aUorbU3Zvlx7pjsFdu1rGNnD4pgw/CWoYzRY3t2EzoVT7shKHY1dlAy3f41cGIO7ZDPQimhGFTlEYkG/Hw==", + "hasInstallScript": true, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/core-js" + } + }, "node_modules/core-util-is": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz", @@ -9666,6 +9693,11 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/fflate": { + "version": "0.4.8", + "resolved": "https://registry.npmjs.org/fflate/-/fflate-0.4.8.tgz", + "integrity": "sha512-FJqqoDBR00Mdj9ppamLa/Y7vxm+PRmNWA67N846RvsoYVMKB4q3y/de5PA7gUmRMYK/8CMz2GDZQmCRN1wBcWA==" + }, "node_modules/file-entry-cache": { "version": "6.0.1", "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-6.0.1.tgz", @@ -19406,6 +19438,50 @@ "pathe": "^1.1.2" } }, + "node_modules/playwright": { + "version": "1.48.2", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.48.2.tgz", + "integrity": "sha512-NjYvYgp4BPmiwfe31j4gHLa3J7bD2WiBz8Lk2RoSsmX38SVIARZ18VYjxLjAcDsAhA+F4iSEXTSGgjua0rrlgQ==", + "dev": true, + "dependencies": { + "playwright-core": "1.48.2" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.48.2", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.48.2.tgz", + "integrity": "sha512-sjjw+qrLFlriJo64du+EK0kJgZzoQPsabGF4lBvsid+3CNIZIYLgnMj9V6JY5VhM2Peh20DJWIVpVljLLnlawA==", + "dev": true, + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/playwright/node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "dev": true, + "hasInstallScript": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, "node_modules/possible-typed-array-names": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/possible-typed-array-names/-/possible-typed-array-names-1.0.0.tgz", @@ -19653,6 +19729,31 @@ "resolved": "https://registry.npmjs.org/postcss-value-parser/-/postcss-value-parser-4.2.0.tgz", "integrity": "sha512-1NNCs6uurfkVbeXG4S8JFT9t19m45ICnif8zWLd5oPSZ50QnwMfK+H3jv408d4jw/7Bttv5axS5IiHoLaVNHeQ==" }, + "node_modules/posthog-js": { + "version": "1.176.0", + "resolved": "https://registry.npmjs.org/posthog-js/-/posthog-js-1.176.0.tgz", + "integrity": "sha512-T5XKNtRzp7q6CGb7Vc7wAI76rWap9fiuDUPxPsyPBPDkreKya91x9RIsSapAVFafwD1AEin1QMczCmt9Le9BWw==", + "dependencies": { + "core-js": "^3.38.1", + "fflate": "^0.4.8", + "preact": "^10.19.3", + "web-vitals": "^4.2.0" + } + }, + "node_modules/posthog-js/node_modules/web-vitals": { + "version": "4.2.4", + "resolved": "https://registry.npmjs.org/web-vitals/-/web-vitals-4.2.4.tgz", + "integrity": "sha512-r4DIlprAGwJ7YM11VZp4R884m0Vmgr6EAKe3P+kO0PPj3Unqyvv59rczf6UiGcb9Z8QxZVcqKNwv/g0WNdWwsw==" + }, + "node_modules/preact": { + "version": "10.24.3", + "resolved": "https://registry.npmjs.org/preact/-/preact-10.24.3.tgz", + "integrity": "sha512-Z2dPnBnMUfyQfSQ+GBdsGa16hz35YmLmtTLhM169uW944hYL6xzTYkJjC07j+Wosz733pMWx0fgON3JNw1jJQA==", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/preact" + } + }, "node_modules/prelude-ls": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.2.1.tgz", diff --git a/frontend/package.json b/frontend/package.json index 819c024f0429..3825ad4ae251 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "openhands-frontend", - "version": "0.12.0", + "version": "0.12.3", "private": true, "type": "module", "engines": { @@ -25,6 +25,7 @@ "isbot": "^5.1.17", "jose": "^5.9.4", "monaco-editor": "^0.52.0", + "posthog-js": "^1.176.0", "react": "^18.3.1", "react-dom": "^18.3.1", "react-highlight": "^0.15.0", @@ -49,6 +50,7 @@ "build": "npm run make-i18n && tsc && remix vite:build", "start": "npx sirv-cli build/ --single", "test": "vitest run", + "test:e2e": "playwright test", "test:coverage": "npm run make-i18n && vitest run --coverage", "dev_wsl": "VITE_WATCH_USE_POLLING=true vite", "preview": "vite preview", @@ -70,6 +72,7 @@ ] }, "devDependencies": { + "@playwright/test": "^1.48.2", "@remix-run/dev": "^2.11.2", "@remix-run/testing": "^2.11.2", "@tailwindcss/typography": "^0.5.15", diff --git a/frontend/playwright.config.ts b/frontend/playwright.config.ts new file mode 100644 index 000000000000..53a48004433d --- /dev/null +++ b/frontend/playwright.config.ts @@ -0,0 +1,79 @@ +import { defineConfig, devices } from "@playwright/test"; + +/** + * Read environment variables from file. + * https://github.com/motdotla/dotenv + */ +// import dotenv from 'dotenv'; +// import path from 'path'; +// dotenv.config({ path: path.resolve(__dirname, '.env') }); + +/** + * See https://playwright.dev/docs/test-configuration. + */ +export default defineConfig({ + testDir: "./tests", + /* Run tests in files in parallel */ + fullyParallel: true, + /* Fail the build on CI if you accidentally left test.only in the source code. */ + forbidOnly: !!process.env.CI, + /* Retry on CI only */ + retries: process.env.CI ? 2 : 0, + /* Opt out of parallel tests on CI. */ + workers: process.env.CI ? 1 : undefined, + /* Reporter to use. See https://playwright.dev/docs/test-reporters */ + reporter: "html", + /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */ + use: { + /* Base URL to use in actions like `await page.goto('/')`. */ + baseURL: "http://127.0.0.1:3000", + + /* Collect trace when retrying the failed test. See https://playwright.dev/docs/trace-viewer */ + trace: "on-first-retry", + }, + + /* Configure projects for major browsers */ + projects: [ + { + name: "chromium", + use: { ...devices["Desktop Chrome"] }, + }, + + { + name: "firefox", + use: { ...devices["Desktop Firefox"] }, + }, + + { + name: "webkit", + use: { ...devices["Desktop Safari"] }, + }, + + /* Test against mobile viewports. */ + // { + // name: 'Mobile Chrome', + // use: { ...devices['Pixel 5'] }, + // }, + // { + // name: 'Mobile Safari', + // use: { ...devices['iPhone 12'] }, + // }, + + /* Test against branded browsers. */ + // { + // name: 'Microsoft Edge', + // use: { ...devices['Desktop Edge'], channel: 'msedge' }, + // }, + // { + // name: 'Google Chrome', + // use: { ...devices['Desktop Chrome'], channel: 'chrome' }, + // }, + ], + + /* Run your local dev server before starting the tests */ + webServer: { + command: "npm run dev:mock -- --port 3000", + url: "http://127.0.0.1:3000", + reuseExistingServer: !process.env.CI, + }, +}); diff --git a/frontend/src/api/open-hands.ts b/frontend/src/api/open-hands.ts index 0ef84c0278c2..6981848c7b4d 100644 --- a/frontend/src/api/open-hands.ts +++ b/frontend/src/api/open-hands.ts @@ -1,4 +1,4 @@ -import { getValidFallbackHost } from "#/utils/get-valid-fallback-host"; +import { request } from "#/services/api"; import { SaveFileSuccessResponse, FileUploadSuccessResponse, @@ -9,36 +9,13 @@ import { GetConfigResponse, } from "./open-hands.types"; -/** - * Generate the base URL of the OpenHands API - * @returns Base URL of the OpenHands API - */ -const generateBaseURL = () => { - const fallback = getValidFallbackHost(); - const baseUrl = import.meta.env.VITE_BACKEND_BASE_URL || fallback; - - if (typeof window === "undefined") { - return `http://${baseUrl}`; - } - return `${window.location.protocol}//${baseUrl}`; -}; - -/** - * Class to interact with the OpenHands API - */ class OpenHands { - /** - * Base URL of the OpenHands API - */ - static BASE_URL = generateBaseURL(); - /** * Retrieve the list of models available * @returns List of models available */ static async getModels(): Promise { - const response = await fetch(`${OpenHands.BASE_URL}/api/options/models`); - return response.json(); + return request("/api/options/models"); } /** @@ -46,8 +23,7 @@ class OpenHands { * @returns List of agents available */ static async getAgents(): Promise { - const response = await fetch(`${OpenHands.BASE_URL}/api/options/agents`); - return response.json(); + return request(`/api/options/agents`); } /** @@ -55,178 +31,123 @@ class OpenHands { * @returns List of security analyzers available */ static async getSecurityAnalyzers(): Promise { - const response = await fetch( - `${OpenHands.BASE_URL}/api/options/security-analyzers`, - ); - return response.json(); + return request(`/api/options/security-analyzers`); } static async getConfig(): Promise { - const response = await fetch("config.json", { - headers: { - "Cache-Control": "no-cache", - }, - }); - return response.json(); + return request("/config.json"); } /** * Retrieve the list of files available in the workspace - * @param token User token provided by the server * @param path Path to list files from * @returns List of files available in the given path. If path is not provided, it lists all the files in the workspace */ - static async getFiles(token: string, path?: string): Promise { - const url = new URL(`${OpenHands.BASE_URL}/api/list-files`); - if (path) url.searchParams.append("path", path); - - const response = await fetch(url.toString(), { - headers: { - Authorization: `Bearer ${token}`, - }, - }); - - return response.json(); + static async getFiles(path?: string): Promise { + let url = "/api/list-files"; + if (path) url += `?path=${encodeURIComponent(path)}`; + return request(url); } /** * Retrieve the content of a file - * @param token User token provided by the server * @param path Full path of the file to retrieve * @returns Content of the file */ - static async getFile(token: string, path: string): Promise { - const url = new URL(`${OpenHands.BASE_URL}/api/select-file`); - url.searchParams.append("file", path); - const response = await fetch(url.toString(), { - headers: { - Authorization: `Bearer ${token}`, - }, - }); - - const data = await response.json(); + static async getFile(path: string): Promise { + const url = `/api/select-file?file=${encodeURIComponent(path)}`; + const data = await request(url); return data.code; } /** * Save the content of a file - * @param token User token provided by the server * @param path Full path of the file to save * @param content Content to save in the file * @returns Success message or error message */ static async saveFile( - token: string, path: string, content: string, ): Promise { - const response = await fetch(`${OpenHands.BASE_URL}/api/save-file`, { + return request(`/api/save-file`, { method: "POST", body: JSON.stringify({ filePath: path, content }), headers: { - Authorization: `Bearer ${token}`, "Content-Type": "application/json", }, }); - - return response.json(); } /** * Upload a file to the workspace - * @param token User token provided by the server * @param file File to upload * @returns Success message or error message */ static async uploadFiles( - token: string, file: File[], ): Promise { const formData = new FormData(); file.forEach((f) => formData.append("files", f)); - const response = await fetch(`${OpenHands.BASE_URL}/api/upload-files`, { + return request(`/api/upload-files`, { method: "POST", body: formData, - headers: { - Authorization: `Bearer ${token}`, - }, }); - - return response.json(); } /** * Get the blob of the workspace zip - * @param token User token provided by the server * @returns Blob of the workspace zip */ - static async getWorkspaceZip(token: string): Promise { - const response = await fetch(`${OpenHands.BASE_URL}/api/zip-directory`, { - headers: { - Authorization: `Bearer ${token}`, - }, - }); - + static async getWorkspaceZip(): Promise { + const response = await request(`/api/zip-directory`, {}, false, true); return response.blob(); } /** * Send feedback to the server - * @param token User token provided by the server * @param data Feedback data * @returns The stored feedback data */ - static async sendFeedback( - token: string, - data: Feedback, - ): Promise { - const response = await fetch(`${OpenHands.BASE_URL}/api/submit-feedback`, { + static async submitFeedback(data: Feedback): Promise { + return request(`/api/submit-feedback`, { method: "POST", body: JSON.stringify(data), headers: { - Authorization: `Bearer ${token}`, "Content-Type": "application/json", }, }); - - return response.json(); } /** - * Get the GitHub access token * @param code Code provided by GitHub * @returns GitHub access token */ static async getGitHubAccessToken( code: string, ): Promise { - const response = await fetch(`${OpenHands.BASE_URL}/api/github/callback`, { + return request(`/api/github/callback`, { method: "POST", body: JSON.stringify({ code }), headers: { "Content-Type": "application/json", }, }); - - return response.json(); } /** - * Check if the user is authenticated - * @param login The user's GitHub login handle - * @returns Whether the user is authenticated + * Authenticate with GitHub token + * @returns Response with authentication status and user info if successful */ - static async isAuthenticated(login: string): Promise { - const response = await fetch(`${OpenHands.BASE_URL}/api/authenticate`, { - method: "POST", - body: JSON.stringify({ login }), - headers: { - "Content-Type": "application/json", + static async authenticate(): Promise { + return request( + `/api/authenticate`, + { + method: "POST", }, - }); - - return response.status === 200; + true, + ); } } diff --git a/frontend/src/api/open-hands.types.ts b/frontend/src/api/open-hands.types.ts index 9da1a339b4d2..a562267363d4 100644 --- a/frontend/src/api/open-hands.types.ts +++ b/frontend/src/api/open-hands.types.ts @@ -27,6 +27,11 @@ export interface GitHubAccessTokenResponse { access_token: string; } +export interface AuthenticationResponse { + message: string; + login?: string; // Only present when allow list is enabled +} + export interface Feedback { version: string; email: string; diff --git a/frontend/src/assets/arrow-send.svg b/frontend/src/assets/arrow-send.svg index b353657406d0..a42795073c91 100644 --- a/frontend/src/assets/arrow-send.svg +++ b/frontend/src/assets/arrow-send.svg @@ -2,4 +2,4 @@ - \ No newline at end of file + diff --git a/frontend/src/assets/branding/all-hands-logo-spark.svg b/frontend/src/assets/branding/all-hands-logo-spark.svg index 439dff9778cb..bb4070944af8 100644 --- a/frontend/src/assets/branding/all-hands-logo-spark.svg +++ b/frontend/src/assets/branding/all-hands-logo-spark.svg @@ -32,4 +32,4 @@ - \ No newline at end of file + diff --git a/frontend/src/assets/branding/github-logo.svg b/frontend/src/assets/branding/github-logo.svg index 975e5fa3ca12..fcf918efacfe 100644 --- a/frontend/src/assets/branding/github-logo.svg +++ b/frontend/src/assets/branding/github-logo.svg @@ -2,4 +2,4 @@ - \ No newline at end of file + diff --git a/frontend/src/assets/clip.svg b/frontend/src/assets/clip.svg index aaebcbc4dfbe..26a6acb485a9 100644 --- a/frontend/src/assets/clip.svg +++ b/frontend/src/assets/clip.svg @@ -2,4 +2,4 @@ - \ No newline at end of file + diff --git a/frontend/src/assets/clipboard.svg b/frontend/src/assets/clipboard.svg index 6da359d3806c..abf4e5a3155a 100644 --- a/frontend/src/assets/clipboard.svg +++ b/frontend/src/assets/clipboard.svg @@ -32,4 +32,4 @@ - \ No newline at end of file + diff --git a/frontend/src/assets/default-user.svg b/frontend/src/assets/default-user.svg index b67e64cf786a..620ab2f3f9c6 100644 --- a/frontend/src/assets/default-user.svg +++ b/frontend/src/assets/default-user.svg @@ -2,4 +2,4 @@ - \ No newline at end of file + diff --git a/frontend/src/assets/docs.svg b/frontend/src/assets/docs.svg index 23475a3da1c5..33eb3a374c61 100644 --- a/frontend/src/assets/docs.svg +++ b/frontend/src/assets/docs.svg @@ -24,4 +24,4 @@ - \ No newline at end of file + diff --git a/frontend/src/assets/external-link.svg b/frontend/src/assets/external-link.svg index 96294735b2bf..e55411671193 100644 --- a/frontend/src/assets/external-link.svg +++ b/frontend/src/assets/external-link.svg @@ -4,4 +4,4 @@ - \ No newline at end of file + diff --git a/frontend/src/assets/lightbulb.svg b/frontend/src/assets/lightbulb.svg index 9f61d275efc1..aa703e60abbc 100644 --- a/frontend/src/assets/lightbulb.svg +++ b/frontend/src/assets/lightbulb.svg @@ -2,4 +2,4 @@ - \ No newline at end of file + diff --git a/frontend/src/assets/loading-outer.svg b/frontend/src/assets/loading-outer.svg index da669d7da4cb..aebe42c8e528 100644 --- a/frontend/src/assets/loading-outer.svg +++ b/frontend/src/assets/loading-outer.svg @@ -1,4 +1,4 @@ - \ No newline at end of file + diff --git a/frontend/src/assets/message.svg b/frontend/src/assets/message.svg index cf1e6fc409de..3bf0a6e3e29b 100644 --- a/frontend/src/assets/message.svg +++ b/frontend/src/assets/message.svg @@ -2,4 +2,4 @@ - \ No newline at end of file + diff --git a/frontend/src/assets/new-project.svg b/frontend/src/assets/new-project.svg index e7573f625a4f..550d656e353f 100644 --- a/frontend/src/assets/new-project.svg +++ b/frontend/src/assets/new-project.svg @@ -3,4 +3,4 @@ - \ No newline at end of file + diff --git a/frontend/src/assets/refresh.svg b/frontend/src/assets/refresh.svg index ae9e2bac3841..4bba1daa1adb 100644 --- a/frontend/src/assets/refresh.svg +++ b/frontend/src/assets/refresh.svg @@ -2,4 +2,4 @@ - \ No newline at end of file + diff --git a/frontend/src/components/AgentStatusBar.tsx b/frontend/src/components/AgentStatusBar.tsx index c337a838f32e..7de9ae0397e8 100644 --- a/frontend/src/components/AgentStatusBar.tsx +++ b/frontend/src/components/AgentStatusBar.tsx @@ -1,6 +1,7 @@ import React, { useEffect } from "react"; import { useTranslation } from "react-i18next"; import { useSelector } from "react-redux"; +import toast from "react-hot-toast"; import { I18nKey } from "#/i18n/declaration"; import { RootState } from "#/store"; import AgentState from "#/types/AgentState"; @@ -16,7 +17,7 @@ enum IndicatorColor { } function AgentStatusBar() { - const { t } = useTranslation(); + const { t, i18n } = useTranslation(); const { curAgentState } = useSelector((state: RootState) => state.agent); const { curStatusMessage } = useSelector((state: RootState) => state.status); @@ -94,15 +95,27 @@ function AgentStatusBar() { const [statusMessage, setStatusMessage] = React.useState(""); React.useEffect(() => { - if (curAgentState === AgentState.LOADING) { - const trimmedCustomMessage = curStatusMessage.status.trim(); - if (trimmedCustomMessage) { - setStatusMessage(t(trimmedCustomMessage)); - return; + let message = curStatusMessage.message || ""; + if (curStatusMessage?.id) { + const id = curStatusMessage.id.trim(); + if (i18n.exists(id)) { + message = t(curStatusMessage.id.trim()) || message; } } + if (curStatusMessage?.type === "error") { + toast.error(message); + return; + } + if (curAgentState === AgentState.LOADING && message.trim()) { + setStatusMessage(message); + } else { + setStatusMessage(AgentStatusMap[curAgentState].message); + } + }, [curStatusMessage.id]); + + React.useEffect(() => { setStatusMessage(AgentStatusMap[curAgentState].message); - }, [curAgentState, curStatusMessage.status]); + }, [curAgentState]); return (
diff --git a/frontend/src/components/analytics-consent-form-modal.tsx b/frontend/src/components/analytics-consent-form-modal.tsx new file mode 100644 index 000000000000..e122b9e8a9bf --- /dev/null +++ b/frontend/src/components/analytics-consent-form-modal.tsx @@ -0,0 +1,42 @@ +import { useFetcher } from "@remix-run/react"; +import { ModalBackdrop } from "./modals/modal-backdrop"; +import ModalBody from "./modals/ModalBody"; +import ModalButton from "./buttons/ModalButton"; +import { + BaseModalTitle, + BaseModalDescription, +} from "./modals/confirmation-modals/BaseModal"; + +export function AnalyticsConsentFormModal() { + const fetcher = useFetcher({ key: "set-consent" }); + + return ( + + + + + + We use tools to understand how our application is used to improve + your experience. You can enable or disable analytics. Your + preferences will be stored and can be updated anytime. + + + + + + + + + ); +} diff --git a/frontend/src/components/buttons/ModalButton.tsx b/frontend/src/components/buttons/ModalButton.tsx index 29ec2ae5c566..e011fef76075 100644 --- a/frontend/src/components/buttons/ModalButton.tsx +++ b/frontend/src/components/buttons/ModalButton.tsx @@ -2,6 +2,7 @@ import clsx from "clsx"; import React from "react"; interface ModalButtonProps { + testId?: string; variant?: "default" | "text-like"; onClick?: () => void; text: string; @@ -13,6 +14,7 @@ interface ModalButtonProps { } function ModalButton({ + testId, variant = "default", onClick, text, @@ -24,6 +26,7 @@ function ModalButton({ }: ModalButtonProps) { return ( + )} + {showDetails &&

{details}

}
); diff --git a/frontend/src/components/feedback-form.tsx b/frontend/src/components/feedback-form.tsx index 4e1ddde63548..078e4b0ccca6 100644 --- a/frontend/src/components/feedback-form.tsx +++ b/frontend/src/components/feedback-form.tsx @@ -1,8 +1,8 @@ import React from "react"; import hotToast from "react-hot-toast"; import ModalButton from "./buttons/ModalButton"; -import { request } from "#/services/api"; import { Feedback } from "#/api/open-hands.types"; +import OpenHands from "#/api/open-hands"; const FEEDBACK_VERSION = "1.0"; const VIEWER_PAGE = "https://www.all-hands.dev/share"; @@ -71,13 +71,7 @@ export function FeedbackForm({ onClose, polarity }: FeedbackFormProps) { token: "", }; - const response = await request("/api/submit-feedback", { - method: "POST", - body: JSON.stringify(feedback), - headers: { - "Content-Type": "application/json", - }, - }); + const response = await OpenHands.submitFeedback(feedback); const { message, feedback_id, password } = response.body; // eslint-disable-line const link = `${VIEWER_PAGE}?share_id=${feedback_id}`; shareFeedbackToast(message, link, password); diff --git a/frontend/src/components/file-explorer/FileExplorer.tsx b/frontend/src/components/file-explorer/FileExplorer.tsx index c6e2c249feff..8db4460b1ae6 100644 --- a/frontend/src/components/file-explorer/FileExplorer.tsx +++ b/frontend/src/components/file-explorer/FileExplorer.tsx @@ -91,14 +91,15 @@ function ExplorerActions({ } interface FileExplorerProps { + isOpen: boolean; + onToggle: () => void; error: string | null; } -function FileExplorer({ error }: FileExplorerProps) { +function FileExplorer({ error, isOpen, onToggle }: FileExplorerProps) { const { revalidate } = useRevalidator(); const { paths, setPaths } = useFiles(); - const [isHidden, setIsHidden] = React.useState(false); const [isDragging, setIsDragging] = React.useState(false); const { curAgentState } = useSelector((state: RootState) => state.agent); @@ -117,52 +118,47 @@ function FileExplorer({ error }: FileExplorerProps) { return; } dispatch(setRefreshID(Math.random())); - // TODO: Get token from data loader - const token = localStorage.getItem("token"); - if (token) OpenHands.getFiles(token).then(setPaths); + OpenHands.getFiles().then(setPaths); revalidate(); }; const uploadFileData = async (files: FileList) => { try { - const token = localStorage.getItem("token"); - if (token) { - const result = await OpenHands.uploadFiles(token, Array.from(files)); + const result = await OpenHands.uploadFiles(Array.from(files)); - if (isOpenHandsErrorResponse(result)) { - // Handle error response - toast.error( - `upload-error-${new Date().getTime()}`, - result.error || t(I18nKey.EXPLORER$UPLOAD_ERROR_MESSAGE), - ); - return; - } + if (isOpenHandsErrorResponse(result)) { + // Handle error response + toast.error( + `upload-error-${new Date().getTime()}`, + result.error || t(I18nKey.EXPLORER$UPLOAD_ERROR_MESSAGE), + ); + return; + } - const uploadedCount = result.uploaded_files.length; - const skippedCount = result.skipped_files.length; + const uploadedCount = result.uploaded_files.length; + const skippedCount = result.skipped_files.length; - if (uploadedCount > 0) { - toast.success( - `upload-success-${new Date().getTime()}`, - t(I18nKey.EXPLORER$UPLOAD_SUCCESS_MESSAGE, { - count: uploadedCount, - }), - ); - } - - if (skippedCount > 0) { - const message = t(I18nKey.EXPLORER$UPLOAD_PARTIAL_SUCCESS_MESSAGE, { - count: skippedCount, - }); - toast.info(message); - } + if (uploadedCount > 0) { + toast.success( + `upload-success-${new Date().getTime()}`, + t(I18nKey.EXPLORER$UPLOAD_SUCCESS_MESSAGE, { + count: uploadedCount, + }), + ); + } - if (uploadedCount === 0 && skippedCount === 0) { - toast.info(t(I18nKey.EXPLORER$NO_FILES_UPLOADED_MESSAGE)); - } + if (skippedCount > 0) { + const message = t(I18nKey.EXPLORER$UPLOAD_PARTIAL_SUCCESS_MESSAGE, { + count: skippedCount, + }); + toast.info(message); + } - refreshWorkspace(); + if (uploadedCount === 0 && skippedCount === 0) { + toast.info(t(I18nKey.EXPLORER$NO_FILES_UPLOADED_MESSAGE)); } + + refreshWorkspace(); } catch (e) { // Handle unexpected errors (network issues, etc.) toast.error( @@ -211,7 +207,7 @@ function FileExplorer({ error }: FileExplorerProps) {
@@ -219,17 +215,17 @@ function FileExplorer({ error }: FileExplorerProps) {
- {!isHidden && ( + {isOpen && (
{t(I18nKey.EXPLORER$LABEL_WORKSPACE)}
)} setIsHidden((prev) => !prev)} + isHidden={!isOpen} + toggleHidden={onToggle} onRefresh={refreshWorkspace} onUpload={selectFileInput} /> @@ -237,7 +233,7 @@ function FileExplorer({ error }: FileExplorerProps) {
{!error && (
-
+
diff --git a/frontend/src/components/file-explorer/TreeNode.tsx b/frontend/src/components/file-explorer/TreeNode.tsx index fd44cd88cd72..b3aa3c28335c 100644 --- a/frontend/src/components/file-explorer/TreeNode.tsx +++ b/frontend/src/components/file-explorer/TreeNode.tsx @@ -59,14 +59,11 @@ function TreeNode({ path, defaultOpen = false }: TreeNodeProps) { return; } - const token = localStorage.getItem("token"); - if (token) { - try { - const newChildren = await OpenHands.getFiles(token, path); - setChildren(newChildren); - } catch (error) { - toast.error("Failed to fetch files"); - } + try { + const newChildren = await OpenHands.getFiles(path); + setChildren(newChildren); + } catch (error) { + toast.error("Failed to fetch files"); } }; @@ -77,15 +74,13 @@ function TreeNode({ path, defaultOpen = false }: TreeNodeProps) { }, [refreshID, isOpen]); const handleClick = async () => { - const token = localStorage.getItem("token"); - if (isDirectory) { setIsOpen((prev) => !prev); - } else if (token) { + } else { const code = modifiedFiles[path] || files[path]; try { - const fetchedCode = await OpenHands.getFile(token, path); + const fetchedCode = await OpenHands.getFile(path); setSelectedPath(path); if (!code || fetchedCode !== files[path]) { setFileContent(path, fetchedCode); diff --git a/frontend/src/components/github-repositories-suggestion-box.tsx b/frontend/src/components/github-repositories-suggestion-box.tsx new file mode 100644 index 000000000000..c39abe11a9ea --- /dev/null +++ b/frontend/src/components/github-repositories-suggestion-box.tsx @@ -0,0 +1,94 @@ +import React from "react"; +import { + isGitHubErrorReponse, + retrieveAllGitHubUserRepositories, +} from "#/api/github"; +import { SuggestionBox } from "#/routes/_oh._index/suggestion-box"; +import { ConnectToGitHubModal } from "./modals/connect-to-github-modal"; +import { ModalBackdrop } from "./modals/modal-backdrop"; +import { GitHubRepositorySelector } from "#/routes/_oh._index/github-repo-selector"; +import ModalButton from "./buttons/ModalButton"; +import GitHubLogo from "#/assets/branding/github-logo.svg?react"; + +interface GitHubAuthProps { + onConnectToGitHub: () => void; + repositories: GitHubRepository[]; + isLoggedIn: boolean; +} + +function GitHubAuth({ + onConnectToGitHub, + repositories, + isLoggedIn, +}: GitHubAuthProps) { + if (isLoggedIn) { + return ; + } + + return ( + } + className="bg-[#791B80] w-full" + onClick={onConnectToGitHub} + /> + ); +} + +interface GitHubRepositoriesSuggestionBoxProps { + repositories: Awaited< + ReturnType + > | null; + gitHubAuthUrl: string | null; + user: GitHubErrorReponse | GitHubUser | null; +} + +export function GitHubRepositoriesSuggestionBox({ + repositories, + gitHubAuthUrl, + user, +}: GitHubRepositoriesSuggestionBoxProps) { + const [connectToGitHubModalOpen, setConnectToGitHubModalOpen] = + React.useState(false); + + const handleConnectToGitHub = () => { + if (gitHubAuthUrl) { + window.location.href = gitHubAuthUrl; + } else { + setConnectToGitHubModalOpen(true); + } + }; + + if (isGitHubErrorReponse(repositories)) { + return ( + {repositories.message}

+ } + /> + ); + } + + return ( + <> + + } + /> + {connectToGitHubModalOpen && ( + setConnectToGitHubModalOpen(false)}> + setConnectToGitHubModalOpen(false)} + /> + + )} + + ); +} diff --git a/frontend/src/components/modals/AccountSettingsModal.tsx b/frontend/src/components/modals/AccountSettingsModal.tsx index 1acdacc0319d..0ca6df56be32 100644 --- a/frontend/src/components/modals/AccountSettingsModal.tsx +++ b/frontend/src/components/modals/AccountSettingsModal.tsx @@ -14,12 +14,14 @@ interface AccountSettingsModalProps { onClose: () => void; selectedLanguage: string; gitHubError: boolean; + analyticsConsent: string | null; } function AccountSettingsModal({ onClose, selectedLanguage, gitHubError, + analyticsConsent, }: AccountSettingsModalProps) { const data = useRouteLoaderData("routes/_oh"); const settingsFetcher = useFetcher({ @@ -32,6 +34,7 @@ function AccountSettingsModal({ const formData = new FormData(event.currentTarget); const language = formData.get("language")?.toString(); const ghToken = formData.get("ghToken")?.toString(); + const analytics = formData.get("analytics")?.toString() === "on"; const accountForm = new FormData(); const loginForm = new FormData(); @@ -44,6 +47,7 @@ function AccountSettingsModal({ accountForm.append("language", languageKey ?? "en"); } if (ghToken) loginForm.append("ghToken", ghToken); + accountForm.append("analytics", analytics.toString()); settingsFetcher.submit(accountForm, { method: "POST", @@ -101,6 +105,15 @@ function AccountSettingsModal({ )}
+ +
{description}; + return ( + {children || description} + ); } interface BaseModalProps { diff --git a/frontend/src/components/modals/connect-to-github-modal.tsx b/frontend/src/components/modals/connect-to-github-modal.tsx index e0315d14c84f..165dab852105 100644 --- a/frontend/src/components/modals/connect-to-github-modal.tsx +++ b/frontend/src/components/modals/connect-to-github-modal.tsx @@ -53,6 +53,7 @@ export function ConnectToGitHubModal({ onClose }: ConnectToGitHubModalProps) {
{ setIsConnected(true); diff --git a/frontend/src/entry.client.tsx b/frontend/src/entry.client.tsx index 3e87b2736e23..8a6d4fac2dfc 100644 --- a/frontend/src/entry.client.tsx +++ b/frontend/src/entry.client.tsx @@ -6,13 +6,25 @@ */ import { RemixBrowser } from "@remix-run/react"; -import { startTransition, StrictMode } from "react"; +import React, { startTransition, StrictMode } from "react"; import { hydrateRoot } from "react-dom/client"; import { Provider } from "react-redux"; +import posthog from "posthog-js"; import { SocketProvider } from "./context/socket"; import "./i18n"; import store from "./store"; +function PosthogInit() { + React.useEffect(() => { + posthog.init("phc_3ESMmY9SgqEAGBB6sMGK5ayYHkeUuknH2vP6FmWH9RA", { + api_host: "https://us.i.posthog.com", + person_profiles: "identified_only", + }); + }, []); + + return null; +} + async function prepareApp() { if ( process.env.NODE_ENV === "development" && @@ -34,6 +46,7 @@ prepareApp().then(() => + , diff --git a/frontend/src/i18n/translation.json b/frontend/src/i18n/translation.json index 795c60e051f2..6db2520b6b8a 100644 --- a/frontend/src/i18n/translation.json +++ b/frontend/src/i18n/translation.json @@ -1441,6 +1441,12 @@ "fr": "Privé", "tr": "Özel" }, + "ERROR_MESSAGE$SHOW_DETAILS": { + "en": "Show details" + }, + "ERROR_MESSAGE$HIDE_DETAILS": { + "en": "Hide details" + }, "STATUS$STARTING_RUNTIME": { "en": "Starting Runtime...", "zh-CN": "启动运行时...", @@ -1510,5 +1516,17 @@ "ar": "في انتظار جاهزية العميل...", "fr": "En attente que le client soit prêt...", "tr": "İstemcinin hazır olması bekleniyor..." + }, + "STATUS$ERROR_LLM_AUTHENTICATION": { + "en": "Error authenticating with the LLM provider. Please check your API key" + }, + "STATUS$ERROR_RUNTIME_DISCONNECTED": { + "en": "There was an error while connecting to the runtime. Please refresh the page." + }, + "AGENT_ERROR$BAD_ACTION": { + "en": "Agent tried to execute a malformed action." + }, + "AGENT_ERROR$ACTION_TIMEOUT": { + "en": "Action timed out." } } diff --git a/frontend/src/mocks/handlers.ts b/frontend/src/mocks/handlers.ts index 91a1c5d7acbc..97a7f9cf9c84 100644 --- a/frontend/src/mocks/handlers.ts +++ b/frontend/src/mocks/handlers.ts @@ -1,7 +1,7 @@ import { delay, http, HttpResponse } from "msw"; const openHandsHandlers = [ - http.get("http://localhost:3000/api/options/models", async () => { + http.get("/api/options/models", async () => { await delay(); return HttpResponse.json([ "gpt-3.5-turbo", @@ -10,17 +10,17 @@ const openHandsHandlers = [ ]); }), - http.get("http://localhost:3000/api/options/agents", async () => { + http.get("/api/options/agents", async () => { await delay(); return HttpResponse.json(["CodeActAgent", "CoActAgent"]); }), - http.get("http://localhost:3000/api/options/security-analyzers", async () => { + http.get("/api/options/security-analyzers", async () => { await delay(); return HttpResponse.json(["mock-invariant"]); }), - http.get("http://localhost:3000/api/list-files", async ({ request }) => { + http.get("http://localhost:3001/api/list-files", async ({ request }) => { await delay(); const token = request.headers @@ -32,11 +32,11 @@ const openHandsHandlers = [ return HttpResponse.json(["file1.ts", "dir1/file2.ts", "file3.ts"]); }), - http.post("http://localhost:3000/api/save-file", () => + http.post("http://localhost:3001/api/save-file", () => HttpResponse.json(null, { status: 200 }), ), - http.get("http://localhost:3000/api/select-file", async ({ request }) => { + http.get("http://localhost:3001/api/select-file", async ({ request }) => { await delay(); const token = request.headers @@ -58,7 +58,7 @@ const openHandsHandlers = [ return HttpResponse.json(null, { status: 404 }); }), - http.post("http://localhost:3000/api/submit-feedback", async () => { + http.post("http://localhost:3001/api/submit-feedback", async () => { await delay(1200); return HttpResponse.json({ @@ -70,7 +70,9 @@ const openHandsHandlers = [ export const handlers = [ ...openHandsHandlers, - http.get("https://api.github.com/user/repos", ({ request }) => { + http.get("https://api.github.com/user/repos", async ({ request }) => { + if (import.meta.env.MODE !== "test") await delay(3500); + const token = request.headers .get("Authorization") ?.replace("Bearer", "") @@ -85,7 +87,20 @@ export const handlers = [ { id: 2, full_name: "octocat/earth" }, ]); }), - http.post("http://localhost:3000/api/submit-feedback", async () => + http.get("https://api.github.com/user", () => { + const user: GitHubUser = { + id: 1, + login: "octocat", + avatar_url: "https://avatars.githubusercontent.com/u/583231?v=4", + }; + + return HttpResponse.json(user); + }), + http.post("http://localhost:3001/api/submit-feedback", async () => HttpResponse.json({ statusCode: 200 }, { status: 200 }), ), + http.post("https://us.i.posthog.com/e", async () => + HttpResponse.json(null, { status: 200 }), + ), + http.get("/config.json", () => HttpResponse.json({ APP_MODE: "oss" })), ]; diff --git a/frontend/src/routes/_oh._index/github-repo-selector.tsx b/frontend/src/routes/_oh._index/github-repo-selector.tsx index 73dc03cf7d97..370bd3a613e4 100644 --- a/frontend/src/routes/_oh._index/github-repo-selector.tsx +++ b/frontend/src/routes/_oh._index/github-repo-selector.tsx @@ -1,5 +1,6 @@ import { Autocomplete, AutocompleteItem } from "@nextui-org/react"; import { useDispatch } from "react-redux"; +import { useNavigate } from "react-router-dom"; import { setSelectedRepository } from "#/state/initial-query-slice"; interface GitHubRepositorySelectorProps { @@ -9,6 +10,7 @@ interface GitHubRepositorySelectorProps { export function GitHubRepositorySelector({ repositories, }: GitHubRepositorySelectorProps) { + const navigate = useNavigate(); const dispatch = useDispatch(); const handleRepoSelection = (id: string | null) => { @@ -16,6 +18,7 @@ export function GitHubRepositorySelector({ if (repo) { // set query param dispatch(setSelectedRepository(repo.full_name)); + navigate("/app"); } }; @@ -26,6 +29,7 @@ export function GitHubRepositorySelector({ return ( {repositories.map((repo) => ( - + {repo.full_name} ))} diff --git a/frontend/src/routes/_oh._index/route.tsx b/frontend/src/routes/_oh._index/route.tsx index edc22c6dfca4..5f1df1b6c0a1 100644 --- a/frontend/src/routes/_oh._index/route.tsx +++ b/frontend/src/routes/_oh._index/route.tsx @@ -1,54 +1,24 @@ import { + Await, ClientActionFunctionArgs, ClientLoaderFunctionArgs, - json, + defer, redirect, useLoaderData, + useNavigate, useRouteLoaderData, } from "@remix-run/react"; -import React from "react"; +import React, { Suspense } from "react"; import { SuggestionBox } from "./suggestion-box"; import { TaskForm } from "./task-form"; import { HeroHeading } from "./hero-heading"; -import { GitHubRepositorySelector } from "./github-repo-selector"; -import { - isGitHubErrorReponse, - retrieveAllGitHubUserRepositories, -} from "#/api/github"; -import ModalButton from "#/components/buttons/ModalButton"; -import GitHubLogo from "#/assets/branding/github-logo.svg?react"; -import { ConnectToGitHubModal } from "#/components/modals/connect-to-github-modal"; -import { ModalBackdrop } from "#/components/modals/modal-backdrop"; +import { retrieveAllGitHubUserRepositories } from "#/api/github"; import store from "#/store"; import { setInitialQuery } from "#/state/initial-query-slice"; import { clientLoader as rootClientLoader } from "#/routes/_oh"; import OpenHands from "#/api/open-hands"; import { generateGitHubAuthUrl } from "#/utils/generate-github-auth-url"; - -interface GitHubAuthProps { - onConnectToGitHub: () => void; - repositories: GitHubRepository[]; - isLoggedIn: boolean; -} - -function GitHubAuth({ - onConnectToGitHub, - repositories, - isLoggedIn, -}: GitHubAuthProps) { - if (isLoggedIn) { - return ; - } - - return ( - } - className="bg-[#791B80] w-full" - onClick={onConnectToGitHub} - /> - ); -} +import { GitHubRepositoriesSuggestionBox } from "#/components/github-repositories-suggestion-box"; export const clientLoader = async ({ request }: ClientLoaderFunctionArgs) => { let isSaas = false; @@ -67,12 +37,12 @@ export const clientLoader = async ({ request }: ClientLoaderFunctionArgs) => { const token = localStorage.getItem("token"); if (token) return redirect("/app"); - let repositories: GitHubRepository[] = []; + let repositories: ReturnType< + typeof retrieveAllGitHubUserRepositories + > | null = null; if (ghToken) { - const data = await retrieveAllGitHubUserRepositories(ghToken); - if (!isGitHubErrorReponse(data)) { - repositories = data; - } + const data = retrieveAllGitHubUserRepositories(ghToken); + repositories = data; } let githubAuthUrl: string | null = null; @@ -81,7 +51,7 @@ export const clientLoader = async ({ request }: ClientLoaderFunctionArgs) => { githubAuthUrl = generateGitHubAuthUrl(githubClientId, requestUrl); } - return json({ repositories, githubAuthUrl }); + return defer({ repositories, githubAuthUrl }); }; export const clientAction = async ({ request }: ClientActionFunctionArgs) => { @@ -93,40 +63,40 @@ export const clientAction = async ({ request }: ClientActionFunctionArgs) => { }; function Home() { + const navigate = useNavigate(); const rootData = useRouteLoaderData("routes/_oh"); const { repositories, githubAuthUrl } = useLoaderData(); - const [connectToGitHubModalOpen, setConnectToGitHubModalOpen] = - React.useState(false); const [importedFile, setImportedFile] = React.useState(null); - const handleConnectToGitHub = () => { - if (githubAuthUrl) { - window.location.href = githubAuthUrl; - } else { - setConnectToGitHubModalOpen(true); - } - }; - return ( -
+
- } - /> + > + + {(resolvedRepositories) => ( + + )} + +
- {connectToGitHubModalOpen && ( - setConnectToGitHubModalOpen(false)}> - setConnectToGitHubModalOpen(false)} - /> - - )}
); } diff --git a/frontend/src/routes/_oh.app._index/code-editor-component.tsx b/frontend/src/routes/_oh.app._index/code-editor-component.tsx index cf94ed863a41..8182805193a0 100644 --- a/frontend/src/routes/_oh.app._index/code-editor-component.tsx +++ b/frontend/src/routes/_oh.app._index/code-editor-component.tsx @@ -1,18 +1,21 @@ -import { Editor, Monaco } from "@monaco-editor/react"; +import { Editor, EditorProps } from "@monaco-editor/react"; import React from "react"; import { useTranslation } from "react-i18next"; import { VscCode } from "react-icons/vsc"; -import { type editor } from "monaco-editor"; import toast from "react-hot-toast"; import { I18nKey } from "#/i18n/declaration"; import { useFiles } from "#/context/files"; import OpenHands from "#/api/open-hands"; interface CodeEditorCompoonentProps { + onMount: EditorProps["onMount"]; isReadOnly: boolean; } -function CodeEditorCompoonent({ isReadOnly }: CodeEditorCompoonentProps) { +function CodeEditorCompoonent({ + onMount, + isReadOnly, +}: CodeEditorCompoonentProps) { const { t } = useTranslation(); const { files, @@ -22,22 +25,6 @@ function CodeEditorCompoonent({ isReadOnly }: CodeEditorCompoonentProps) { saveFileContent: saveNewFileContent, } = useFiles(); - const handleEditorDidMount = React.useCallback( - (editor: editor.IStandaloneCodeEditor, monaco: Monaco): void => { - monaco.editor.defineTheme("my-theme", { - base: "vs-dark", - inherit: true, - rules: [], - colors: { - "editor.background": "#171717", - }, - }); - - monaco.editor.setTheme("my-theme"); - }, - [], - ); - const handleEditorChange = (value: string | undefined) => { if (selectedPath && value) modifyFileContent(selectedPath, value); }; @@ -49,8 +36,7 @@ function CodeEditorCompoonent({ isReadOnly }: CodeEditorCompoonentProps) { if (content) { try { - const token = localStorage.getItem("token")?.toString(); - if (token) await OpenHands.saveFile(token, selectedPath, content); + await OpenHands.saveFile(selectedPath, content); } catch (error) { toast.error("Failed to save file"); } @@ -68,7 +54,7 @@ function CodeEditorCompoonent({ isReadOnly }: CodeEditorCompoonentProps) { return (
{t(I18nKey.CODE_EDITOR$EMPTY_MESSAGE)} @@ -79,7 +65,6 @@ function CodeEditorCompoonent({ isReadOnly }: CodeEditorCompoonentProps) { return ( diff --git a/frontend/src/routes/_oh.app._index/route.tsx b/frontend/src/routes/_oh.app._index/route.tsx index ba20e003f797..6ef5f5762ae8 100644 --- a/frontend/src/routes/_oh.app._index/route.tsx +++ b/frontend/src/routes/_oh.app._index/route.tsx @@ -1,12 +1,13 @@ import React from "react"; import { useSelector } from "react-redux"; -import { json, useLoaderData, useRouteError } from "@remix-run/react"; +import { json, useRouteError } from "@remix-run/react"; import toast from "react-hot-toast"; +import { editor } from "monaco-editor"; +import { EditorProps } from "@monaco-editor/react"; import { RootState } from "#/store"; import AgentState from "#/types/AgentState"; import FileExplorer from "#/components/file-explorer/FileExplorer"; import OpenHands from "#/api/open-hands"; -import { useSocket } from "#/context/socket"; import CodeEditorCompoonent from "./code-editor-component"; import { useFiles } from "#/context/files"; import { EditorActions } from "#/components/editor-actions"; @@ -28,8 +29,7 @@ export function ErrorBoundary() { } function CodeEditor() { - const { token } = useLoaderData(); - const { runtimeActive } = useSocket(); + const { curAgentState } = useSelector((state: RootState) => state.agent); const { setPaths, selectedPath, @@ -37,6 +37,27 @@ function CodeEditor() { saveFileContent: saveNewFileContent, discardChanges, } = useFiles(); + const [fileExplorerIsOpen, setFileExplorerIsOpen] = React.useState(true); + const editorRef = React.useRef(null); + + const toggleFileExplorer = () => { + setFileExplorerIsOpen((prev) => !prev); + editorRef.current?.layout({ width: 0, height: 0 }); + }; + + const handleEditorDidMount: EditorProps["onMount"] = (e, monaco) => { + editorRef.current = e; + + monaco.editor.defineTheme("oh-dark", { + base: "vs-dark", + inherit: true, + rules: [], + colors: { + "editor.background": "#171717", + }, + }); + monaco.editor.setTheme("oh-dark"); + }; const [errors, setErrors] = React.useState<{ getFiles: string | null }>({ getFiles: null, @@ -47,15 +68,14 @@ function CodeEditor() { ); React.useEffect(() => { - // only retrieve files if connected to WS to prevent requesting before runtime is ready - if (runtimeActive && token) { - OpenHands.getFiles(token) + if (curAgentState === AgentState.INIT) { + OpenHands.getFiles() .then(setPaths) .catch(() => { setErrors({ getFiles: "Failed to retrieve files" }); }); } - }, [runtimeActive, token]); + }, [curAgentState]); // Code editing is only allowed when the agent is paused, finished, or awaiting user input (server rules) const isEditingAllowed = React.useMemo( @@ -69,9 +89,9 @@ function CodeEditor() { const handleSave = async () => { if (selectedPath) { const content = modifiedFiles[selectedPath]; - if (content && token) { + if (content) { try { - await OpenHands.saveFile(token, selectedPath, content); + await OpenHands.saveFile(selectedPath, content); saveNewFileContent(selectedPath); } catch (error) { toast.error("Failed to save file"); @@ -85,9 +105,13 @@ function CodeEditor() { }; return ( -
- -
+
+ +
{selectedPath && (
{selectedPath} @@ -98,9 +122,10 @@ function CodeEditor() { />
)} -
- -
+
); diff --git a/frontend/src/routes/_oh.app.tsx b/frontend/src/routes/_oh.app.tsx index 9af5ad278bb3..50c933b64d9e 100644 --- a/frontend/src/routes/_oh.app.tsx +++ b/frontend/src/routes/_oh.app.tsx @@ -72,9 +72,8 @@ const isAgentStateChange = ( export const clientLoader = async () => { const ghToken = localStorage.getItem("ghToken"); - try { - const isAuthed = await userIsAuthenticated(ghToken); + const isAuthed = await userIsAuthenticated(); if (!isAuthed) { clearSession(); return redirect("/"); @@ -185,21 +184,6 @@ function App() { if (q) addIntialQueryToChat(q, files); }, [settings]); - const handleError = (message: string) => { - const [error, ...rest] = message.split(":"); - const details = rest.join(":"); - if (!details) { - dispatch( - addErrorMessage({ - error: "An error has occured", - message: error, - }), - ); - } else { - dispatch(addErrorMessage({ error, message: details })); - } - }; - const handleMessage = React.useCallback( (message: MessageEvent) => { // set token received from the server @@ -225,7 +209,12 @@ function App() { return; } if (isErrorObservation(parsed)) { - handleError(parsed.message); + dispatch( + addErrorMessage({ + id: parsed.extras?.error_id, + message: parsed.message, + }), + ); return; } @@ -290,21 +279,21 @@ function App() { React.useEffect(() => { (async () => { - if (runtimeActive && token && importedProjectZip) { + if (runtimeActive && importedProjectZip) { // upload files action try { const blob = base64ToBlob(importedProjectZip); const file = new File([blob], "imported-project.zip", { type: blob.type, }); - await OpenHands.uploadFiles(token, [file]); + await OpenHands.uploadFiles([file]); dispatch(setImportedProjectZip(null)); } catch (error) { toast.error("Failed to upload project files."); } } })(); - }, [runtimeActive, token, importedProjectZip]); + }, [runtimeActive, importedProjectZip]); const { isOpen: securityModalIsOpen, @@ -315,7 +304,7 @@ function App() { return (
- + diff --git a/frontend/src/routes/_oh.tsx b/frontend/src/routes/_oh.tsx index b4b1b35cb098..b6594f7e73bf 100644 --- a/frontend/src/routes/_oh.tsx +++ b/frontend/src/routes/_oh.tsx @@ -10,6 +10,8 @@ import { Outlet, ClientLoaderFunctionArgs, } from "@remix-run/react"; +import posthog from "posthog-js"; +import { useDispatch } from "react-redux"; import { retrieveGitHubUser, isGitHubErrorReponse } from "#/api/github"; import OpenHands from "#/api/open-hands"; import CogTooth from "#/assets/cog-tooth"; @@ -28,6 +30,9 @@ import DocsIcon from "#/assets/docs.svg?react"; import { userIsAuthenticated } from "#/utils/user-is-authenticated"; import { generateGitHubAuthUrl } from "#/utils/generate-github-auth-url"; import { WaitlistModal } from "#/components/waitlist-modal"; +import { AnalyticsConsentFormModal } from "#/components/analytics-consent-form-modal"; +import { setCurrentAgentState } from "#/state/agentSlice"; +import AgentState from "#/types/AgentState"; export const clientLoader = async ({ request }: ClientLoaderFunctionArgs) => { try { @@ -41,12 +46,20 @@ export const clientLoader = async ({ request }: ClientLoaderFunctionArgs) => { let token = localStorage.getItem("token"); const ghToken = localStorage.getItem("ghToken"); + const analyticsConsent = localStorage.getItem("analytics-consent"); + const userConsents = analyticsConsent === "true"; - let isAuthed: boolean = false; + if (!userConsents) { + posthog.opt_out_capturing(); + } else { + posthog.opt_in_capturing(); + } + + let isAuthed = false; let githubAuthUrl: string | null = null; try { - isAuthed = await userIsAuthenticated(ghToken); + isAuthed = await userIsAuthenticated(); if (!isAuthed && window.__GITHUB_CLIENT_ID__) { const requestUrl = new URL(request.url); githubAuthUrl = generateGitHubAuthUrl( @@ -79,6 +92,7 @@ export const clientLoader = async ({ request }: ClientLoaderFunctionArgs) => { user, settingsIsUpdated, settings, + analyticsConsent, }); }; @@ -132,9 +146,11 @@ export default function MainApp() { githubAuthUrl, settingsIsUpdated, settings, + analyticsConsent, } = useLoaderData(); const logoutFetcher = useFetcher({ key: "logout" }); const endSessionFetcher = useFetcher({ key: "end-session" }); + const dispatch = useDispatch(); const [accountSettingsModalOpen, setAccountSettingsModalOpen] = React.useState(false); @@ -204,6 +220,7 @@ export default function MainApp() { const handleEndSession = () => { setStartNewProjectModalIsOpen(false); + dispatch(setCurrentAgentState(AgentState.LOADING)); // call new session action and redirect to '/' endSessionFetcher.submit(new FormData(), { method: "POST", @@ -212,7 +229,10 @@ export default function MainApp() { }; return ( -
+