From 76cae8fee3078f189399023a1c3bca02712442fc Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 26 Nov 2024 12:13:13 -0500 Subject: [PATCH 1/9] feat(agent): add webpage read microagent --- .../codeact_agent/micro/webpage_read.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 openhands/agenthub/codeact_agent/micro/webpage_read.md diff --git a/openhands/agenthub/codeact_agent/micro/webpage_read.md b/openhands/agenthub/codeact_agent/micro/webpage_read.md new file mode 100644 index 000000000000..24d8919e1b5f --- /dev/null +++ b/openhands/agenthub/codeact_agent/micro/webpage_read.md @@ -0,0 +1,19 @@ +--- +name: webpage_read +agent: CodeActAgent +triggers: +- http:// +- https:// +--- + +To read content from a webpage, you can use the `percollate` CLI tool: + +1. Install `percollate` with `npm install -g percollate` +2. Once installed, use it to convert a webpage to markdown with the following command: + +```bash +percollate md https://example.com --output example.md +``` + +3. Then, you can read the markdown file `./example.md` using other tools you have access to. +4. If you need to interact further with the webpage, you should not use the `percollate` CLI tool. Instead, you should use the web browser directly provided to you. From 3660445d7ead15137a803941ae86858bbd0a8ce8 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 26 Nov 2024 13:17:28 -0500 Subject: [PATCH 2/9] use BrowseURLAction instead --- .../codeact_agent/function_calling.py | 30 +++++++- .../codeact_agent/micro/webpage_read.md | 19 ----- openhands/events/observation/browse.py | 69 ++++++++++++------- openhands/runtime/browser/utils.py | 2 + tests/unit/test_security.py | 1 + 5 files changed, 76 insertions(+), 45 deletions(-) delete mode 100644 openhands/agenthub/codeact_agent/micro/webpage_read.md diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py index a4ee35ff7b59..399776e6c6f3 100644 --- a/openhands/agenthub/codeact_agent/function_calling.py +++ b/openhands/agenthub/codeact_agent/function_calling.py @@ -19,6 +19,7 @@ AgentDelegateAction, AgentFinishAction, BrowseInteractiveAction, + BrowseURLAction, CmdRunAction, FileEditAction, IPythonRunCellAction, @@ -266,6 +267,30 @@ def __init__(self): ), ) + +_WEB_DESCRIPTION = """Read (convert to markdown) content from a webpage. You should prefer using the `webpage_read` tool over the `browser` tool, but do use the `browser` tool if you need to interact with a webpage (e.g., click a button, fill out a form, etc.). + +You may use the `webpage_read` tool to read content from a webpage, and even search the webpage content using a Google search query (e.g., url=`https://www.google.com/search?q=YOUR_QUERY`). +""" + +WebReadTool = ChatCompletionToolParam( + type='function', + function=ChatCompletionToolParamFunctionChunk( + name='web_read', + description=_WEB_DESCRIPTION, + parameters={ + 'type': 'object', + 'properties': { + 'url': { + 'type': 'string', + 'description': 'The URL of the webpage to read. You can also use a Google search query here (e.g., `https://www.google.com/search?q=YOUR_QUERY`).', + } + }, + 'required': ['url'], + }, + ), +) + # from browsergym/core/action/highlevel.py _browser_action_space = HighLevelActionSet( subsets=['bid', 'nav'], @@ -274,7 +299,7 @@ def __init__(self): ) -_BROWSER_DESCRIPTION = """Interact with the browser using Python code. +_BROWSER_DESCRIPTION = """Interact with the browser using Python code. Use it ONLY when you need to interact with a webpage. See the description of "code" parameter for more details. @@ -484,6 +509,8 @@ def response_to_actions(response: ModelResponse) -> list[Action]: action = IPythonRunCellAction(code=code, include_extra=False) elif tool_call.function.name == 'browser': action = BrowseInteractiveAction(browser_actions=arguments['code']) + elif tool_call.function.name == 'web_read': + action = BrowseURLAction(url=arguments['url']) else: raise FunctionCallNotExistsError( f'Tool {tool_call.function.name} is not registered. (arguments: {arguments}). Please check the tool name and retry with an existing tool.' @@ -516,6 +543,7 @@ def get_tools( ) -> list[ChatCompletionToolParam]: tools = [CmdRunTool, FinishTool] if codeact_enable_browsing: + tools.append(WebReadTool) tools.append(BrowserTool) if codeact_enable_jupyter: tools.append(IPythonTool) diff --git a/openhands/agenthub/codeact_agent/micro/webpage_read.md b/openhands/agenthub/codeact_agent/micro/webpage_read.md deleted file mode 100644 index 24d8919e1b5f..000000000000 --- a/openhands/agenthub/codeact_agent/micro/webpage_read.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -name: webpage_read -agent: CodeActAgent -triggers: -- http:// -- https:// ---- - -To read content from a webpage, you can use the `percollate` CLI tool: - -1. Install `percollate` with `npm install -g percollate` -2. Once installed, use it to convert a webpage to markdown with the following command: - -```bash -percollate md https://example.com --output example.md -``` - -3. Then, you can read the markdown file `./example.md` using other tools you have access to. -4. If you need to interact further with the webpage, you should not use the `percollate` CLI tool. Instead, you should use the web browser directly provided to you. diff --git a/openhands/events/observation/browse.py b/openhands/events/observation/browse.py index 9632fac57d54..fd3e12871f16 100644 --- a/openhands/events/observation/browse.py +++ b/openhands/events/observation/browse.py @@ -2,7 +2,7 @@ from browsergym.utils.obs import flatten_axtree_to_str -from openhands.core.schema import ObservationType +from openhands.core.schema import ActionType, ObservationType from openhands.events.observation.observation import Observation @@ -11,6 +11,7 @@ class BrowserOutputObservation(Observation): """This data class represents the output of a browser.""" url: str + trigger_by_action: str screenshot: str = field(repr=False) # don't show in repr error: bool = False observation: str = ObservationType.BROWSE @@ -48,31 +49,49 @@ def __str__(self) -> str: def get_agent_obs_text(self) -> str: """Get a concise text that will be shown to the agent.""" - text = f'[Current URL: {self.url}]\n' - text += f'[Focused element bid: {self.focused_element_bid}]\n\n' - if self.error: - text += ( - '================ BEGIN error message ===============\n' - 'The following error occurred when executing the last action:\n' - f'{self.last_browser_action_error}\n' - '================ END error message ===============\n' - ) - else: - text += '[Action executed successfully.]\n' + if self.trigger_by_action == ActionType.BROWSE_INTERACTIVE: + text = f'[Current URL: {self.url}]\n' + text += f'[Focused element bid: {self.focused_element_bid}]\n\n' + if self.error: + text += ( + '================ BEGIN error message ===============\n' + 'The following error occurred when executing the last action:\n' + f'{self.last_browser_action_error}\n' + '================ END error message ===============\n' + ) + else: + text += '[Action executed successfully.]\n' + try: + # We do not filter visible only here because we want to show the full content + # of the web page to the agent for simplicity. + # FIXME: handle the case when the web page is too large + cur_axtree_txt = self.get_axtree_str(filter_visible_only=False) + text += ( + f'============== BEGIN accessibility tree ==============\n' + f'{cur_axtree_txt}\n' + f'============== END accessibility tree ==============\n' + ) + except Exception as e: + text += ( + f'\n[Error encountered when processing the accessibility tree: {e}]' + ) + return text - try: - # We do not filter visible only here because we want to show the full content - # of the web page to the agent for simplicity. - # FIXME: handle the case when the web page is too large - cur_axtree_txt = self.get_axtree_str(filter_visible_only=False) - text += ( - f'============== BEGIN accessibility tree ==============\n' - f'{cur_axtree_txt}\n' - f'============== END accessibility tree ==============\n' - ) - except Exception as e: - text += f'\n[Error encountered when processing the accessibility tree: {e}]' - return text + elif self.trigger_by_action == ActionType.BROWSE: + text = f'[Current URL: {self.url}]\n' + if self.error: + text += ( + '================ BEGIN error message ===============\n' + 'The following error occurred when trying to visit the URL:\n' + f'{self.last_browser_action_error}\n' + '================ END error message ===============\n' + ) + text += '============== BEGIN webpage content ==============\n' + text += self.content + text += '\n============== END webpage content ==============\n' + return text + else: + raise ValueError(f'Invalid trigger_by_action: {self.trigger_by_action}') def get_axtree_str(self, filter_visible_only: bool = False) -> str: cur_axtree_txt = flatten_axtree_to_str( diff --git a/openhands/runtime/browser/utils.py b/openhands/runtime/browser/utils.py index 336b3801e3e2..6f823e47d546 100644 --- a/openhands/runtime/browser/utils.py +++ b/openhands/runtime/browser/utils.py @@ -49,6 +49,7 @@ async def browse( ), # last browser env action performed last_browser_action_error=obs.get('last_action_error', ''), error=True if obs.get('last_action_error', '') else False, # error flag + trigger_by_action=action.action, ) except Exception as e: return BrowserOutputObservation( @@ -57,4 +58,5 @@ async def browse( error=True, last_browser_action_error=str(e), url=asked_url if action.action == ActionType.BROWSE else '', + trigger_by_action=action.action, ) diff --git a/tests/unit/test_security.py b/tests/unit/test_security.py index 771ccc206d3c..68ea9f9eea63 100644 --- a/tests/unit/test_security.py +++ b/tests/unit/test_security.py @@ -382,6 +382,7 @@ def test_parse_action(action, expected_trace): content='browser output content', url='http://localhost:3000', screenshot='screenshot', + trigger_by_action=ActionType.BROWSE, ), [ ToolOutput( From 1a33d2a7cdc87546ac79dbf17b93184ed480460a Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 26 Nov 2024 13:24:07 -0500 Subject: [PATCH 3/9] dispatch thought for browse url first --- frontend/src/services/actions.ts | 6 +++++- openhands/events/action/browse.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/frontend/src/services/actions.ts b/frontend/src/services/actions.ts index 13265776dcee..69844b8a22cd 100644 --- a/frontend/src/services/actions.ts +++ b/frontend/src/services/actions.ts @@ -21,7 +21,11 @@ import { handleObservationMessage } from "./observations"; const messageActions = { [ActionType.BROWSE]: (message: ActionMessage) => { - store.dispatch(addAssistantMessage(message.message)); + if (message.args.thought) { + store.dispatch(addAssistantMessage(message.args.thought)); + } else { + store.dispatch(addAssistantMessage(message.message)); + } }, [ActionType.BROWSE_INTERACTIVE]: (message: ActionMessage) => { if (message.args.thought) { diff --git a/openhands/events/action/browse.py b/openhands/events/action/browse.py index 41816216d6d5..418dd0444366 100644 --- a/openhands/events/action/browse.py +++ b/openhands/events/action/browse.py @@ -15,7 +15,7 @@ class BrowseURLAction(Action): @property def message(self) -> str: - return f'Browsing URL: {self.url}' + return f'I am browsing the URL: {self.url}' def __str__(self) -> str: ret = '**BrowseURLAction**\n' From 8474a42056d039db87e49b4f939848386e095e48 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 26 Nov 2024 13:24:18 -0500 Subject: [PATCH 4/9] remove content from browser output str for debug clarity --- openhands/events/observation/browse.py | 1 - 1 file changed, 1 deletion(-) diff --git a/openhands/events/observation/browse.py b/openhands/events/observation/browse.py index fd3e12871f16..1052aaf17a91 100644 --- a/openhands/events/observation/browse.py +++ b/openhands/events/observation/browse.py @@ -41,7 +41,6 @@ def __str__(self) -> str: f'Last browser action: {self.last_browser_action}\n' f'Last browser action error: {self.last_browser_action_error}\n' f'Focused element bid: {self.focused_element_bid}\n' - f'Content: {self.content}\n' ) ret += '--- Agent Observation ---\n' ret += self.get_agent_obs_text() From b9b75638ab41c707f0faeae95676a875296c8fcb Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 26 Nov 2024 13:46:38 -0500 Subject: [PATCH 5/9] add BrowseURL action to codeact --- openhands/agenthub/codeact_agent/codeact_agent.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py index 6743de87ade6..1113fd0271d3 100644 --- a/openhands/agenthub/codeact_agent/codeact_agent.py +++ b/openhands/agenthub/codeact_agent/codeact_agent.py @@ -15,6 +15,7 @@ AgentDelegateAction, AgentFinishAction, BrowseInteractiveAction, + BrowseURLAction, CmdRunAction, FileEditAction, IPythonRunCellAction, @@ -151,6 +152,7 @@ def get_action_message( IPythonRunCellAction, FileEditAction, BrowseInteractiveAction, + BrowseURLAction, ), ) or ( isinstance(action, (AgentFinishAction, CmdRunAction)) From 4e9cb764806f03a5abb25a37351e71dfd6b45f10 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 26 Nov 2024 13:47:28 -0500 Subject: [PATCH 6/9] fix pr browsing test --- evaluation/integration_tests/tests/t06_github_pr_browsing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/evaluation/integration_tests/tests/t06_github_pr_browsing.py b/evaluation/integration_tests/tests/t06_github_pr_browsing.py index 52ec927cd334..69d37f69b6b6 100644 --- a/evaluation/integration_tests/tests/t06_github_pr_browsing.py +++ b/evaluation/integration_tests/tests/t06_github_pr_browsing.py @@ -27,6 +27,8 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: content = event.content elif isinstance(event, AgentFinishAction): content = event.outputs.get('content', '') + if not content: + content = event.thought elif isinstance(event, MessageAction): content = event.content else: From 87f5e9abf914778492a8dd60595822f322d4e13e Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 26 Nov 2024 16:02:03 -0500 Subject: [PATCH 7/9] fix test --- evaluation/integration_tests/run_infer.py | 2 +- evaluation/integration_tests/tests/t06_github_pr_browsing.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py index 5e3205fefe2e..6eace74403d0 100644 --- a/evaluation/integration_tests/run_infer.py +++ b/evaluation/integration_tests/run_infer.py @@ -130,7 +130,7 @@ def process_instance( # # ============================================= histories = [event_to_dict(event) for event in state.history] - test_result: TestResult = test_class.verify_result(runtime, histories) + test_result: TestResult = test_class.verify_result(runtime, state.history) metrics = state.metrics.get() if state.metrics else None # Save the output diff --git a/evaluation/integration_tests/tests/t06_github_pr_browsing.py b/evaluation/integration_tests/tests/t06_github_pr_browsing.py index 69d37f69b6b6..7536ccc828d8 100644 --- a/evaluation/integration_tests/tests/t06_github_pr_browsing.py +++ b/evaluation/integration_tests/tests/t06_github_pr_browsing.py @@ -27,8 +27,8 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: content = event.content elif isinstance(event, AgentFinishAction): content = event.outputs.get('content', '') - if not content: - content = event.thought + if event.thought: + content += f'\n\n{event.thought}' elif isinstance(event, MessageAction): content = event.content else: From 922849b5be812ff21d5211a55f7194c5bca107b1 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Wed, 27 Nov 2024 09:42:40 -0500 Subject: [PATCH 8/9] cleanup miniwob runtime appropriately --- evaluation/integration_tests/run_infer.py | 45 ++++++++++++----------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py index 6eace74403d0..2c7f7d7a1c14 100644 --- a/evaluation/integration_tests/run_infer.py +++ b/evaluation/integration_tests/run_infer.py @@ -107,31 +107,32 @@ def process_instance( # ============================================= # create sandbox and run the agent # ============================================= - runtime: Runtime = create_runtime(config) call_async_from_sync(runtime.connect) - - test_class.initialize_runtime(runtime) - - # Here's how you can run the agent (similar to the `main` function) and get the final task state - state: State | None = asyncio.run( - run_controller( - config=config, - initial_user_action=MessageAction(content=instruction), - runtime=runtime, - fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class], + try: + test_class.initialize_runtime(runtime) + + # Here's how you can run the agent (similar to the `main` function) and get the final task state + state: State | None = asyncio.run( + run_controller( + config=config, + initial_user_action=MessageAction(content=instruction), + runtime=runtime, + fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class], + ) ) - ) - if state is None: - raise ValueError('State should not be None.') - - # # ============================================= - # # result evaluation - # # ============================================= - - histories = [event_to_dict(event) for event in state.history] - test_result: TestResult = test_class.verify_result(runtime, state.history) - metrics = state.metrics.get() if state.metrics else None + if state is None: + raise ValueError('State should not be None.') + + # # ============================================= + # # result evaluation + # # ============================================= + + histories = [event_to_dict(event) for event in state.history] + test_result: TestResult = test_class.verify_result(runtime, state.history) + metrics = state.metrics.get() if state.metrics else None + finally: + runtime.close() # Save the output output = EvalOutput( From 97df362af04aa934ef14537e3341adb60e129d86 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Wed, 27 Nov 2024 16:35:41 -0500 Subject: [PATCH 9/9] fix linter --- evaluation/integration_tests/run_infer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py index 36eb353d73f4..3b6f1c6ff2cc 100644 --- a/evaluation/integration_tests/run_infer.py +++ b/evaluation/integration_tests/run_infer.py @@ -135,7 +135,7 @@ def process_instance( # # ============================================= histories = state.history - + # some basic check logger.info(f'Total events in history: {len(histories)}') assert len(histories) > 0, 'History should not be empty'