diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py index 8a21b12ae5b2..3b6f1c6ff2cc 100644 --- a/evaluation/integration_tests/run_infer.py +++ b/evaluation/integration_tests/run_infer.py @@ -113,36 +113,37 @@ def process_instance( # ============================================= # create sandbox and run the agent # ============================================= - runtime: Runtime = create_runtime(config) call_async_from_sync(runtime.connect) - - test_class.initialize_runtime(runtime) - - # Here's how you can run the agent (similar to the `main` function) and get the final task state - state: State | None = asyncio.run( - run_controller( - config=config, - initial_user_action=MessageAction(content=instruction), - runtime=runtime, - fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class], + try: + test_class.initialize_runtime(runtime) + + # Here's how you can run the agent (similar to the `main` function) and get the final task state + state: State | None = asyncio.run( + run_controller( + config=config, + initial_user_action=MessageAction(content=instruction), + runtime=runtime, + fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class], + ) ) - ) - if state is None: - raise ValueError('State should not be None.') + if state is None: + raise ValueError('State should not be None.') - # # ============================================= - # # result evaluation - # # ============================================= + # # ============================================= + # # result evaluation + # # ============================================= - histories = state.history + histories = state.history - # some basic check - logger.info(f'Total events in history: {len(histories)}') - assert len(histories) > 0, 'History should not be empty' + # some basic check + logger.info(f'Total events in history: {len(histories)}') + assert len(histories) > 0, 'History should not be empty' - test_result: TestResult = test_class.verify_result(runtime, histories) - metrics = state.metrics.get() if state.metrics else None + test_result: TestResult = test_class.verify_result(runtime, histories) + metrics = state.metrics.get() if state.metrics else None + finally: + runtime.close() # Save the output output = EvalOutput( diff --git a/evaluation/integration_tests/tests/t06_github_pr_browsing.py b/evaluation/integration_tests/tests/t06_github_pr_browsing.py index 2dc1a01ecd97..3c25e0300a35 100644 --- a/evaluation/integration_tests/tests/t06_github_pr_browsing.py +++ b/evaluation/integration_tests/tests/t06_github_pr_browsing.py @@ -32,6 +32,8 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: content = event.content elif isinstance(event, AgentFinishAction): content = event.outputs.get('content', '') + if event.thought: + content += f'\n\n{event.thought}' elif isinstance(event, MessageAction): content = event.content else: diff --git a/frontend/src/services/actions.ts b/frontend/src/services/actions.ts index 13265776dcee..69844b8a22cd 100644 --- a/frontend/src/services/actions.ts +++ b/frontend/src/services/actions.ts @@ -21,7 +21,11 @@ import { handleObservationMessage } from "./observations"; const messageActions = { [ActionType.BROWSE]: (message: ActionMessage) => { - store.dispatch(addAssistantMessage(message.message)); + if (message.args.thought) { + store.dispatch(addAssistantMessage(message.args.thought)); + } else { + store.dispatch(addAssistantMessage(message.message)); + } }, [ActionType.BROWSE_INTERACTIVE]: (message: ActionMessage) => { if (message.args.thought) { diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py index 6743de87ade6..1113fd0271d3 100644 --- a/openhands/agenthub/codeact_agent/codeact_agent.py +++ b/openhands/agenthub/codeact_agent/codeact_agent.py @@ -15,6 +15,7 @@ AgentDelegateAction, AgentFinishAction, BrowseInteractiveAction, + BrowseURLAction, CmdRunAction, FileEditAction, IPythonRunCellAction, @@ -151,6 +152,7 @@ def get_action_message( IPythonRunCellAction, FileEditAction, BrowseInteractiveAction, + BrowseURLAction, ), ) or ( isinstance(action, (AgentFinishAction, CmdRunAction)) diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py index a4ee35ff7b59..399776e6c6f3 100644 --- a/openhands/agenthub/codeact_agent/function_calling.py +++ b/openhands/agenthub/codeact_agent/function_calling.py @@ -19,6 +19,7 @@ AgentDelegateAction, AgentFinishAction, BrowseInteractiveAction, + BrowseURLAction, CmdRunAction, FileEditAction, IPythonRunCellAction, @@ -266,6 +267,30 @@ def __init__(self): ), ) + +_WEB_DESCRIPTION = """Read (convert to markdown) content from a webpage. You should prefer using the `webpage_read` tool over the `browser` tool, but do use the `browser` tool if you need to interact with a webpage (e.g., click a button, fill out a form, etc.). + +You may use the `webpage_read` tool to read content from a webpage, and even search the webpage content using a Google search query (e.g., url=`https://www.google.com/search?q=YOUR_QUERY`). +""" + +WebReadTool = ChatCompletionToolParam( + type='function', + function=ChatCompletionToolParamFunctionChunk( + name='web_read', + description=_WEB_DESCRIPTION, + parameters={ + 'type': 'object', + 'properties': { + 'url': { + 'type': 'string', + 'description': 'The URL of the webpage to read. You can also use a Google search query here (e.g., `https://www.google.com/search?q=YOUR_QUERY`).', + } + }, + 'required': ['url'], + }, + ), +) + # from browsergym/core/action/highlevel.py _browser_action_space = HighLevelActionSet( subsets=['bid', 'nav'], @@ -274,7 +299,7 @@ def __init__(self): ) -_BROWSER_DESCRIPTION = """Interact with the browser using Python code. +_BROWSER_DESCRIPTION = """Interact with the browser using Python code. Use it ONLY when you need to interact with a webpage. See the description of "code" parameter for more details. @@ -484,6 +509,8 @@ def response_to_actions(response: ModelResponse) -> list[Action]: action = IPythonRunCellAction(code=code, include_extra=False) elif tool_call.function.name == 'browser': action = BrowseInteractiveAction(browser_actions=arguments['code']) + elif tool_call.function.name == 'web_read': + action = BrowseURLAction(url=arguments['url']) else: raise FunctionCallNotExistsError( f'Tool {tool_call.function.name} is not registered. (arguments: {arguments}). Please check the tool name and retry with an existing tool.' @@ -516,6 +543,7 @@ def get_tools( ) -> list[ChatCompletionToolParam]: tools = [CmdRunTool, FinishTool] if codeact_enable_browsing: + tools.append(WebReadTool) tools.append(BrowserTool) if codeact_enable_jupyter: tools.append(IPythonTool) diff --git a/openhands/events/action/browse.py b/openhands/events/action/browse.py index 41816216d6d5..418dd0444366 100644 --- a/openhands/events/action/browse.py +++ b/openhands/events/action/browse.py @@ -15,7 +15,7 @@ class BrowseURLAction(Action): @property def message(self) -> str: - return f'Browsing URL: {self.url}' + return f'I am browsing the URL: {self.url}' def __str__(self) -> str: ret = '**BrowseURLAction**\n' diff --git a/openhands/events/observation/browse.py b/openhands/events/observation/browse.py index 9632fac57d54..1052aaf17a91 100644 --- a/openhands/events/observation/browse.py +++ b/openhands/events/observation/browse.py @@ -2,7 +2,7 @@ from browsergym.utils.obs import flatten_axtree_to_str -from openhands.core.schema import ObservationType +from openhands.core.schema import ActionType, ObservationType from openhands.events.observation.observation import Observation @@ -11,6 +11,7 @@ class BrowserOutputObservation(Observation): """This data class represents the output of a browser.""" url: str + trigger_by_action: str screenshot: str = field(repr=False) # don't show in repr error: bool = False observation: str = ObservationType.BROWSE @@ -40,7 +41,6 @@ def __str__(self) -> str: f'Last browser action: {self.last_browser_action}\n' f'Last browser action error: {self.last_browser_action_error}\n' f'Focused element bid: {self.focused_element_bid}\n' - f'Content: {self.content}\n' ) ret += '--- Agent Observation ---\n' ret += self.get_agent_obs_text() @@ -48,31 +48,49 @@ def __str__(self) -> str: def get_agent_obs_text(self) -> str: """Get a concise text that will be shown to the agent.""" - text = f'[Current URL: {self.url}]\n' - text += f'[Focused element bid: {self.focused_element_bid}]\n\n' - if self.error: - text += ( - '================ BEGIN error message ===============\n' - 'The following error occurred when executing the last action:\n' - f'{self.last_browser_action_error}\n' - '================ END error message ===============\n' - ) - else: - text += '[Action executed successfully.]\n' + if self.trigger_by_action == ActionType.BROWSE_INTERACTIVE: + text = f'[Current URL: {self.url}]\n' + text += f'[Focused element bid: {self.focused_element_bid}]\n\n' + if self.error: + text += ( + '================ BEGIN error message ===============\n' + 'The following error occurred when executing the last action:\n' + f'{self.last_browser_action_error}\n' + '================ END error message ===============\n' + ) + else: + text += '[Action executed successfully.]\n' + try: + # We do not filter visible only here because we want to show the full content + # of the web page to the agent for simplicity. + # FIXME: handle the case when the web page is too large + cur_axtree_txt = self.get_axtree_str(filter_visible_only=False) + text += ( + f'============== BEGIN accessibility tree ==============\n' + f'{cur_axtree_txt}\n' + f'============== END accessibility tree ==============\n' + ) + except Exception as e: + text += ( + f'\n[Error encountered when processing the accessibility tree: {e}]' + ) + return text - try: - # We do not filter visible only here because we want to show the full content - # of the web page to the agent for simplicity. - # FIXME: handle the case when the web page is too large - cur_axtree_txt = self.get_axtree_str(filter_visible_only=False) - text += ( - f'============== BEGIN accessibility tree ==============\n' - f'{cur_axtree_txt}\n' - f'============== END accessibility tree ==============\n' - ) - except Exception as e: - text += f'\n[Error encountered when processing the accessibility tree: {e}]' - return text + elif self.trigger_by_action == ActionType.BROWSE: + text = f'[Current URL: {self.url}]\n' + if self.error: + text += ( + '================ BEGIN error message ===============\n' + 'The following error occurred when trying to visit the URL:\n' + f'{self.last_browser_action_error}\n' + '================ END error message ===============\n' + ) + text += '============== BEGIN webpage content ==============\n' + text += self.content + text += '\n============== END webpage content ==============\n' + return text + else: + raise ValueError(f'Invalid trigger_by_action: {self.trigger_by_action}') def get_axtree_str(self, filter_visible_only: bool = False) -> str: cur_axtree_txt = flatten_axtree_to_str( diff --git a/openhands/runtime/browser/utils.py b/openhands/runtime/browser/utils.py index 336b3801e3e2..6f823e47d546 100644 --- a/openhands/runtime/browser/utils.py +++ b/openhands/runtime/browser/utils.py @@ -49,6 +49,7 @@ async def browse( ), # last browser env action performed last_browser_action_error=obs.get('last_action_error', ''), error=True if obs.get('last_action_error', '') else False, # error flag + trigger_by_action=action.action, ) except Exception as e: return BrowserOutputObservation( @@ -57,4 +58,5 @@ async def browse( error=True, last_browser_action_error=str(e), url=asked_url if action.action == ActionType.BROWSE else '', + trigger_by_action=action.action, ) diff --git a/tests/unit/test_security.py b/tests/unit/test_security.py index 771ccc206d3c..68ea9f9eea63 100644 --- a/tests/unit/test_security.py +++ b/tests/unit/test_security.py @@ -382,6 +382,7 @@ def test_parse_action(action, expected_trace): content='browser output content', url='http://localhost:3000', screenshot='screenshot', + trigger_by_action=ActionType.BROWSE, ), [ ToolOutput(