Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(agent): add BrowseURLAction to CodeAct (produce markdown from URL) #5285

Merged
merged 10 commits into from
Nov 27, 2024
2 changes: 1 addition & 1 deletion evaluation/integration_tests/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def process_instance(
# # =============================================

histories = [event_to_dict(event) for event in state.history]
test_result: TestResult = test_class.verify_result(runtime, histories)
test_result: TestResult = test_class.verify_result(runtime, state.history)
metrics = state.metrics.get() if state.metrics else None

# Save the output
Expand Down
2 changes: 2 additions & 0 deletions evaluation/integration_tests/tests/t06_github_pr_browsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
content = event.content
elif isinstance(event, AgentFinishAction):
content = event.outputs.get('content', '')
if event.thought:
content += f'\n\n{event.thought}'
elif isinstance(event, MessageAction):
content = event.content
else:
Expand Down
6 changes: 5 additions & 1 deletion frontend/src/services/actions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@

const messageActions = {
[ActionType.BROWSE]: (message: ActionMessage) => {
store.dispatch(addAssistantMessage(message.message));
if (message.args.thought) {
store.dispatch(addAssistantMessage(message.args.thought));
} else {
store.dispatch(addAssistantMessage(message.message));
}
},
[ActionType.BROWSE_INTERACTIVE]: (message: ActionMessage) => {
if (message.args.thought) {
Expand Down Expand Up @@ -149,6 +153,6 @@
} else if (message.status_update) {
handleStatusMessage(message as unknown as StatusMessage);
} else {
console.error("Unknown message type", message);

Check warning on line 156 in frontend/src/services/actions.ts

View workflow job for this annotation

GitHub Actions / Lint frontend

Unexpected console statement
}
}
2 changes: 2 additions & 0 deletions openhands/agenthub/codeact_agent/codeact_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
AgentDelegateAction,
AgentFinishAction,
BrowseInteractiveAction,
BrowseURLAction,
CmdRunAction,
FileEditAction,
IPythonRunCellAction,
Expand Down Expand Up @@ -151,6 +152,7 @@ def get_action_message(
IPythonRunCellAction,
FileEditAction,
BrowseInteractiveAction,
BrowseURLAction,
),
) or (
isinstance(action, (AgentFinishAction, CmdRunAction))
Expand Down
30 changes: 29 additions & 1 deletion openhands/agenthub/codeact_agent/function_calling.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
AgentDelegateAction,
AgentFinishAction,
BrowseInteractiveAction,
BrowseURLAction,
CmdRunAction,
FileEditAction,
IPythonRunCellAction,
Expand Down Expand Up @@ -266,6 +267,30 @@ def __init__(self):
),
)


_WEB_DESCRIPTION = """Read (convert to markdown) content from a webpage. You should prefer using the `webpage_read` tool over the `browser` tool, but do use the `browser` tool if you need to interact with a webpage (e.g., click a button, fill out a form, etc.).

You may use the `webpage_read` tool to read content from a webpage, and even search the webpage content using a Google search query (e.g., url=`https://www.google.com/search?q=YOUR_QUERY`).
"""

WebReadTool = ChatCompletionToolParam(
type='function',
function=ChatCompletionToolParamFunctionChunk(
name='web_read',
description=_WEB_DESCRIPTION,
parameters={
'type': 'object',
'properties': {
'url': {
'type': 'string',
'description': 'The URL of the webpage to read. You can also use a Google search query here (e.g., `https://www.google.com/search?q=YOUR_QUERY`).',
}
},
'required': ['url'],
},
),
)

# from browsergym/core/action/highlevel.py
_browser_action_space = HighLevelActionSet(
subsets=['bid', 'nav'],
Expand All @@ -274,7 +299,7 @@ def __init__(self):
)


_BROWSER_DESCRIPTION = """Interact with the browser using Python code.
_BROWSER_DESCRIPTION = """Interact with the browser using Python code. Use it ONLY when you need to interact with a webpage.

See the description of "code" parameter for more details.

Expand Down Expand Up @@ -484,6 +509,8 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
action = IPythonRunCellAction(code=code, include_extra=False)
elif tool_call.function.name == 'browser':
action = BrowseInteractiveAction(browser_actions=arguments['code'])
elif tool_call.function.name == 'web_read':
action = BrowseURLAction(url=arguments['url'])
else:
raise FunctionCallNotExistsError(
f'Tool {tool_call.function.name} is not registered. (arguments: {arguments}). Please check the tool name and retry with an existing tool.'
Expand Down Expand Up @@ -516,6 +543,7 @@ def get_tools(
) -> list[ChatCompletionToolParam]:
tools = [CmdRunTool, FinishTool]
if codeact_enable_browsing:
tools.append(WebReadTool)
tools.append(BrowserTool)
if codeact_enable_jupyter:
tools.append(IPythonTool)
Expand Down
2 changes: 1 addition & 1 deletion openhands/events/action/browse.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class BrowseURLAction(Action):

@property
def message(self) -> str:
return f'Browsing URL: {self.url}'
return f'I am browsing the URL: {self.url}'

def __str__(self) -> str:
ret = '**BrowseURLAction**\n'
Expand Down
70 changes: 44 additions & 26 deletions openhands/events/observation/browse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from browsergym.utils.obs import flatten_axtree_to_str

from openhands.core.schema import ObservationType
from openhands.core.schema import ActionType, ObservationType
from openhands.events.observation.observation import Observation


Expand All @@ -11,6 +11,7 @@ class BrowserOutputObservation(Observation):
"""This data class represents the output of a browser."""

url: str
trigger_by_action: str
screenshot: str = field(repr=False) # don't show in repr
error: bool = False
observation: str = ObservationType.BROWSE
Expand Down Expand Up @@ -40,39 +41,56 @@ def __str__(self) -> str:
f'Last browser action: {self.last_browser_action}\n'
f'Last browser action error: {self.last_browser_action_error}\n'
f'Focused element bid: {self.focused_element_bid}\n'
f'Content: {self.content}\n'
)
ret += '--- Agent Observation ---\n'
ret += self.get_agent_obs_text()
return ret

def get_agent_obs_text(self) -> str:
"""Get a concise text that will be shown to the agent."""
text = f'[Current URL: {self.url}]\n'
text += f'[Focused element bid: {self.focused_element_bid}]\n\n'
if self.error:
text += (
'================ BEGIN error message ===============\n'
'The following error occurred when executing the last action:\n'
f'{self.last_browser_action_error}\n'
'================ END error message ===============\n'
)
else:
text += '[Action executed successfully.]\n'
if self.trigger_by_action == ActionType.BROWSE_INTERACTIVE:
text = f'[Current URL: {self.url}]\n'
text += f'[Focused element bid: {self.focused_element_bid}]\n\n'
if self.error:
text += (
'================ BEGIN error message ===============\n'
'The following error occurred when executing the last action:\n'
f'{self.last_browser_action_error}\n'
'================ END error message ===============\n'
)
else:
text += '[Action executed successfully.]\n'
try:
# We do not filter visible only here because we want to show the full content
# of the web page to the agent for simplicity.
# FIXME: handle the case when the web page is too large
cur_axtree_txt = self.get_axtree_str(filter_visible_only=False)
text += (
f'============== BEGIN accessibility tree ==============\n'
f'{cur_axtree_txt}\n'
f'============== END accessibility tree ==============\n'
)
except Exception as e:
text += (
f'\n[Error encountered when processing the accessibility tree: {e}]'
)
return text

try:
# We do not filter visible only here because we want to show the full content
# of the web page to the agent for simplicity.
# FIXME: handle the case when the web page is too large
cur_axtree_txt = self.get_axtree_str(filter_visible_only=False)
text += (
f'============== BEGIN accessibility tree ==============\n'
f'{cur_axtree_txt}\n'
f'============== END accessibility tree ==============\n'
)
except Exception as e:
text += f'\n[Error encountered when processing the accessibility tree: {e}]'
return text
elif self.trigger_by_action == ActionType.BROWSE:
text = f'[Current URL: {self.url}]\n'
if self.error:
text += (
'================ BEGIN error message ===============\n'
'The following error occurred when trying to visit the URL:\n'
f'{self.last_browser_action_error}\n'
'================ END error message ===============\n'
)
text += '============== BEGIN webpage content ==============\n'
text += self.content
text += '\n============== END webpage content ==============\n'
return text
else:
raise ValueError(f'Invalid trigger_by_action: {self.trigger_by_action}')

def get_axtree_str(self, filter_visible_only: bool = False) -> str:
cur_axtree_txt = flatten_axtree_to_str(
Expand Down
2 changes: 2 additions & 0 deletions openhands/runtime/browser/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ async def browse(
), # last browser env action performed
last_browser_action_error=obs.get('last_action_error', ''),
error=True if obs.get('last_action_error', '') else False, # error flag
trigger_by_action=action.action,
)
except Exception as e:
return BrowserOutputObservation(
Expand All @@ -57,4 +58,5 @@ async def browse(
error=True,
last_browser_action_error=str(e),
url=asked_url if action.action == ActionType.BROWSE else '',
trigger_by_action=action.action,
)
1 change: 1 addition & 0 deletions tests/unit/test_security.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,7 @@ def test_parse_action(action, expected_trace):
content='browser output content',
url='http://localhost:3000',
screenshot='screenshot',
trigger_by_action=ActionType.BROWSE,
),
[
ToolOutput(
Expand Down