Skip to content

Commit

Permalink
feat(agent): add BrowseURLAction to CodeAct (produce markdown from UR…
Browse files Browse the repository at this point in the history
…L) (#5285)
  • Loading branch information
xingyaoww authored Nov 27, 2024
1 parent f0ca223 commit 4d3b035
Show file tree
Hide file tree
Showing 9 changed files with 110 additions and 52 deletions.
47 changes: 24 additions & 23 deletions evaluation/integration_tests/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,36 +113,37 @@ def process_instance(
# =============================================
# create sandbox and run the agent
# =============================================

runtime: Runtime = create_runtime(config)
call_async_from_sync(runtime.connect)

test_class.initialize_runtime(runtime)

# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State | None = asyncio.run(
run_controller(
config=config,
initial_user_action=MessageAction(content=instruction),
runtime=runtime,
fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
try:
test_class.initialize_runtime(runtime)

# Here's how you can run the agent (similar to the `main` function) and get the final task state
state: State | None = asyncio.run(
run_controller(
config=config,
initial_user_action=MessageAction(content=instruction),
runtime=runtime,
fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
)
)
)
if state is None:
raise ValueError('State should not be None.')
if state is None:
raise ValueError('State should not be None.')

# # =============================================
# # result evaluation
# # =============================================
# # =============================================
# # result evaluation
# # =============================================

histories = state.history
histories = state.history

# some basic check
logger.info(f'Total events in history: {len(histories)}')
assert len(histories) > 0, 'History should not be empty'
# some basic check
logger.info(f'Total events in history: {len(histories)}')
assert len(histories) > 0, 'History should not be empty'

test_result: TestResult = test_class.verify_result(runtime, histories)
metrics = state.metrics.get() if state.metrics else None
test_result: TestResult = test_class.verify_result(runtime, histories)
metrics = state.metrics.get() if state.metrics else None
finally:
runtime.close()

# Save the output
output = EvalOutput(
Expand Down
2 changes: 2 additions & 0 deletions evaluation/integration_tests/tests/t06_github_pr_browsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
content = event.content
elif isinstance(event, AgentFinishAction):
content = event.outputs.get('content', '')
if event.thought:
content += f'\n\n{event.thought}'
elif isinstance(event, MessageAction):
content = event.content
else:
Expand Down
6 changes: 5 additions & 1 deletion frontend/src/services/actions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@ import { handleObservationMessage } from "./observations";

const messageActions = {
[ActionType.BROWSE]: (message: ActionMessage) => {
store.dispatch(addAssistantMessage(message.message));
if (message.args.thought) {
store.dispatch(addAssistantMessage(message.args.thought));
} else {
store.dispatch(addAssistantMessage(message.message));
}
},
[ActionType.BROWSE_INTERACTIVE]: (message: ActionMessage) => {
if (message.args.thought) {
Expand Down
2 changes: 2 additions & 0 deletions openhands/agenthub/codeact_agent/codeact_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
AgentDelegateAction,
AgentFinishAction,
BrowseInteractiveAction,
BrowseURLAction,
CmdRunAction,
FileEditAction,
IPythonRunCellAction,
Expand Down Expand Up @@ -151,6 +152,7 @@ def get_action_message(
IPythonRunCellAction,
FileEditAction,
BrowseInteractiveAction,
BrowseURLAction,
),
) or (
isinstance(action, (AgentFinishAction, CmdRunAction))
Expand Down
30 changes: 29 additions & 1 deletion openhands/agenthub/codeact_agent/function_calling.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
AgentDelegateAction,
AgentFinishAction,
BrowseInteractiveAction,
BrowseURLAction,
CmdRunAction,
FileEditAction,
IPythonRunCellAction,
Expand Down Expand Up @@ -266,6 +267,30 @@ def __init__(self):
),
)


_WEB_DESCRIPTION = """Read (convert to markdown) content from a webpage. You should prefer using the `webpage_read` tool over the `browser` tool, but do use the `browser` tool if you need to interact with a webpage (e.g., click a button, fill out a form, etc.).
You may use the `webpage_read` tool to read content from a webpage, and even search the webpage content using a Google search query (e.g., url=`https://www.google.com/search?q=YOUR_QUERY`).
"""

WebReadTool = ChatCompletionToolParam(
type='function',
function=ChatCompletionToolParamFunctionChunk(
name='web_read',
description=_WEB_DESCRIPTION,
parameters={
'type': 'object',
'properties': {
'url': {
'type': 'string',
'description': 'The URL of the webpage to read. You can also use a Google search query here (e.g., `https://www.google.com/search?q=YOUR_QUERY`).',
}
},
'required': ['url'],
},
),
)

# from browsergym/core/action/highlevel.py
_browser_action_space = HighLevelActionSet(
subsets=['bid', 'nav'],
Expand All @@ -274,7 +299,7 @@ def __init__(self):
)


_BROWSER_DESCRIPTION = """Interact with the browser using Python code.
_BROWSER_DESCRIPTION = """Interact with the browser using Python code. Use it ONLY when you need to interact with a webpage.
See the description of "code" parameter for more details.
Expand Down Expand Up @@ -484,6 +509,8 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
action = IPythonRunCellAction(code=code, include_extra=False)
elif tool_call.function.name == 'browser':
action = BrowseInteractiveAction(browser_actions=arguments['code'])
elif tool_call.function.name == 'web_read':
action = BrowseURLAction(url=arguments['url'])
else:
raise FunctionCallNotExistsError(
f'Tool {tool_call.function.name} is not registered. (arguments: {arguments}). Please check the tool name and retry with an existing tool.'
Expand Down Expand Up @@ -516,6 +543,7 @@ def get_tools(
) -> list[ChatCompletionToolParam]:
tools = [CmdRunTool, FinishTool]
if codeact_enable_browsing:
tools.append(WebReadTool)
tools.append(BrowserTool)
if codeact_enable_jupyter:
tools.append(IPythonTool)
Expand Down
2 changes: 1 addition & 1 deletion openhands/events/action/browse.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class BrowseURLAction(Action):

@property
def message(self) -> str:
return f'Browsing URL: {self.url}'
return f'I am browsing the URL: {self.url}'

def __str__(self) -> str:
ret = '**BrowseURLAction**\n'
Expand Down
70 changes: 44 additions & 26 deletions openhands/events/observation/browse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from browsergym.utils.obs import flatten_axtree_to_str

from openhands.core.schema import ObservationType
from openhands.core.schema import ActionType, ObservationType
from openhands.events.observation.observation import Observation


Expand All @@ -11,6 +11,7 @@ class BrowserOutputObservation(Observation):
"""This data class represents the output of a browser."""

url: str
trigger_by_action: str
screenshot: str = field(repr=False) # don't show in repr
error: bool = False
observation: str = ObservationType.BROWSE
Expand Down Expand Up @@ -40,39 +41,56 @@ def __str__(self) -> str:
f'Last browser action: {self.last_browser_action}\n'
f'Last browser action error: {self.last_browser_action_error}\n'
f'Focused element bid: {self.focused_element_bid}\n'
f'Content: {self.content}\n'
)
ret += '--- Agent Observation ---\n'
ret += self.get_agent_obs_text()
return ret

def get_agent_obs_text(self) -> str:
"""Get a concise text that will be shown to the agent."""
text = f'[Current URL: {self.url}]\n'
text += f'[Focused element bid: {self.focused_element_bid}]\n\n'
if self.error:
text += (
'================ BEGIN error message ===============\n'
'The following error occurred when executing the last action:\n'
f'{self.last_browser_action_error}\n'
'================ END error message ===============\n'
)
else:
text += '[Action executed successfully.]\n'
if self.trigger_by_action == ActionType.BROWSE_INTERACTIVE:
text = f'[Current URL: {self.url}]\n'
text += f'[Focused element bid: {self.focused_element_bid}]\n\n'
if self.error:
text += (
'================ BEGIN error message ===============\n'
'The following error occurred when executing the last action:\n'
f'{self.last_browser_action_error}\n'
'================ END error message ===============\n'
)
else:
text += '[Action executed successfully.]\n'
try:
# We do not filter visible only here because we want to show the full content
# of the web page to the agent for simplicity.
# FIXME: handle the case when the web page is too large
cur_axtree_txt = self.get_axtree_str(filter_visible_only=False)
text += (
f'============== BEGIN accessibility tree ==============\n'
f'{cur_axtree_txt}\n'
f'============== END accessibility tree ==============\n'
)
except Exception as e:
text += (
f'\n[Error encountered when processing the accessibility tree: {e}]'
)
return text

try:
# We do not filter visible only here because we want to show the full content
# of the web page to the agent for simplicity.
# FIXME: handle the case when the web page is too large
cur_axtree_txt = self.get_axtree_str(filter_visible_only=False)
text += (
f'============== BEGIN accessibility tree ==============\n'
f'{cur_axtree_txt}\n'
f'============== END accessibility tree ==============\n'
)
except Exception as e:
text += f'\n[Error encountered when processing the accessibility tree: {e}]'
return text
elif self.trigger_by_action == ActionType.BROWSE:
text = f'[Current URL: {self.url}]\n'
if self.error:
text += (
'================ BEGIN error message ===============\n'
'The following error occurred when trying to visit the URL:\n'
f'{self.last_browser_action_error}\n'
'================ END error message ===============\n'
)
text += '============== BEGIN webpage content ==============\n'
text += self.content
text += '\n============== END webpage content ==============\n'
return text
else:
raise ValueError(f'Invalid trigger_by_action: {self.trigger_by_action}')

def get_axtree_str(self, filter_visible_only: bool = False) -> str:
cur_axtree_txt = flatten_axtree_to_str(
Expand Down
2 changes: 2 additions & 0 deletions openhands/runtime/browser/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ async def browse(
), # last browser env action performed
last_browser_action_error=obs.get('last_action_error', ''),
error=True if obs.get('last_action_error', '') else False, # error flag
trigger_by_action=action.action,
)
except Exception as e:
return BrowserOutputObservation(
Expand All @@ -57,4 +58,5 @@ async def browse(
error=True,
last_browser_action_error=str(e),
url=asked_url if action.action == ActionType.BROWSE else '',
trigger_by_action=action.action,
)
1 change: 1 addition & 0 deletions tests/unit/test_security.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,7 @@ def test_parse_action(action, expected_trace):
content='browser output content',
url='http://localhost:3000',
screenshot='screenshot',
trigger_by_action=ActionType.BROWSE,
),
[
ToolOutput(
Expand Down

0 comments on commit 4d3b035

Please sign in to comment.