From 0c69d6484c60d419c579558b5c21609a58d8634e Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Tue, 26 Nov 2024 13:21:31 +0000 Subject: [PATCH 01/14] setup and add utils --- .../codeact_agent/function_calling.py | 62 ++++++++++ openhands/runtime/browser/transformer.py | 115 ++++++++++++++++++ openhands/runtime/browser/typing.py | 26 ++++ openhands/runtime/browser/utils.py | 15 ++- poetry.lock | 16 ++- pyproject.toml | 3 + 6 files changed, 233 insertions(+), 4 deletions(-) create mode 100644 openhands/runtime/browser/transformer.py create mode 100644 openhands/runtime/browser/typing.py diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py index a4ee35ff7b59..33432fd2dffa 100644 --- a/openhands/agenthub/codeact_agent/function_calling.py +++ b/openhands/agenthub/codeact_agent/function_calling.py @@ -419,6 +419,68 @@ def __init__(self): ), ) +_GUI_USE_TOOL_DESCRIPTION = """The following 5 functions are available. Nothing else is supported. + +keyboard_type(text: str) + Types a string of text through the keyboard. Sends a keydown, keypress/input, + and keyup event for each character in the text. Modifier keys DO NOT affect + keyboard_type. Holding down Shift will not type the text in upper case. + + Examples: + keyboard_type('Hello world!') + +mouse_move(x: float, y: float) + Description: Move the mouse to a location. Uses absolute client coordinates in pixels. + Dispatches a mousemove event. + + Examples: + mouse_move(65.2, 158.5) + +mouse_click(x: float, y: float, button: Literal["left", "middle", "right"] = "left") + Description: Move the mouse to a location and click a mouse button. Dispatches mousemove, + mousedown and mouseup events. + + Examples: + mouse_click(887.2, 68) + mouse_click(56, 712.56, 'right') + +mouse_dblclick(x: float, y: float, button: Literal["left", "middle", "right"] = "left") + Description: Move the mouse to a location and double click a mouse button. Dispatches + mousemove, mousedown and mouseup events. + + Examples: + mouse_dblclick(5, 236) + mouse_dblclick(87.5, 354, 'right') + +mouse_drag_and_drop(from_x: float, from_y: float, to_x: float, to_y: float) + Description: Drag and drop from a location to a location. Uses absolute client + coordinates in pixels. Dispatches mousemove, mousedown and mouseup + events. + + Examples: + mouse_drag_and_drop(10.7, 325, 235.6, 24.54) +""" + +GUIUseTool = ChatCompletionToolParam( + type='function', + function=ChatCompletionToolParamFunctionChunk( + name='gui_use', + description=_GUI_USE_TOOL_DESCRIPTION, + parameters={ + 'type': 'object', + 'properties': { + 'code': { + 'type': 'string', + 'description': ( + 'The Python code that interacts with the GUI.\n' + + _GUI_USE_TOOL_DESCRIPTION + ), + } + }, + }, + ), +) + _FINISH_DESCRIPTION = """Finish the interaction when the task is complete OR if the assistant cannot proceed further with the task.""" FinishTool = ChatCompletionToolParam( diff --git a/openhands/runtime/browser/transformer.py b/openhands/runtime/browser/transformer.py new file mode 100644 index 000000000000..697de327cf9b --- /dev/null +++ b/openhands/runtime/browser/transformer.py @@ -0,0 +1,115 @@ +import ast + +import astor + + +class ActionTransformer(ast.NodeTransformer): + def __init__(self, mapping): + self.mapping = mapping + + def visit_Call(self, node): + # Check if the function name matches one in the mapping + if isinstance(node.func, ast.Name) and node.func.id in self.mapping: + transform_info = self.mapping[node.func.id] + target_func = transform_info['target_func'] + arg_transform = transform_info.get('arg_transform') + extra_args = transform_info.get('extra_args', []) + + # Update the function name + node.func.id = target_func + + # Apply argument transformations if defined + if arg_transform: + new_keywords = [] + for kw in node.keywords: + if kw.arg in arg_transform: + new_keywords.extend(arg_transform[kw.arg](kw)) + else: + new_keywords.append(kw) + node.keywords = new_keywords + + # Add extra arguments + for extra_arg in extra_args: + node.keywords.append( + ast.keyword( + arg=extra_arg['name'], + value=ast.Constant(value=extra_arg['value']), + ) + ) + + return self.generic_visit(node) + + +def coordinate_split(arg_node): + if isinstance(arg_node.value, ast.Tuple) and len(arg_node.value.elts) == 2: + x_arg = ast.keyword(arg='to_x', value=arg_node.value.elts[0]) + y_arg = ast.keyword(arg='to_y', value=arg_node.value.elts[1]) + return [x_arg, y_arg] + return [] + + +def translate_computer_use_action_to_browsergym_action(code: str) -> str: + mapping = { + 'type': { + 'target_func': 'keyboard_type', + }, + 'key': { + 'target_func': 'keyboard_type', + }, + 'mouse_move': { + 'target_func': 'mouse_move', + 'arg_transform': {'coordinate': coordinate_split}, + }, + 'left_click_drag': { + 'target_func': 'mouse_drag_and_drop', + 'arg_transform': {'coordinate': coordinate_split}, + }, + 'left_click': { + 'target_func': 'mouse_click', + 'extra_args': [{'name': 'button', 'value': 'left'}], + }, + 'right_click': { + 'target_func': 'mouse_click', + 'extra_args': [{'name': 'button', 'value': 'right'}], + }, + 'middle_click': { + 'target_func': 'mouse_click', + 'extra_args': [{'name': 'button', 'value': 'middle'}], + }, + 'double_click': { + 'target_func': 'mouse_double_click', + 'extra_args': [{'name': 'button', 'value': 'left'}], + }, + 'screenshot': { + 'target_func': 'noop', + }, + 'cursor_position': 'noop', + } + + # Parse code to AST, transform, and generate new code + tree = ast.parse(code) + transformer = ActionTransformer(mapping) + transformed_tree = transformer.visit(tree) + transformed_code = astor.to_source(transformed_tree) + + return transformed_code + + +if __name__ == '__main__': + code = """result = type("Hello, World!")""" + assert ( + translate_computer_use_action_to_browsergym_action(code) + == "result = keyboard_type('Hello, World!')\n" + ) + + code = """result = mouse_move(coordinate=(100, 200))""" + assert ( + translate_computer_use_action_to_browsergym_action(code) + == 'result = mouse_move(to_x=100, to_y=200)\n' + ) + + code = """result = left_click()""" + assert ( + translate_computer_use_action_to_browsergym_action(code) + == "result = mouse_click(button='left')\n" + ) diff --git a/openhands/runtime/browser/typing.py b/openhands/runtime/browser/typing.py new file mode 100644 index 000000000000..419fa8a16cc4 --- /dev/null +++ b/openhands/runtime/browser/typing.py @@ -0,0 +1,26 @@ +from enum import StrEnum +from typing import Literal, TypedDict + + +class Resolution(TypedDict): + width: int + height: int + + +class ScalingSource(StrEnum): + COMPUTER = 'computer' + API = 'api' + + +ComputerUseAction = Literal[ + 'type', # type sequence in chunks --> keyboard_type + 'key', # key sequence pressed --> keyboard_type + 'mouse_move', # move mouse to a position --> mouse_move + 'left_click', # left click --> mouse_click + 'left_click_drag', # left click and drag --> mouse_drag_and_drop + 'right_click', # right click --> mouse_click + 'middle_click', # middle click --> mouse_click + 'double_click', # double left click --> mouse_dblclick + 'screenshot', # take a screenshot --> noop + 'cursor_position', # get cursor position --> +] diff --git a/openhands/runtime/browser/utils.py b/openhands/runtime/browser/utils.py index 336b3801e3e2..434cc6f761cb 100644 --- a/openhands/runtime/browser/utils.py +++ b/openhands/runtime/browser/utils.py @@ -1,10 +1,15 @@ import os +from typing import get_args from openhands.core.exceptions import BrowserUnavailableException from openhands.core.schema import ActionType from openhands.events.action import BrowseInteractiveAction, BrowseURLAction from openhands.events.observation import BrowserOutputObservation from openhands.runtime.browser.browser_env import BrowserEnv +from openhands.runtime.browser.transformer import ( + translate_computer_use_action_to_browsergym_action, +) +from openhands.runtime.browser.typing import ComputerUseAction async def browse( @@ -21,9 +26,15 @@ async def browse( action_str = f'goto("{asked_url}")' elif isinstance(action, BrowseInteractiveAction): - # new BrowseInteractiveAction, supports full featured BrowserGym actions + # received action_str defined by Anthropic's Computer Use feature: see https://docs.anthropic.com/en/docs/build-with-claude/computer-use#computer-tool + _action_str = action.browser_actions + + if _action_str not in get_args(ComputerUseAction): + raise ValueError(f'Invalid action: {_action_str}') + + # translate to BrowserGym actions # action in BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/functions.py - action_str = action.browser_actions + action_str = translate_computer_use_action_to_browsergym_action(_action_str) else: raise ValueError(f'Invalid action type: {action.action}') diff --git a/poetry.lock b/poetry.lock index 70d888cafed6..9ceee78e7cde 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "aenum" @@ -353,6 +353,17 @@ files = [ [package.extras] tests = ["mypy (>=0.800)", "pytest", "pytest-asyncio"] +[[package]] +name = "astor" +version = "0.8.1" +description = "Read/rewrite/write Python ASTs" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" +files = [ + {file = "astor-0.8.1-py2.py3-none-any.whl", hash = "sha256:070a54e890cefb5b3739d19f30f5a5ec840ffc9c50ffa7d23cc9fc1a38ebbfc5"}, + {file = "astor-0.8.1.tar.gz", hash = "sha256:6a6effda93f4e1ce9f618779b2dd1d9d84f1e32812c23a29b3fff6fd7f63fa5e"}, +] + [[package]] name = "asttokens" version = "2.4.1" @@ -5629,6 +5640,7 @@ optional = false python-versions = ">=3.6" files = [ {file = "opencv-python-4.10.0.84.tar.gz", hash = "sha256:72d234e4582e9658ffea8e9cae5b63d488ad06994ef12d81dc303b17472f3526"}, + {file = "opencv_python-4.10.0.84-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:fc182f8f4cda51b45f01c64e4cbedfc2f00aff799debebc305d8d0210c43f251"}, {file = "opencv_python-4.10.0.84-cp37-abi3-macosx_12_0_x86_64.whl", hash = "sha256:71e575744f1d23f79741450254660442785f45a0797212852ee5199ef12eed98"}, {file = "opencv_python-4.10.0.84-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09a332b50488e2dda866a6c5573ee192fe3583239fb26ff2f7f9ceb0bc119ea6"}, {file = "opencv_python-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ace140fc6d647fbe1c692bcb2abce768973491222c067c131d80957c595b71f"}, @@ -10212,4 +10224,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "b710448cff0788b563f4d7614fca438ab0b9fe19903a061750012c56da95ff37" +content-hash = "08b911701c5a4543bf58a722845cf099e19bedf154ccde26cd9fc6f854acda8d" diff --git a/pyproject.toml b/pyproject.toml index fc1807f72fc3..036a25e8f166 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,7 @@ modal = "^0.64.145" runloop-api-client = "0.7.0" pygithub = "^2.5.0" openhands-aci = "^0.1.1" +astor = "*" [tool.poetry.group.llama-index.dependencies] llama-index = "*" @@ -95,6 +96,7 @@ reportlab = "*" [tool.coverage.run] concurrency = ["gevent"] + [tool.poetry.group.runtime.dependencies] jupyterlab = "*" notebook = "*" @@ -125,6 +127,7 @@ ignore = ["D1"] [tool.ruff.lint.pydocstyle] convention = "google" + [tool.poetry.group.evaluation.dependencies] streamlit = "*" whatthepatch = "*" From bf248329b458aa18c54f63facbad3aac76e532eb Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Thu, 28 Nov 2024 13:32:02 +0000 Subject: [PATCH 02/14] update tool description --- .../codeact_agent/function_calling.py | 90 +++++++++---------- 1 file changed, 44 insertions(+), 46 deletions(-) diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py index 33432fd2dffa..562b81f96291 100644 --- a/openhands/agenthub/codeact_agent/function_calling.py +++ b/openhands/agenthub/codeact_agent/function_calling.py @@ -419,46 +419,14 @@ def __init__(self): ), ) -_GUI_USE_TOOL_DESCRIPTION = """The following 5 functions are available. Nothing else is supported. - -keyboard_type(text: str) - Types a string of text through the keyboard. Sends a keydown, keypress/input, - and keyup event for each character in the text. Modifier keys DO NOT affect - keyboard_type. Holding down Shift will not type the text in upper case. - - Examples: - keyboard_type('Hello world!') - -mouse_move(x: float, y: float) - Description: Move the mouse to a location. Uses absolute client coordinates in pixels. - Dispatches a mousemove event. - - Examples: - mouse_move(65.2, 158.5) - -mouse_click(x: float, y: float, button: Literal["left", "middle", "right"] = "left") - Description: Move the mouse to a location and click a mouse button. Dispatches mousemove, - mousedown and mouseup events. - - Examples: - mouse_click(887.2, 68) - mouse_click(56, 712.56, 'right') - -mouse_dblclick(x: float, y: float, button: Literal["left", "middle", "right"] = "left") - Description: Move the mouse to a location and double click a mouse button. Dispatches - mousemove, mousedown and mouseup events. - - Examples: - mouse_dblclick(5, 236) - mouse_dblclick(87.5, 354, 'right') - -mouse_drag_and_drop(from_x: float, from_y: float, to_x: float, to_y: float) - Description: Drag and drop from a location to a location. Uses absolute client - coordinates in pixels. Dispatches mousemove, mousedown and mouseup - events. - - Examples: - mouse_drag_and_drop(10.7, 325, 235.6, 24.54) +_GUI_USE_TOOL_DESCRIPTION = """Use a mouse and keyboard to interact with a computer, and take screenshots. +* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications. +* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try taking another screenshot. +* The screen's resolution is {{ display_width_px }}x{{ display_height_px }}. +* The display number is {{ display_number }} +* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor. +* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click. +* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked. """ GUIUseTool = ChatCompletionToolParam( @@ -469,14 +437,44 @@ def __init__(self): parameters={ 'type': 'object', 'properties': { - 'code': { + 'action': { + 'description': """The action to perform. The available actions are: +* `key`: Press a key or key-combination on the keyboard. + - This supports xdotool's `key` syntax. + - Examples: "a", "Return", "alt+Tab", "ctrl+s", "Up", "KP_0" (for the numpad 0 key). +* `type`: Type a string of text on the keyboard. +* `cursor_position`: Get the current (x, y) pixel coordinate of the cursor on the screen. +* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen. +* `left_click`: Click the left mouse button. +* `left_click_drag`: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen. +* `right_click`: Click the right mouse button. +* `middle_click`: Click the middle mouse button. +* `double_click`: Double-click the left mouse button. +* `screenshot`: Take a screenshot of the screen.""", + 'enum': [ + 'key', + 'type', + 'mouse_move', + 'left_click', + 'left_click_drag', + 'right_click', + 'middle_click', + 'double_click', + 'screenshot', + 'cursor_position', + ], 'type': 'string', - 'description': ( - 'The Python code that interacts with the GUI.\n' - + _GUI_USE_TOOL_DESCRIPTION - ), - } + }, + 'coordinate': { + 'description': '(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=mouse_move` and `action=left_click_drag`.', + 'type': 'array', + }, + 'text': { + 'description': 'Required only by `action=type` and `action=key`.', + 'type': 'string', + }, }, + 'required': ['action'], }, ), ) From 3fe80974cfa46788db95f453c238d30c78e4e241 Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Fri, 29 Nov 2024 06:06:13 +0000 Subject: [PATCH 03/14] add missing action translation --- openhands/events/observation/browse.py | 1 + openhands/runtime/action_execution_server.py | 14 ++- openhands/runtime/browser/browser_env.py | 2 + openhands/runtime/browser/transformer.py | 92 ++++++++++----- .../runtime/browser/{typing.py => types.py} | 4 +- openhands/runtime/browser/utils.py | 11 +- tests/unit/test_action_transformer.py | 108 ++++++++++++++++++ 7 files changed, 197 insertions(+), 35 deletions(-) rename openhands/runtime/browser/{typing.py => types.py} (84%) create mode 100644 tests/unit/test_action_transformer.py diff --git a/openhands/events/observation/browse.py b/openhands/events/observation/browse.py index 9632fac57d54..028823d5da18 100644 --- a/openhands/events/observation/browse.py +++ b/openhands/events/observation/browse.py @@ -25,6 +25,7 @@ class BrowserOutputObservation(Observation): last_browser_action: str = '' last_browser_action_error: str = '' focused_element_bid: str = '' + mouse_position: list = field(default_factory=list) @property def message(self) -> str: diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py index 1251aa346838..2a046b0daad0 100644 --- a/openhands/runtime/action_execution_server.py +++ b/openhands/runtime/action_execution_server.py @@ -37,6 +37,7 @@ IPythonRunCellAction, ) from openhands.events.observation import ( + BrowserOutputObservation, CmdOutputObservation, ErrorObservation, FileReadObservation, @@ -107,6 +108,7 @@ def __init__( self.browser = BrowserEnv(browsergym_eval_env) self.start_time = time.time() self.last_execution_time = self.start_time + self.last_browser_output_observation: BrowserOutputObservation | None = None @property def initial_pwd(self): @@ -319,10 +321,18 @@ async def write(self, action: FileWriteAction) -> Observation: return FileWriteObservation(content='', path=filepath) async def browse(self, action: BrowseURLAction) -> Observation: - return await browse(action, self.browser) + browser_obs = await browse( + action, self.browser, self.last_browser_output_observation + ) + self.last_browser_output_observation = browser_obs + return browser_obs async def browse_interactive(self, action: BrowseInteractiveAction) -> Observation: - return await browse(action, self.browser) + browser_obs = await browse( + action, self.browser, self.last_browser_output_observation + ) + self.last_browser_output_observation = browser_obs + return browser_obs def close(self): self.bash_session.close() diff --git a/openhands/runtime/browser/browser_env.py b/openhands/runtime/browser/browser_env.py index d9a7fd752956..258ca9dcc64c 100644 --- a/openhands/runtime/browser/browser_env.py +++ b/openhands/runtime/browser/browser_env.py @@ -40,6 +40,8 @@ def __init__(self, browsergym_eval_env: str | None = None): self.init_browser() atexit.register(self.close) + self.last_obs = None + def get_html_text_converter(self): html_text_converter = html2text.HTML2Text() # ignore links and images diff --git a/openhands/runtime/browser/transformer.py b/openhands/runtime/browser/transformer.py index 697de327cf9b..b6a57330e712 100644 --- a/openhands/runtime/browser/transformer.py +++ b/openhands/runtime/browser/transformer.py @@ -2,6 +2,8 @@ import astor +from openhands.events.observation import BrowserOutputObservation + class ActionTransformer(ast.NodeTransformer): def __init__(self, mapping): @@ -48,42 +50,96 @@ def coordinate_split(arg_node): return [] -def translate_computer_use_action_to_browsergym_action(code: str) -> str: +def rename_argument(new_name): + def transformer(arg_node): + # Change the name of the argument + return [ast.keyword(arg=new_name, value=arg_node.value)] + + return transformer + + +def translate_computer_use_action_to_browsergym_action( + code: str, last_obs: BrowserOutputObservation | None +) -> str: + last_mouse_position = last_obs.mouse_position if last_obs else None + if last_mouse_position is None or len(last_mouse_position) != 2: + last_mouse_position = [0, 0] + mapping = { 'type': { 'target_func': 'keyboard_type', + 'arg_transform': {'text': rename_argument('key')}, }, 'key': { - 'target_func': 'keyboard_type', + 'target_func': 'keyboard_press', + 'arg_transform': {'text': rename_argument('key')}, }, 'mouse_move': { 'target_func': 'mouse_move', 'arg_transform': {'coordinate': coordinate_split}, + 'extra_args': [ + { + 'name': 'from_x', + 'value': last_mouse_position[0], + }, + { + 'name': 'from_y', + 'value': last_mouse_position[1], + }, + ], }, 'left_click_drag': { 'target_func': 'mouse_drag_and_drop', 'arg_transform': {'coordinate': coordinate_split}, + 'extra_args': [ + { + 'name': 'from_x', + 'value': last_mouse_position[0], + }, + { + 'name': 'from_y', + 'value': last_mouse_position[1], + }, + ], }, 'left_click': { 'target_func': 'mouse_click', - 'extra_args': [{'name': 'button', 'value': 'left'}], + 'extra_args': [ + {'name': 'button', 'value': 'left'}, + {'name': 'x', 'value': last_mouse_position[0]}, + {'name': 'y', 'value': last_mouse_position[1]}, + ], }, 'right_click': { 'target_func': 'mouse_click', - 'extra_args': [{'name': 'button', 'value': 'right'}], + 'extra_args': [ + {'name': 'button', 'value': 'right'}, + {'name': 'x', 'value': last_mouse_position[0]}, + {'name': 'y', 'value': last_mouse_position[1]}, + ], }, 'middle_click': { 'target_func': 'mouse_click', - 'extra_args': [{'name': 'button', 'value': 'middle'}], + 'extra_args': [ + {'name': 'button', 'value': 'middle'}, + {'name': 'x', 'value': last_mouse_position[0]}, + {'name': 'y', 'value': last_mouse_position[1]}, + ], }, 'double_click': { - 'target_func': 'mouse_double_click', - 'extra_args': [{'name': 'button', 'value': 'left'}], + 'target_func': 'mouse_dblclick', + 'extra_args': [ + {'name': 'button', 'value': 'left'}, + {'name': 'x', 'value': last_mouse_position[0]}, + {'name': 'y', 'value': last_mouse_position[1]}, + ], }, 'screenshot': { 'target_func': 'noop', }, - 'cursor_position': 'noop', + 'cursor_position': { + 'target_func': 'noop', + }, } # Parse code to AST, transform, and generate new code @@ -93,23 +149,3 @@ def translate_computer_use_action_to_browsergym_action(code: str) -> str: transformed_code = astor.to_source(transformed_tree) return transformed_code - - -if __name__ == '__main__': - code = """result = type("Hello, World!")""" - assert ( - translate_computer_use_action_to_browsergym_action(code) - == "result = keyboard_type('Hello, World!')\n" - ) - - code = """result = mouse_move(coordinate=(100, 200))""" - assert ( - translate_computer_use_action_to_browsergym_action(code) - == 'result = mouse_move(to_x=100, to_y=200)\n' - ) - - code = """result = left_click()""" - assert ( - translate_computer_use_action_to_browsergym_action(code) - == "result = mouse_click(button='left')\n" - ) diff --git a/openhands/runtime/browser/typing.py b/openhands/runtime/browser/types.py similarity index 84% rename from openhands/runtime/browser/typing.py rename to openhands/runtime/browser/types.py index 419fa8a16cc4..03925c770b14 100644 --- a/openhands/runtime/browser/typing.py +++ b/openhands/runtime/browser/types.py @@ -13,8 +13,8 @@ class ScalingSource(StrEnum): ComputerUseAction = Literal[ - 'type', # type sequence in chunks --> keyboard_type - 'key', # key sequence pressed --> keyboard_type + 'type', # type sequence --> keyboard_type + 'key', # press a key or key comb --> keyboard_press 'mouse_move', # move mouse to a position --> mouse_move 'left_click', # left click --> mouse_click 'left_click_drag', # left click and drag --> mouse_drag_and_drop diff --git a/openhands/runtime/browser/utils.py b/openhands/runtime/browser/utils.py index 434cc6f761cb..f98cfcd12ccd 100644 --- a/openhands/runtime/browser/utils.py +++ b/openhands/runtime/browser/utils.py @@ -9,11 +9,13 @@ from openhands.runtime.browser.transformer import ( translate_computer_use_action_to_browsergym_action, ) -from openhands.runtime.browser.typing import ComputerUseAction +from openhands.runtime.browser.types import ComputerUseAction async def browse( - action: BrowseURLAction | BrowseInteractiveAction, browser: BrowserEnv | None + action: BrowseURLAction | BrowseInteractiveAction, + browser: BrowserEnv | None, + last_obs: BrowserOutputObservation | None, ) -> BrowserOutputObservation: if browser is None: raise BrowserUnavailableException() @@ -34,7 +36,9 @@ async def browse( # translate to BrowserGym actions # action in BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/functions.py - action_str = translate_computer_use_action_to_browsergym_action(_action_str) + action_str = translate_computer_use_action_to_browsergym_action( + _action_str, last_obs + ) else: raise ValueError(f'Invalid action type: {action.action}') @@ -60,6 +64,7 @@ async def browse( ), # last browser env action performed last_browser_action_error=obs.get('last_action_error', ''), error=True if obs.get('last_action_error', '') else False, # error flag + mouse_position=obs.get('mouse_position', []), # mouse position ) except Exception as e: return BrowserOutputObservation( diff --git a/tests/unit/test_action_transformer.py b/tests/unit/test_action_transformer.py new file mode 100644 index 000000000000..9b9b7da4917e --- /dev/null +++ b/tests/unit/test_action_transformer.py @@ -0,0 +1,108 @@ +import pytest + +from openhands.events.observation import BrowserOutputObservation +from openhands.runtime.browser.transformer import ( + translate_computer_use_action_to_browsergym_action, +) + + +@pytest.fixture +def last_obs(): + return BrowserOutputObservation( + content='Hello, World!', + url='https://example.com', + screenshot='screenshot', + mouse_position=[50, 100], + ) + + +def test_keyboard_type(last_obs): + code = """result = type(text="Hello, World!")""" + expected = "result = keyboard_type(key='Hello, World!')\n" + assert ( + translate_computer_use_action_to_browsergym_action(code, last_obs) == expected + ) + + +def test_mouse_move(last_obs): + code = """result = mouse_move(coordinate=(100, 200))""" + expected = 'result = mouse_move(to_x=100, to_y=200, from_x=50, from_y=100)\n' + assert ( + translate_computer_use_action_to_browsergym_action(code, last_obs) == expected + ) + + +def test_left_click(last_obs): + code = """result = left_click()""" + expected = "result = mouse_click(button='left', x=50, y=100)\n" + assert ( + translate_computer_use_action_to_browsergym_action(code, last_obs) == expected + ) + + +def test_right_click(last_obs): + code = """result = right_click()""" + expected = "result = mouse_click(button='right', x=50, y=100)\n" + assert ( + translate_computer_use_action_to_browsergym_action(code, last_obs) == expected + ) + + +def test_middle_click(last_obs): + code = """result = middle_click()""" + expected = "result = mouse_click(button='middle', x=50, y=100)\n" + assert ( + translate_computer_use_action_to_browsergym_action(code, last_obs) == expected + ) + + +def test_double_click(last_obs): + code = """result = double_click()""" + expected = "result = mouse_dblclick(button='left', x=50, y=100)\n" + assert ( + translate_computer_use_action_to_browsergym_action(code, last_obs) == expected + ) + + +def test_screenshot(last_obs): + code = """result = screenshot()""" + expected = 'result = noop()\n' + assert ( + translate_computer_use_action_to_browsergym_action(code, last_obs) == expected + ) + + +def test_cursor_position(last_obs): + code = """result = cursor_position()""" + expected = 'result = noop()\n' + assert ( + translate_computer_use_action_to_browsergym_action(code, last_obs) == expected + ) + + +def test_missing_mouse_position(): + last_obs = BrowserOutputObservation( + content='Hello, World!', + url='https://example.com', + screenshot='screenshot', + mouse_position=None, + ) + code = """result = mouse_move(coordinate=(100, 200))""" + expected = 'result = mouse_move(to_x=100, to_y=200, from_x=0, from_y=0)\n' + assert ( + translate_computer_use_action_to_browsergym_action(code, last_obs) == expected + ) + + +def test_empty_mouse_position(): + last_obs = BrowserOutputObservation( + content='Hello, World!', + url='https://example.com', + screenshot='screenshot', + mouse_position=[], + ) + code = """result = mouse_move(coordinate=(100, 200))""" + expected = 'result = mouse_move(to_x=100, to_y=200, from_x=0, from_y=0)\n' + assert ( + translate_computer_use_action_to_browsergym_action(code, last_obs) == expected + ) From 48fea0cb3a721a1584822ea25a924d8e0909d3f9 Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Fri, 29 Nov 2024 06:11:10 +0000 Subject: [PATCH 04/14] switch obs to screenshot only --- openhands/agenthub/codeact_agent/codeact_agent.py | 11 ++++++++--- openhands/core/message.py | 6 +++++- openhands/runtime/browser/browser_env.py | 4 +++- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py index 39b9e69247be..490ae5431bf5 100644 --- a/openhands/agenthub/codeact_agent/codeact_agent.py +++ b/openhands/agenthub/codeact_agent/codeact_agent.py @@ -187,7 +187,9 @@ def get_action_message( ) ] elif isinstance(action, CmdRunAction) and action.source == 'user': - content = [TextContent(text=f'User executed the command:\n{action.command}')] + content = [ + TextContent(text=f'User executed the command:\n{action.command}') + ] return [ Message( role='user', @@ -257,10 +259,13 @@ def get_observation_message( text = truncate_content(str(obs), max_message_chars) message = Message(role='user', content=[TextContent(text=text)]) elif isinstance(obs, BrowserOutputObservation): - text = obs.get_agent_obs_text() + # text = obs.get_agent_obs_text() message = Message( role='user', - content=[TextContent(text=text)], + content=[ + # TextContent(text=text), + ImageContent(image_urls=[obs.screenshot]), + ], ) elif isinstance(obs, AgentDelegateObservation): text = truncate_content( diff --git a/openhands/core/message.py b/openhands/core/message.py index a5b67917eaee..ac3a533c5681 100644 --- a/openhands/core/message.py +++ b/openhands/core/message.py @@ -97,7 +97,11 @@ def _list_serializer(self) -> dict: # See discussion here for details: https://github.com/BerriAI/litellm/issues/6422#issuecomment-2438765472 if self.role == 'tool' and item.cache_prompt: role_tool_with_prompt_caching = True - d.pop('cache_control') + if isinstance(d, list): # image content + for i in range(len(d)): + d[i].pop('cache_control') + else: + d.pop('cache_control') if isinstance(item, TextContent): content.append(d) elif isinstance(item, ImageContent) and self.vision_enabled: diff --git a/openhands/runtime/browser/browser_env.py b/openhands/runtime/browser/browser_env.py index 258ca9dcc64c..efdabfa6a045 100644 --- a/openhands/runtime/browser/browser_env.py +++ b/openhands/runtime/browser/browser_env.py @@ -147,7 +147,9 @@ def browser_process(self): html_str = flatten_dom_to_str(obs['dom_object']) obs['text_content'] = self.html_text_converter.handle(html_str) # make observation serializable - obs['screenshot'] = self.image_to_png_base64_url(obs['screenshot']) + obs['screenshot'] = self.image_to_png_base64_url( + obs['screenshot'], add_data_prefix=True + ) obs['active_page_index'] = obs['active_page_index'].item() obs['elapsed_time'] = obs['elapsed_time'].item() self.browser_side.send((unique_request_id, obs)) From 002a2d82333458a273f558d1968c0afd38034f38 Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Fri, 29 Nov 2024 07:57:26 +0000 Subject: [PATCH 05/14] wire up with fn calling --- .../codeact_agent/function_calling.py | 17 +++++++- openhands/runtime/browser/utils.py | 21 +++++----- tests/unit/test_action_transformer.py | 40 +++++++++---------- 3 files changed, 47 insertions(+), 31 deletions(-) diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py index 562b81f96291..ff62aeca3b85 100644 --- a/openhands/agenthub/codeact_agent/function_calling.py +++ b/openhands/agenthub/codeact_agent/function_calling.py @@ -544,6 +544,20 @@ def response_to_actions(response: ModelResponse) -> list[Action]: action = IPythonRunCellAction(code=code, include_extra=False) elif tool_call.function.name == 'browser': action = BrowseInteractiveAction(browser_actions=arguments['code']) + elif tool_call.function.name == 'gui_use': + arg_action = arguments['action'] + # arguments is a python object, so need to consider when the property is not present + arg_coordinate = arguments.get('coordinate') + arg_text = arguments.get('text') + + browser_action = f'{arg_action}(' + if arg_coordinate: + browser_action += f'coordinate={arg_coordinate}, ' + if arg_text: + browser_action += f'text="{arg_text}", ' + browser_action += ')' + browser_action = f'gui_use({browser_action})' + action = BrowseInteractiveAction(browser_actions=browser_action) else: raise FunctionCallNotExistsError( f'Tool {tool_call.function.name} is not registered. (arguments: {arguments}). Please check the tool name and retry with an existing tool.' @@ -576,7 +590,8 @@ def get_tools( ) -> list[ChatCompletionToolParam]: tools = [CmdRunTool, FinishTool] if codeact_enable_browsing: - tools.append(BrowserTool) + # tools.append(BrowserTool) + tools.append(GUIUseTool) if codeact_enable_jupyter: tools.append(IPythonTool) if codeact_enable_llm_editor: diff --git a/openhands/runtime/browser/utils.py b/openhands/runtime/browser/utils.py index f98cfcd12ccd..0adcd58ee582 100644 --- a/openhands/runtime/browser/utils.py +++ b/openhands/runtime/browser/utils.py @@ -1,5 +1,4 @@ import os -from typing import get_args from openhands.core.exceptions import BrowserUnavailableException from openhands.core.schema import ActionType @@ -9,7 +8,6 @@ from openhands.runtime.browser.transformer import ( translate_computer_use_action_to_browsergym_action, ) -from openhands.runtime.browser.types import ComputerUseAction async def browse( @@ -28,17 +26,20 @@ async def browse( action_str = f'goto("{asked_url}")' elif isinstance(action, BrowseInteractiveAction): - # received action_str defined by Anthropic's Computer Use feature: see https://docs.anthropic.com/en/docs/build-with-claude/computer-use#computer-tool _action_str = action.browser_actions - if _action_str not in get_args(ComputerUseAction): - raise ValueError(f'Invalid action: {_action_str}') + if _action_str.startswith('gui_use'): + # received action_str defined by Anthropic's Computer Use feature: see https://docs.anthropic.com/en/docs/build-with-claude/computer-use#computer-tool + _action_str = _action_str[8:-1] - # translate to BrowserGym actions - # action in BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/functions.py - action_str = translate_computer_use_action_to_browsergym_action( - _action_str, last_obs - ) + # translate to BrowserGym actions + # action in BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/functions.py + action_str = translate_computer_use_action_to_browsergym_action( + _action_str, last_obs + ) + else: + # received normal BrowserGym action + action_str = _action_str else: raise ValueError(f'Invalid action type: {action.action}') diff --git a/tests/unit/test_action_transformer.py b/tests/unit/test_action_transformer.py index 9b9b7da4917e..400d6ea12a63 100644 --- a/tests/unit/test_action_transformer.py +++ b/tests/unit/test_action_transformer.py @@ -17,64 +17,64 @@ def last_obs(): def test_keyboard_type(last_obs): - code = """result = type(text="Hello, World!")""" - expected = "result = keyboard_type(key='Hello, World!')\n" + code = """type(text="Hello, World!")""" + expected = "keyboard_type(key='Hello, World!')\n" assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) def test_mouse_move(last_obs): - code = """result = mouse_move(coordinate=(100, 200))""" - expected = 'result = mouse_move(to_x=100, to_y=200, from_x=50, from_y=100)\n' + code = """mouse_move(coordinate=(100, 200))""" + expected = 'mouse_move(to_x=100, to_y=200, from_x=50, from_y=100)\n' assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) def test_left_click(last_obs): - code = """result = left_click()""" - expected = "result = mouse_click(button='left', x=50, y=100)\n" + code = """left_click()""" + expected = "mouse_click(button='left', x=50, y=100)\n" assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) def test_right_click(last_obs): - code = """result = right_click()""" - expected = "result = mouse_click(button='right', x=50, y=100)\n" + code = """right_click()""" + expected = "mouse_click(button='right', x=50, y=100)\n" assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) def test_middle_click(last_obs): - code = """result = middle_click()""" - expected = "result = mouse_click(button='middle', x=50, y=100)\n" + code = """middle_click()""" + expected = "mouse_click(button='middle', x=50, y=100)\n" assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) def test_double_click(last_obs): - code = """result = double_click()""" - expected = "result = mouse_dblclick(button='left', x=50, y=100)\n" + code = """double_click()""" + expected = "mouse_dblclick(button='left', x=50, y=100)\n" assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) def test_screenshot(last_obs): - code = """result = screenshot()""" - expected = 'result = noop()\n' + code = """screenshot()""" + expected = 'noop()\n' assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) def test_cursor_position(last_obs): - code = """result = cursor_position()""" - expected = 'result = noop()\n' + code = """cursor_position()""" + expected = 'noop()\n' assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) @@ -87,8 +87,8 @@ def test_missing_mouse_position(): screenshot='screenshot', mouse_position=None, ) - code = """result = mouse_move(coordinate=(100, 200))""" - expected = 'result = mouse_move(to_x=100, to_y=200, from_x=0, from_y=0)\n' + code = """mouse_move(coordinate=(100, 200))""" + expected = 'mouse_move(to_x=100, to_y=200, from_x=0, from_y=0)\n' assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) @@ -101,8 +101,8 @@ def test_empty_mouse_position(): screenshot='screenshot', mouse_position=[], ) - code = """result = mouse_move(coordinate=(100, 200))""" - expected = 'result = mouse_move(to_x=100, to_y=200, from_x=0, from_y=0)\n' + code = """mouse_move(coordinate=(100, 200))""" + expected = 'mouse_move(to_x=100, to_y=200, from_x=0, from_y=0)\n' assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) From bbdde561320f34abca37897de8f3550256db4df4 Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Fri, 29 Nov 2024 08:13:17 +0000 Subject: [PATCH 06/14] simplify wiring up --- .../agenthub/codeact_agent/function_calling.py | 17 ++++------------- openhands/events/action/browse.py | 1 + openhands/runtime/browser/gui_use.py | 0 openhands/runtime/browser/utils.py | 10 ++++++++-- 4 files changed, 13 insertions(+), 15 deletions(-) create mode 100644 openhands/runtime/browser/gui_use.py diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py index ff62aeca3b85..b794742449dc 100644 --- a/openhands/agenthub/codeact_agent/function_calling.py +++ b/openhands/agenthub/codeact_agent/function_calling.py @@ -545,19 +545,10 @@ def response_to_actions(response: ModelResponse) -> list[Action]: elif tool_call.function.name == 'browser': action = BrowseInteractiveAction(browser_actions=arguments['code']) elif tool_call.function.name == 'gui_use': - arg_action = arguments['action'] - # arguments is a python object, so need to consider when the property is not present - arg_coordinate = arguments.get('coordinate') - arg_text = arguments.get('text') - - browser_action = f'{arg_action}(' - if arg_coordinate: - browser_action += f'coordinate={arg_coordinate}, ' - if arg_text: - browser_action += f'text="{arg_text}", ' - browser_action += ')' - browser_action = f'gui_use({browser_action})' - action = BrowseInteractiveAction(browser_actions=browser_action) + browser_action = 'gui_use' + action = BrowseInteractiveAction( + browser_actions=browser_action, extra_args=arguments + ) else: raise FunctionCallNotExistsError( f'Tool {tool_call.function.name} is not registered. (arguments: {arguments}). Please check the tool name and retry with an existing tool.' diff --git a/openhands/events/action/browse.py b/openhands/events/action/browse.py index 41816216d6d5..6fea072a0226 100644 --- a/openhands/events/action/browse.py +++ b/openhands/events/action/browse.py @@ -33,6 +33,7 @@ class BrowseInteractiveAction(Action): action: str = ActionType.BROWSE_INTERACTIVE runnable: ClassVar[bool] = True security_risk: ActionSecurityRisk | None = None + extra_args: dict | None = None @property def message(self) -> str: diff --git a/openhands/runtime/browser/gui_use.py b/openhands/runtime/browser/gui_use.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/openhands/runtime/browser/utils.py b/openhands/runtime/browser/utils.py index 0adcd58ee582..a20e99f4406d 100644 --- a/openhands/runtime/browser/utils.py +++ b/openhands/runtime/browser/utils.py @@ -28,9 +28,15 @@ async def browse( elif isinstance(action, BrowseInteractiveAction): _action_str = action.browser_actions - if _action_str.startswith('gui_use'): + if _action_str == 'gui_use': # received action_str defined by Anthropic's Computer Use feature: see https://docs.anthropic.com/en/docs/build-with-claude/computer-use#computer-tool - _action_str = _action_str[8:-1] + extra_args = action.extra_args + + # TODO: perform argument validation on extra_args + assert extra_args is not None + + # construct a computer use action + _action_str = f'{extra_args["action"]}({", ".join([f"{k}={v}" for k, v in extra_args.items() if k != "action"])})' # translate to BrowserGym actions # action in BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/functions.py From 135550cf79f720f18cfeb7f131ef5391f6257cbc Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Fri, 29 Nov 2024 08:16:02 +0000 Subject: [PATCH 07/14] restructure --- openhands/runtime/browser/{ => gui_use}/gui_use.py | 0 openhands/runtime/browser/{ => gui_use}/types.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename openhands/runtime/browser/{ => gui_use}/gui_use.py (100%) rename openhands/runtime/browser/{ => gui_use}/types.py (93%) diff --git a/openhands/runtime/browser/gui_use.py b/openhands/runtime/browser/gui_use/gui_use.py similarity index 100% rename from openhands/runtime/browser/gui_use.py rename to openhands/runtime/browser/gui_use/gui_use.py diff --git a/openhands/runtime/browser/types.py b/openhands/runtime/browser/gui_use/types.py similarity index 93% rename from openhands/runtime/browser/types.py rename to openhands/runtime/browser/gui_use/types.py index 03925c770b14..dacdb83c7447 100644 --- a/openhands/runtime/browser/types.py +++ b/openhands/runtime/browser/gui_use/types.py @@ -22,5 +22,5 @@ class ScalingSource(StrEnum): 'middle_click', # middle click --> mouse_click 'double_click', # double left click --> mouse_dblclick 'screenshot', # take a screenshot --> noop - 'cursor_position', # get cursor position --> + 'cursor_position', # get cursor position --> noop ] From 36670fe364232cbd8e035940d31f30d4df766a97 Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Mon, 2 Dec 2024 04:46:36 +0000 Subject: [PATCH 08/14] fix litellm --- poetry.lock | 26 +++++++++++++++----------- pyproject.toml | 2 +- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/poetry.lock b/poetry.lock index cf67faff0388..d01a1c7a87f5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3970,25 +3970,23 @@ types-tqdm = "*" [[package]] name = "litellm" -version = "1.52.15" +version = "1.53.1" description = "Library to easily interface with LLM API providers" optional = false -python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8" -files = [ - {file = "litellm-1.52.15-py3-none-any.whl", hash = "sha256:8a2d8e2526c5e7afb3006b0214d3c348778462fefafd582fd76bb7f5c35d28d0"}, - {file = "litellm-1.52.15.tar.gz", hash = "sha256:11a61b1b033ddff9d480da66c00acc9d3e4fbfeed166d1b0de8eda16c684116e"}, -] +python-versions = ">=3.8.1,<4.0, !=3.9.7" +files = [] +develop = false [package.dependencies] aiohttp = "*" click = "*" importlib-metadata = ">=6.8.0" -jinja2 = ">=3.1.2,<4.0.0" -jsonschema = ">=4.22.0,<5.0.0" +jinja2 = "^3.1.2" +jsonschema = "^4.22.0" openai = ">=1.54.0" -pydantic = ">=2.0.0,<3.0.0" +pydantic = "^2.0.0" python-dotenv = ">=0.2.0" -requests = ">=2.31.0,<3.0.0" +requests = "^2.31.0" tiktoken = ">=0.7.0" tokenizers = "*" @@ -3996,6 +3994,12 @@ tokenizers = "*" extra-proxy = ["azure-identity (>=1.15.0,<2.0.0)", "azure-keyvault-secrets (>=4.8.0,<5.0.0)", "google-cloud-kms (>=2.21.3,<3.0.0)", "prisma (==0.11.0)", "resend (>=0.8.0,<0.9.0)"] proxy = ["PyJWT (>=2.8.0,<3.0.0)", "apscheduler (>=3.10.4,<4.0.0)", "backoff", "cryptography (>=42.0.5,<43.0.0)", "fastapi (>=0.111.0,<0.112.0)", "fastapi-sso (>=0.10.0,<0.11.0)", "gunicorn (>=22.0.0,<23.0.0)", "orjson (>=3.9.7,<4.0.0)", "pynacl (>=1.5.0,<2.0.0)", "python-multipart (>=0.0.9,<0.0.10)", "pyyaml (>=6.0.1,<7.0.0)", "rq", "uvicorn (>=0.22.0,<0.23.0)"] +[package.source] +type = "git" +url = "https://github.com/BerriAI/litellm.git" +reference = "litellm_dev_11_29_2024" +resolved_reference = "8f9cc0d9a4164899c9f60bb586c64d769b121211" + [[package]] name = "llama-cloud" version = "0.1.5" @@ -10361,4 +10365,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "8e57949c60fdd06204215174c801246c9d4c724c8aa58d68583a4d8e6999a3f6" +content-hash = "2f235c42ffac541765ed7380c05bedcfb8feb6c3d580fb8e229eb9544f7936de" diff --git a/pyproject.toml b/pyproject.toml index 89e951b20bd6..6b03ad92eb40 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ packages = [ python = "^3.12" datasets = "*" pandas = "*" -litellm = "^1.52.3" +litellm = {git = "https://github.com/BerriAI/litellm.git", rev = "litellm_dev_11_29_2024"} google-generativeai = "*" # To use litellm with Gemini Pro API google-api-python-client = "*" # For Google Sheets API google-auth-httplib2 = "*" # For Google Sheets authentication From 38eefce28fe32b2a36f9ba1e0863167587053689 Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Mon, 2 Dec 2024 09:14:54 +0000 Subject: [PATCH 09/14] add args validation, coordinate scaling, image resizing --- containers/app/Dockerfile | 2 +- .../agenthub/codeact_agent/codeact_agent.py | 5 +- openhands/runtime/action_execution_server.py | 6 +- openhands/runtime/browser/gui_use/gui_use.py | 145 ++++++++++++++++++ openhands/runtime/browser/gui_use/types.py | 9 ++ openhands/runtime/browser/utils.py | 36 ++++- 6 files changed, 191 insertions(+), 12 deletions(-) diff --git a/containers/app/Dockerfile b/containers/app/Dockerfile index 266a9d6b3e40..afb1a2bcc9a5 100644 --- a/containers/app/Dockerfile +++ b/containers/app/Dockerfile @@ -21,7 +21,7 @@ ENV POETRY_NO_INTERACTION=1 \ POETRY_CACHE_DIR=/tmp/poetry_cache RUN apt-get update -y \ - && apt-get install -y curl make git build-essential \ + && apt-get install -y curl make git build-essential imagemagick \ && python3 -m pip install poetry==1.8.2 --break-system-packages COPY ./pyproject.toml ./poetry.lock ./ diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py index 6e6bfa54e2bf..e6b129b4c463 100644 --- a/openhands/agenthub/codeact_agent/codeact_agent.py +++ b/openhands/agenthub/codeact_agent/codeact_agent.py @@ -265,8 +265,9 @@ def get_observation_message( message = Message( role='user', content=[ - # TextContent(text=text), - ImageContent(image_urls=[obs.screenshot]), + TextContent(text=obs.last_browser_action_error) + if obs.error + else ImageContent(image_urls=[obs.screenshot]), ], ) elif isinstance(obs, AgentDelegateObservation): diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py index 4a6c7e069665..6d985ef9166a 100644 --- a/openhands/runtime/action_execution_server.py +++ b/openhands/runtime/action_execution_server.py @@ -48,6 +48,7 @@ from openhands.events.serialization import event_from_dict, event_to_dict from openhands.runtime.browser import browse from openhands.runtime.browser.browser_env import BrowserEnv +from openhands.runtime.browser.gui_use.gui_use import GUIUseTool from openhands.runtime.plugins import ALL_PLUGINS, JupyterPlugin, Plugin, VSCodePlugin from openhands.runtime.utils.bash import BashSession from openhands.runtime.utils.files import insert_lines, read_lines @@ -110,6 +111,7 @@ def __init__( self.start_time = time.time() self.last_execution_time = self.start_time self.last_browser_output_observation: BrowserOutputObservation | None = None + self.gui_use = GUIUseTool() @property def initial_pwd(self): @@ -323,14 +325,14 @@ async def write(self, action: FileWriteAction) -> Observation: async def browse(self, action: BrowseURLAction) -> Observation: browser_obs = await browse( - action, self.browser, self.last_browser_output_observation + action, self.browser, self.gui_use, self.last_browser_output_observation ) self.last_browser_output_observation = browser_obs return browser_obs async def browse_interactive(self, action: BrowseInteractiveAction) -> Observation: browser_obs = await browse( - action, self.browser, self.last_browser_output_observation + action, self.browser, self.gui_use, self.last_browser_output_observation ) self.last_browser_output_observation = browser_obs return browser_obs diff --git a/openhands/runtime/browser/gui_use/gui_use.py b/openhands/runtime/browser/gui_use/gui_use.py index e69de29bb2d1..62e6f09cacc4 100644 --- a/openhands/runtime/browser/gui_use/gui_use.py +++ b/openhands/runtime/browser/gui_use/gui_use.py @@ -0,0 +1,145 @@ +import base64 +import os +from pathlib import Path +from uuid import uuid4 + +from openhands_aci.editor.exceptions import ToolError +from openhands_aci.utils.shell import run_shell_cmd + +from openhands.runtime.browser.gui_use.types import ( + MAX_SCALING_TARGETS, + OUTPUT_DIR, + ScalingSource, +) +from openhands.runtime.browser.gui_use.types import ComputerUseAction as Action + + +class GUIUseTool: + """ + A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer. + The tool parameters are defined by Anthropic and are not editable. + + Original implementation: https://github.com/anthropics/anthropic-quickstarts/blob/main/computer-use-demo/computer_use_demo/tools/computer.py + """ + + TOOL_NAME = 'gui_use' + _scaling_enabled = True + + width: int # Screen width + height: int # Screen height + + def __init__(self): + self.width = int(os.getenv('WIDTH') or 0) + self.height = int(os.getenv('HEIGHT') or 0) + assert self.width and self.height, 'WIDTH, HEIGHT must be set' + + def validate_and_transform_args( + self, + *, + action: Action, + text: str | None = None, + coordinate: tuple[int, int] | None = None, + **kwargs, + ) -> dict[str, str | tuple[int, int] | None]: + if action in ('mouse_move', 'left_click_drag'): + if coordinate is None: + raise ToolError(f'coordinate is required for {action}') + if text is not None: + raise ToolError(f'text is not accepted for {action}') + if not isinstance(coordinate, tuple) or len(coordinate) != 2: + raise ToolError(f'{coordinate} must be a tuple of length 2') + if not all(isinstance(i, int) and i >= 0 for i in coordinate): + raise ToolError(f'{coordinate} must be a tuple of non-negative ints') + + x, y = self.scale_coordinates( + ScalingSource.API, coordinate[0], coordinate[1] + ) + + return { + 'action': action, + 'coordinate': (x, y), + 'text': text, + } + + if action in ('key', 'type'): + if text is None: + raise ToolError(f'text is required for {action}') + if coordinate is not None: + raise ToolError(f'coordinate is not accepted for {action}') + if not isinstance(text, str): + raise ToolError(output=f'{text} must be a string') + + return { + 'action': action, + 'coordinate': coordinate, + 'text': text, + } + + if action in ( + 'left_click', + 'right_click', + 'double_click', + 'middle_click', + 'screenshot', + 'cursor_position', + ): + if text is not None: + raise ToolError(f'text is not accepted for {action}') + if coordinate is not None: + raise ToolError(f'coordinate is not accepted for {action}') + + return { + 'action': action, + 'coordinate': coordinate, + 'text': text, + } + + raise ToolError(f'Invalid action: {action}') + + def scale_coordinates( + self, source: ScalingSource, x: int, y: int + ) -> tuple[int, int]: + """Scale coordinates to a target maximum resolution.""" + if not self._scaling_enabled: + return x, y + ratio = self.width / self.height + target_dimension = None + for dimension in MAX_SCALING_TARGETS.values(): + # allow some error in the aspect ratio - not ratios are exactly 16:9 + if abs(dimension['width'] / dimension['height'] - ratio) < 0.02: + if dimension['width'] < self.width: + target_dimension = dimension + break + if target_dimension is None: + return x, y + # should be less than 1 + x_scaling_factor = target_dimension['width'] / self.width + y_scaling_factor = target_dimension['height'] / self.height + if source == ScalingSource.API: + if x > self.width or y > self.height: + raise ToolError(f'Coordinates {x}, {y} are out of bounds') + # scale up + return round(x / x_scaling_factor), round(y / y_scaling_factor) + # scale down + return round(x * x_scaling_factor), round(y * y_scaling_factor) + + def resize_image(self, base64_image: str) -> str: + output_dir = Path(OUTPUT_DIR) + output_dir.mkdir(parents=True, exist_ok=True) + path = output_dir / f'screenshot_{uuid4().hex}.png' + + # Write the base64 image to a file + with open(path, 'wb') as f: + f.write(base64.b64decode(base64_image)) + + if self._scaling_enabled: + x, y = self.scale_coordinates( + ScalingSource.COMPUTER, self.width, self.height + ) + # Resize the image + run_shell_cmd(f'convert {path} -resize {x}x{y}! {path}') + + if path.exists(): + return base64.b64encode(path.read_bytes()).decode() + + raise ToolError(f'Failed to resize image: {path}') diff --git a/openhands/runtime/browser/gui_use/types.py b/openhands/runtime/browser/gui_use/types.py index dacdb83c7447..6e3a87391ddf 100644 --- a/openhands/runtime/browser/gui_use/types.py +++ b/openhands/runtime/browser/gui_use/types.py @@ -1,12 +1,21 @@ from enum import StrEnum from typing import Literal, TypedDict +OUTPUT_DIR = '/tmp/outputs' + class Resolution(TypedDict): width: int height: int +MAX_SCALING_TARGETS: dict[str, Resolution] = { + 'XGA': Resolution(width=1024, height=768), # 4:3 + 'WXGA': Resolution(width=1280, height=800), # 16:10 + 'FWXGA': Resolution(width=1366, height=768), # ~16:9 +} + + class ScalingSource(StrEnum): COMPUTER = 'computer' API = 'api' diff --git a/openhands/runtime/browser/utils.py b/openhands/runtime/browser/utils.py index abbb161833a4..47ab6692937e 100644 --- a/openhands/runtime/browser/utils.py +++ b/openhands/runtime/browser/utils.py @@ -1,10 +1,14 @@ import os +from openhands_aci.editor.exceptions import ToolError + from openhands.core.exceptions import BrowserUnavailableException from openhands.core.schema import ActionType from openhands.events.action import BrowseInteractiveAction, BrowseURLAction from openhands.events.observation import BrowserOutputObservation from openhands.runtime.browser.browser_env import BrowserEnv +from openhands.runtime.browser.gui_use.gui_use import GUIUseTool +from openhands.runtime.browser.gui_use.types import ScalingSource from openhands.runtime.browser.transformer import ( translate_computer_use_action_to_browsergym_action, ) @@ -13,6 +17,7 @@ async def browse( action: BrowseURLAction | BrowseInteractiveAction, browser: BrowserEnv | None, + gui_use: GUIUseTool, last_obs: BrowserOutputObservation | None, ) -> BrowserOutputObservation: if browser is None: @@ -32,11 +37,22 @@ async def browse( # received action_str defined by Anthropic's Computer Use feature: see https://docs.anthropic.com/en/docs/build-with-claude/computer-use#computer-tool extra_args = action.extra_args - # TODO: perform argument validation on extra_args - assert extra_args is not None + try: + validated_args = gui_use.validate_and_transform_args( + **(extra_args or {}) + ) + except ToolError as e: + return BrowserOutputObservation( + content=f'ERROR:\n{e.message}', + screenshot='', + error=True, + last_browser_action_error=f'ERROR:\n{e.message}', + url=asked_url if action.action == ActionType.BROWSE else '', + trigger_by_action=action.action, + ) # construct a computer use action - _action_str = f'{extra_args["action"]}({", ".join([f"{k}={v}" for k, v in extra_args.items() if k != "action"])})' + _action_str = f'{validated_args["action"]}({", ".join([f"{k}={v}" for k, v in validated_args.items() if k != "action"])})' # translate to BrowserGym actions # action in BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/functions.py @@ -52,10 +68,16 @@ async def browse( try: # obs provided by BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/env.py#L396 obs = browser.step(action_str) + mouse_position = obs.get('mouse_position', [0, 0]) + scaled_mouse_position = gui_use.scale_coordinates( + ScalingSource.COMPUTER, int(mouse_position[0]), int(mouse_position[1]) + ) return BrowserOutputObservation( content=obs['text_content'], # text content of the page url=obs.get('url', ''), # URL of the page - screenshot=obs.get('screenshot', None), # base64-encoded screenshot, png + screenshot=gui_use.resize_image( + obs.get('screenshot', None) + ), # base64-encoded screenshot, png open_pages_urls=obs.get('open_pages_urls', []), # list of open pages active_page_index=obs.get( 'active_page_index', -1 @@ -72,14 +94,14 @@ async def browse( last_browser_action_error=obs.get('last_action_error', ''), error=True if obs.get('last_action_error', '') else False, # error flag trigger_by_action=action.action, - mouse_position=obs.get('mouse_position', []), # mouse position + mouse_position=[scaled_mouse_position[0], scaled_mouse_position[1]], ) except Exception as e: return BrowserOutputObservation( - content=str(e), + content=f'ERROR:\n{str(e)}', screenshot='', error=True, - last_browser_action_error=str(e), + last_browser_action_error=f'ERROR:\n{str(e)}', url=asked_url if action.action == ActionType.BROWSE else '', trigger_by_action=action.action, ) From e1778059e621c25c0b838e5a5d0a037a4034f49c Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Tue, 3 Dec 2024 07:25:25 +0000 Subject: [PATCH 10/14] fix bugs --- .../codeact_agent/function_calling.py | 24 ++++++++++-------- openhands/runtime/browser/gui_use/gui_use.py | 19 ++++++++------ openhands/runtime/browser/gui_use/types.py | 1 + openhands/runtime/browser/transformer.py | 25 ++++++++----------- openhands/runtime/browser/utils.py | 7 +++++- tests/unit/test_action_transformer.py | 9 ++++--- 6 files changed, 49 insertions(+), 36 deletions(-) diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py index ebcc12104f17..8ba7d26a528b 100644 --- a/openhands/agenthub/codeact_agent/function_calling.py +++ b/openhands/agenthub/codeact_agent/function_calling.py @@ -444,13 +444,13 @@ def __init__(self): ), ) -_GUI_USE_TOOL_DESCRIPTION = """Use a mouse and keyboard to interact with a computer, and take screenshots. -* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications. -* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try taking another screenshot. -* The screen's resolution is {{ display_width_px }}x{{ display_height_px }}. -* The display number is {{ display_number }} -* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor. -* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click. +_GUI_USE_TOOL_DESCRIPTION = """Use a mouse and keyboard to navigate websites, interact with it, and take screenshots. +* This is an interface to a web browser GUI environment. +* Always use `goto` to navigate to a URL before interacting with the page. +* Some web pages may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on a button and nothing happens, try taking another screenshot. +* The screen's resolution is 1280x720. +* Whenever you intend to move the cursor to click on an element like a button or a form field, you should consult a screenshot to determine the coordinates of the element before moving the cursor. +* If you tried clicking on a button or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click. * Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked. """ @@ -464,9 +464,11 @@ def __init__(self): 'properties': { 'action': { 'description': """The action to perform. The available actions are: +* `goto`: Navigate to a URL. * `key`: Press a key or key-combination on the keyboard. - - This supports xdotool's `key` syntax. - - Examples: "a", "Return", "alt+Tab", "ctrl+s", "Up", "KP_0" (for the numpad 0 key). + - Press a combination of keys. Accepts the logical key names. + - Examples: "Backquote", "Minus", "Equal", "Backslash", "Backspace", "Tab", "Delete", "Escape", "ArrowDown", "End", "Enter", "Home", "Insert", "PageDown", "PageUp", "ArrowRight", "ArrowUp", "F1" - F12, "Digit0" - Digit9, "KeyA" - KeyZ, etc. + - Can alternatively specify a single character to produce such as "a" or "#". Following modification shortcuts are also supported: "Shift", "Control", "Alt", "Meta", "ShiftLeft", "ControlOrMeta". "ControlOrMeta" resolves to Control on Windows and Linux and to Meta on macOS. * `type`: Type a string of text on the keyboard. * `cursor_position`: Get the current (x, y) pixel coordinate of the cursor on the screen. * `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen. @@ -495,7 +497,7 @@ def __init__(self): 'type': 'array', }, 'text': { - 'description': 'Required only by `action=type` and `action=key`.', + 'description': 'Required only by `action=type`, `action=key` and `action=url`.', 'type': 'string', }, }, @@ -608,7 +610,7 @@ def get_tools( ) -> list[ChatCompletionToolParam]: tools = [CmdRunTool, FinishTool] if codeact_enable_browsing: - tools.append(WebReadTool) + # tools.append(WebReadTool) # tools.append(BrowserTool) tools.append(GUIUseTool) if codeact_enable_jupyter: diff --git a/openhands/runtime/browser/gui_use/gui_use.py b/openhands/runtime/browser/gui_use/gui_use.py index 62e6f09cacc4..cff370795b8f 100644 --- a/openhands/runtime/browser/gui_use/gui_use.py +++ b/openhands/runtime/browser/gui_use/gui_use.py @@ -1,5 +1,4 @@ import base64 -import os from pathlib import Path from uuid import uuid4 @@ -29,8 +28,10 @@ class GUIUseTool: height: int # Screen height def __init__(self): - self.width = int(os.getenv('WIDTH') or 0) - self.height = int(os.getenv('HEIGHT') or 0) + # self.width = int(os.getenv('WIDTH') or 0) + # self.height = int(os.getenv('HEIGHT') or 0) + self.width = 1280 + self.height = 720 assert self.width and self.height, 'WIDTH, HEIGHT must be set' def validate_and_transform_args( @@ -38,7 +39,7 @@ def validate_and_transform_args( *, action: Action, text: str | None = None, - coordinate: tuple[int, int] | None = None, + coordinate: list[int] | None = None, **kwargs, ) -> dict[str, str | tuple[int, int] | None]: if action in ('mouse_move', 'left_click_drag'): @@ -46,7 +47,7 @@ def validate_and_transform_args( raise ToolError(f'coordinate is required for {action}') if text is not None: raise ToolError(f'text is not accepted for {action}') - if not isinstance(coordinate, tuple) or len(coordinate) != 2: + if not isinstance(coordinate, list) or len(coordinate) != 2: raise ToolError(f'{coordinate} must be a tuple of length 2') if not all(isinstance(i, int) and i >= 0 for i in coordinate): raise ToolError(f'{coordinate} must be a tuple of non-negative ints') @@ -61,7 +62,7 @@ def validate_and_transform_args( 'text': text, } - if action in ('key', 'type'): + if action in ('key', 'type', 'goto'): if text is None: raise ToolError(f'text is required for {action}') if coordinate is not None: @@ -124,6 +125,10 @@ def scale_coordinates( return round(x * x_scaling_factor), round(y * y_scaling_factor) def resize_image(self, base64_image: str) -> str: + data_prefix = 'data:image/png;base64,' + if base64_image.startswith('data:image/png;base64,'): + base64_image = base64_image[len(data_prefix) :] + output_dir = Path(OUTPUT_DIR) output_dir.mkdir(parents=True, exist_ok=True) path = output_dir / f'screenshot_{uuid4().hex}.png' @@ -140,6 +145,6 @@ def resize_image(self, base64_image: str) -> str: run_shell_cmd(f'convert {path} -resize {x}x{y}! {path}') if path.exists(): - return base64.b64encode(path.read_bytes()).decode() + return data_prefix + base64.b64encode(path.read_bytes()).decode() raise ToolError(f'Failed to resize image: {path}') diff --git a/openhands/runtime/browser/gui_use/types.py b/openhands/runtime/browser/gui_use/types.py index 6e3a87391ddf..a2947d9d6bd3 100644 --- a/openhands/runtime/browser/gui_use/types.py +++ b/openhands/runtime/browser/gui_use/types.py @@ -22,6 +22,7 @@ class ScalingSource(StrEnum): ComputerUseAction = Literal[ + 'goto', # go to a URL --> goto 'type', # type sequence --> keyboard_type 'key', # press a key or key comb --> keyboard_press 'mouse_move', # move mouse to a position --> mouse_move diff --git a/openhands/runtime/browser/transformer.py b/openhands/runtime/browser/transformer.py index b6a57330e712..07aeacebdbe1 100644 --- a/openhands/runtime/browser/transformer.py +++ b/openhands/runtime/browser/transformer.py @@ -1,4 +1,5 @@ import ast +from functools import partial import astor @@ -42,10 +43,10 @@ def visit_Call(self, node): return self.generic_visit(node) -def coordinate_split(arg_node): +def coordinate_split(arg_node, x_name='to_x', y_name='to_y'): if isinstance(arg_node.value, ast.Tuple) and len(arg_node.value.elts) == 2: - x_arg = ast.keyword(arg='to_x', value=arg_node.value.elts[0]) - y_arg = ast.keyword(arg='to_y', value=arg_node.value.elts[1]) + x_arg = ast.keyword(arg=x_name, value=arg_node.value.elts[0]) + y_arg = ast.keyword(arg=y_name, value=arg_node.value.elts[1]) return [x_arg, y_arg] return [] @@ -66,6 +67,10 @@ def translate_computer_use_action_to_browsergym_action( last_mouse_position = [0, 0] mapping = { + 'goto': { + 'target_func': 'goto', + 'arg_transform': {'text': rename_argument('url')}, + }, 'type': { 'target_func': 'keyboard_type', 'arg_transform': {'text': rename_argument('key')}, @@ -76,17 +81,9 @@ def translate_computer_use_action_to_browsergym_action( }, 'mouse_move': { 'target_func': 'mouse_move', - 'arg_transform': {'coordinate': coordinate_split}, - 'extra_args': [ - { - 'name': 'from_x', - 'value': last_mouse_position[0], - }, - { - 'name': 'from_y', - 'value': last_mouse_position[1], - }, - ], + 'arg_transform': { + 'coordinate': partial(coordinate_split, x_name='x', y_name='y') + }, }, 'left_click_drag': { 'target_func': 'mouse_drag_and_drop', diff --git a/openhands/runtime/browser/utils.py b/openhands/runtime/browser/utils.py index 47ab6692937e..a5aa1fd80b63 100644 --- a/openhands/runtime/browser/utils.py +++ b/openhands/runtime/browser/utils.py @@ -52,7 +52,12 @@ async def browse( ) # construct a computer use action - _action_str = f'{validated_args["action"]}({", ".join([f"{k}={v}" for k, v in validated_args.items() if k != "action"])})' + _action_str = f'{validated_args["action"]}(' + if validated_args.get('coordinate'): + _action_str += f'coordinate={validated_args["coordinate"]}' + if validated_args.get('text'): + _action_str += f'text="{validated_args["text"]}"' + _action_str += ')' # translate to BrowserGym actions # action in BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/functions.py diff --git a/tests/unit/test_action_transformer.py b/tests/unit/test_action_transformer.py index 400d6ea12a63..8184d55d2ab8 100644 --- a/tests/unit/test_action_transformer.py +++ b/tests/unit/test_action_transformer.py @@ -13,6 +13,7 @@ def last_obs(): url='https://example.com', screenshot='screenshot', mouse_position=[50, 100], + trigger_by_action='BROWSE', ) @@ -26,7 +27,7 @@ def test_keyboard_type(last_obs): def test_mouse_move(last_obs): code = """mouse_move(coordinate=(100, 200))""" - expected = 'mouse_move(to_x=100, to_y=200, from_x=50, from_y=100)\n' + expected = 'mouse_move(x=100, y=200)\n' assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) @@ -86,9 +87,10 @@ def test_missing_mouse_position(): url='https://example.com', screenshot='screenshot', mouse_position=None, + trigger_by_action='BROWSE', ) code = """mouse_move(coordinate=(100, 200))""" - expected = 'mouse_move(to_x=100, to_y=200, from_x=0, from_y=0)\n' + expected = 'mouse_move(x=100, y=200)\n' assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) @@ -100,9 +102,10 @@ def test_empty_mouse_position(): url='https://example.com', screenshot='screenshot', mouse_position=[], + trigger_by_action='BROWSE', ) code = """mouse_move(coordinate=(100, 200))""" - expected = 'mouse_move(to_x=100, to_y=200, from_x=0, from_y=0)\n' + expected = 'mouse_move(x=100, y=200)\n' assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) From ca3547ceb1f13bf1172aacb2dd4cda429a617616 Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Mon, 9 Dec 2024 13:51:01 +0000 Subject: [PATCH 11/14] use positional args & fix action not recognized --- .../codeact_agent/function_calling.py | 16 +++---- openhands/runtime/browser/browser_env.py | 4 ++ openhands/runtime/browser/transformer.py | 47 ++++++++----------- tests/unit/test_action_transformer.py | 16 +++---- 4 files changed, 39 insertions(+), 44 deletions(-) diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py index 8ba7d26a528b..0c6cc60b01ee 100644 --- a/openhands/agenthub/codeact_agent/function_calling.py +++ b/openhands/agenthub/codeact_agent/function_calling.py @@ -293,7 +293,7 @@ def __init__(self): # from browsergym/core/action/highlevel.py _browser_action_space = HighLevelActionSet( - subsets=['bid', 'nav'], + subsets=['bid', 'nav', 'coord'], strict=False, # less strict on the parsing of the actions multiaction=True, # enable to agent to take multiple actions at once ) @@ -415,13 +415,13 @@ def __init__(self): """ -for _, action in _browser_action_space.action_set.items(): - assert ( - action.signature in _BROWSER_TOOL_DESCRIPTION - ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.signature}' - assert ( - action.description in _BROWSER_TOOL_DESCRIPTION - ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.description}' +# for _, action in _browser_action_space.action_set.items(): +# assert ( +# action.signature in _BROWSER_TOOL_DESCRIPTION +# ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.signature}' +# assert ( +# action.description in _BROWSER_TOOL_DESCRIPTION +# ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.description}' BrowserTool = ChatCompletionToolParam( type='function', diff --git a/openhands/runtime/browser/browser_env.py b/openhands/runtime/browser/browser_env.py index efdabfa6a045..67a060ab37dd 100644 --- a/openhands/runtime/browser/browser_env.py +++ b/openhands/runtime/browser/browser_env.py @@ -11,6 +11,7 @@ import html2text import numpy as np import tenacity +from browsergym.core.action.highlevel import HighLevelActionSet from browsergym.utils.obs import flatten_dom_to_str from PIL import Image @@ -95,6 +96,9 @@ def browser_process(self): headless=True, disable_env_checker=True, tags_to_mark='all', + action_mapping=HighLevelActionSet( + subsets=['chat', 'infeas', 'bid', 'nav', 'tab', 'coord'] + ).to_python_code, ) obs, info = env.reset() diff --git a/openhands/runtime/browser/transformer.py b/openhands/runtime/browser/transformer.py index 07aeacebdbe1..a2c0dc624f64 100644 --- a/openhands/runtime/browser/transformer.py +++ b/openhands/runtime/browser/transformer.py @@ -23,38 +23,35 @@ def visit_Call(self, node): # Apply argument transformations if defined if arg_transform: - new_keywords = [] + new_args = [] for kw in node.keywords: if kw.arg in arg_transform: - new_keywords.extend(arg_transform[kw.arg](kw)) + new_args.extend(arg_transform[kw.arg](kw)) else: - new_keywords.append(kw) - node.keywords = new_keywords + # Append unnamed arguments from remaining keywords + new_args.append(kw.value) + node.args.extend(new_args) + node.keywords = [] # Clear keywords, as we're using unnamed args - # Add extra arguments + # Add extra arguments as unnamed arguments for extra_arg in extra_args: - node.keywords.append( - ast.keyword( - arg=extra_arg['name'], - value=ast.Constant(value=extra_arg['value']), - ) - ) + node.args.append(ast.Constant(value=extra_arg['value'])) return self.generic_visit(node) def coordinate_split(arg_node, x_name='to_x', y_name='to_y'): if isinstance(arg_node.value, ast.Tuple) and len(arg_node.value.elts) == 2: - x_arg = ast.keyword(arg=x_name, value=arg_node.value.elts[0]) - y_arg = ast.keyword(arg=y_name, value=arg_node.value.elts[1]) + x_arg = arg_node.value.elts[0] + y_arg = arg_node.value.elts[1] return [x_arg, y_arg] return [] -def rename_argument(new_name): +def rename_argument(_): def transformer(arg_node): - # Change the name of the argument - return [ast.keyword(arg=new_name, value=arg_node.value)] + # Change the argument into an unnamed argument + return [arg_node.value] return transformer @@ -89,46 +86,40 @@ def translate_computer_use_action_to_browsergym_action( 'target_func': 'mouse_drag_and_drop', 'arg_transform': {'coordinate': coordinate_split}, 'extra_args': [ - { - 'name': 'from_x', - 'value': last_mouse_position[0], - }, - { - 'name': 'from_y', - 'value': last_mouse_position[1], - }, + {'name': 'from_x', 'value': last_mouse_position[0]}, + {'name': 'from_y', 'value': last_mouse_position[1]}, ], }, 'left_click': { 'target_func': 'mouse_click', 'extra_args': [ - {'name': 'button', 'value': 'left'}, {'name': 'x', 'value': last_mouse_position[0]}, {'name': 'y', 'value': last_mouse_position[1]}, + {'name': 'button', 'value': 'left'}, ], }, 'right_click': { 'target_func': 'mouse_click', 'extra_args': [ - {'name': 'button', 'value': 'right'}, {'name': 'x', 'value': last_mouse_position[0]}, {'name': 'y', 'value': last_mouse_position[1]}, + {'name': 'button', 'value': 'right'}, ], }, 'middle_click': { 'target_func': 'mouse_click', 'extra_args': [ - {'name': 'button', 'value': 'middle'}, {'name': 'x', 'value': last_mouse_position[0]}, {'name': 'y', 'value': last_mouse_position[1]}, + {'name': 'button', 'value': 'middle'}, ], }, 'double_click': { 'target_func': 'mouse_dblclick', 'extra_args': [ - {'name': 'button', 'value': 'left'}, {'name': 'x', 'value': last_mouse_position[0]}, {'name': 'y', 'value': last_mouse_position[1]}, + {'name': 'button', 'value': 'left'}, ], }, 'screenshot': { diff --git a/tests/unit/test_action_transformer.py b/tests/unit/test_action_transformer.py index 8184d55d2ab8..2536a0b376bd 100644 --- a/tests/unit/test_action_transformer.py +++ b/tests/unit/test_action_transformer.py @@ -19,7 +19,7 @@ def last_obs(): def test_keyboard_type(last_obs): code = """type(text="Hello, World!")""" - expected = "keyboard_type(key='Hello, World!')\n" + expected = "keyboard_type('Hello, World!')\n" assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) @@ -27,7 +27,7 @@ def test_keyboard_type(last_obs): def test_mouse_move(last_obs): code = """mouse_move(coordinate=(100, 200))""" - expected = 'mouse_move(x=100, y=200)\n' + expected = 'mouse_move(100, 200)\n' assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) @@ -35,7 +35,7 @@ def test_mouse_move(last_obs): def test_left_click(last_obs): code = """left_click()""" - expected = "mouse_click(button='left', x=50, y=100)\n" + expected = "mouse_click(50, 100, 'left')\n" assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) @@ -43,7 +43,7 @@ def test_left_click(last_obs): def test_right_click(last_obs): code = """right_click()""" - expected = "mouse_click(button='right', x=50, y=100)\n" + expected = "mouse_click(50, 100, 'right')\n" assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) @@ -51,7 +51,7 @@ def test_right_click(last_obs): def test_middle_click(last_obs): code = """middle_click()""" - expected = "mouse_click(button='middle', x=50, y=100)\n" + expected = "mouse_click(50, 100, 'middle')\n" assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) @@ -59,7 +59,7 @@ def test_middle_click(last_obs): def test_double_click(last_obs): code = """double_click()""" - expected = "mouse_dblclick(button='left', x=50, y=100)\n" + expected = "mouse_dblclick(50, 100, 'left')\n" assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) @@ -90,7 +90,7 @@ def test_missing_mouse_position(): trigger_by_action='BROWSE', ) code = """mouse_move(coordinate=(100, 200))""" - expected = 'mouse_move(x=100, y=200)\n' + expected = 'mouse_move(100, 200)\n' assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) @@ -105,7 +105,7 @@ def test_empty_mouse_position(): trigger_by_action='BROWSE', ) code = """mouse_move(coordinate=(100, 200))""" - expected = 'mouse_move(x=100, y=200)\n' + expected = 'mouse_move(100, 200)\n' assert ( translate_computer_use_action_to_browsergym_action(code, last_obs) == expected ) From 32796a1c085436dffdb4bcaa64b61cf7f02ae756 Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Mon, 9 Dec 2024 15:30:19 +0000 Subject: [PATCH 12/14] revert redundant code --- .../agenthub/codeact_agent/function_calling.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py index 0c6cc60b01ee..7e5a7c60275a 100644 --- a/openhands/agenthub/codeact_agent/function_calling.py +++ b/openhands/agenthub/codeact_agent/function_calling.py @@ -293,7 +293,7 @@ def __init__(self): # from browsergym/core/action/highlevel.py _browser_action_space = HighLevelActionSet( - subsets=['bid', 'nav', 'coord'], + subsets=['bid', 'nav'], strict=False, # less strict on the parsing of the actions multiaction=True, # enable to agent to take multiple actions at once ) @@ -415,13 +415,13 @@ def __init__(self): """ -# for _, action in _browser_action_space.action_set.items(): -# assert ( -# action.signature in _BROWSER_TOOL_DESCRIPTION -# ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.signature}' -# assert ( -# action.description in _BROWSER_TOOL_DESCRIPTION -# ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.description}' +for _, action in _browser_action_space.action_set.items(): + assert ( + action.signature in _BROWSER_TOOL_DESCRIPTION + ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.signature}' + assert ( + action.description in _BROWSER_TOOL_DESCRIPTION + ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.description}' BrowserTool = ChatCompletionToolParam( type='function', @@ -479,6 +479,7 @@ def __init__(self): * `double_click`: Double-click the left mouse button. * `screenshot`: Take a screenshot of the screen.""", 'enum': [ + 'goto', 'key', 'type', 'mouse_move', From 32b0b057d6bf99d96e36576f10d26c73cd70dbeb Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Fri, 3 Jan 2025 16:39:20 +0700 Subject: [PATCH 13/14] use fork browsergym-core --- poetry.lock | 188 ++++--------------------------------------------- pyproject.toml | 8 +-- 2 files changed, 17 insertions(+), 179 deletions(-) diff --git a/poetry.lock b/poetry.lock index 3a217b12410f..14ded293e2c6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -484,24 +484,6 @@ files = [ tests = ["pytest (>=3.2.1,!=3.3.0)"] typecheck = ["mypy"] -[[package]] -name = "beartype" -version = "0.12.0" -description = "Unbearably fast runtime type checking in pure Python." -optional = false -python-versions = ">=3.7.0" -files = [ - {file = "beartype-0.12.0-py3-none-any.whl", hash = "sha256:3d9d5bec198bcf965c000d7b5120bebdd19a444ef6e39e97d0e93eb8832e10c8"}, - {file = "beartype-0.12.0.tar.gz", hash = "sha256:3b7545b3f333a6b07042b68b102141554c9add2e979dab7b0f8ed6378f7af7d7"}, -] - -[package.extras] -all = ["typing-extensions (>=3.10.0.0)"] -dev = ["autoapi (>=0.9.0)", "coverage (>=5.5)", "mypy (>=0.800)", "numpy", "pytest (>=4.0.0)", "sphinx", "sphinx (>=4.1.0)", "tox (>=3.20.1)", "typing-extensions"] -doc-rtd = ["furo (==2022.6.21)", "sphinx (==4.1.0)"] -test-tox = ["mypy (>=0.800)", "numpy", "pytest (>=4.0.0)", "sphinx", "typing-extensions"] -test-tox-coverage = ["coverage (>=5.5)"] - [[package]] name = "beautifulsoup4" version = "4.12.3" @@ -602,14 +584,12 @@ crt = ["awscrt (==0.22.0)"] [[package]] name = "browsergym-core" -version = "0.10.2" +version = "0.13.3" description = "BrowserGym: a gym environment for web task automation in the Chromium browser" optional = false python-versions = ">3.9" -files = [ - {file = "browsergym_core-0.10.2-py3-none-any.whl", hash = "sha256:0686a8e2ee7244e33c97326193f54df0ad08d99aad9a4ed9ac28baba5ca26d18"}, - {file = "browsergym_core-0.10.2.tar.gz", hash = "sha256:7e93bad5cc3990badee77e9481413d625d2fce2ec8f7f9e195dbc194b6cfb4e9"}, -] +files = [] +develop = false [package.dependencies] beautifulsoup4 = ">=4.12" @@ -620,34 +600,12 @@ pillow = ">=10.1" playwright = ">=1.39,<2.0" pyparsing = ">=3" -[[package]] -name = "browsergym-miniwob" -version = "0.10.2" -description = "MiniWoB++ benchmark for BrowserGym" -optional = false -python-versions = ">3.7" -files = [ - {file = "browsergym_miniwob-0.10.2-py3-none-any.whl", hash = "sha256:b11b04378868a8f5dee34f721134baed4780fd55ccaebf9db4de6fcac48f3190"}, - {file = "browsergym_miniwob-0.10.2.tar.gz", hash = "sha256:9109b8122a61b27e227d923861055f220c6ddd60f34f877c3a30444c6f8a7b05"}, -] - -[package.dependencies] -browsergym-core = "0.10.2" - -[[package]] -name = "browsergym-webarena" -version = "0.10.2" -description = "WebArena benchmark for BrowserGym" -optional = false -python-versions = ">3.7" -files = [ - {file = "browsergym_webarena-0.10.2-py3-none-any.whl", hash = "sha256:e9ca6d0ad263412ebb229fe1b66e1ab7f5841a3f838abedf3bf01b800a7c6597"}, - {file = "browsergym_webarena-0.10.2.tar.gz", hash = "sha256:b4b9a38f144b6aaa56bbbbce9dd2c5565a39a1b55e3647d61e02458ca3f5fd24"}, -] - -[package.dependencies] -browsergym-core = "0.10.2" -libwebarena = "0.0.3" +[package.source] +type = "git" +url = "https://github.com/ryanhoangt/BrowserGym" +reference = "add-mouse-position" +resolved_reference = "4202aab6a2b5a0d42f9c126452a6b6fb3231bb55" +subdirectory = "browsergym/core" [[package]] name = "build" @@ -1735,28 +1693,6 @@ mccabe = ">=0.7.0,<0.8.0" pycodestyle = ">=2.12.0,<2.13.0" pyflakes = ">=3.2.0,<3.3.0" -[[package]] -name = "flask" -version = "3.1.0" -description = "A simple framework for building complex web applications." -optional = false -python-versions = ">=3.9" -files = [ - {file = "flask-3.1.0-py3-none-any.whl", hash = "sha256:d667207822eb83f1c4b50949b1623c8fc8d51f2341d65f72e1a1815397551136"}, - {file = "flask-3.1.0.tar.gz", hash = "sha256:5f873c5184c897c8d9d1b05df1e3d01b14910ce69607a117bd3277098a5836ac"}, -] - -[package.dependencies] -blinker = ">=1.9" -click = ">=8.1.3" -itsdangerous = ">=2.2" -Jinja2 = ">=3.1.2" -Werkzeug = ">=3.1" - -[package.extras] -async = ["asgiref (>=3.2)"] -dotenv = ["python-dotenv"] - [[package]] name = "flatbuffers" version = "24.3.25" @@ -3114,17 +3050,6 @@ files = [ [package.dependencies] arrow = ">=0.15.0" -[[package]] -name = "itsdangerous" -version = "2.2.0" -description = "Safely pass data to untrusted environments and back." -optional = false -python-versions = ">=3.8" -files = [ - {file = "itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef"}, - {file = "itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173"}, -] - [[package]] name = "jedi" version = "0.19.2" @@ -3722,32 +3647,6 @@ websocket-client = ">=0.32.0,<0.40.0 || >0.40.0,<0.41.dev0 || >=0.43.dev0" [package.extras] adal = ["adal (>=1.0.2)"] -[[package]] -name = "libwebarena" -version = "0.0.3" -description = "This is an unofficial, use-at-your-own risks port of the webarena benchmark, for use as a standalone library package." -optional = false -python-versions = "<4,>=3.7" -files = [ - {file = "libwebarena-0.0.3-py3-none-any.whl", hash = "sha256:aa0a0879486e5c90b2b2ec1c3bf309b0c7f13ee2bf7c8945447ac15f7027d248"}, - {file = "libwebarena-0.0.3.tar.gz", hash = "sha256:3d05fae6749931aaf26e6c80fd665725dfeab41ac4848f168c407dbe3de89baf"}, -] - -[package.dependencies] -aiolimiter = "*" -beartype = "0.12.0" -evaluate = "*" -flask = "*" -gymnasium = "*" -nltk = "*" -openai = ">=1" -Pillow = "*" -playwright = ">=1.32,<1.40" -text-generation = "*" -tiktoken = "*" -transformers = "*" -types-tqdm = "*" - [[package]] name = "litellm" version = "1.56.6" @@ -5442,13 +5341,13 @@ numpy = {version = ">=1.26.0", markers = "python_version >= \"3.12\""} [[package]] name = "openhands-aci" -version = "0.1.1" +version = "0.1.5" description = "An Agent-Computer Interface (ACI) designed for software development agents OpenHands." optional = false python-versions = "<4.0,>=3.12" files = [ - {file = "openhands_aci-0.1.1-py3-none-any.whl", hash = "sha256:8831f97b887571005dca0d70a9f6f0a4f9feb35d3d41f499e70d72b5fb68a599"}, - {file = "openhands_aci-0.1.1.tar.gz", hash = "sha256:705b74a12a8f428e64295b5de125f553500f62ef5ab3a5a6284d8fcf638025e6"}, + {file = "openhands_aci-0.1.5-py3-none-any.whl", hash = "sha256:7b4238161ede81ba870efd5f30af654d432d03a74632746ba19147761ed21533"}, + {file = "openhands_aci-0.1.5.tar.gz", hash = "sha256:3ed6d051c4944ba4dee1febcf2b41c3d95da1706102f53f61809a3aca0821afe"}, ] [package.dependencies] @@ -8534,22 +8433,6 @@ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] test = ["pre-commit", "pytest (>=7.0)", "pytest-timeout"] typing = ["mypy (>=1.6,<2.0)", "traitlets (>=5.11.1)"] -[[package]] -name = "text-generation" -version = "0.7.0" -description = "Hugging Face Text Generation Python Client" -optional = false -python-versions = "<4.0,>=3.7" -files = [ - {file = "text_generation-0.7.0-py3-none-any.whl", hash = "sha256:02ab337a0ee0e7c70e04a607b311c261caae74bde46a7d837c6fdd150108f4d8"}, - {file = "text_generation-0.7.0.tar.gz", hash = "sha256:689200cd1f0d4141562af2515393c2c21cdbd9fac21c8398bf3043cdcc14184e"}, -] - -[package.dependencies] -aiohttp = ">=3.8,<4.0" -huggingface-hub = ">=0.12,<1.0" -pydantic = ">2,<3" - [[package]] name = "threadpoolctl" version = "3.5.0" @@ -9120,20 +9003,6 @@ files = [ {file = "types_python_dateutil-2.9.0.20241003-py3-none-any.whl", hash = "sha256:250e1d8e80e7bbc3a6c99b907762711d1a1cdd00e978ad39cb5940f6f0a87f3d"}, ] -[[package]] -name = "types-requests" -version = "2.32.0.20241016" -description = "Typing stubs for requests" -optional = false -python-versions = ">=3.8" -files = [ - {file = "types-requests-2.32.0.20241016.tar.gz", hash = "sha256:0d9cad2f27515d0e3e3da7134a1b6f28fb97129d86b867f24d9c726452634d95"}, - {file = "types_requests-2.32.0.20241016-py3-none-any.whl", hash = "sha256:4195d62d6d3e043a4eaaf08ff8a62184584d2e8684e9d2aa178c7915a7da3747"}, -] - -[package.dependencies] -urllib3 = ">=2" - [[package]] name = "types-toml" version = "0.10.8.20240310" @@ -9145,20 +9014,6 @@ files = [ {file = "types_toml-0.10.8.20240310-py3-none-any.whl", hash = "sha256:627b47775d25fa29977d9c70dc0cbab3f314f32c8d8d0c012f2ef5de7aaec05d"}, ] -[[package]] -name = "types-tqdm" -version = "4.67.0.20241119" -description = "Typing stubs for tqdm" -optional = false -python-versions = ">=3.8" -files = [ - {file = "types-tqdm-4.67.0.20241119.tar.gz", hash = "sha256:1769e0e94d5e6d8fa814965f9cf3d9928376dd15dabcbcb784bb8769081092b4"}, - {file = "types_tqdm-4.67.0.20241119-py3-none-any.whl", hash = "sha256:a18d4eb62db0d35c52707ae13d821b5a57970755273ecb56e133ccc0ac7e7c79"}, -] - -[package.dependencies] -types-requests = "*" - [[package]] name = "typing-extensions" version = "4.12.2" @@ -9615,23 +9470,6 @@ files = [ {file = "websockets-14.1.tar.gz", hash = "sha256:398b10c77d471c0aab20a845e7a60076b6390bfdaac7a6d2edb0d2c59d75e8d8"}, ] -[[package]] -name = "werkzeug" -version = "3.1.3" -description = "The comprehensive WSGI web application library." -optional = false -python-versions = ">=3.9" -files = [ - {file = "werkzeug-3.1.3-py3-none-any.whl", hash = "sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e"}, - {file = "werkzeug-3.1.3.tar.gz", hash = "sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746"}, -] - -[package.dependencies] -MarkupSafe = ">=2.1.1" - -[package.extras] -watchdog = ["watchdog (>=2.3)"] - [[package]] name = "whatthepatch" version = "1.0.7" @@ -10064,4 +9902,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "c69863e861b84a4af1cae314f85076448f011d1e347a46ce9d63843121e595eb" +content-hash = "b82e3e70979818aebb2ff9a5c02b34c4c3f79936b4a4d97d11d3c09d8680264f" diff --git a/pyproject.toml b/pyproject.toml index 7a120c9f38b1..e633ab3ec817 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ uvicorn = "*" types-toml = "*" numpy = "*" json-repair = "*" -browsergym-core = "0.10.2" # integrate browsergym-core as the browsing interface +browsergym-core = { git = "https://github.com/ryanhoangt/BrowserGym", subdirectory = "browsergym/core", rev = "add-mouse-position" } # integrate browsergym-core as the browsing interface html2text = "*" e2b = ">=1.0.5,<1.1.0" pexpect = "*" @@ -64,7 +64,7 @@ modal = ">=0.66.26,<0.71.0" runloop-api-client = "0.11.0" pygithub = "^2.5.0" joblib = "*" -openhands-aci = "0.1.1" +openhands-aci = "0.1.5" astor = "*" python-socketio = "^5.11.4" redis = "^5.2.0" @@ -145,8 +145,8 @@ gdown = "*" matplotlib = "*" seaborn = "*" tabulate = "*" -browsergym-webarena = "0.10.2" -browsergym-miniwob = "0.10.2" +# browsergym-webarena = "0.10.2" +# browsergym-miniwob = "0.10.2" [tool.poetry-dynamic-versioning] enable = true From 054c1d603be23029216512c36610fd8856b9a246 Mon Sep 17 00:00:00 2001 From: Hoang Tran Date: Fri, 3 Jan 2025 23:57:48 +0700 Subject: [PATCH 14/14] fix bug --- openhands/controller/agent_controller.py | 8 ++++---- openhands/runtime/browser/utils.py | 8 ++++++-- poetry.lock | 2 +- pyproject.toml | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py index e76439f79457..b491e424ea7e 100644 --- a/openhands/controller/agent_controller.py +++ b/openhands/controller/agent_controller.py @@ -281,10 +281,10 @@ async def _handle_observation(self, observation: Observation) -> None: observation_to_print.content, self.agent.llm.config.max_message_chars ) # Use info level if LOG_ALL_EVENTS is set - log_level = 'info' if os.getenv('LOG_ALL_EVENTS') in ('true', '1') else 'debug' - self.log( - log_level, str(observation_to_print), extra={'msg_type': 'OBSERVATION'} - ) + # log_level = 'info' if os.getenv('LOG_ALL_EVENTS') in ('true', '1') else 'debug' + # self.log( + # log_level, str(observation_to_print), extra={'msg_type': 'OBSERVATION'} + # ) if observation.llm_metrics is not None: self.agent.llm.metrics.merge(observation.llm_metrics) diff --git a/openhands/runtime/browser/utils.py b/openhands/runtime/browser/utils.py index a5aa1fd80b63..0f9f1127fefb 100644 --- a/openhands/runtime/browser/utils.py +++ b/openhands/runtime/browser/utils.py @@ -36,6 +36,7 @@ async def browse( if _action_str == 'gui_use': # received action_str defined by Anthropic's Computer Use feature: see https://docs.anthropic.com/en/docs/build-with-claude/computer-use#computer-tool extra_args = action.extra_args + print(f'✅ extra_args: {extra_args}') try: validated_args = gui_use.validate_and_transform_args( @@ -71,11 +72,14 @@ async def browse( raise ValueError(f'Invalid action type: {action.action}') try: - # obs provided by BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/env.py#L396 + # obs provided by BrowserGym: see + # https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/env.py#L396 obs = browser.step(action_str) mouse_position = obs.get('mouse_position', [0, 0]) scaled_mouse_position = gui_use.scale_coordinates( - ScalingSource.COMPUTER, int(mouse_position[0]), int(mouse_position[1]) + ScalingSource.COMPUTER, + int(mouse_position[0] or 0), + int(mouse_position[1] or 0), ) return BrowserOutputObservation( content=obs['text_content'], # text content of the page diff --git a/poetry.lock b/poetry.lock index 14ded293e2c6..3d5c2a9c309e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -9902,4 +9902,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "b82e3e70979818aebb2ff9a5c02b34c4c3f79936b4a4d97d11d3c09d8680264f" +content-hash = "1f3a26372473b5b6c842282765a61f0f7594f4d9bb3f4d00b1129c1829318943" diff --git a/pyproject.toml b/pyproject.toml index e633ab3ec817..8aa054b3e8af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ uvicorn = "*" types-toml = "*" numpy = "*" json-repair = "*" -browsergym-core = { git = "https://github.com/ryanhoangt/BrowserGym", subdirectory = "browsergym/core", rev = "add-mouse-position" } # integrate browsergym-core as the browsing interface +browsergym-core = { git = "https://github.com/ryanhoangt/BrowserGym", subdirectory = "browsergym/core", rev = "add-mouse-position-no-iframe" } # integrate browsergym-core as the browsing interface html2text = "*" e2b = ">=1.0.5,<1.1.0" pexpect = "*"