diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index 367e170b..3982f754 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -1,4 +1,4 @@ -name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI +name: Build and Publish on: [push, workflow_dispatch] diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml new file mode 100644 index 00000000..9183c77e --- /dev/null +++ b/.github/workflows/unit_tests.yml @@ -0,0 +1,188 @@ +name: Unit tests + +on: + push: + branches: + - main + pull_request: + +jobs: + + code-format: + runs-on: ubuntu-latest + + defaults: + run: + shell: bash -l {0} + + steps: + + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' # caching pip dependencies + + - name: Pip install + run: pip install black[jupyter]==24.2.0 blacken-docs + + - name: Code Formatting + run: black . --check + + browsergym-core: + runs-on: ubuntu-latest + + defaults: + run: + shell: bash -l {0} + + steps: + + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' # caching pip dependencies + + - name: Pip install + working-directory: ./dev + run: pip install -r requirements.txt + + - name: Pip list + run: pip list + + - name: Install Playwright + run: playwright install --with-deps + + - name: Run browsergym-core Unit Tests + run: pytest -n 1 --durations=10 -m 'not pricy' -v core/tests + + browsergym-miniwob: + runs-on: ubuntu-latest + + defaults: + run: + shell: bash -l {0} + + steps: + + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' # caching pip dependencies + + - name: Pip install + working-directory: ./dev + run: pip install -r requirements.txt + + - name: Pip list + run: pip list + + - name: Install Playwright + run: playwright install --with-deps + + - name: Fetch MiniWob + uses: actions/checkout@v4 + with: + repository: "Farama-Foundation/miniwob-plusplus" + ref: "7fd85d71a4b60325c6585396ec4f48377d049838" + path: "miniwob-plusplus" + + - name: Serve MiniWob + uses: Eun/http-server-action@v1 + with: + directory: "${{ github.workspace }}/miniwob-plusplus/miniwob/html" + port: 8080 + + - name: Run browsergym-miniwob Unit Tests + env: + MINIWOB_URL: "http://localhost:8080/miniwob/" + run: pytest -n 5 --durations=10 -m 'not pricy' -v miniwob/tests + + browsergym-webarena-fast: + runs-on: ubuntu-latest + + defaults: + run: + shell: bash -l {0} + + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' # caching pip dependencies + + - name: Pip install + working-directory: ./dev + run: pip install -r requirements.txt + + - name: Pip list + run: pip list + + - name: Install Playwright + run: playwright install --with-deps + + - name: Run browsergym-webarena not slow Unit Tests + env: + SHOPPING: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:7770/" + SHOPPING_ADMIN: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:7780/admin" + REDDIT: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:9999" + GITLAB: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:8023" + WIKIPEDIA: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing" + MAP: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:3000" + HOMEPAGE: "PASS:4399" + WEBARENA_PATH: "${{ github.workspace }}/webarena/" + run: pytest -n 5 --durations=10 -m 'not slow and not pricy' --slowmo 1000 -v webarena/tests + + browsergym-webarena-slow: + runs-on: ubuntu-latest + + defaults: + run: + shell: bash -l {0} + + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' # caching pip dependencies + + - name: Pip install + working-directory: ./dev + run: pip install -r requirements.txt + + - name: Pip list + run: pip list + + - name: Install Playwright + run: playwright install --with-deps + + - name: Run browsergym-webarena slow Unit Tests + env: + SHOPPING: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:7770/" + SHOPPING_ADMIN: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:7780/admin" + REDDIT: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:9999" + GITLAB: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:8023" + WIKIPEDIA: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing" + MAP: "http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:3000" + HOMEPAGE: "PASS:4399" + WEBARENA_PATH: "${{ github.workspace }}/webarena/" + run: pytest -n 5 --durations=10 -m 'slow and not pricy' --slowmo 1000 -v webarena/tests diff --git a/.gitignore b/.gitignore index 4a48538f..078c6d62 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ -.DS_store \ No newline at end of file +.DS_store +__pycache__/ +*.py[cod] \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..2540c440 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,44 @@ +fail_fast: false + +default_language_version: + python: python3 + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.2.0 + hooks: + - id: trailing-whitespace + exclude: ^(.*)\.md$ + - id: end-of-file-fixer + - id: check-yaml + exclude: ^(.circleci/recipe|recipe) # conda build recipes are templated + - id: check-added-large-files + - repo: https://github.com/pocc/pre-commit-hooks + rev: v1.1.1 + hooks: + - id: clang-format + args: [--style=file, -i] + - id: clang-tidy + args: [--fix, --fix-errors] + - repo: https://github.com/psf/black + rev: 24.2.0 + hooks: + - id: black + args: [--config=./pyproject.toml] + - repo: https://github.com/asottile/blacken-docs + rev: v1.12.1 + hooks: + - id: blacken-docs + args: [ '--line-length', '100' ] + additional_dependencies: [black] + - repo: https://github.com/Lucas-C/pre-commit-hooks + rev: v1.5.5 + hooks: + - id: forbid-crlf + - id: remove-crlf + # Black does not clear tabs in docstrings + - id: forbid-tabs + files: '.*\.py$' + - id: remove-tabs + files: '.*\.py$' + args: [ '--whitespaces-count', '4' ] \ No newline at end of file diff --git a/core/src/browsergym/core/__init__.py b/core/src/browsergym/core/__init__.py index 31df8fe9..127356ae 100644 --- a/core/src/browsergym/core/__init__.py +++ b/core/src/browsergym/core/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.1.0rc7" +__version__ = "0.2.0" import playwright.sync_api diff --git a/core/src/browsergym/core/action/__init__.py b/core/src/browsergym/core/action/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/core/src/browsergym/core/action/functions.py b/core/src/browsergym/core/action/functions.py index 6b55f0ea..9eb06048 100644 --- a/core/src/browsergym/core/action/functions.py +++ b/core/src/browsergym/core/action/functions.py @@ -5,7 +5,6 @@ from .utils import ( add_demo_mode_effects, - check_for_overlay, get_elem_by_bid, highlight_by_box, smooth_move_visual_cursor_to, @@ -13,7 +12,7 @@ page: playwright.sync_api.Page = None send_message_to_user: callable = None -demo_mode: Literal["off", "default", "only_visible_elements"] = "off" +demo_mode: Literal["off", "default", "all_blue", "only_visible_elements"] = None """IMPORTANT The following primitives are meant to be included in the browsergym action using @@ -51,16 +50,16 @@ def fill(bid: str, value: str): Examples: fill('237', 'example value') fill('45', "multi-line\\nexample") - fill('32-12', "example with \\"quotes\\"") + fill('a12', "example with \\"quotes\\"") """ - elem = get_elem_by_bid(page, bid, demo_mode) + elem = get_elem_by_bid(page, bid, demo_mode != "off") + add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False) if demo_mode != "off": - add_demo_mode_effects(page, elem, bid, demo_mode_type=demo_mode, move_cursor=False) elem.clear() delay = max(2000 / len(value), 10) elem.type(value, delay=delay) else: - elem.fill(value) + elem.fill(value, timeout=500) # https://playwright.dev/python/docs/api/class-locator#locator-check @@ -71,10 +70,9 @@ def check(bid: str): Examples: check('55') """ - elem = get_elem_by_bid(page, bid, demo_mode) - if demo_mode != "off": - add_demo_mode_effects(page, elem, bid, demo_mode_type=demo_mode, move_cursor=True) - elem.check() + elem = get_elem_by_bid(page, bid, demo_mode != "off") + add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True) + elem.check(timeout=500) # https://playwright.dev/python/docs/api/class-locator#locator-uncheck @@ -83,12 +81,11 @@ def uncheck(bid: str): Ensure a checkbox or radio element is unchecked. Examples: - uncheck('65-5289') + uncheck('a5289') """ - elem = get_elem_by_bid(page, bid, demo_mode) - if demo_mode != "off": - add_demo_mode_effects(page, elem, bid, demo_mode_type=demo_mode, move_cursor=True) - elem.uncheck() + elem = get_elem_by_bid(page, bid, demo_mode != "off") + add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True) + elem.uncheck(timeout=500) # https://playwright.dev/docs/input#select-options @@ -101,10 +98,9 @@ def select_option(bid: str, options: str | list[str]): select_option('48', "blue") select_option('48', ["red", "green", "blue"]) """ - elem = get_elem_by_bid(page, bid, demo_mode) - if demo_mode != "off": - add_demo_mode_effects(page, elem, bid, demo_mode_type=demo_mode, move_cursor=False) - elem.select_option(options) + elem = get_elem_by_bid(page, bid, demo_mode != "off") + add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False) + elem.select_option(options, timeout=500) # https://playwright.dev/python/docs/api/class-locator#locator-click @@ -118,13 +114,11 @@ def click( Examples: click('51') - click('69-2', button="right") + click('b22', button="right") click('48', button="middle", modifiers=["Shift"]) """ - elem = get_elem_by_bid(page, bid, demo_mode) - if demo_mode != "off": - add_demo_mode_effects(page, elem, bid, demo_mode_type=demo_mode, move_cursor=True) - + elem = get_elem_by_bid(page, bid, demo_mode != "off") + add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True) elem.click(button=button, modifiers=modifiers) @@ -139,13 +133,12 @@ def dblclick( Examples: dblclick('12') - dblclick('289-5-42', button="right") + dblclick('ca42', button="right") dblclick('178', button="middle", modifiers=["Shift"]) """ - elem = get_elem_by_bid(page, bid, demo_mode) - if demo_mode != "off": - add_demo_mode_effects(page, elem, bid, demo_mode_type=demo_mode, move_cursor=True) - elem.dblclick(button=button, modifiers=modifiers, force=True) + elem = get_elem_by_bid(page, bid, demo_mode != "off") + add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True) + elem.dblclick(button=button, modifiers=modifiers, timeout=500) # https://playwright.dev/python/docs/api/class-locator#locator-hover @@ -154,15 +147,15 @@ def hover(bid: str): Hover over an element. Examples: - hover('12-8') + hover('b8') """ - elem = get_elem_by_bid(page, bid, demo_mode) + elem = get_elem_by_bid(page, bid, demo_mode != "off") if demo_mode != "off": box = elem.bounding_box() if box: center_x, center_y = box["x"] + box["width"] / 2, box["y"] + box["height"] / 2 smooth_move_visual_cursor_to(page, center_x, center_y) - elem.hover() + elem.hover(timeout=500) # https://playwright.dev/python/docs/input#keys-and-shortcuts @@ -179,13 +172,12 @@ def press(bid: str, key_comb: str): Examples: press('88', 'Backspace') - press('48-6', 'Control+a') - press('48-6', 'Meta+Shift+t') + press('a26', 'Control+a') + press('a61', 'Meta+Shift+t') """ - elem = get_elem_by_bid(page, bid, demo_mode) - if demo_mode != "off": - add_demo_mode_effects(page, elem, bid, demo_mode_type=demo_mode, move_cursor=False) - elem.press(key_comb) + elem = get_elem_by_bid(page, bid, demo_mode != "off") + add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False) + elem.press(key_comb, timeout=500) # https://playwright.dev/python/docs/api/class-locator#locator-focus @@ -194,12 +186,11 @@ def focus(bid: str): Focus the matching element. Examples: - focus('87-455') + focus('b455') """ - elem = get_elem_by_bid(page, bid, demo_mode) - if demo_mode != "off": - add_demo_mode_effects(page, elem, bid, demo_mode_type=demo_mode, move_cursor=False) - elem.focus() + elem = get_elem_by_bid(page, bid, demo_mode != "off") + add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False) + elem.focus(timeout=500) # https://playwright.dev/python/docs/api/class-locator#locator-clear @@ -210,10 +201,9 @@ def clear(bid: str): Examples: clear('996') """ - elem = get_elem_by_bid(page, bid, demo_mode) - if demo_mode != "off": - add_demo_mode_effects(page, elem, bid, demo_mode_type=demo_mode, move_cursor=False) - elem.clear() + elem = get_elem_by_bid(page, bid, demo_mode != "off") + add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False) + elem.clear(timeout=500) # https://playwright.dev/python/docs/input#drag-and-drop @@ -226,23 +216,21 @@ def drag_and_drop(from_bid: str, to_bid: str): Examples: drag_and_drop('56', '498') """ - from_elem = get_elem_by_bid(page, from_bid, demo_mode) - if demo_mode != "off": - add_demo_mode_effects(page, from_elem, from_bid, move_cursor=True) - from_elem.hover() + from_elem = get_elem_by_bid(page, from_bid, demo_mode != "off") + add_demo_mode_effects(page, from_elem, from_bid, demo_mode=demo_mode, move_cursor=True) + from_elem.hover(timeout=500) page.mouse.down() - to_elem = get_elem_by_bid(page, to_bid, demo_mode) - if demo_mode != "off": - add_demo_mode_effects(page, to_elem, to_bid, move_cursor=True) - to_elem.hover() + to_elem = get_elem_by_bid(page, to_bid, demo_mode != "off") + add_demo_mode_effects(page, to_elem, to_bid, demo_mode=demo_mode, move_cursor=True) + to_elem.hover(timeout=500) page.mouse.up() # https://playwright.dev/python/docs/api/class-mouse#mouse-wheel def scroll(delta_x: float, delta_y: float): """ - Scroll horizontally and vertically. Amounts in pixels. Dispatches a wheel event. + Scroll horizontally and vertically. Amounts in pixels, positive for right or down scrolling, negative for left or up scrolling. Dispatches a wheel event. Examples: scroll(0, 200) @@ -370,6 +358,7 @@ def keyboard_press(key: str): keyboard_press('Backspace') keyboard_press('Control+a') keyboard_press('Meta+Shift+t') + page.keyboard.press("PageDown") """ page.keyboard.press(key) @@ -521,3 +510,48 @@ def tab_focus(index: int): page = page.context.pages[index] # trigger the callback that sets this page as active in browsergym page.locate("html").dispatch_event("pageshow") + + +# https://playwright.dev/python/docs/input#upload-files +def upload_file(bid: str, file: str | list[str]): + """ + Click an element and wait for a "filechooser" event, then select one + or multiple input files for upload. Relative file paths are resolved + relative to the current working directory. An empty list clears the + selected files. + + Examples: + upload_file("572", "my_receipt.pdf") + upload_file("63", ["/home/bob/Documents/image.jpg", "/home/bob/Documents/file.zip"]) + """ + elem = get_elem_by_bid(page, bid, demo_mode != "off") + add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True) + + with page.expect_file_chooser() as fc_info: + elem.click(timeout=500) + + file_chooser = fc_info.value + file_chooser.set_files(file) + + +# https://playwright.dev/python/docs/input#upload-files +def mouse_upload_file(x: float, y: float, file: str | list[str]): + """ + Click a location and wait for a "filechooser" event, then select one + or multiple input files for upload. Relative file paths are resolved + relative to the current working directory. An empty list clears the + selected files. + + Examples: + mouse_upload_file(132.1, 547, "my_receipt.pdf") + mouse_upload_file(328, 812, ["/home/bob/Documents/image.jpg", "/home/bob/Documents/file.zip"]) + """ + if demo_mode != "off": + smooth_move_visual_cursor_to(page, x, y) + highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1}) + + with page.expect_file_chooser() as fc_info: + page.mouse.click(x, y) + + file_chooser = fc_info.value + file_chooser.set_files(file) diff --git a/core/src/browsergym/core/action/highlevel.py b/core/src/browsergym/core/action/highlevel.py index a5209acb..fc0b0826 100644 --- a/core/src/browsergym/core/action/highlevel.py +++ b/core/src/browsergym/core/action/highlevel.py @@ -20,6 +20,7 @@ focus, clear, drag_and_drop, + upload_file, scroll, mouse_move, mouse_up, @@ -27,6 +28,7 @@ mouse_click, mouse_dblclick, mouse_drag_and_drop, + mouse_upload_file, keyboard_down, keyboard_up, keyboard_press, @@ -58,6 +60,7 @@ focus, clear, drag_and_drop, + upload_file, ] COORD_ACTIONS = [ @@ -68,6 +71,7 @@ mouse_click, mouse_dblclick, mouse_drag_and_drop, + mouse_upload_file, keyboard_down, keyboard_up, keyboard_press, @@ -106,7 +110,7 @@ def __init__( ], custom_actions: Optional[list[callable]] = None, multiaction: bool = True, - demo_mode: Literal["off", "default", "only_visible_elements"] = "off", + demo_mode: Literal["off", "default", "all_blue", "only_visible_elements"] = "off", strict: bool = False, ): super().__init__(strict) @@ -209,7 +213,7 @@ def __init__( examples=examples, ) - def example_action(self, abstract: bool) -> str: + def example_action(self, abstract: bool, max_examples: int = 3) -> str: """ Returns an example action as a string. """ @@ -224,22 +228,21 @@ def example_action(self, abstract: bool) -> str: picked_examples = [] # use fill and click examples if action is present - if "fill" in self.action_set: - picked_examples.extend(self.action_set["fill"].examples) - if "click" in self.action_set: - picked_examples.extend(self.action_set["click"].examples) + for action_name in ["fill", "click", "mouse_click", "keyboard_type"]: + if action_name in self.action_set: + picked_examples.extend(self.action_set[action_name].examples) - # last resort, use all examples + # last resort, use all action examples if not picked_examples: for _, action in self.action_set.items(): - all_examples += action.examples + picked_examples += action.examples # shuffle examples rng = random.Random(1) rng.shuffle(picked_examples) if self.multiaction: - return "\n".join(picked_examples[:3]) + return "\n".join(picked_examples[:max_examples]) else: return picked_examples[0] diff --git a/core/src/browsergym/core/action/python.py b/core/src/browsergym/core/action/python.py index 6c1c9927..487c8038 100644 --- a/core/src/browsergym/core/action/python.py +++ b/core/src/browsergym/core/action/python.py @@ -24,17 +24,17 @@ def describe(self, with_long_description: bool = True, with_examples: bool = Tru ``` Here is another example: ``` -frame = page.get_by_test_id("35").frame_locator(":scope") -frame.get_by_test_id("35-776").click() +frame = page.get_by_test_id("a").frame_locator(":scope") +frame.get_by_test_id("a776").click() ``` Note that Playwright's `get_by_test_id()` method is configured to use the `bid` attribute to locate HTML elements, instead of the default `data-testid`. Also, Playwright's locators can not traverse iframes, so you have to locate parent iframes first in order to locate an element in an iframe. The `bid` attribute contains all the information -required to recursively locate an element. For example, an element with `bid="23-557-2"` can be retrieved as follows: +required to recursively locate an element. For example, an element with `bid="ac2"` can be retrieved as follows: ``` -frame = page.get_by_test_id("23").frame_locator(":scope") -frame = frame.get_by_test_id("23-557").frame_locator(":scope") -elem = frame.get_by_test_id("23-557-2") +frame = page.get_by_test_id("a").frame_locator(":scope") +frame = frame.get_by_test_id("ac").frame_locator(":scope") +elem = frame.get_by_test_id("ac2") ``` """ else: @@ -77,10 +77,10 @@ def example_action(self, abstract: bool) -> str: One single bloc of Python code. Do not include any explanation, only valid Python code.""" else: return """\ -frame = page.get_by_test_id("23").frame_locator(":scope") -frame = page.get_by_test_id("23-557").frame_locator(":scope") -frame.get_by_test_id("23-557-2").fill("Hello world!") -frame.get_by_test_id("23-557-3").click() +frame = page.get_by_test_id("b").frame_locator(":scope") +frame = page.get_by_test_id("ba").frame_locator(":scope") +frame.get_by_test_id("ba2").fill("Hello world!") +frame.get_by_test_id("ba3").click() """ def to_python_code(self, action): diff --git a/core/src/browsergym/core/action/utils.py b/core/src/browsergym/core/action/utils.py index f71a55f2..48540d6f 100644 --- a/core/src/browsergym/core/action/utils.py +++ b/core/src/browsergym/core/action/utils.py @@ -1,4 +1,5 @@ import playwright.sync_api +from typing import Literal def get_elem_by_bid( @@ -6,9 +7,9 @@ def get_elem_by_bid( ) -> playwright.sync_api.Locator: """ Parse the given bid to sequentially locate every nested frame leading to the bid, then - locate the bid element. Bids are expected to take the form "XX-...-YY-ZZ", which means - the element ZZ is located inside frame YY, which is located inside frame ..., which is - located inside frame XX, which is located inside the page's main frame. + locate the bid element. Bids are expected to take the form "abb123", which means + the element abb123 is located inside frame abb, which is located inside frame ab, which is + located inside frame a, which is located inside the page's main frame. Args: bid: the browsergym id (playwright testid) of the page element. @@ -24,33 +25,42 @@ def get_elem_by_bid( current_frame = page # dive into each nested frame, to the frame where the element is located - for i in range(bid.count("-")): - frame_bid = "-".join(bid.split("-")[: i + 1]) + i = 0 + while bid[i:] and not bid[i:].isnumeric(): + i += 1 + frame_bid = bid[:i] # bid of the next frame to select frame_elem = current_frame.get_by_test_id(frame_bid) + if not frame_elem.count(): + raise ValueError(f'could not find element with bid "{frame_bid}"') if scroll_into_view: - frame_elem.scroll_into_view_if_needed() + frame_elem.scroll_into_view_if_needed(timeout=500) current_frame = frame_elem.frame_locator(":scope") # finally, we should have selected the frame where the target element is elem = current_frame.get_by_test_id(bid) + if not elem.count(): + raise ValueError(f'Could not find element with bid "{bid}".') if scroll_into_view: - elem.scroll_into_view_if_needed() + elem.scroll_into_view_if_needed(timeout=500) return elem -def highlight_by_box(page: playwright.sync_api.Page, box: dict, is_visible: bool = True): +def highlight_by_box( + page: playwright.sync_api.Page, box: dict, color: Literal["blue", "red"] = "blue" +): """Highlights the target element based on its bounding box attributes.""" + assert color in ("blue", "red") + if box: left, top, width, height = box["x"], box["y"], box["width"], box["height"] - color = "blue" if is_visible else "red" page.evaluate( f"""\ const overlay = document.createElement('div'); document.body.appendChild(overlay); overlay.setAttribute('style', ` all: initial; - position: absolute; + position: fixed; border: 2px solid transparent; /* Start with transparent border */ borderRadius: 10px; /* Add rounded corners */ boxShadow: 0 0 0px {color}; /* Initial boxShadow with 0px spread */ @@ -111,7 +121,7 @@ def smooth_move_visual_cursor_to( `; cursor.setAttribute('style', ` all: initial; - position: absolute; + position: fixed; opacity: 0.7; /* Slightly transparent */ z-index: 2147483647; /* Maximum value */ pointer-events: none; /* Ensures the SVG doesn't interfere with page interactions */ @@ -188,35 +198,39 @@ def smooth_move_visual_cursor_to( def check_for_overlay( - page: playwright.sync_api.Page, - bid: str, - element: playwright.sync_api.ElementHandle, + page: playwright.sync_api.Page, bid: str, element: playwright.sync_api.ElementHandle, box: dict ): - """Checks in a given element is the topmost element at its center position by default. + if not element: + return False + + visibility = element.get_attribute("browsergym_visibility_ratio") + if visibility is not None: + return float(visibility) >= 0.5 + + """Checks if a given element is the topmost element at its center position by default. If check_corners is True, it checks if any of the corners is visible.""" - if element: - box = element.bounding_box() - if box: - # corners - points_to_check = [ - (box["x"], box["y"]), - (box["x"] + box["width"], box["y"]), - (box["x"], box["y"] + box["height"]), - (box["x"] + box["width"], box["y"] + box["height"]), - ] - - for x, y in points_to_check: - # Execute JavaScript to find the topmost element at the point. - top_element = page.evaluate( - f"""() => {{ - const el = document.elementFromPoint({x}, {y}); - return el ? el.outerHTML : ''; - }}""" - ) - - # Check if the topmost element is the element we're interested in. - if top_element and bid in top_element: - return True + if box: + # corners + points_to_check = [ + (box["x"], box["y"]), + (box["x"] + box["width"], box["y"]), + (box["x"], box["y"] + box["height"]), + (box["x"] + box["width"], box["y"] + box["height"]), + ] + + for x, y in points_to_check: + # Execute JavaScript to find the topmost element at the point. + top_element = page.evaluate( + f"""() => {{ + const el = document.elementFromPoint({x}, {y}); + return el ? el.outerHTML : ''; + }}""" + ) + + # Check if the topmost element is the element we're interested in. + if top_element and bid in top_element: + return True + return False @@ -224,16 +238,34 @@ def add_demo_mode_effects( page: playwright.sync_api.Page, elem: playwright.sync_api.ElementHandle, bid: str, + demo_mode: Literal["off", "default", "all_blue", "only_visible_elements"], move_cursor: bool = True, - demo_mode_type: str = "default", ): + if demo_mode == "off": + return + """Adds visual effects to the target element""" box = elem.bounding_box() + # box = extract_bounds_cdp(page, bid) if box: center_x, center_y = box["x"] + box["width"] / 2, box["y"] + box["height"] / 2 - is_top_element = check_for_overlay(page, bid, elem) - - if is_top_element or demo_mode_type == "default": - if move_cursor: - smooth_move_visual_cursor_to(page, center_x, center_y) - highlight_by_box(page, box, is_visible=is_top_element) + is_top_element = check_for_overlay(page, bid, elem, box) + + if demo_mode == "only_visible_elements": + if not is_top_element: + return + else: + color = "blue" + + elif demo_mode == "default": + if is_top_element: + color = "blue" + else: + color = "red" + + elif demo_mode == "all_blue": + color = "blue" + + if move_cursor: + smooth_move_visual_cursor_to(page, center_x, center_y) + highlight_by_box(page, box, color=color) diff --git a/core/src/browsergym/core/chat.py b/core/src/browsergym/core/chat.py index 758d1489..f9221444 100644 --- a/core/src/browsergym/core/chat.py +++ b/core/src/browsergym/core/chat.py @@ -4,18 +4,20 @@ import logging import playwright.sync_api import re +import time from importlib import resources from . import _get_global_playwright, chat_files -CHATBOX_HTML_PATH = str(resources.files(chat_files).joinpath("chatbox.html")) -ASSISTANT_IMG_PATH = str(resources.files(chat_files).joinpath("assistant.png")) +CHATBOX_DIR = resources.files(chat_files) class Chat: - def __init__(self, headless: bool, chat_size=(500, 800), record_video_dir=None) -> None: + def __init__( + self, headless: bool, chat_size=(500, 800), record_video_dir=None, modern=True + ) -> None: self.messages = [] # create a new browser, browser context and page for the chat @@ -29,13 +31,17 @@ def __init__(self, headless: bool, chat_size=(500, 800), record_video_dir=None) record_video_size=dict(width=chat_size[0], height=chat_size[1]), ) self.page = self.context.new_page() + self.recording_start_time = time.time() if record_video_dir else None # setup the chat page self.page.expose_function( "send_user_message", lambda msg: self.add_message(role="user", msg=msg, from_js=True) ) - self.page.set_content(get_chatbox_html()) + if modern: + self.page.set_content(get_chatbox_modern(CHATBOX_DIR)) + else: + self.page.set_content(get_chatbox_classic(CHATBOX_DIR)) def add_message( self, role: Literal["user", "assistant", "info"], msg: str, from_js: bool = False @@ -46,8 +52,6 @@ def add_message( if role in ("user", "assistant"): self.messages.append({"role": role, "message": msg}) if not from_js: - # change new lines to html - msg = msg.replace("\n", "
") self.page.evaluate(f"addChatMessage({repr(role)}, {repr(msg)});") def wait_for_user_message(self): @@ -63,16 +67,19 @@ def close(self): self.browser.close() -def get_chatbox_html() -> str: - with open(CHATBOX_HTML_PATH, "r") as file: +def get_chatbox_modern(chatbox_dir) -> str: + with open(chatbox_dir / "chatbox_modern.html", "r") as file: chatbox_html = file.read() - with open(ASSISTANT_IMG_PATH, "rb") as f: - # image = Image.open(f) + return chatbox_html + + +def get_chatbox_classic(chatbox_dir) -> str: + with open(chatbox_dir / "chatbox.html", "r") as file: + chatbox_html = file.read() + with open(chatbox_dir / "assistant.png", "rb") as f: image_base64 = base64.b64encode(f.read()).decode("utf-8") - # hard-code the assistant image in the HTML assistant_image_url = f"data:image/png;base64,{image_base64}" chatbox_html = re.sub("", assistant_image_url, chatbox_html) - return chatbox_html diff --git a/core/src/browsergym/core/chat_files/chatbox.html b/core/src/browsergym/core/chat_files/chatbox.html index 8424ad08..2afe6e59 100644 --- a/core/src/browsergym/core/chat_files/chatbox.html +++ b/core/src/browsergym/core/chat_files/chatbox.html @@ -133,7 +133,7 @@
-

UI Assistant

+

BrowserGym

@@ -151,6 +151,15 @@

UI Assistant

var USER_MESSAGE_RECEIVED = false; + function escapeHtml(unsafe) { + return unsafe + .replace(/&/g, "&") + .replace(//g, ">") + .replace(/"/g, """) + .replace(/'/g, "'"); + } + function addChatMessage(role, msg) { const chatBody = document.getElementById('chatBody'); const chatDebug = document.getElementById('chatDebug'); @@ -158,7 +167,7 @@

UI Assistant

msgContainer.className = 'message'; const text = document.createElement('p'); - text.innerHTML = msg; + text.innerHTML = escapeHtml(msg); const assistant_img = document.createElement('img'); assistant_img.src = assistant_image_data; diff --git a/core/src/browsergym/core/chat_files/chatbox_modern.html b/core/src/browsergym/core/chat_files/chatbox_modern.html new file mode 100644 index 00000000..63b9d4ac --- /dev/null +++ b/core/src/browsergym/core/chat_files/chatbox_modern.html @@ -0,0 +1,354 @@ + + + + + + + UI Assistant Chat + + + + +
+
+
+
+ +
+
+
+
+
+
+ + + +
+
+
+
+ + + + + + \ No newline at end of file diff --git a/core/src/browsergym/core/chat_files/img/send.svg b/core/src/browsergym/core/chat_files/img/send.svg new file mode 100644 index 00000000..7d5705f5 --- /dev/null +++ b/core/src/browsergym/core/chat_files/img/send.svg @@ -0,0 +1,3 @@ + + + diff --git a/core/src/browsergym/core/constants.py b/core/src/browsergym/core/constants.py index e1e59a7b..3169920d 100644 --- a/core/src/browsergym/core/constants.py +++ b/core/src/browsergym/core/constants.py @@ -1,4 +1,7 @@ TEXT_MAX_LENGTH = 2**32 - 1 BROWSERGYM_ID_ATTRIBUTE = "bid" # Playwright's default is "data-testid" +BROWSERGYM_VISIBILITY_ATTRIBUTE = "browsergym_visibility_ratio" +BROWSERGYM_SETOFMARKS_ATTRIBUTE = "browsergym_set_of_marks" + EXTRACT_OBS_MAX_TRIES = 5 diff --git a/core/src/browsergym/core/env.py b/core/src/browsergym/core/env.py index 1476536a..0898e23b 100644 --- a/core/src/browsergym/core/env.py +++ b/core/src/browsergym/core/env.py @@ -4,22 +4,25 @@ import numpy as np import playwright.sync_api import time +import re from abc import ABC from pathlib import Path -from typing import Optional, Literal +from typing import Optional from .chat import Chat from .task import AbstractBrowserTask -from .spaces import Unicode, AnyDict +from .spaces import Unicode, AnyDict, AnyBox from .constants import TEXT_MAX_LENGTH, BROWSERGYM_ID_ATTRIBUTE, EXTRACT_OBS_MAX_TRIES from .observation import ( _pre_extract, _post_extract, extract_screenshot, extract_dom_snapshot, + extract_dom_extra_properties, extract_merged_axtree, extract_focused_element_bid, + MarkingError, ) from .action.base import execute_python_code from .action.highlevel import HighLevelActionSet @@ -28,45 +31,65 @@ class BrowserEnv(gym.Env, ABC): + """The main BrowserGym class, which encapsulates instruction-following Web browsing into a Gymnasium environment.""" + # gym metadata metadata = {"render_modes": None} def __init__( self, + # task-related arguments task_entrypoint: type[AbstractBrowserTask], + task_kwargs: dict = {}, + viewport: Optional[dict] = None, # will override the task's viewport + slow_mo: Optional[int] = None, # will override the task's slow_mo + timeout: Optional[int] = None, # will override the task's timeout + # interactive / debugging arguments headless: bool = True, - viewport: dict = {"width": 1280, "height": 720}, - slow_mo: int = 1000, # in milliseconds - timeout: int = 5000, wait_for_user_message: bool = False, - demo_mode: Literal["off", "default", "only_visible_elements"] = "off", - record_video_dir: str = None, - playwright_kwargs: dict = {}, + resizeable_window: bool = False, + record_video_dir: Optional[str] = None, + pw_chromium_kwargs: dict = {}, + pw_context_kwargs: dict = {}, + # agent-related arguments action_mapping: Optional[callable] = HighLevelActionSet().to_python_code, - **task_kwargs, ): + """ + Instantiate a ready to use BrowserEnv gym environment. + + Args: + task_entrypoint: a callable that returns a new task object from a seed. Used for creating a new task during `reset()`. + task_kwargs: additional arguments passed to `task_entrypoint`. + viewport: desired viewport size. This will override the value defined by the task, which might change its behaviour and difficulty. Should only be set for debugging/testing. + slow_mo: desired slow_mo value for Playwright. This will override the value defined by the task, which might change its behaviour and difficulty. Should only be set for debugging/testing. + timeout: desired timeout value for Playwright. This will override the value defined by the task, which might change its behaviour and difficulty. Should only be set for debugging/testing. + headless: whether the browser should run in headless mode or not. This will affect the viewport size, which might change the behaviour and difficulty of the task. Headless mode should only be disabled for debugging/testing. + wait_for_user_message: whether the environment should pause and wait for a user message in the chat after a new message is sent by the agent. Useful for running agents in interactive mode. + resizeable_window: whether the browser window should be resizeable or not. This will affect the viewport size, which might change the behaviour and difficulty of the task. Should only be set for debugging/testing. + record_video_dir: if set, indicates a directory to which viewport videos will be recorded. + pw_chromium_kwargs: extra parameters for the playwright Browser. Should only be used for debugging/testing. + pw_context_kwargs: extra parameters for the playwright BrowserContext. Should only be used for debugging/testing. + action_mapping: if set, the environment will use this function to map every received action to executable Python code. + + """ super().__init__() self.task_entrypoint = task_entrypoint - self.task_kwargs = task_kwargs - self.headless = headless + self.task_kwargs = dict(**task_kwargs) self.viewport = viewport self.slow_mo = slow_mo self.timeout = timeout + self.headless = headless self.wait_for_user_message = wait_for_user_message - self.demo_mode = demo_mode - self.action_mapping = action_mapping + self.resizeable_window = resizeable_window self.record_video_dir = record_video_dir + self.pw_chromium_kwargs = pw_chromium_kwargs + self.pw_context_kwargs = pw_context_kwargs + self.action_mapping = action_mapping # task self.task = None # playwright - self.playwright_kwargs = playwright_kwargs - self.playwright_kwargs.setdefault("headless", self.headless) - self.playwright_kwargs.setdefault("slow_mo", self.slow_mo) - self.playwright_kwargs.setdefault( - "args", [f"--window-size={self.viewport['width']},{self.viewport['height']}"] - ) self.browser: playwright.sync_api.Browser = None self.context: playwright.sync_api.BrowserContext = None self.page: playwright.sync_api.Page = None @@ -93,14 +116,15 @@ def __init__( ), "active_page_index": gym.spaces.Box(low=0, high=255, dtype=int), "url": Unicode(min_length=0, max_length=TEXT_MAX_LENGTH), - "screenshot": gym.spaces.Box( - 0, - 255, - shape=(viewport["height"], viewport["width"], 3), + "screenshot": AnyBox( + low=0, + high=255, + shape=(-1, -1, 3), dtype=np.uint8, - ), # swapped axes (height first) + ), # swapped axes (height, width, RGB) "dom_object": AnyDict(), "axtree_object": AnyDict(), + "extra_element_properties": AnyDict(), "focused_element_bid": Unicode(min_length=0, max_length=TEXT_MAX_LENGTH), "last_action": Unicode(min_length=0, max_length=TEXT_MAX_LENGTH), "last_action_error": Unicode(min_length=0, max_length=TEXT_MAX_LENGTH), @@ -124,39 +148,67 @@ def close(self): self.task = None def reset(self, seed=None, *args, **kwargs): - # we need the following line to seed self.np_random super().reset(seed=seed, *args, **kwargs) + self.np_random = None # make sure all randomness is handled by the task if self.task: self.task.teardown() self.context.close() self.chat.close() - else: - pw: playwright.sync_api.Playwright = _get_global_playwright() - # important: change playwright's test id attribute from "data-testid" to "bid" - pw.selectors.set_test_id_attribute(BROWSERGYM_ID_ATTRIBUTE) - self.browser = pw.chromium.launch(**self.playwright_kwargs) + self.browser.close() + + # create a new task + self.task = self.task_entrypoint(seed=seed, **self.task_kwargs) + + def override_property(task, env, property): + """Extract property value from env if not None, otherwise from task.""" + env_value = getattr(env, property) + task_value = getattr(task, property) + if env_value is None: + return task_value + else: + logging.warning( + f"Overriding the task's {property} parameter ({repr(task_value)} => {repr(env_value)}). This might change the task's behaviour and difficulty." + ) + return env_value + + # fetch task's desired parameters for browser setup + viewport = override_property(self.task, self, "viewport") + slow_mo = override_property(self.task, self, "slow_mo") + timeout = override_property(self.task, self, "timeout") + + # use the global Playwright instance + pw: playwright.sync_api.Playwright = _get_global_playwright() + # important: change playwright's test id attribute from "data-testid" to "bid" + pw.selectors.set_test_id_attribute(BROWSERGYM_ID_ATTRIBUTE) + + # create a new browser + self.browser = pw.chromium.launch( + headless=self.headless, + slow_mo=slow_mo, + args=( + [f"--window-size={viewport['width']},{viewport['height']}"] + if self.resizeable_window + else None + ), + # will raise an Exception if above args are overriden + **self.pw_chromium_kwargs, + ) # create a new browser context for pages - t_before = time.time() self.context = self.browser.new_context( - no_viewport=True, + no_viewport=True if self.resizeable_window else None, + viewport=viewport, record_video_dir=( Path(self.record_video_dir) / "task_video" if self.record_video_dir else None ), - record_video_size=self.viewport, - ) - # create the chat at the same time to make sure videos are synced - self.chat = Chat( - headless=self.playwright_kwargs["headless"], - chat_size=(500, max(self.viewport["height"], 800)), - record_video_dir=self.record_video_dir, + record_video_size=viewport, + # will raise an Exception if above args are overriden + **self.pw_context_kwargs, ) - t_after = time.time() - recording_start_time = (t_before + t_after) / 2 # recording start time # set default timeout - self.context.set_default_timeout(self.timeout) + self.context.set_default_timeout(timeout) # hack: keep track of the active page with a javascript callback # there is no concept of active page in playwright @@ -188,13 +240,19 @@ def reset(self, seed=None, *args, **kwargs): """ ) + # create the chat + self.chat = Chat( + headless=self.headless, + chat_size=(500, max(viewport["height"], 800)), + record_video_dir=self.record_video_dir, + ) + # create a new page self.page = self.context.new_page() + recording_start_time = time.time() - # create and setup a new task - task_seed = self.np_random.integers(np.iinfo(np.int32).max + 1) - self.task = self.task_entrypoint(**self.task_kwargs) - goal, info = self.task.setup(seed=task_seed, page=self.page) + # setup the task + goal, task_info = self.task.setup(page=self.page) # initialize the chat self.chat.add_message( @@ -224,14 +282,27 @@ def reset(self, seed=None, *args, **kwargs): # extract obs and info from environment obs = self._get_obs() + info = {} + info["task_info"] = task_info + + # TODO this is a bit hacky, find a better solution to record videos if self.record_video_dir: info["recording_start_time"] = recording_start_time + info["recording_file"] = str(self.page.video.path()) + info["chat"] = { + "recording_start_time": self.chat.recording_start_time, + "recording_file": str(self.chat.page.video.path()), + } return obs, info def step(self, action: str) -> tuple: self.last_action = action + info = {} + info["action_exec_start"] = time.time() + info["action_exec_timeout"] = 0 + # try to execute the action try: if self.action_mapping: @@ -246,6 +317,11 @@ def step(self, action: str) -> tuple: self.last_action_error = "" except Exception as e: self.last_action_error = f"{type(e).__name__}: {e}" + match = re.match("TimeoutError: Timeout ([0-9]+)ms exceeded.", self.last_action_error) + if match: + info["action_exec_timeout"] = float(match.groups()[0]) / 1000 # ms to sec + + info["action_exec_stop"] = time.time() # wait a bit (for the JavaScript callback to set the active page) time.sleep(0.5) # wait for JS events to be fired (half a second) @@ -262,7 +338,8 @@ def step(self, action: str) -> tuple: self._wait_for_user_message() # extract reward, done, user_message, info (task-specific) - reward, done, user_message, info = self._task_validate() + reward, done, user_message, task_info = self._task_validate() + info["task_info"] = task_info # add any user message sent by the task to the chat if user_message: @@ -287,7 +364,7 @@ def _task_validate(self): # safety fix, in case validate() did mess up the active page and/or page history if prev_active_page != self.page or prev_page_history != self.page_history: - logging.warning( + logging.info( "The active page and / or page history has changed during task.validate(). A recovery fix will be applied." ) self.page = prev_active_page @@ -363,13 +440,16 @@ def _get_obs(self): dom = extract_dom_snapshot(self.page) axtree = extract_merged_axtree(self.page) focused_element_bid = extract_focused_element_bid(self.page) - except playwright.sync_api.Error as e: + extra_properties = extract_dom_extra_properties(dom) + except (playwright.sync_api.Error, MarkingError) as e: err_msg = str(e) # try to add robustness to async events (detached / deleted frames) if retries_left > 0 and ( "Frame was detached" in err_msg or "Frame with the given frameId is not found" in err_msg or "Execution context was destroyed" in err_msg + or "Frame has been detached" in err_msg + or "Cannot mark a child frame without a bid" in err_msg ): logging.warning( f"An error occured while extracting the dom and axtree. Retrying ({retries_left}/{EXTRACT_OBS_MAX_TRIES} tries left).\n{repr(e)}" @@ -402,6 +482,7 @@ def _get_obs(self): "screenshot": extract_screenshot(self.page), "dom_object": dom, "axtree_object": axtree, + "extra_element_properties": extra_properties, "focused_element_bid": focused_element_bid, "last_action": self.last_action, "last_action_error": self.last_action_error, diff --git a/core/src/browsergym/core/javascript/frame_mark_elements.js b/core/src/browsergym/core/javascript/frame_mark_elements.js index 311ecb5c..3358810d 100644 --- a/core/src/browsergym/core/javascript/frame_mark_elements.js +++ b/core/src/browsergym/core/javascript/frame_mark_elements.js @@ -2,11 +2,7 @@ * Go through all DOM elements in the frame (including shadowDOMs), give them unique browsergym * identifiers (bid), and store custom data in the aria-roledescription attribute. */ -var { innerWidth: windowWidth, innerHeight: windowHeight } = window; -var scrollX = window.scrollX || document.documentElement.scrollLeft; -var scrollY = window.scrollY || document.documentElement.scrollTop; - -([parent_bid, bid_attr_name, iframe_position, super_iframe_offset]) => { +async ([parent_bid, bid_attr_name]) => { // standard html tags // https://www.w3schools.com/tags/ @@ -25,30 +21,39 @@ var scrollY = window.scrollY || document.documentElement.scrollTop; "svg", "table", "tbody", "td", "template", "textarea", "tfoot", "th", "thead", "time", "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr" ]; - - if (super_iframe_offset == null) { - - iframe_offset = { x: scrollX, y: scrollY, right: windowWidth, bottom: windowHeight }; - } - else { - [super_x, super_y, super_right, super_bottom] = [super_iframe_offset["x"], super_iframe_offset["y"], super_iframe_offset["right"], super_iframe_offset["bottom"]]; - - x = Math.max(-iframe_position.x, 0); - y = Math.max(-iframe_position.y, 0); - right = Math.min(...[super_right, windowWidth, super_right - iframe_position.x]); - bottom = Math.min(...[super_bottom, windowHeight, super_bottom - iframe_position.y]); - iframe_offset = { x: x, y: y, right: right, bottom: bottom }; - } + const set_of_marks_tags = [ + "input", "textarea", "select", "button", "a", "iframe", "video", "li", "td", "option" + ]; let browsergym_first_visit = false; // if no yet set, set the frame (local) element counter to 0 - if (!("browsergym_frame_elem_counter" in window)) { - window.browsergym_frame_elem_counter = 0; + if (!("browsergym_elem_counter" in window)) { + window.browsergym_elem_counter = 0; + window.browsergym_frame_id_generator = new IFrameIdGenerator(); browsergym_first_visit = true; } + // mechanism for computing all element's visibility + // the intersection observer will set the visibility ratio of elements entering / exiting the viewport + // a set is used to keep track of not-yet-visited elements + let elems_to_be_visited = new Set() + let intersection_observer = new IntersectionObserver( + entries => { + entries.forEach(entry => { + let elem = entry.target; + elem.setAttribute('browsergym_visibility_ratio', Math.round(entry.intersectionRatio * 100) / 100); + if (elems_to_be_visited.has(elem)) { + elems_to_be_visited.delete(elem); + } + }) + }, + { + threshold: [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] + } + ) // get all DOM elements in the current frame (does not include elements in shadowDOMs) let elements = Array.from(document.querySelectorAll('*')); + let som_buttons = []; i = 0; while (i < elements.length) { const elem = elements[i]; @@ -64,10 +69,14 @@ var scrollY = window.scrollY || document.documentElement.scrollTop; i++; // we will mark only standard HTML tags if (!elem.tagName || !html_tags.includes(elem.tagName.toLowerCase())) { - // console.log(`Skipping element ${elem.outerHTML}`) + // Skipping element continue; // stop and move on to the next element } - // console.log(`Processing element ${elem.outerHTML}`) + // Processing element + // register intersection callback on element, and keep track of element for waiting later + elem.setAttribute('browsergym_visibility_ratio', 0); + elems_to_be_visited.add(elem); + intersection_observer.observe(elem); // write dynamic element values to the DOM if (typeof elem.value !== 'undefined') { elem.setAttribute("value", elem.value); @@ -81,7 +90,7 @@ var scrollY = window.scrollY || document.documentElement.scrollTop; elem.removeAttribute("checked"); } } - // add the element global id to a custom HTML attribute + // add the element global id (browsergym id) to a custom HTML attribute // https://playwright.dev/docs/locators#locate-by-test-id // recover the element id if it has one already, else compute a new element id let elem_global_bid; @@ -93,100 +102,169 @@ var scrollY = window.scrollY || document.documentElement.scrollTop; elem_global_bid = elem.getAttribute(bid_attr_name); } else { - let elem_local_id = window.browsergym_frame_elem_counter++; + let elem_local_id = null; + // iFrames get alphabetical ids: 'a', 'b', ..., 'z'. + // if more than 26 iFrames are present, raise an Error + if (['iframe', 'frame'].includes(elem.tagName.toLowerCase())) { + elem_local_id = `${window.browsergym_frame_id_generator.next()}`; + if (elem_local_id.length > 1) { + throw new Error(`More than 26? Such iFrames. BrowserGym not like.`); + } + } + // other elements get numerical ids: '0', '1', '2', ... + else { + elem_local_id = `${window.browsergym_elem_counter++}`; + } if (parent_bid == "") { elem_global_bid = `${elem_local_id}`; } else { - elem_global_bid = `${parent_bid}-${elem_local_id}`; + elem_global_bid = `${parent_bid}${elem_local_id}`; } elem.setAttribute(bid_attr_name, `${elem_global_bid}`); } + // Hack: store custom data inside the aria-roledescription attribute (will be available in DOM and AXTree) // - elem_global_bid: global element identifier (unique over multiple frames) // TODO: add more data if needed (x, y coordinates, bounding box, is_visible, is_clickable etc.) - - let [rect, is_in_viewport] = getElementPositionInfo(elem, iframe_offset, iframe_position); - let left = (rect.left + iframe_position.x).toString(); - let top = (rect.top + iframe_position.y ).toString(); - let right = (rect.right + iframe_position.x ).toString(); - let bottom = (rect.bottom + iframe_position.y).toString(); - let center_x = ((rect.left + rect.right) / 2 + iframe_position.x).toString(); - let center_y = ((rect.top + rect.bottom) / 2 + iframe_position.y).toString(); - - elem.setAttribute("browsergym_center", `(${center_x}, ${center_y})`); - elem.setAttribute("browsergym_bounding_box", `(${left}, ${top}, ${right}, ${bottom})`); - elem.setAttribute("browsergym_is_in_viewport", `${is_in_viewport}`); - let original_content = ""; if (elem.hasAttribute("aria-roledescription")) { original_content = elem.getAttribute("aria-roledescription"); } - let new_content = `${elem_global_bid}_${left}_${top}_${center_x}_${center_y}_${right}_${bottom}_${is_in_viewport}_${original_content}` + let new_content = `${elem_global_bid}_${original_content}` elem.setAttribute("aria-roledescription", new_content); + // set-of-marks flag (He et al. 2024) + // https://github.com/MinorJerry/WebVoyager/blob/main/utils.py + elem.setAttribute("browsergym_set_of_marks", "0"); + // click at center activates self or a child + if (["self", "child"].includes(whoCapturesCenterClick(elem))) { + // has valid tag name, or has click event, or triggers a pointer cursor + if (set_of_marks_tags.includes(elem.tagName.toLowerCase()) || (elem.onclick != null) || (window.getComputedStyle(elem).cursor == "pointer")) { + let rect = elem.getBoundingClientRect(); + let area = (rect.right - rect.left) * (rect.bottom - rect.top); + // area is large enough + if (area >= 20) { + // is not a child of a button (role, type, tag) set to be marked + if (som_buttons.every(button => !button.contains(elem))) { + // is not the sole child of span that has a role and is set to be marked + let parent = elem.parentElement; + if (!(parent && parent.tagName.toLowerCase() == "span" && parent.children.length === 1 && parent.getAttribute("role") && parent.getAttribute("browsergym_set_of_marks") === "1")) { + // all checks have passed, flag the element for inclusion in set-of-marks + elem.setAttribute("browsergym_set_of_marks", "1"); + if (elem.matches('button, a, input[type="button"], div[role="button"]')) { + som_buttons.push(elem) + } + // lastly, remove the set-of-marks flag from all parents, if any + while (parent) { + if (parent.getAttribute("browsergym_set_of_marks") === "1") { + parent.setAttribute("browsergym_set_of_marks", "0") + } + parent = parent.parentElement; + } + } + } + } + } + } + } + + warning_msgs = new Array(); + + // wait for all elements to be visited for visibility + let visibility_marking_timeout = 1000; // ms + try { + await until(() => elems_to_be_visited.size == 0, visibility_marking_timeout); + } catch { + warning_msgs.push(`Frame marking: not all elements have been visited by the intersection_observer after ${visibility_marking_timeout} ms`); } - return iframe_offset; + // disconnect intersection observer + intersection_observer.disconnect(); + return warning_msgs; } -function getElementPositionInfo(element, iframe_offset, iframe_position) { - var rect = element.getBoundingClientRect(); - let x = (rect.left + rect.right) / 2 ; - let y = (rect.top + rect.bottom) / 2 ; - //loop over element ancestors (parent) and refine iframe offset to be the most precise possible - var parent = element.parentElement; - parent_iframe_offset = { x: 0, y: 0, right: windowWidth, bottom: windowHeight }; - while (parent !== null) { - var parent_rect = parent.getBoundingClientRect(); - parent_iframe_offset["x"] = Math.max(parent_rect.left , parent_iframe_offset["x"] ); - parent_iframe_offset["y"] = Math.max(parent_rect.top , parent_iframe_offset["y"] ); - parent_iframe_offset["right"] = Math.min(parent_rect.right , parent_iframe_offset["right"] ); - parent_iframe_offset["bottom"] = Math.min(parent_rect.bottom , parent_iframe_offset["bottom"] ); - parent = parent.parentElement; - } - var is_in_viewport = ( - x >= iframe_offset["x"] && - y >= iframe_offset["y"] && - x <= iframe_offset["right"] && - y <= iframe_offset["bottom"] - ); - //this features is broken for the moment - var NotBehindParent = ( - x >= parent_iframe_offset["x"] && - y >= parent_iframe_offset["y"] && - x <= parent_iframe_offset["right"] && - y <= parent_iframe_offset["bottom"] - ); - - var isVisible = (typeof element.offsetWidth === 'undefined' || typeof element.offsetHeight === 'undefined') || (element.offsetWidth > 0 && element.offsetHeight > 0); - - // Return true if the element is both in the viewport and has non-zero dimensions - return [rect, (is_in_viewport && isVisible && IsInFront(element))? 1 : 0]; +async function until(f, timeout, interval=40) { + return new Promise((resolve, reject) => { + const start_time = Date.now(); + // immediate check + if (f()) { + resolve(); + } + // loop check + const wait = setInterval(() => { + if (f()) { + clearInterval(wait); + resolve(); + } else if (Date.now() - start_time > timeout) { + clearInterval(wait); + reject(); + } + }, interval); + }); } -function IsInFront(element){ +function whoCapturesCenterClick(element){ var rect = element.getBoundingClientRect(); var x = (rect.left + rect.right) / 2 ; var y = (rect.top + rect.bottom) / 2 ; - var newElement = elementFromPoint(x, y); //return the element in the foreground at position (x,y) - if(newElement){ - if(newElement === element) - return true; + var element_at_center = elementFromPoint(x, y); // return the element in the foreground at position (x,y) + if (!element_at_center) { + return "nobody"; + } else if (element_at_center === element) { + return "self"; + } else if (element.contains(element_at_center)) { + return "child"; + } else { + return "non-descendant"; } - return false; } function elementFromPoint(x, y) { - let node = document.elementFromPoint(x, y); + let dom = document; + let last_elem = null; + let elem = null; - let child = node?.shadowRoot?.elementFromPoint(x, y); + do { + last_elem = elem; + elem = dom.elementFromPoint(x, y); + dom = elem?.shadowRoot; + } while(dom && elem !== last_elem); - while (child && child !== node) { - node = child; - child = node?.shadowRoot?.elementFromPoint(x, y); + return elem; +} + +// https://stackoverflow.com/questions/12504042/what-is-a-method-that-can-be-used-to-increment-letters#answer-12504061 +class IFrameIdGenerator { + constructor(chars = 'abcdefghijklmnopqrstuvwxyz') { + this._chars = chars; + this._nextId = [0]; + } + + next() { + const r = []; + for (const char of this._nextId) { + r.unshift(this._chars[char]); + } + this._increment(); + return r.join(''); } - return child || node; + _increment() { + for (let i = 0; i < this._nextId.length; i++) { + const val = ++this._nextId[i]; + if (val < this._chars.length) { + return; + } + this._nextId[i] = 0; + } + this._nextId.push(0); + } + + *[Symbol.iterator]() { + while (true) { + yield this.next(); + } + } } diff --git a/core/src/browsergym/core/javascript/frame_unmark_elements.js b/core/src/browsergym/core/javascript/frame_unmark_elements.js index 4a29f15f..578a47b9 100644 --- a/core/src/browsergym/core/javascript/frame_unmark_elements.js +++ b/core/src/browsergym/core/javascript/frame_unmark_elements.js @@ -23,7 +23,7 @@ if (elem.hasAttribute("aria-roledescription")) { let content = elem.getAttribute("aria-roledescription"); // TODO: handle more data if needed - let n_data_items = 8; // bid, bbox_left, bbox_top, center_x, center_y, bbox_right, bbox_bottom, is_in_viewport + let n_data_items = 1; // bid let post_data_index = 0; for (let j = 0 ; j < n_data_items ; j++) { post_data_index = content.indexOf("_", post_data_index) + 1; @@ -35,7 +35,6 @@ else { elem.removeAttribute("aria-roledescription"); } - } } } diff --git a/core/src/browsergym/core/observation.py b/core/src/browsergym/core/observation.py index 3ea9d8ac..e5e63114 100644 --- a/core/src/browsergym/core/observation.py +++ b/core/src/browsergym/core/observation.py @@ -8,10 +8,16 @@ import re from .constants import BROWSERGYM_ID_ATTRIBUTE as BID_ATTR +from .constants import BROWSERGYM_VISIBILITY_ATTRIBUTE as VIS_ATTR +from .constants import BROWSERGYM_SETOFMARKS_ATTRIBUTE as SOM_ATTR MARK_FRAMES_MAX_TRIES = 3 +class MarkingError(Exception): + pass + + def _pre_extract(page: playwright.sync_api.Page): """ pre-extraction routine, marks dom elements (set bid and dynamic attributes like value and checked) @@ -22,47 +28,41 @@ def _pre_extract(page: playwright.sync_api.Page): # we can't run this loop in JS due to Same-Origin Policy # (can't access the content of an iframe from a another one) - def mark_frames_recursive( - frame, - global_iframe_position, - iframe_offset=None, - ): - # get the bid of the parent frame element - try: - parent_bid = frame.frame_element().get_attribute(BID_ATTR) - except: - parent_bid = "" + def mark_frames_recursive(frame, frame_bid: str): + assert frame_bid == "" or (frame_bid.islower() and frame_bid.isalpha()) + # mark all DOM elements in the frame (it will use the parent frame element's bid as a prefix) - super_iframe_offset = frame.evaluate( + warning_msgs = frame.evaluate( js_frame_mark_elements, - [ - parent_bid, - BID_ATTR, - global_iframe_position, - iframe_offset, - ], + [frame_bid, BID_ATTR], ) + # print warning messages if any + for msg in warning_msgs: + logging.warning(msg) # recursively mark all descendant frames - for _, sub_frame in enumerate(frame.child_frames): - if not sub_frame.is_detached(): - is_frame_hidden = sub_frame.evaluate( - """ () => { - const style = window.getComputedStyle(document.documentElement); - const is_null_size = document.documentElement.offsetWidth <= 0 || document.documentElement.offsetHeight <= 0; - return style.display === 'none' || style.visibility === 'hidden' || is_null_size; -}""" + for child_frame in frame.child_frames: + # deal with detached frames + if child_frame.is_detached(): + continue + # deal with weird frames (pdf viewer in ) + child_frame_elem = child_frame.frame_element() + if not child_frame_elem.content_frame() == child_frame: + logging.warning( + f"Skipping frame '{child_frame.name}' for marking, seems problematic." ) - if not is_frame_hidden: - sub_iframe_position = { - key: sub_frame.frame_element().bounding_box()[key] for key in ["x", "y"] - } - mark_frames_recursive(sub_frame, sub_iframe_position, super_iframe_offset) + continue + # deal with sandboxed frames with blocked script execution + sandbox_attr = child_frame_elem.get_attribute("sandbox") + if sandbox_attr is not None and "allow-scripts" not in sandbox_attr.split(): + continue + child_frame_bid = child_frame_elem.get_attribute(BID_ATTR) + if child_frame_bid is None: + raise MarkingError("Cannot mark a child frame without a bid.") + mark_frames_recursive(child_frame, frame_bid=child_frame_bid) # mark all frames recursively - global_iframe_position = {"x": 0, "y": 0} - - mark_frames_recursive(page.main_frame, global_iframe_position) + mark_frames_recursive(page.main_frame, frame_bid="") def _post_extract(page: playwright.sync_api.Page): @@ -73,6 +73,16 @@ def _post_extract(page: playwright.sync_api.Page): # we can't run this loop in JS due to Same-Origin Policy # (can't access the content of an iframe from a another one) for frame in page.frames: + if not frame == page.main_frame: + # deal with weird frames (pdf viewer in ) + if not frame.frame_element().content_frame() == frame: + logging.warning(f"Skipping frame '{frame.name}' for unmarking, seems problematic.") + continue + # deal with sandboxed frames with blocked script execution + sandbox_attr = frame.frame_element().get_attribute("sandbox") + if sandbox_attr is not None and "allow-scripts" not in sandbox_attr.split(): + continue + try: frame.evaluate(js_frame_unmark_elements) except playwright.sync_api.Error as e: @@ -118,29 +128,11 @@ def extract_screenshot(page: playwright.sync_api.Page): # TODO: handle more data items if needed -__BID_EXPR = r"([-0-9]+)" +__BID_EXPR = r"([a-z0-9]+)" __FLOAT_EXPR = r"([+-]?(?:[0-9]*[.])?[0-9]+)" __BOOL_EXPR = r"([01])" # bid, bbox_left, bbox_top, center_x, center_y, bbox_right, bbox_bottom, is_in_viewport -__DATA_REGEXP = re.compile( - __BID_EXPR - + r"_" - + __FLOAT_EXPR - + r"_" - + __FLOAT_EXPR - + r"_" - + __FLOAT_EXPR - + r"_" - + __FLOAT_EXPR - + r"_" - + __FLOAT_EXPR - + r"_" - + __FLOAT_EXPR - + r"_" - + __BOOL_EXPR - + r"_" - + r"(.*)" -) +__DATA_REGEXP = re.compile(__BID_EXPR + r"_" + r"(.*)") def extract_data_items_from_aria(string): @@ -209,7 +201,7 @@ def extract_dom_snapshot( for node_attributes in document["nodes"]["attributes"]: i = 0 # find the "aria-roledescription" attribute, if any - while i < len(node_attributes): + for i in range(0, len(node_attributes), 2): attr_name_id = node_attributes[i] attr_value_id = node_attributes[i + 1] if attr_name_id == target_attr_name_id: @@ -223,16 +215,179 @@ def extract_dom_snapshot( processed_string_ids.add( attr_value_id ) # mark string as processed (in case several "aria-roledescription" attributes share the same value string) + attr_value = new_attr_value # remove "aria-roledescription" attribute (name and value) if empty - if new_attr_value == "": + if attr_value == "": del node_attributes[i : i + 2] # once "aria-roledescription" is found, exit the search break - i += 2 return dom_snapshot +def extract_dom_extra_properties(dom_snapshot): + def to_string(idx): + if idx == -1: + return None + else: + return dom_snapshot["strings"][idx] + + # pre-locate important string ids + try: + bid_string_id = dom_snapshot["strings"].index(BID_ATTR) + except ValueError: + bid_string_id = -1 + try: + vis_string_id = dom_snapshot["strings"].index(VIS_ATTR) + except ValueError: + vis_string_id = -1 + try: + som_string_id = dom_snapshot["strings"].index(SOM_ATTR) + except ValueError: + som_string_id = -1 + + # build the iframe tree (DFS from the first frame) + doc_properties = { + 0: { + "parent": None, + } + } + + docs_to_process = [0] + while docs_to_process: + doc = docs_to_process.pop(-1) # DFS + + children = dom_snapshot["documents"][doc]["nodes"]["contentDocumentIndex"] + for node, child_doc in zip(children["index"], children["value"]): + doc_properties[child_doc] = { + "parent": { + "doc": doc, # parent frame index + "node": node, # node index within the parent frame + } + } + docs_to_process.append(child_doc) + + # recover the absolute x and y position of the frame node in the parent (if any) + parent = doc_properties[doc]["parent"] + if parent: + parent_doc = parent["doc"] + parent_node = parent["node"] + try: + node_layout_idx = dom_snapshot["documents"][parent_doc]["layout"][ + "nodeIndex" + ].index(parent_node) + except ValueError: + node_layout_idx = -1 + if node_layout_idx >= 0: + node_bounds = dom_snapshot["documents"][parent_doc]["layout"]["bounds"][ + node_layout_idx + ] # can be empty? + # absolute position of parent + relative position of frame node within parent + parent_node_abs_x = doc_properties[parent_doc]["abs_pos"]["x"] + node_bounds[0] + parent_node_abs_y = doc_properties[parent_doc]["abs_pos"]["y"] + node_bounds[1] + else: + parent_node_abs_x = 0 + parent_node_abs_y = 0 + else: + parent_node_abs_x = 0 + parent_node_abs_y = 0 + + # get the frame's absolute position, by adding any scrolling offset if any + doc_properties[doc]["abs_pos"] = { + "x": parent_node_abs_x - dom_snapshot["documents"][doc]["scrollOffsetX"], + "y": parent_node_abs_y - dom_snapshot["documents"][doc]["scrollOffsetY"], + } + + document = dom_snapshot["documents"][doc] + doc_properties[doc]["nodes"] = [ + { + "bid": None, # default value, to be filled (str) + "visibility": None, # default value, to be filled (float) + "bbox": None, # default value, to be filled (list) + "clickable": False, # default value, to be filled (bool) + "set_of_marks": None, # default value, to be filled (bool) + } + for _ in enumerate(document["nodes"]["parentIndex"]) + ] # all nodes in document + + # extract clickable property + for node_idx in document["nodes"]["isClickable"]["index"]: + doc_properties[doc]["nodes"][node_idx]["clickable"] = True + + # extract bid and visibility properties (attribute-based) + for node_idx, node_attrs in enumerate(document["nodes"]["attributes"]): + i = 0 + # loop over all attributes + for i in range(0, len(node_attrs), 2): + name_string_id = node_attrs[i] + value_string_id = node_attrs[i + 1] + if name_string_id == bid_string_id: + doc_properties[doc]["nodes"][node_idx]["bid"] = to_string(value_string_id) + if name_string_id == vis_string_id: + doc_properties[doc]["nodes"][node_idx]["visibility"] = float( + to_string(value_string_id) + ) + if name_string_id == som_string_id: + doc_properties[doc]["nodes"][node_idx]["set_of_marks"] = ( + to_string(value_string_id) == "1" + ) + + # extract bbox property (in absolute coordinates) + for node_idx, bounds, client_rect in zip( + document["layout"]["nodeIndex"], + document["layout"]["bounds"], + document["layout"]["clientRects"], + ): + # empty clientRect means element is not actually rendered + if not client_rect: + doc_properties[doc]["nodes"][node_idx]["bbox"] = None + else: + # bounds gives the relative position within the document + doc_properties[doc]["nodes"][node_idx]["bbox"] = bounds.copy() + # adjust for absolute document position + doc_properties[doc]["nodes"][node_idx]["bbox"][0] += doc_properties[doc]["abs_pos"][ + "x" + ] + doc_properties[doc]["nodes"][node_idx]["bbox"][1] += doc_properties[doc]["abs_pos"][ + "y" + ] + + # Note: other interesting fields + # document["nodes"]["parentIndex"] # parent node + # document["nodes"]["nodeType"] + # document["nodes"]["nodeName"] + # document["nodes"]["nodeValue"] + # document["nodes"]["textValue"] + # document["nodes"]["inputValue"] + # document["nodes"]["inputChecked"] + # document["nodes"]["optionSelected"] + # document["nodes"]["pseudoType"] + # document["nodes"]["pseudoIdentifier"] + # document["nodes"]["isClickable"] + # document["textBoxes"] + # document["layout"]["nodeIndex"] + # document["layout"]["bounds"] + # document["layout"]["offsetRects"] + # document["layout"]["scrollRects"] + # document["layout"]["clientRects"] + # document["layout"]["paintOrders"] + + # collect the extra properties of all nodes with a browsergym_id attribute + extra_properties = {} + for doc in doc_properties.keys(): + for node in doc_properties[doc]["nodes"]: + bid = node["bid"] + if bid: + if bid in extra_properties: + logging.warning(f"duplicate {BID_ATTR}={repr(bid)} attribute detected") + extra_properties[bid] = { + extra_prop: node[extra_prop] + for extra_prop in ("visibility", "bbox", "clickable", "set_of_marks") + } + + return extra_properties + + def extract_all_frame_axtrees(page: playwright.sync_api.Page): """ Extracts the AXTree of all frames (main document and iframes) of a Playwright page using Chrome DevTools Protocol. @@ -289,16 +444,7 @@ def extract_all_frame_axtrees(page: playwright.sync_api.Page): del node["properties"][i] # add all extracted "browsergym" properties to the AXTree if data_items: - ( - browsergym_id, - left, - top, - center_x, - center_y, - right, - bottom, - is_in_viewport, - ) = data_items + (browsergym_id,) = data_items node["properties"].append( { "name": "browsergym_id", @@ -308,38 +454,6 @@ def extract_all_frame_axtrees(page: playwright.sync_api.Page): }, } ) - node["properties"].append( - { - "name": "browsergym_center", - "value": { - "type": "list", - "value": (float(center_x), float(center_y)), - }, - } - ) - node["properties"].append( - { - "name": "browsergym_bounding_box", - "value": { - "type": "list", - "value": ( - float(left), - float(top), - float(right), - float(bottom), - ), - }, - } - ) - node["properties"].append( - { - "name": "browsergym_is_in_viewport", - "value": { - "type": "boolean", - "value": bool(is_in_viewport == "1"), - }, - } - ) return frame_axtrees diff --git a/core/src/browsergym/core/registration.py b/core/src/browsergym/core/registration.py index a8c3350d..dd0e36ed 100644 --- a/core/src/browsergym/core/registration.py +++ b/core/src/browsergym/core/registration.py @@ -5,18 +5,24 @@ from .task import AbstractBrowserTask -def register_task(id: str, task_class: Type[AbstractBrowserTask], *args, **kwargs): +def register_task( + id: str, task_class: Type[AbstractBrowserTask], nondeterministic: bool = True, *args, **kwargs +): """ Registers a browser task as a gym environment with its unique id. Args: - task: the task class to register. + id: the id of the task to register (will be prepended by "browsergym/"). + task_class: the task class to register. + nondeterministic: whether the task cannot be guaranteed deterministic transitions. + *args: additional arguments for the browsergym environment. + *kwargs: additional arguments for the browsergym environment. """ gym.register( id=f"browsergym/{id}", - entry_point=lambda *args, **kwargs: BrowserEnv(task_class, *args, **kwargs), - nondeterministic=task_class.nondeterministic, + entry_point=lambda *env_args, **env_kwargs: BrowserEnv(task_class, *env_args, **env_kwargs), + nondeterministic=nondeterministic, *args, **kwargs, ) diff --git a/core/src/browsergym/core/spaces.py b/core/src/browsergym/core/spaces.py index f5e1f005..177959e5 100644 --- a/core/src/browsergym/core/spaces.py +++ b/core/src/browsergym/core/spaces.py @@ -77,3 +77,42 @@ def __repr__(self) -> str: def __eq__(self, other: Any) -> bool: """Check whether ``other`` is equivalent to this instance.""" return isinstance(other, AnyDict) + + +class AnyBox(Space[NDArray[Any]]): + """A space representing an arbitrary dictionary object.""" + + def __init__(self, low, high, shape, dtype): + super().__init__(shape, dtype) + self.low = low + self.high = high + + def contains(self, x: Any) -> bool: + """Return boolean specifying if x is a valid member of this space.""" + if not isinstance(x, np.ndarray): + try: + x = np.asarray(x, dtype=self.dtype) + except (ValueError, TypeError): + return False + + return bool( + np.can_cast(x.dtype, self.dtype) + and len(x.shape) == len(self.shape) + and all([dim in (xdim, -1) for xdim, dim in zip(x.shape, self.shape)]) + and np.all(x >= self.low) + and np.all(x <= self.high) + ) + + def __repr__(self) -> str: + """Gives a string representation of this space.""" + return f"AnyBox(low={repr(self.low)}, high={repr(self.high)}, shape={repr(self.shape)}, dtype={repr(self.dtype)})" + + def __eq__(self, other: Any) -> bool: + """Check whether ``other`` is equivalent to this instance.""" + return ( + isinstance(other, AnyBox) + and self.low == other.low + and self.high == other.high + and self.shape == other.shape + and self.dtype == other.dtype + ) diff --git a/core/src/browsergym/core/task.py b/core/src/browsergym/core/task.py index e3ad8720..6555223d 100644 --- a/core/src/browsergym/core/task.py +++ b/core/src/browsergym/core/task.py @@ -1,3 +1,4 @@ +import numpy as np import playwright.sync_api from abc import ABC, abstractmethod @@ -10,21 +11,27 @@ class AbstractBrowserTask(ABC): """ - # gym metadata (default values, can be overloaded) - nondeterministic: bool = True - @classmethod @abstractmethod def get_task_id(cls): pass + def __init__(self, seed: int) -> None: + # initiate a random number generator + self.random = np.random.RandomState(seed) + + # task properties, will be used to set up the browsergym environment + # default values, can be overriden in children classes + self.viewport = {"width": 1280, "height": 720} + self.slow_mo = 1000 # ms + self.timeout = 5000 # ms + @abstractmethod - def setup(self, seed: int, page: playwright.sync_api.Page) -> tuple[str, dict]: + def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: """ Set up everything needed to execute the task. Args: - seed: a seed for the task's randomness. page: the active playwright page. Returns: @@ -71,17 +78,19 @@ class OpenEndedTask(AbstractBrowserTask): def get_task_id(cls): return "openended" - def __init__(self, start_url: str, goal: str = None) -> None: + def __init__(self, seed: int, start_url: str, goal: str = None) -> None: """ Args: + seed: random seed. start_url: str, the url for the starting page. goal: str, the initial goal. """ + super().__init__(seed) self.start_url = start_url self.goal = goal - def setup(self, seed: int, page: playwright.sync_api.Page) -> tuple[str, dict]: + def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: page.goto(self.start_url, timeout=10000) return self.goal, {} diff --git a/core/src/browsergym/utils/obs.py b/core/src/browsergym/utils/obs.py index 795179ee..f63ff2b7 100644 --- a/core/src/browsergym/utils/obs.py +++ b/core/src/browsergym/utils/obs.py @@ -1,8 +1,16 @@ import ast +import numpy as np +import PIL.Image +import PIL.ImageDraw +import PIL.ImageFont from collections import defaultdict from bs4 import BeautifulSoup +from browsergym.core.constants import BROWSERGYM_ID_ATTRIBUTE as BID_ATTR +from browsergym.core.constants import BROWSERGYM_VISIBILITY_ATTRIBUTE as VIS_ATTR +from browsergym.core.constants import BROWSERGYM_SETOFMARKS_ATTRIBUTE as SOM_ATTR + IGNORED_AXTREE_ROLES = ["LineBreak"] IGNORED_AXTREE_PROPERTIES = ( @@ -17,104 +25,139 @@ def flatten_dom_to_str( - DOM_tree, + dom_snapshot, + extra_properties: dict = None, with_visible: bool = False, + with_clickable: bool = False, with_center_coords: bool = False, with_bounding_box_coords: bool = False, + with_som: bool = False, filter_visible_only: bool = False, + filter_with_bid_only: bool = False, + filter_som_only: bool = False, coord_decimals: int = 0, ) -> str: """Formats a DOM snapshot into a string text""" - coord_format = f":0.{coord_decimals}f" + def to_string(idx): + if idx == -1: + return None + else: + return dom_snapshot["strings"][idx] - def parse_DOM(document_idx) -> str: - # adopted from [natbot](https://github.com/nat/natbot) + def parse_document(document_idx) -> str: + # adapted from [natbot](https://github.com/nat/natbot) - strings = DOM_tree["strings"] - nodes = DOM_tree["documents"][document_idx]["nodes"] - node_iframe_link = nodes["contentDocumentIndex"] - graph = defaultdict(lambda: []) + nodes = dom_snapshot["documents"][document_idx]["nodes"] + node_children = defaultdict(lambda: []) for node_idx in range(len(nodes["nodeName"])): parent_idx = nodes["parentIndex"][node_idx] if parent_idx != -1: - graph[parent_idx].append(node_idx) - - def dfs(idx: int) -> str: - node_name = strings[nodes["nodeName"][idx]] - can_skip = ( - "#" in node_name or "::" in node_name or node_name == "html" - ) # We skip the root node - node_name = node_name.lower().strip() - html = "" - - # print node opening tag - if not can_skip: - html += f"<{node_name}" - node_attr_idxs = nodes["attributes"][idx] - if node_attr_idxs: - for i in range(0, len(node_attr_idxs), 2): - attr_name = strings[node_attr_idxs[i]] - - # filter visible elements if requested - if ( - filter_visible_only - and attr_name == "browsergym_is_in_viewport" - and strings[node_attr_idxs[i + 1]] == "0" - ): - can_skip = True - break - - # print browsergym attributes if requested (with new names) - if attr_name == "browsergym_is_in_viewport": - if with_visible: - attr_value = strings[node_attr_idxs[i + 1]] - html += f' is_visible="{attr_value}"' - elif attr_name == "browsergym_center": - if with_center_coords: - attr_value = strings[node_attr_idxs[i + 1]] - html += f' center="{_get_coord_str(attr_value, coord_decimals)}"' - - elif attr_name == "browsergym_bounding_box": - if with_bounding_box_coords: - attr_value = strings[node_attr_idxs[i + 1]] - html += f' box="{_get_coord_str(attr_value, coord_decimals)}"' - - # print other attributes + node_children[parent_idx].append(node_idx) + + def dfs(node_idx: int, parent_node_skipped: bool) -> str: + + # https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType + # https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeName + # https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeValue + + node_type = nodes["nodeType"][node_idx] + node_name = to_string(nodes["nodeName"][node_idx]) + node_value = to_string(nodes["nodeValue"][node_idx]) + html_before = "" + html_after = "" + skip_node = False + + # text nodes: print text content only if parent was not skipped + if node_type == 3: # node_name == "#text" + if not parent_node_skipped and node_value is not None: + html_before += node_value + + # CData nodes: print content only if parent was not skipped + elif node_type == 4: # node_name == "#cdata-section": + if not parent_node_skipped and node_value is not None: + html_before += f"" + + # processing instructions, comments, documents, doctypes, document fragments: don't print + elif node_type in (7, 8, 9, 10, 11): + skip_node = True + + # now we should have an element node + else: + assert node_type == 1 + + tag_name = node_name.lower().strip() + attributes = [] # to be printed as attributes with the tag + bid = None + + # parse node attributes + node_attr_idxs = nodes["attributes"][node_idx] + for i in range(0, len(node_attr_idxs), 2): + attr_name = to_string(node_attr_idxs[i]) + attr_value = to_string(node_attr_idxs[i + 1]) + + # extract and print bid + if attr_name == BID_ATTR: + bid = attr_value + # ignore browsergym attributes + elif attr_name in (VIS_ATTR, SOM_ATTR): + pass + # print other attributes + else: + if attr_value is None: + # attribute value missing + attributes.append(f"{attr_name}") else: - if node_attr_idxs[i + 1] >= 0: - attr_value = strings[node_attr_idxs[i + 1]] - # attribute value present - html += f' {attr_name}="{attr_value}"' - else: - # attribute value missing - html += f" {attr_name}" - - html += f">" - if can_skip: - html = "" - # print inner text - node_value_idx = nodes["nodeValue"][idx] - if node_value_idx >= 0: - html += " ".join(strings[node_value_idx].split()) + # attribute value present + attributes.append(f'{attr_name}="{attr_value}"') + + skip_node, extra_attributes_to_print = _process_bid( + bid, + extra_properties=extra_properties, + with_visible=with_visible, + with_clickable=with_clickable, + with_center_coords=with_center_coords, + with_bounding_box_coords=with_bounding_box_coords, + with_som=with_som, + filter_visible_only=filter_visible_only, + filter_with_bid_only=filter_with_bid_only, + filter_som_only=filter_som_only, + coord_decimals=coord_decimals, + ) + + # insert extra attributes before regular attributes + attributes = extra_attributes_to_print + attributes + + # insert bid as first attribute + if bid is not None: + attributes.insert(0, f'bid="{bid}"') + + if not skip_node: + # print node opening tag, with its attributes + html_before += f"<{tag_name}" + " ".join([""] + attributes) + ">" + # print node closing tag + html_after += f"" + + html = "" + html += html_before # recursively print iframe nodes if any - if idx in node_iframe_link["index"]: - sub_document_idx = node_iframe_link["value"][node_iframe_link["index"].index(idx)] - html += parse_DOM(document_idx=sub_document_idx) + if node_idx in nodes["contentDocumentIndex"]["index"]: + sub_document_idx = nodes["contentDocumentIndex"]["value"][ + nodes["contentDocumentIndex"]["index"].index(node_idx) + ] + html += parse_document(document_idx=sub_document_idx) - # recursively print children nodes - for child_idx in graph[idx]: - html += dfs(child_idx) + # recursively print children nodes if any + for child_idx in node_children[node_idx]: + html += dfs(node_idx=child_idx, parent_node_skipped=skip_node) - # print node closing tag - if not can_skip: - html += f"" + html += html_after return html - html = dfs(idx=0) + html = dfs(node_idx=0, parent_node_skipped=False) # Format the HTML document with indentation soup = BeautifulSoup(html, "lxml") @@ -122,7 +165,7 @@ def dfs(idx: int) -> str: return html - html = parse_DOM(0) + html = parse_document(document_idx=0) return html @@ -149,96 +192,305 @@ def _get_coord_str(coord, decimals): return f"({coord_str})" +def _process_bid( + bid, + extra_properties: dict = None, + with_visible: bool = False, + with_clickable: bool = False, + with_center_coords: bool = False, + with_bounding_box_coords: bool = False, + with_som: bool = False, + filter_visible_only: bool = False, + filter_with_bid_only: bool = False, + filter_som_only: bool = False, + coord_decimals: int = 0, +): + """ + Process extra attributes and attribute-based filters, for the element with the given bid. + + Returns: + A flag indicating if the element should be skipped or not (due to filters). + Attributes to be printed, as a list of "x=y" strings. + """ + + if extra_properties is None: + if any( + ( + with_visible, + with_clickable, + with_center_coords, + with_bounding_box_coords, + with_som, + filter_visible_only, + filter_with_bid_only, + filter_som_only, + ) + ): + raise ValueError("extra_properties argument required") + else: + extra_properties = {} + + skip_element = False + attributes_to_print = [] + + if bid is None: + # skip nodes without a bid (if requested) + if filter_with_bid_only: + skip_element = True + if filter_som_only: + skip_element = True + if filter_visible_only: + # element without bid have no visibility mark, they could be visible or non-visible + pass # TODO: we consider them as visible. Is this what we want? + + # parse extra browsergym properties, if node has a bid + else: + if bid in extra_properties: + node_vis = extra_properties[bid]["visibility"] + node_bbox = extra_properties[bid]["bbox"] + node_is_clickable = extra_properties[bid]["clickable"] + node_in_som = extra_properties[bid]["set_of_marks"] + node_is_visible = node_vis >= 0.5 + # skip non-visible nodes (if requested) + if filter_visible_only and not node_is_visible: + skip_element = True + if filter_som_only and not node_in_som: + skip_element = True + # print extra attributes if requested (with new names) + if with_som and node_in_som: + attributes_to_print.insert(0, f'som="{int(node_in_som)}"') + if with_visible: + attributes_to_print.insert(0, f'visible="{int(node_is_visible)}"') + if with_clickable and node_is_clickable: + attributes_to_print.insert(0, f'clickable="{int(node_is_clickable)}"') + if with_center_coords and node_bbox is not None: + x, y, width, height = node_bbox + center = (x + width / 2, y + height / 2) + attributes_to_print.insert(0, f'center="{_get_coord_str(center, coord_decimals)}"') + if with_bounding_box_coords and node_bbox is not None: + x, y, width, height = node_bbox + box = (x, y, x + width, y + height) + attributes_to_print.insert(0, f'box="{_get_coord_str(box, coord_decimals)}"') + + return skip_element, attributes_to_print + + def flatten_axtree_to_str( AX_tree, + extra_properties: dict = None, with_visible: bool = False, + with_clickable: bool = False, with_center_coords: bool = False, with_bounding_box_coords: bool = False, + with_som: bool = False, filter_visible_only: bool = False, + filter_with_bid_only: bool = False, + filter_som_only: bool = False, coord_decimals: int = 0, - ignore_roles=IGNORED_AXTREE_ROLES, + ignored_roles=IGNORED_AXTREE_ROLES, ignored_properties=IGNORED_AXTREE_PROPERTIES, - remove_rdundant_static_text: bool = True, + remove_redundant_static_text: bool = True, ) -> str: """Formats the accessibility tree into a string text""" node_id_to_idx = {} for idx, node in enumerate(AX_tree["nodes"]): node_id_to_idx[node["nodeId"]] = idx - def dfs(idx: int, depth: int) -> str: + def dfs(node_idx: int, depth: int, parent_node_filtered: bool) -> str: tree_str = "" - node = AX_tree["nodes"][idx] + node = AX_tree["nodes"][node_idx] indent = "\t" * depth - valid_node = True - role = node["role"]["value"] + skip_node = False + filter_node = False + node_role = node["role"]["value"] - if role in ignore_roles: + if node_role in ignored_roles: + skip_node = True pass elif "name" not in node: + skip_node = True pass else: - print_node = True - name = node["name"]["value"] - node_str = f"{role} {repr(name.strip())}" - + node_name = node["name"]["value"] if "value" in node and "value" in node["value"]: - node_str += f' value: {repr(node["value"]["value"])}' + node_value = node["value"]["value"] + else: + node_value = None - properties = [] + attributes = [] + bid = None for property in node.get("properties", []): if not "value" in property: continue if not "value" in property["value"]: continue - prop_name, value = property["name"], property["value"]["value"] + prop_name = property["name"] + prop_value = property["value"]["value"] + if prop_name == "browsergym_id": - node_str = f"[{value}] " + node_str - elif prop_name == "browsergym_center": - if with_center_coords: - coord_str = _get_coord_str(value, coord_decimals) - node_str += f", center={coord_str}" - elif prop_name == "browsergym_bounding_box": - if with_bounding_box_coords: - coord_str = _get_coord_str(value, coord_decimals) - node_str += f", box={coord_str}" - elif prop_name == "browsergym_is_in_viewport": - # filter visible elements if requested - if filter_visible_only and not value: - print_node = False - break - if with_visible: - visible_str = "visible" if value else "hidden" - node_str += f", {visible_str}" + bid = prop_value + elif prop_name in ignored_properties: + continue elif prop_name in ("required", "focused", "atomic"): - if value: - properties.append(prop_name) - elif prop_name not in ignored_properties: - properties.append(f"{prop_name}={repr(value)}") + if prop_value: + attributes.append(prop_name) + else: + attributes.append(f"{prop_name}={repr(prop_value)}") + + if node_role == "generic" and not attributes: + skip_node = True + + if node_role == "StaticText": + if parent_node_filtered: + skip_node = True + else: + filter_node, extra_attributes_to_print = _process_bid( + bid, + extra_properties=extra_properties, + with_visible=with_visible, + with_clickable=with_clickable, + with_center_coords=with_center_coords, + with_bounding_box_coords=with_bounding_box_coords, + with_som=with_som, + filter_visible_only=filter_visible_only, + filter_with_bid_only=filter_with_bid_only, + filter_som_only=filter_som_only, + coord_decimals=coord_decimals, + ) + + # if either is True, skip the node + skip_node = skip_node or filter_node + + # insert extra attributes before regular attributes + attributes = extra_attributes_to_print + attributes + + # actually print the node string + if not skip_node: + node_str = f"{node_role} {repr(node_name.strip())}" + + if bid is not None: + node_str = f"[{bid}] " + node_str + + if node_value is not None: + node_str += f' value={repr(node["value"]["value"])}' + + if attributes: + node_str += ", ".join([""] + attributes) - if role in ["generic"] and not properties: - print_node = False - - if properties: - node_str += " " + ", ".join(properties) - - if print_node: tree_str += f"{indent}{node_str}" - for _, child_node_id in enumerate(node["childIds"]): + for child_node_id in node["childIds"]: if child_node_id not in node_id_to_idx or child_node_id == node["nodeId"]: continue # mark this to save some tokens - child_depth = depth + 1 if valid_node else depth - child_str = dfs(node_id_to_idx[child_node_id], child_depth) - if child_str.strip(): - if tree_str.strip(): + child_depth = depth if skip_node else (depth + 1) + child_str = dfs( + node_id_to_idx[child_node_id], child_depth, parent_node_filtered=filter_node + ) + if child_str: + if tree_str: tree_str += "\n" tree_str += child_str return tree_str - tree_str = dfs(0, 0) - if remove_rdundant_static_text: + tree_str = dfs(0, 0, False) + if remove_redundant_static_text: tree_str = _remove_redundant_static_text(tree_str) return tree_str + + +def overlay_som( + screenshot: np.typing.ArrayLike, + extra_properties: dict, + fontsize: int = 12, + linewidth: int = 2, + tag_margin: int = 2, +): + img = PIL.Image.fromarray(screenshot).copy() # make a copy + img = img.convert(mode="RGBA") + draw = PIL.ImageDraw.Draw(img) + + font = PIL.ImageFont.load_default(size=fontsize) + + # https://stackoverflow.com/questions/51908563/dotted-or-dashed-line-with-python-pillow/58885306#58885306 + import math # math has the fastest sqrt + + def linedashed(draw: PIL.ImageDraw.Draw, x0, y0, x1, y1, fill, width, dashlen=4, ratio=3): + dx = x1 - x0 # delta x + dy = y1 - y0 # delta y + # check whether we can avoid sqrt + if dy == 0: + vlen = dx + elif dx == 0: + vlen = dy + else: + vlen = math.sqrt(dx * dx + dy * dy) # length of line + xa = dx / vlen # x add for 1px line length + ya = dy / vlen # y add for 1px line length + step = dashlen * ratio # step to the next dash + a0 = 0 + while a0 < vlen: + a1 = a0 + dashlen + if a1 > vlen: + a1 = vlen + draw.line( + (x0 + xa * a0, y0 + ya * a0, x0 + xa * a1, y0 + ya * a1), fill=fill, width=width + ) + a0 += step + + for bid, properties in extra_properties.items(): + if properties["set_of_marks"] and properties["bbox"]: + x, y, width, height = properties["bbox"] + + # draw bounding box with dashed lines + linedashed(draw, x, y, x + width, y, fill=(0, 0, 0, 255), width=linewidth) + linedashed( + draw, x + width, y, x + width, y + height, fill=(0, 0, 0, 255), width=linewidth + ) + linedashed( + draw, x, y + height, x + width, y + height, fill=(0, 0, 0, 255), width=linewidth + ) + linedashed(draw, x, y, x, y + height, fill=(0, 0, 0, 255), width=linewidth) + + # get text box size (left, top, right, bottom) + tag_box = font.getbbox( + bid, + ) + + # set tag size, including margins + tag_size = ( + (tag_box[2] - tag_box[0] + 2 * (tag_margin + 1)), + (tag_box[3] - tag_box[1] + 2 * (tag_margin + 1)), + ) + + # create tag image with correct size and black background + tag_img = PIL.Image.new("RGBA", tag_size, "black") + tag_draw = PIL.ImageDraw.Draw(tag_img) + # write text with 1px horizontal margin + tag_draw.text( + (-tag_box[0] + tag_margin + 1, -tag_box[1] + tag_margin + 1), + bid, + font=font, + fill=(255, 255, 255, 255), + spacing=0, + ) + tag_draw.rectangle( + (0, 0, tag_size[0] - 1, tag_size[1] - 1), + fill=None, + outline=(255, 255, 255, 255), + width=1, + ) + + # draw tag in the source image, upper left of the bounding box + tag_pos = (x + 0, y - tag_size[1] / 2 + 4) + tag_pos = list(map(round, tag_pos)) + img.paste(tag_img, tag_pos) + + # convert to RGB (3 channels) + img = img.convert(mode="RGB") + # convert to a numpy array + img = np.array(img) + + return img diff --git a/core/tests/data/test_page_2.html b/core/tests/data/test_page_2.html new file mode 100644 index 00000000..250a8170 --- /dev/null +++ b/core/tests/data/test_page_2.html @@ -0,0 +1,63 @@ + + + + + Simple Form + + + +

Simple Form

+ +
+ +

+ + +

+ + +

+ +
+

+ + +

+ + + +
+ + + Text within in non-html tag + +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

Text that should not be visible

+ + + diff --git a/core/tests/test_actions_highlevel.py b/core/tests/test_actions_highlevel.py index b1723048..779746c2 100644 --- a/core/tests/test_actions_highlevel.py +++ b/core/tests/test_actions_highlevel.py @@ -97,7 +97,7 @@ def test_valid_action(): env = gym.make( "browsergym/openended", - start_url=CHECKBOX_URL, + task_kwargs={"start_url": CHECKBOX_URL}, headless=__HEADLESS, slow_mo=__SLOW_MO, timeout=__TIMEOUT, @@ -288,7 +288,7 @@ def test_invalid_action(): env = gym.make( "browsergym/openended", - start_url=CHECKBOX_URL, + task_kwargs={"start_url": CHECKBOX_URL}, headless=__HEADLESS, slow_mo=__SLOW_MO, timeout=__TIMEOUT, @@ -304,7 +304,7 @@ def test_invalid_action(): obs, reward, term, trunc, info = env.step(action) # error - assert "TimeoutError" in obs["last_action_error"] + assert "ValueError" in obs["last_action_error"] # invalid bid value type action = f"""\ @@ -450,7 +450,7 @@ def test_click_through_frames(): env = gym.make( "browsergym/openended", - start_url=MULTI_IFRAME_URL, + task_kwargs={"start_url": MULTI_IFRAME_URL}, headless=__HEADLESS, slow_mo=__SLOW_MO, timeout=__TIMEOUT, @@ -490,7 +490,7 @@ def test_fill_through_iframe(): env = gym.make( "browsergym/openended", - start_url=MULTI_IFRAME_URL, + task_kwargs={"start_url": MULTI_IFRAME_URL}, headless=__HEADLESS, slow_mo=__SLOW_MO, timeout=__TIMEOUT, @@ -534,7 +534,7 @@ def test_click(): env = gym.make( "browsergym/openended", - start_url=CHECKBOX_URL, + task_kwargs={"start_url": CHECKBOX_URL}, headless=__HEADLESS, slow_mo=__SLOW_MO, timeout=__TIMEOUT, @@ -606,7 +606,7 @@ def test_hover(): env = gym.make( "browsergym/openended", - start_url=HOVER_URL, + task_kwargs={"start_url": HOVER_URL}, headless=__HEADLESS, slow_mo=__SLOW_MO, timeout=__TIMEOUT, @@ -651,7 +651,7 @@ def test_fill_type_press(): action_set = HighLevelActionSet(subsets=["bid", "coord"]) env = gym.make( "browsergym/openended", - start_url=TEXT_INPUT_URL, + task_kwargs={"start_url": TEXT_INPUT_URL}, headless=__HEADLESS, slow_mo=__SLOW_MO, timeout=__TIMEOUT, @@ -806,7 +806,7 @@ def test_key_press(): env = gym.make( "browsergym/openended", - start_url=TEXT_INPUT_URL, + task_kwargs={"start_url": TEXT_INPUT_URL}, headless=__HEADLESS, slow_mo=__SLOW_MO, timeout=__TIMEOUT, @@ -848,7 +848,7 @@ def test_goto(): env = gym.make( "browsergym/openended", - start_url=url1, + task_kwargs={"start_url": url1}, headless=__HEADLESS, slow_mo=__SLOW_MO, timeout=__TIMEOUT, @@ -896,26 +896,22 @@ def test_scroll(): env = gym.make( "browsergym/openended", - start_url=LONG_PAGE_URL, - headless=False, + task_kwargs={"start_url": LONG_PAGE_URL}, + headless=__HEADLESS, slow_mo=__SLOW_MO, timeout=__TIMEOUT, action_mapping=action_set.to_python_code, ) def extract_coords_from_elem(elem): - x, y = map( - float, - re.search( - r"\(([-+]?[0-9\.]+),[\s]*([-+]?[0-9\.]+)\)", - elem.get("center"), - ).groups(), - ) - return x, y + return ast.literal_eval(elem.get("center")) def get_top_bottom_elems(obs): soup = bs4.BeautifulSoup( - flatten_dom_to_str(obs["dom_object"], with_center_coords=True), "lxml" + flatten_dom_to_str( + obs["dom_object"], obs["extra_element_properties"], with_center_coords=True + ), + "lxml", ) top = soup.find("input", attrs={"type": "checkbox", "id": "top"}) bottom = soup.find("input", attrs={"type": "checkbox", "id": "bottom"}) @@ -1009,7 +1005,7 @@ def get_top_bottom_elems(obs): # def test_meta_action(): # env = BrowserEnv( # task_entrypoint=OpenEndedTask, -# start_url=TEXT_INPUT_URL, +# task_kwargs={"start_url": TEXT_INPUT_URL}, # headless=__HEADLESS__, # ) # obs, info = env.reset() @@ -1143,7 +1139,7 @@ def get_top_bottom_elems(obs): # def test_clear_success(): # env = BrowserEnv( # task_entrypoint=OpenEndedTask, -# start_url=TEXT_INPUT_URL, +# task_kwargs={"start_url": TEXT_INPUT_URL}, # headless=__HEADLESS__, # ) # obs, info = env.reset() @@ -1213,7 +1209,7 @@ def get_top_bottom_elems(obs): # """In this test, we try to build a ClearAction but we use invalid args, and we check that the action fails when executed in the environment""" # env = BrowserEnv( # task_entrypoint=OpenEndedTask, -# start_url=TEXT_INPUT_URL, +# task_kwargs={"start_url": TEXT_INPUT_URL}, # headless=__HEADLESS__, # ) # obs, info = env.reset() @@ -1307,7 +1303,7 @@ def test_mouse_down_up(): env = gym.make( "browsergym/openended", - start_url=CHECKBOX_URL, + task_kwargs={"start_url": CHECKBOX_URL}, headless=__HEADLESS, slow_mo=__SLOW_MO, timeout=__TIMEOUT, @@ -1316,7 +1312,10 @@ def test_mouse_down_up(): def get_checkbox_elem(obs): soup = bs4.BeautifulSoup( - flatten_dom_to_str(obs["dom_object"], with_center_coords=True), "lxml" + flatten_dom_to_str( + obs["dom_object"], obs["extra_element_properties"], with_center_coords=True + ), + "lxml", ) checkbox = soup.find("input", attrs={"type": "checkbox", "id": "vehicle1"}) return checkbox diff --git a/core/tests/test_gym_envs.py b/core/tests/test_gym_envs.py index 0ebea442..968cf525 100644 --- a/core/tests/test_gym_envs.py +++ b/core/tests/test_gym_envs.py @@ -23,7 +23,7 @@ __DATA_DIR = pathlib.Path(__file__).resolve().parent / "data" TEST_PAGE = f"file://{__DATA_DIR}/test_page.html" BASIC_IFRAME_PAGE = f"file://{__DATA_DIR}/basic_iframe_site/basic_iframe_2.html" -DEMO_MODES = ["default", "only_visible_elements"] +DEMO_MODES = ["default", "only_visible_elements", "all_blue"] def test_gym_env(): @@ -31,7 +31,7 @@ def test_gym_env(): env = gym.make( "browsergym/openended", - start_url=TEST_PAGE, + task_kwargs={"start_url": TEST_PAGE}, headless=__HEADLESS, slow_mo=__SLOW_MO, timeout=__TIMEOUT, @@ -92,7 +92,7 @@ def test_max_episode_steps(): # no max_steps env = gym.make( "browsergym/openended", - start_url=TEST_PAGE, + task_kwargs={"start_url": TEST_PAGE}, headless=__HEADLESS, slow_mo=__SLOW_MO, timeout=__TIMEOUT, @@ -112,7 +112,7 @@ def test_max_episode_steps(): # max_steps = 2 env = gym.make( "browsergym/openended", - start_url=TEST_PAGE, + task_kwargs={"start_url": TEST_PAGE}, headless=__HEADLESS, slow_mo=__SLOW_MO, timeout=__TIMEOUT, @@ -137,7 +137,7 @@ def test_active_page(): action_set = PythonActionSet() env = gym.make( "browsergym/openended", - start_url=TEST_PAGE, + task_kwargs={"start_url": TEST_PAGE}, headless=__HEADLESS, slow_mo=__SLOW_MO, timeout=__TIMEOUT, @@ -191,11 +191,10 @@ def test_nested_iframes_default_demo_mode(): action_set = HighLevelActionSet(demo_mode=demo_mode) env = gym.make( "browsergym/openended", - start_url=BASIC_IFRAME_PAGE, + task_kwargs={"start_url": BASIC_IFRAME_PAGE}, headless=__HEADLESS, slow_mo=__SLOW_MO, timeout=__TIMEOUT, - demo_mode=demo_mode, action_mapping=action_set.to_python_code, ) obs, info = env.reset() @@ -228,12 +227,11 @@ def test_demo_mode(demo_mode): action_set = HighLevelActionSet(demo_mode=demo_mode) env = gym.make( "browsergym/openended", - start_url=TEST_PAGE, + task_kwargs={"start_url": TEST_PAGE}, headless=__HEADLESS, slow_mo=__SLOW_MO, timeout=__TIMEOUT, action_mapping=action_set.to_python_code, - demo_mode=demo_mode, ) obs, info = env.reset() assert not obs["last_action_error"] @@ -282,3 +280,23 @@ def test_demo_mode(demo_mode): assert checkbox.has_attr("checked") env.close() + + +@pytest.mark.parametrize("resizeable_window", (True, False)) +@pytest.mark.parametrize("size", ((1600, 1200), (800, 800))) +def test_resizeable_window(resizeable_window, size): + env = gym.make( + "browsergym/openended", + task_kwargs={"start_url": TEST_PAGE}, + headless=__HEADLESS, + slow_mo=__SLOW_MO, + timeout=__TIMEOUT, + viewport={"width": size[0], "height": size[1]}, + resizeable_window=resizeable_window, + ) + obs, info = env.reset() + assert not obs["last_action_error"] + + assert (obs["screenshot"].shape[1], obs["screenshot"].shape[0]) == size + + env.close() diff --git a/core/tests/test_observation.py b/core/tests/test_observation.py index 2d594752..a488dbc0 100644 --- a/core/tests/test_observation.py +++ b/core/tests/test_observation.py @@ -17,7 +17,7 @@ ) # bugfix: use same playwright instance in browsergym and pytest - +from utils import setup_playwright from browsergym.core.observation import ( _pre_extract, @@ -37,6 +37,7 @@ __DATA_DIR = Path(__file__).resolve().parent / "data" TEST_PAGE = f"file://{__DATA_DIR}/test_page.html" +TEST_PAGE_2 = f"file://{__DATA_DIR}/test_page_2.html" MULTI_IFRAME_URL = f"file://{__DATA_DIR}/basic_iframe_site/basic_iframe_2.html" SHADOW_DOM_URL = f"file://{__DATA_DIR}/basic_shadow_dom_site/basic_shadow_dom.html" SIMPLE_SHADOW_DOM_URL = f"file://{__DATA_DIR}/basic_shadow_dom_site/simple_shadow_dom.html" @@ -52,7 +53,7 @@ def test_extract_screenshot(): env = gym.make( "browsergym/openended", - start_url=TEST_PAGE, + task_kwargs={"start_url": TEST_PAGE}, headless=__HEADLESS, slow_mo=__SLOW_MO, viewport=__VIEWPORT, @@ -78,7 +79,7 @@ def test_extract_screenshot(): def test_extract_axtree_simple(): env = gym.make( "browsergym/openended", - start_url=TEST_PAGE, + task_kwargs={"start_url": TEST_PAGE}, headless=__HEADLESS, slow_mo=__SLOW_MO, viewport=__VIEWPORT, @@ -101,7 +102,7 @@ def test_extract_axtree_simple(): def test_extract_axtree_multi_iframe(): env = gym.make( "browsergym/openended", - start_url=MULTI_IFRAME_URL, + task_kwargs={"start_url": MULTI_IFRAME_URL}, headless=__HEADLESS, slow_mo=__SLOW_MO, viewport=__VIEWPORT, @@ -130,7 +131,7 @@ def test_extract_axtree_multi_iframe(): def test_extract_dom_simple(): env = gym.make( "browsergym/openended", - start_url=TEST_PAGE, + task_kwargs={"start_url": TEST_PAGE}, headless=__HEADLESS, slow_mo=__SLOW_MO, viewport=__VIEWPORT, @@ -151,7 +152,7 @@ def test_extract_dom_simple(): def test_extract_dom_multi_iframe(): env = gym.make( "browsergym/openended", - start_url=MULTI_IFRAME_URL, + task_kwargs={"start_url": MULTI_IFRAME_URL}, headless=__HEADLESS, slow_mo=__SLOW_MO, viewport=__VIEWPORT, @@ -172,7 +173,7 @@ def test_extract_dom_multi_iframe(): def test_simple_shadowdom(): env = gym.make( "browsergym/openended", - start_url=SIMPLE_SHADOW_DOM_URL, + task_kwargs={"start_url": SIMPLE_SHADOW_DOM_URL}, headless=__HEADLESS, slow_mo=__SLOW_MO, viewport=__VIEWPORT, @@ -206,7 +207,7 @@ def test_simple_shadowdom(): def test_nested_shadowdom(): env = gym.make( "browsergym/openended", - start_url=SHADOW_DOM_URL, + task_kwargs={"start_url": SHADOW_DOM_URL}, headless=__HEADLESS, slow_mo=__SLOW_MO, viewport=__VIEWPORT, @@ -252,7 +253,7 @@ def test_nested_shadowdom(): def test_dom_has_bids_no_aria(url): env = gym.make( "browsergym/openended", - start_url=url, + task_kwargs={"start_url": url}, headless=__HEADLESS, slow_mo=__SLOW_MO, viewport=__VIEWPORT, @@ -325,7 +326,7 @@ def test_dom_has_bids_no_aria(url): def test_dom_to_text(): env = gym.make( "browsergym/openended", - start_url=TEST_PAGE, + task_kwargs={"start_url": TEST_PAGE_2}, headless=__HEADLESS, slow_mo=__SLOW_MO, timeout=__TIMEOUT, @@ -354,13 +355,58 @@ def test_dom_to_text(): assert "Janice" in dom assert "janice@mail.com" in dom + dom = flatten_dom_to_str( + obs["dom_object"], + extra_properties=obs["extra_element_properties"], + with_visible=True, + with_clickable=True, + with_center_coords=True, + with_bounding_box_coords=True, + with_som=True, + ) + assert 'box="(' in dom + assert 'center="(' in dom + assert 'clickable="1" som="1" type="submit" value="Submit" visible="1"' in dom + assert 'head bid="1" visible="0"' in dom + assert 'clickable="1" for="email" visible="1"' in dom + assert "Text within in non-html tag" in dom + assert "Text that should not be visible" in dom + + dom = flatten_dom_to_str( + obs["dom_object"], extra_properties=obs["extra_element_properties"], filter_som_only=True + ) + assert 'for="email"' not in dom + assert 'type="submit" value="Submit"' in dom + assert "Text within in non-html tag" not in dom + assert "Text that should not be visible" not in dom + + dom = flatten_dom_to_str( + obs["dom_object"], + extra_properties=obs["extra_element_properties"], + filter_visible_only=True, + ) + assert "=3.10 + - pip + + - pip: + - -r requirements.txt \ No newline at end of file diff --git a/dev/requirements.txt b/dev/requirements.txt new file mode 100644 index 00000000..749af812 --- /dev/null +++ b/dev/requirements.txt @@ -0,0 +1,10 @@ +black[jupyter]==24.2.0 +blacken-docs +pre-commit +pytest==7.3.2 +pytest-xdist +pytest-playwright +tenacity +-e ../core # local package +-e ../miniwob # local package +-e ../webarena # local package diff --git a/miniwob/requirements.txt b/miniwob/requirements.txt index 11fd6522..984338fe 100644 --- a/miniwob/requirements.txt +++ b/miniwob/requirements.txt @@ -1 +1 @@ -browsergym-core==0.1.0rc7 +browsergym-core==0.2.0 diff --git a/miniwob/src/browsergym/miniwob/__init__.py b/miniwob/src/browsergym/miniwob/__init__.py index 84ba88f0..9ca3a3f2 100644 --- a/miniwob/src/browsergym/miniwob/__init__.py +++ b/miniwob/src/browsergym/miniwob/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.1.0rc7" +__version__ = "0.2.0" from browsergym.core.registration import register_task @@ -137,5 +137,5 @@ register_task( task.get_task_id(), task, - kwargs={"viewport": {"width": 500, "height": 320}, "slow_mo": 100}, + nondeterministic=task.nondeterministic, ) diff --git a/miniwob/src/browsergym/miniwob/all.py b/miniwob/src/browsergym/miniwob/all.py index 062e7cdf..2e310509 100644 --- a/miniwob/src/browsergym/miniwob/all.py +++ b/miniwob/src/browsergym/miniwob/all.py @@ -137,6 +137,39 @@ class ClickMenu2Task(AbstractMiniwobTask): desc = "Find a specific item from a menu." subdomain = "click-menu-2" + def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: + goal, info = super().setup(page) + + if self.remove_human_display: + get_utterance_func = "getUtterance_legacy" + else: + get_utterance_func = "getUtterance" + + # this task requires specific treatment to recover the text goal + page.evaluate( + f"core.{get_utterance_func} = function () " + + r"""{ + query_div = document.getElementById('query'); + if (query_div.children.length > 0) { + utterance = ''; + utterance = utterance + query_div.childNodes[0].textContent.replace(/\s+/g, ' ').trim(); + utterance = utterance + ' "' + query_div.children[0].getAttribute('class').split(' ')[1] + '"'; + utterance = utterance + ' ' + query_div.childNodes[2].textContent.replace(/\s+/g, ' ').trim(); + } + else { + utterance = query_div.textContent.replace(/\s+/g, ' ').trim(); + } + return utterance; +}; +''; +""" + ) + + # re-extract the goal + goal = self._get_goal() + + return goal, info + class ClickOptionTask(AbstractMiniwobTask): desc = "Click option boxes." @@ -610,8 +643,8 @@ class UseColorwheel2Task(AbstractMiniwobTask): desc = "Use a color wheel given specific random color." subdomain = "use-colorwheel-2" - def setup(self, seed: int, page: playwright.sync_api.Page) -> tuple[str, dict]: - goal, info = super().setup(seed, page) + def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: + goal, info = super().setup(page) if self.remove_human_display: get_utterance_func = "getUtterance_legacy" diff --git a/miniwob/src/browsergym/miniwob/base.py b/miniwob/src/browsergym/miniwob/base.py index 1c394cf6..84bb8d3a 100644 --- a/miniwob/src/browsergym/miniwob/base.py +++ b/miniwob/src/browsergym/miniwob/base.py @@ -11,7 +11,8 @@ class AbstractMiniwobTask(AbstractBrowserTask): """ - nondeterministic = False + # gym metadata (default value, can be overloaded per task) + nondeterministic: bool = False @classmethod def get_task_id(cls): @@ -19,17 +20,26 @@ def get_task_id(cls): def __init__( self, + seed: int, base_url: Optional[str] = None, episode_max_time: int = 1000000, remove_human_display: bool = True, ) -> None: """ Args: + seed: random seed. base_url: str (optional), the base Miniwob URL where the task's HTML file is to be found. If not provided, the MINIWOB_URL environment variable will be used. episode_max_time: int, episode max time in milliseconds. Default: 1000000 ms. remove_human_display: bool, whether or not to remove the human display (goal, time left, last reward etc.) from the DOM. Default: True. """ + super().__init__(seed) + + # task properties, will be used to set up the browsergym environment + self.viewport = {"width": 500, "height": 320} + self.slow_mo = 100 # ms + self.timeout = 5000 # ms + assert episode_max_time > 0 # if not provided, try to get Miniwob URL from environment variable @@ -45,7 +55,7 @@ def __init__( self.episode_max_time = episode_max_time self.remove_human_display = remove_human_display - def setup(self, seed: int, page: playwright.sync_api.Page) -> tuple[str, dict]: + def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: self.page = page # navigate to the task's url @@ -118,7 +128,7 @@ def setup(self, seed: int, page: playwright.sync_api.Page) -> tuple[str, dict]: core.EPISODE_MAX_TIME = {episode_max_time}; core.startEpisodeReal(); """.format( - seed=seed, + seed=self.random.randint(0, 1000000), episode_max_time=self.episode_max_time, ) ) diff --git a/miniwob/tests/test_base.py b/miniwob/tests/test_base.py index baeacf90..9fb3ba1f 100644 --- a/miniwob/tests/test_base.py +++ b/miniwob/tests/test_base.py @@ -30,8 +30,8 @@ def test_validate_teardown(task_cls): context = browser.new_context() page = context.new_page() - task = task_cls() - task.setup(seed=42, page=page) + task = task_cls(seed=42) + task.setup(page=page) reward, done, msg, info = task.validate(page, []) @@ -51,8 +51,8 @@ def test_episode_max_time(task_cls): context = browser.new_context() page = context.new_page() - task = task_cls(episode_max_time=0.2) - task.setup(seed=42, page=page) + task = task_cls(seed=42, episode_max_time=0.2) + task.setup(page=page) time.sleep(0.5) @@ -78,8 +78,8 @@ def test_remove_human_display(task_cls): context = browser.new_context() page = context.new_page() - task = task_cls(remove_human_display=True) - task.setup(seed=42, page=page) + task = task_cls(seed=42, remove_human_display=True) + task.setup(page=page) for element_id in ["reward-display", "click-canvas", "sync-task-cover"]: element_in_dom = page.evaluate(f"!!document.getElementById('{element_id}')") @@ -100,8 +100,8 @@ def test_remove_human_display(task_cls): context = browser.new_context() page = context.new_page() - task = task_cls(remove_human_display=False) - task.setup(seed=42, page=page) + task = task_cls(seed=42, remove_human_display=False) + task.setup(page=page) for element_id in ["reward-display", "click-canvas", "sync-task-cover"]: element_in_dom = page.evaluate(f"!!document.getElementById('{element_id}')") diff --git a/miniwob/tests/test_click-menu-2.py b/miniwob/tests/test_click-menu-2.py new file mode 100644 index 00000000..3851da2f --- /dev/null +++ b/miniwob/tests/test_click-menu-2.py @@ -0,0 +1,84 @@ +import os +import gymnasium as gym +import re +import pytest + +# register gym environments +import browsergym.miniwob + +# bugfix: use same playwright instance in browsergym and pytest +from utils import setup_playwright + +__SLOW_MO = 1000 if "DISPLAY_BROWSER" in os.environ else None +__HEADLESS = False if "DISPLAY_BROWSER" in os.environ else True + + +@pytest.mark.parametrize("seed", range(5)) +def test_cheat(seed): + env = gym.make( + "browsergym/miniwob.click-menu-2", + headless=__HEADLESS, + slow_mo=__SLOW_MO, + action_mapping=None, + ) + obs, info = env.reset(seed=seed) + + assert obs["last_action_error"] == "" + + match1 = re.match( + 'Click the "Menu" button, and then find and click on the item labeled "(.+)".', obs["goal"] + ) + match2 = re.match( + 'Click the "Menu" button, and then find and click on the item with the "(.+)" icon.', + obs["goal"], + ) + + assert match1 or match2 + + if match1: + item_label = match1.groups()[0] + item_classname = { + "Save": "ui-icon-disk", + "Prev": "ui-icon-seek-start", + "Stop": "ui-icon-stop", + "Play": "ui-icon-play", + "Next": "ui-icon-seek-end", + "Zoom In": "ui-icon-zoomin", + "Zoom Out": "ui-icon-zoomout", + }[item_label] + else: + item_classname = match2.groups()[0] + + action = f"""\ +page.get_by_text("Menu").click() +""" + + obs, reward, term, trunc, info = env.step(action) + + assert obs["last_action_error"] == "" + assert reward == 0 + assert term == False + + if item_classname in ("ui-icon-seek-start", "ui-icon-stop", "ui-icon-play", "ui-icon-seek-end"): + + action = f"""\ +page.get_by_text("Playback").click() +""" + + obs, reward, term, trunc, info = env.step(action) + + assert obs["last_action_error"] == "" + assert reward == 0 + assert term == False + + action = f"""\ +page.locator(".{item_classname}").click() +""" + + obs, reward, term, trunc, info = env.step(action) + + assert obs["last_action_error"] == "" + assert reward == 1 + assert term == True + + env.close() diff --git a/pyproject.toml b/pyproject.toml index 917640fd..707a375b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,12 +25,12 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", "License :: OSI Approved :: Apache Software License", ] -version="0.1.0rc7" +version="0.2.0" dependencies = [ - "browsergym-core==0.1.0rc7", - "browsergym-miniwob==0.1.0rc7", - "browsergym-webarena==0.1.0rc7", - "browsergym-workarena==0.1.0rc7", + "browsergym-core==0.2.0", + "browsergym-miniwob==0.2.0", + "browsergym-webarena==0.2.0", + "browsergym-workarena==0.2.0", ] [tool.setuptools] diff --git a/webarena/requirements.txt b/webarena/requirements.txt index bbd6fb1f..200725f0 100644 --- a/webarena/requirements.txt +++ b/webarena/requirements.txt @@ -1,2 +1,2 @@ -browsergym-core==0.1.0rc7 +browsergym-core==0.2.0 libwebarena==0.0.2 diff --git a/webarena/src/browsergym/webarena/__init__.py b/webarena/src/browsergym/webarena/__init__.py index ec79724e..1c5d0753 100644 --- a/webarena/src/browsergym/webarena/__init__.py +++ b/webarena/src/browsergym/webarena/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.1.0rc7" +__version__ = "0.2.0" from browsergym.core.registration import register_task @@ -14,6 +14,6 @@ register_task( gym_id, GenericWebArenaTask, - kwargs={"task_id": task_id, "viewport": {"width": 1280, "height": 720}, "timeout": 10000}, + kwargs={"task_kwargs": {"task_id": task_id}}, ) ALL_WEBARENA_TASK_IDS.append(gym_id) diff --git a/webarena/src/browsergym/webarena/instance.py b/webarena/src/browsergym/webarena/instance.py index 80f8ba10..70648bf9 100644 --- a/webarena/src/browsergym/webarena/instance.py +++ b/webarena/src/browsergym/webarena/instance.py @@ -99,3 +99,12 @@ def ui_login(self, site: str, page: playwright.sync_api.Page): page.get_by_label("Username").fill(username) page.get_by_label("Password").fill(password) page.get_by_role("button", name="Sign in").click() + + case "wikipedia": + page.goto(url) + + case "map": + page.goto(url) + + case _: + raise ValueError diff --git a/webarena/src/browsergym/webarena/task.py b/webarena/src/browsergym/webarena/task.py index 2d4e4338..a7683c8e 100644 --- a/webarena/src/browsergym/webarena/task.py +++ b/webarena/src/browsergym/webarena/task.py @@ -20,11 +20,19 @@ class GenericWebArenaTask(AbstractBrowserTask): def __init__( self, + seed: int, task_id: Optional[int] = None, intent_template_id: Optional[int] = None, with_na_hint: bool = False, with_homepage_hint: bool = False, ) -> None: + super().__init__(seed) + + # task properties, will be used to set up the browsergym environment + self.viewport = {"width": 1280, "height": 720} + self.slow_mo = 1000 # ms + self.timeout = 10000 # ms + self.webarena_instance = WebArenaInstance() self.config_file: str = None self.with_na_hint = with_na_hint @@ -74,12 +82,10 @@ def __init__( self.task_configs = task_configs - def setup(self, seed: int, page: playwright.sync_api.Page) -> tuple[str, dict]: + def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: # import webarena on instanciation from webarena.evaluation_harness.evaluators import evaluator_router - self.random = np.random.RandomState(seed) - # pick a task at random self.config = self.random.choice(self.task_configs)