+
+
+
+
+
+
\ No newline at end of file
diff --git a/core/src/browsergym/core/chat_files/img/send.svg b/core/src/browsergym/core/chat_files/img/send.svg
new file mode 100644
index 00000000..7d5705f5
--- /dev/null
+++ b/core/src/browsergym/core/chat_files/img/send.svg
@@ -0,0 +1,3 @@
+
diff --git a/core/src/browsergym/core/constants.py b/core/src/browsergym/core/constants.py
index e1e59a7b..3169920d 100644
--- a/core/src/browsergym/core/constants.py
+++ b/core/src/browsergym/core/constants.py
@@ -1,4 +1,7 @@
TEXT_MAX_LENGTH = 2**32 - 1
BROWSERGYM_ID_ATTRIBUTE = "bid" # Playwright's default is "data-testid"
+BROWSERGYM_VISIBILITY_ATTRIBUTE = "browsergym_visibility_ratio"
+BROWSERGYM_SETOFMARKS_ATTRIBUTE = "browsergym_set_of_marks"
+
EXTRACT_OBS_MAX_TRIES = 5
diff --git a/core/src/browsergym/core/env.py b/core/src/browsergym/core/env.py
index 1476536a..0898e23b 100644
--- a/core/src/browsergym/core/env.py
+++ b/core/src/browsergym/core/env.py
@@ -4,22 +4,25 @@
import numpy as np
import playwright.sync_api
import time
+import re
from abc import ABC
from pathlib import Path
-from typing import Optional, Literal
+from typing import Optional
from .chat import Chat
from .task import AbstractBrowserTask
-from .spaces import Unicode, AnyDict
+from .spaces import Unicode, AnyDict, AnyBox
from .constants import TEXT_MAX_LENGTH, BROWSERGYM_ID_ATTRIBUTE, EXTRACT_OBS_MAX_TRIES
from .observation import (
_pre_extract,
_post_extract,
extract_screenshot,
extract_dom_snapshot,
+ extract_dom_extra_properties,
extract_merged_axtree,
extract_focused_element_bid,
+ MarkingError,
)
from .action.base import execute_python_code
from .action.highlevel import HighLevelActionSet
@@ -28,45 +31,65 @@
class BrowserEnv(gym.Env, ABC):
+ """The main BrowserGym class, which encapsulates instruction-following Web browsing into a Gymnasium environment."""
+
# gym metadata
metadata = {"render_modes": None}
def __init__(
self,
+ # task-related arguments
task_entrypoint: type[AbstractBrowserTask],
+ task_kwargs: dict = {},
+ viewport: Optional[dict] = None, # will override the task's viewport
+ slow_mo: Optional[int] = None, # will override the task's slow_mo
+ timeout: Optional[int] = None, # will override the task's timeout
+ # interactive / debugging arguments
headless: bool = True,
- viewport: dict = {"width": 1280, "height": 720},
- slow_mo: int = 1000, # in milliseconds
- timeout: int = 5000,
wait_for_user_message: bool = False,
- demo_mode: Literal["off", "default", "only_visible_elements"] = "off",
- record_video_dir: str = None,
- playwright_kwargs: dict = {},
+ resizeable_window: bool = False,
+ record_video_dir: Optional[str] = None,
+ pw_chromium_kwargs: dict = {},
+ pw_context_kwargs: dict = {},
+ # agent-related arguments
action_mapping: Optional[callable] = HighLevelActionSet().to_python_code,
- **task_kwargs,
):
+ """
+ Instantiate a ready to use BrowserEnv gym environment.
+
+ Args:
+ task_entrypoint: a callable that returns a new task object from a seed. Used for creating a new task during `reset()`.
+ task_kwargs: additional arguments passed to `task_entrypoint`.
+ viewport: desired viewport size. This will override the value defined by the task, which might change its behaviour and difficulty. Should only be set for debugging/testing.
+ slow_mo: desired slow_mo value for Playwright. This will override the value defined by the task, which might change its behaviour and difficulty. Should only be set for debugging/testing.
+ timeout: desired timeout value for Playwright. This will override the value defined by the task, which might change its behaviour and difficulty. Should only be set for debugging/testing.
+ headless: whether the browser should run in headless mode or not. This will affect the viewport size, which might change the behaviour and difficulty of the task. Headless mode should only be disabled for debugging/testing.
+ wait_for_user_message: whether the environment should pause and wait for a user message in the chat after a new message is sent by the agent. Useful for running agents in interactive mode.
+ resizeable_window: whether the browser window should be resizeable or not. This will affect the viewport size, which might change the behaviour and difficulty of the task. Should only be set for debugging/testing.
+ record_video_dir: if set, indicates a directory to which viewport videos will be recorded.
+ pw_chromium_kwargs: extra parameters for the playwright Browser. Should only be used for debugging/testing.
+ pw_context_kwargs: extra parameters for the playwright BrowserContext. Should only be used for debugging/testing.
+ action_mapping: if set, the environment will use this function to map every received action to executable Python code.
+
+ """
super().__init__()
self.task_entrypoint = task_entrypoint
- self.task_kwargs = task_kwargs
- self.headless = headless
+ self.task_kwargs = dict(**task_kwargs)
self.viewport = viewport
self.slow_mo = slow_mo
self.timeout = timeout
+ self.headless = headless
self.wait_for_user_message = wait_for_user_message
- self.demo_mode = demo_mode
- self.action_mapping = action_mapping
+ self.resizeable_window = resizeable_window
self.record_video_dir = record_video_dir
+ self.pw_chromium_kwargs = pw_chromium_kwargs
+ self.pw_context_kwargs = pw_context_kwargs
+ self.action_mapping = action_mapping
# task
self.task = None
# playwright
- self.playwright_kwargs = playwright_kwargs
- self.playwright_kwargs.setdefault("headless", self.headless)
- self.playwright_kwargs.setdefault("slow_mo", self.slow_mo)
- self.playwright_kwargs.setdefault(
- "args", [f"--window-size={self.viewport['width']},{self.viewport['height']}"]
- )
self.browser: playwright.sync_api.Browser = None
self.context: playwright.sync_api.BrowserContext = None
self.page: playwright.sync_api.Page = None
@@ -93,14 +116,15 @@ def __init__(
),
"active_page_index": gym.spaces.Box(low=0, high=255, dtype=int),
"url": Unicode(min_length=0, max_length=TEXT_MAX_LENGTH),
- "screenshot": gym.spaces.Box(
- 0,
- 255,
- shape=(viewport["height"], viewport["width"], 3),
+ "screenshot": AnyBox(
+ low=0,
+ high=255,
+ shape=(-1, -1, 3),
dtype=np.uint8,
- ), # swapped axes (height first)
+ ), # swapped axes (height, width, RGB)
"dom_object": AnyDict(),
"axtree_object": AnyDict(),
+ "extra_element_properties": AnyDict(),
"focused_element_bid": Unicode(min_length=0, max_length=TEXT_MAX_LENGTH),
"last_action": Unicode(min_length=0, max_length=TEXT_MAX_LENGTH),
"last_action_error": Unicode(min_length=0, max_length=TEXT_MAX_LENGTH),
@@ -124,39 +148,67 @@ def close(self):
self.task = None
def reset(self, seed=None, *args, **kwargs):
- # we need the following line to seed self.np_random
super().reset(seed=seed, *args, **kwargs)
+ self.np_random = None # make sure all randomness is handled by the task
if self.task:
self.task.teardown()
self.context.close()
self.chat.close()
- else:
- pw: playwright.sync_api.Playwright = _get_global_playwright()
- # important: change playwright's test id attribute from "data-testid" to "bid"
- pw.selectors.set_test_id_attribute(BROWSERGYM_ID_ATTRIBUTE)
- self.browser = pw.chromium.launch(**self.playwright_kwargs)
+ self.browser.close()
+
+ # create a new task
+ self.task = self.task_entrypoint(seed=seed, **self.task_kwargs)
+
+ def override_property(task, env, property):
+ """Extract property value from env if not None, otherwise from task."""
+ env_value = getattr(env, property)
+ task_value = getattr(task, property)
+ if env_value is None:
+ return task_value
+ else:
+ logging.warning(
+ f"Overriding the task's {property} parameter ({repr(task_value)} => {repr(env_value)}). This might change the task's behaviour and difficulty."
+ )
+ return env_value
+
+ # fetch task's desired parameters for browser setup
+ viewport = override_property(self.task, self, "viewport")
+ slow_mo = override_property(self.task, self, "slow_mo")
+ timeout = override_property(self.task, self, "timeout")
+
+ # use the global Playwright instance
+ pw: playwright.sync_api.Playwright = _get_global_playwright()
+ # important: change playwright's test id attribute from "data-testid" to "bid"
+ pw.selectors.set_test_id_attribute(BROWSERGYM_ID_ATTRIBUTE)
+
+ # create a new browser
+ self.browser = pw.chromium.launch(
+ headless=self.headless,
+ slow_mo=slow_mo,
+ args=(
+ [f"--window-size={viewport['width']},{viewport['height']}"]
+ if self.resizeable_window
+ else None
+ ),
+ # will raise an Exception if above args are overriden
+ **self.pw_chromium_kwargs,
+ )
# create a new browser context for pages
- t_before = time.time()
self.context = self.browser.new_context(
- no_viewport=True,
+ no_viewport=True if self.resizeable_window else None,
+ viewport=viewport,
record_video_dir=(
Path(self.record_video_dir) / "task_video" if self.record_video_dir else None
),
- record_video_size=self.viewport,
- )
- # create the chat at the same time to make sure videos are synced
- self.chat = Chat(
- headless=self.playwright_kwargs["headless"],
- chat_size=(500, max(self.viewport["height"], 800)),
- record_video_dir=self.record_video_dir,
+ record_video_size=viewport,
+ # will raise an Exception if above args are overriden
+ **self.pw_context_kwargs,
)
- t_after = time.time()
- recording_start_time = (t_before + t_after) / 2 # recording start time
# set default timeout
- self.context.set_default_timeout(self.timeout)
+ self.context.set_default_timeout(timeout)
# hack: keep track of the active page with a javascript callback
# there is no concept of active page in playwright
@@ -188,13 +240,19 @@ def reset(self, seed=None, *args, **kwargs):
"""
)
+ # create the chat
+ self.chat = Chat(
+ headless=self.headless,
+ chat_size=(500, max(viewport["height"], 800)),
+ record_video_dir=self.record_video_dir,
+ )
+
# create a new page
self.page = self.context.new_page()
+ recording_start_time = time.time()
- # create and setup a new task
- task_seed = self.np_random.integers(np.iinfo(np.int32).max + 1)
- self.task = self.task_entrypoint(**self.task_kwargs)
- goal, info = self.task.setup(seed=task_seed, page=self.page)
+ # setup the task
+ goal, task_info = self.task.setup(page=self.page)
# initialize the chat
self.chat.add_message(
@@ -224,14 +282,27 @@ def reset(self, seed=None, *args, **kwargs):
# extract obs and info from environment
obs = self._get_obs()
+ info = {}
+ info["task_info"] = task_info
+
+ # TODO this is a bit hacky, find a better solution to record videos
if self.record_video_dir:
info["recording_start_time"] = recording_start_time
+ info["recording_file"] = str(self.page.video.path())
+ info["chat"] = {
+ "recording_start_time": self.chat.recording_start_time,
+ "recording_file": str(self.chat.page.video.path()),
+ }
return obs, info
def step(self, action: str) -> tuple:
self.last_action = action
+ info = {}
+ info["action_exec_start"] = time.time()
+ info["action_exec_timeout"] = 0
+
# try to execute the action
try:
if self.action_mapping:
@@ -246,6 +317,11 @@ def step(self, action: str) -> tuple:
self.last_action_error = ""
except Exception as e:
self.last_action_error = f"{type(e).__name__}: {e}"
+ match = re.match("TimeoutError: Timeout ([0-9]+)ms exceeded.", self.last_action_error)
+ if match:
+ info["action_exec_timeout"] = float(match.groups()[0]) / 1000 # ms to sec
+
+ info["action_exec_stop"] = time.time()
# wait a bit (for the JavaScript callback to set the active page)
time.sleep(0.5) # wait for JS events to be fired (half a second)
@@ -262,7 +338,8 @@ def step(self, action: str) -> tuple:
self._wait_for_user_message()
# extract reward, done, user_message, info (task-specific)
- reward, done, user_message, info = self._task_validate()
+ reward, done, user_message, task_info = self._task_validate()
+ info["task_info"] = task_info
# add any user message sent by the task to the chat
if user_message:
@@ -287,7 +364,7 @@ def _task_validate(self):
# safety fix, in case validate() did mess up the active page and/or page history
if prev_active_page != self.page or prev_page_history != self.page_history:
- logging.warning(
+ logging.info(
"The active page and / or page history has changed during task.validate(). A recovery fix will be applied."
)
self.page = prev_active_page
@@ -363,13 +440,16 @@ def _get_obs(self):
dom = extract_dom_snapshot(self.page)
axtree = extract_merged_axtree(self.page)
focused_element_bid = extract_focused_element_bid(self.page)
- except playwright.sync_api.Error as e:
+ extra_properties = extract_dom_extra_properties(dom)
+ except (playwright.sync_api.Error, MarkingError) as e:
err_msg = str(e)
# try to add robustness to async events (detached / deleted frames)
if retries_left > 0 and (
"Frame was detached" in err_msg
or "Frame with the given frameId is not found" in err_msg
or "Execution context was destroyed" in err_msg
+ or "Frame has been detached" in err_msg
+ or "Cannot mark a child frame without a bid" in err_msg
):
logging.warning(
f"An error occured while extracting the dom and axtree. Retrying ({retries_left}/{EXTRACT_OBS_MAX_TRIES} tries left).\n{repr(e)}"
@@ -402,6 +482,7 @@ def _get_obs(self):
"screenshot": extract_screenshot(self.page),
"dom_object": dom,
"axtree_object": axtree,
+ "extra_element_properties": extra_properties,
"focused_element_bid": focused_element_bid,
"last_action": self.last_action,
"last_action_error": self.last_action_error,
diff --git a/core/src/browsergym/core/javascript/frame_mark_elements.js b/core/src/browsergym/core/javascript/frame_mark_elements.js
index 311ecb5c..3358810d 100644
--- a/core/src/browsergym/core/javascript/frame_mark_elements.js
+++ b/core/src/browsergym/core/javascript/frame_mark_elements.js
@@ -2,11 +2,7 @@
* Go through all DOM elements in the frame (including shadowDOMs), give them unique browsergym
* identifiers (bid), and store custom data in the aria-roledescription attribute.
*/
-var { innerWidth: windowWidth, innerHeight: windowHeight } = window;
-var scrollX = window.scrollX || document.documentElement.scrollLeft;
-var scrollY = window.scrollY || document.documentElement.scrollTop;
-
-([parent_bid, bid_attr_name, iframe_position, super_iframe_offset]) => {
+async ([parent_bid, bid_attr_name]) => {
// standard html tags
// https://www.w3schools.com/tags/
@@ -25,30 +21,39 @@ var scrollY = window.scrollY || document.documentElement.scrollTop;
"svg", "table", "tbody", "td", "template", "textarea", "tfoot", "th", "thead",
"time", "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr"
];
-
- if (super_iframe_offset == null) {
-
- iframe_offset = { x: scrollX, y: scrollY, right: windowWidth, bottom: windowHeight };
- }
- else {
- [super_x, super_y, super_right, super_bottom] = [super_iframe_offset["x"], super_iframe_offset["y"], super_iframe_offset["right"], super_iframe_offset["bottom"]];
-
- x = Math.max(-iframe_position.x, 0);
- y = Math.max(-iframe_position.y, 0);
- right = Math.min(...[super_right, windowWidth, super_right - iframe_position.x]);
- bottom = Math.min(...[super_bottom, windowHeight, super_bottom - iframe_position.y]);
- iframe_offset = { x: x, y: y, right: right, bottom: bottom };
- }
+ const set_of_marks_tags = [
+ "input", "textarea", "select", "button", "a", "iframe", "video", "li", "td", "option"
+ ];
let browsergym_first_visit = false;
// if no yet set, set the frame (local) element counter to 0
- if (!("browsergym_frame_elem_counter" in window)) {
- window.browsergym_frame_elem_counter = 0;
+ if (!("browsergym_elem_counter" in window)) {
+ window.browsergym_elem_counter = 0;
+ window.browsergym_frame_id_generator = new IFrameIdGenerator();
browsergym_first_visit = true;
}
+ // mechanism for computing all element's visibility
+ // the intersection observer will set the visibility ratio of elements entering / exiting the viewport
+ // a set is used to keep track of not-yet-visited elements
+ let elems_to_be_visited = new Set()
+ let intersection_observer = new IntersectionObserver(
+ entries => {
+ entries.forEach(entry => {
+ let elem = entry.target;
+ elem.setAttribute('browsergym_visibility_ratio', Math.round(entry.intersectionRatio * 100) / 100);
+ if (elems_to_be_visited.has(elem)) {
+ elems_to_be_visited.delete(elem);
+ }
+ })
+ },
+ {
+ threshold: [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
+ }
+ )
// get all DOM elements in the current frame (does not include elements in shadowDOMs)
let elements = Array.from(document.querySelectorAll('*'));
+ let som_buttons = [];
i = 0;
while (i < elements.length) {
const elem = elements[i];
@@ -64,10 +69,14 @@ var scrollY = window.scrollY || document.documentElement.scrollTop;
i++;
// we will mark only standard HTML tags
if (!elem.tagName || !html_tags.includes(elem.tagName.toLowerCase())) {
- // console.log(`Skipping element ${elem.outerHTML}`)
+ // Skipping element
continue; // stop and move on to the next element
}
- // console.log(`Processing element ${elem.outerHTML}`)
+ // Processing element
+ // register intersection callback on element, and keep track of element for waiting later
+ elem.setAttribute('browsergym_visibility_ratio', 0);
+ elems_to_be_visited.add(elem);
+ intersection_observer.observe(elem);
// write dynamic element values to the DOM
if (typeof elem.value !== 'undefined') {
elem.setAttribute("value", elem.value);
@@ -81,7 +90,7 @@ var scrollY = window.scrollY || document.documentElement.scrollTop;
elem.removeAttribute("checked");
}
}
- // add the element global id to a custom HTML attribute
+ // add the element global id (browsergym id) to a custom HTML attribute
// https://playwright.dev/docs/locators#locate-by-test-id
// recover the element id if it has one already, else compute a new element id
let elem_global_bid;
@@ -93,100 +102,169 @@ var scrollY = window.scrollY || document.documentElement.scrollTop;
elem_global_bid = elem.getAttribute(bid_attr_name);
}
else {
- let elem_local_id = window.browsergym_frame_elem_counter++;
+ let elem_local_id = null;
+ // iFrames get alphabetical ids: 'a', 'b', ..., 'z'.
+ // if more than 26 iFrames are present, raise an Error
+ if (['iframe', 'frame'].includes(elem.tagName.toLowerCase())) {
+ elem_local_id = `${window.browsergym_frame_id_generator.next()}`;
+ if (elem_local_id.length > 1) {
+ throw new Error(`More than 26? Such iFrames. BrowserGym not like.`);
+ }
+ }
+ // other elements get numerical ids: '0', '1', '2', ...
+ else {
+ elem_local_id = `${window.browsergym_elem_counter++}`;
+ }
if (parent_bid == "") {
elem_global_bid = `${elem_local_id}`;
}
else {
- elem_global_bid = `${parent_bid}-${elem_local_id}`;
+ elem_global_bid = `${parent_bid}${elem_local_id}`;
}
elem.setAttribute(bid_attr_name, `${elem_global_bid}`);
}
+
// Hack: store custom data inside the aria-roledescription attribute (will be available in DOM and AXTree)
// - elem_global_bid: global element identifier (unique over multiple frames)
// TODO: add more data if needed (x, y coordinates, bounding box, is_visible, is_clickable etc.)
-
- let [rect, is_in_viewport] = getElementPositionInfo(elem, iframe_offset, iframe_position);
- let left = (rect.left + iframe_position.x).toString();
- let top = (rect.top + iframe_position.y ).toString();
- let right = (rect.right + iframe_position.x ).toString();
- let bottom = (rect.bottom + iframe_position.y).toString();
- let center_x = ((rect.left + rect.right) / 2 + iframe_position.x).toString();
- let center_y = ((rect.top + rect.bottom) / 2 + iframe_position.y).toString();
-
- elem.setAttribute("browsergym_center", `(${center_x}, ${center_y})`);
- elem.setAttribute("browsergym_bounding_box", `(${left}, ${top}, ${right}, ${bottom})`);
- elem.setAttribute("browsergym_is_in_viewport", `${is_in_viewport}`);
-
let original_content = "";
if (elem.hasAttribute("aria-roledescription")) {
original_content = elem.getAttribute("aria-roledescription");
}
- let new_content = `${elem_global_bid}_${left}_${top}_${center_x}_${center_y}_${right}_${bottom}_${is_in_viewport}_${original_content}`
+ let new_content = `${elem_global_bid}_${original_content}`
elem.setAttribute("aria-roledescription", new_content);
+ // set-of-marks flag (He et al. 2024)
+ // https://github.com/MinorJerry/WebVoyager/blob/main/utils.py
+ elem.setAttribute("browsergym_set_of_marks", "0");
+ // click at center activates self or a child
+ if (["self", "child"].includes(whoCapturesCenterClick(elem))) {
+ // has valid tag name, or has click event, or triggers a pointer cursor
+ if (set_of_marks_tags.includes(elem.tagName.toLowerCase()) || (elem.onclick != null) || (window.getComputedStyle(elem).cursor == "pointer")) {
+ let rect = elem.getBoundingClientRect();
+ let area = (rect.right - rect.left) * (rect.bottom - rect.top);
+ // area is large enough
+ if (area >= 20) {
+ // is not a child of a button (role, type, tag) set to be marked
+ if (som_buttons.every(button => !button.contains(elem))) {
+ // is not the sole child of span that has a role and is set to be marked
+ let parent = elem.parentElement;
+ if (!(parent && parent.tagName.toLowerCase() == "span" && parent.children.length === 1 && parent.getAttribute("role") && parent.getAttribute("browsergym_set_of_marks") === "1")) {
+ // all checks have passed, flag the element for inclusion in set-of-marks
+ elem.setAttribute("browsergym_set_of_marks", "1");
+ if (elem.matches('button, a, input[type="button"], div[role="button"]')) {
+ som_buttons.push(elem)
+ }
+ // lastly, remove the set-of-marks flag from all parents, if any
+ while (parent) {
+ if (parent.getAttribute("browsergym_set_of_marks") === "1") {
+ parent.setAttribute("browsergym_set_of_marks", "0")
+ }
+ parent = parent.parentElement;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ warning_msgs = new Array();
+
+ // wait for all elements to be visited for visibility
+ let visibility_marking_timeout = 1000; // ms
+ try {
+ await until(() => elems_to_be_visited.size == 0, visibility_marking_timeout);
+ } catch {
+ warning_msgs.push(`Frame marking: not all elements have been visited by the intersection_observer after ${visibility_marking_timeout} ms`);
}
- return iframe_offset;
+ // disconnect intersection observer
+ intersection_observer.disconnect();
+ return warning_msgs;
}
-function getElementPositionInfo(element, iframe_offset, iframe_position) {
- var rect = element.getBoundingClientRect();
- let x = (rect.left + rect.right) / 2 ;
- let y = (rect.top + rect.bottom) / 2 ;
- //loop over element ancestors (parent) and refine iframe offset to be the most precise possible
- var parent = element.parentElement;
- parent_iframe_offset = { x: 0, y: 0, right: windowWidth, bottom: windowHeight };
- while (parent !== null) {
- var parent_rect = parent.getBoundingClientRect();
- parent_iframe_offset["x"] = Math.max(parent_rect.left , parent_iframe_offset["x"] );
- parent_iframe_offset["y"] = Math.max(parent_rect.top , parent_iframe_offset["y"] );
- parent_iframe_offset["right"] = Math.min(parent_rect.right , parent_iframe_offset["right"] );
- parent_iframe_offset["bottom"] = Math.min(parent_rect.bottom , parent_iframe_offset["bottom"] );
- parent = parent.parentElement;
- }
- var is_in_viewport = (
- x >= iframe_offset["x"] &&
- y >= iframe_offset["y"] &&
- x <= iframe_offset["right"] &&
- y <= iframe_offset["bottom"]
- );
- //this features is broken for the moment
- var NotBehindParent = (
- x >= parent_iframe_offset["x"] &&
- y >= parent_iframe_offset["y"] &&
- x <= parent_iframe_offset["right"] &&
- y <= parent_iframe_offset["bottom"]
- );
-
- var isVisible = (typeof element.offsetWidth === 'undefined' || typeof element.offsetHeight === 'undefined') || (element.offsetWidth > 0 && element.offsetHeight > 0);
-
- // Return true if the element is both in the viewport and has non-zero dimensions
- return [rect, (is_in_viewport && isVisible && IsInFront(element))? 1 : 0];
+async function until(f, timeout, interval=40) {
+ return new Promise((resolve, reject) => {
+ const start_time = Date.now();
+ // immediate check
+ if (f()) {
+ resolve();
+ }
+ // loop check
+ const wait = setInterval(() => {
+ if (f()) {
+ clearInterval(wait);
+ resolve();
+ } else if (Date.now() - start_time > timeout) {
+ clearInterval(wait);
+ reject();
+ }
+ }, interval);
+ });
}
-function IsInFront(element){
+function whoCapturesCenterClick(element){
var rect = element.getBoundingClientRect();
var x = (rect.left + rect.right) / 2 ;
var y = (rect.top + rect.bottom) / 2 ;
- var newElement = elementFromPoint(x, y); //return the element in the foreground at position (x,y)
- if(newElement){
- if(newElement === element)
- return true;
+ var element_at_center = elementFromPoint(x, y); // return the element in the foreground at position (x,y)
+ if (!element_at_center) {
+ return "nobody";
+ } else if (element_at_center === element) {
+ return "self";
+ } else if (element.contains(element_at_center)) {
+ return "child";
+ } else {
+ return "non-descendant";
}
- return false;
}
function elementFromPoint(x, y) {
- let node = document.elementFromPoint(x, y);
+ let dom = document;
+ let last_elem = null;
+ let elem = null;
- let child = node?.shadowRoot?.elementFromPoint(x, y);
+ do {
+ last_elem = elem;
+ elem = dom.elementFromPoint(x, y);
+ dom = elem?.shadowRoot;
+ } while(dom && elem !== last_elem);
- while (child && child !== node) {
- node = child;
- child = node?.shadowRoot?.elementFromPoint(x, y);
+ return elem;
+}
+
+// https://stackoverflow.com/questions/12504042/what-is-a-method-that-can-be-used-to-increment-letters#answer-12504061
+class IFrameIdGenerator {
+ constructor(chars = 'abcdefghijklmnopqrstuvwxyz') {
+ this._chars = chars;
+ this._nextId = [0];
+ }
+
+ next() {
+ const r = [];
+ for (const char of this._nextId) {
+ r.unshift(this._chars[char]);
+ }
+ this._increment();
+ return r.join('');
}
- return child || node;
+ _increment() {
+ for (let i = 0; i < this._nextId.length; i++) {
+ const val = ++this._nextId[i];
+ if (val < this._chars.length) {
+ return;
+ }
+ this._nextId[i] = 0;
+ }
+ this._nextId.push(0);
+ }
+
+ *[Symbol.iterator]() {
+ while (true) {
+ yield this.next();
+ }
+ }
}
diff --git a/core/src/browsergym/core/javascript/frame_unmark_elements.js b/core/src/browsergym/core/javascript/frame_unmark_elements.js
index 4a29f15f..578a47b9 100644
--- a/core/src/browsergym/core/javascript/frame_unmark_elements.js
+++ b/core/src/browsergym/core/javascript/frame_unmark_elements.js
@@ -23,7 +23,7 @@
if (elem.hasAttribute("aria-roledescription")) {
let content = elem.getAttribute("aria-roledescription");
// TODO: handle more data if needed
- let n_data_items = 8; // bid, bbox_left, bbox_top, center_x, center_y, bbox_right, bbox_bottom, is_in_viewport
+ let n_data_items = 1; // bid
let post_data_index = 0;
for (let j = 0 ; j < n_data_items ; j++) {
post_data_index = content.indexOf("_", post_data_index) + 1;
@@ -35,7 +35,6 @@
else {
elem.removeAttribute("aria-roledescription");
}
-
}
}
}
diff --git a/core/src/browsergym/core/observation.py b/core/src/browsergym/core/observation.py
index 3ea9d8ac..e5e63114 100644
--- a/core/src/browsergym/core/observation.py
+++ b/core/src/browsergym/core/observation.py
@@ -8,10 +8,16 @@
import re
from .constants import BROWSERGYM_ID_ATTRIBUTE as BID_ATTR
+from .constants import BROWSERGYM_VISIBILITY_ATTRIBUTE as VIS_ATTR
+from .constants import BROWSERGYM_SETOFMARKS_ATTRIBUTE as SOM_ATTR
MARK_FRAMES_MAX_TRIES = 3
+class MarkingError(Exception):
+ pass
+
+
def _pre_extract(page: playwright.sync_api.Page):
"""
pre-extraction routine, marks dom elements (set bid and dynamic attributes like value and checked)
@@ -22,47 +28,41 @@ def _pre_extract(page: playwright.sync_api.Page):
# we can't run this loop in JS due to Same-Origin Policy
# (can't access the content of an iframe from a another one)
- def mark_frames_recursive(
- frame,
- global_iframe_position,
- iframe_offset=None,
- ):
- # get the bid of the parent frame element
- try:
- parent_bid = frame.frame_element().get_attribute(BID_ATTR)
- except:
- parent_bid = ""
+ def mark_frames_recursive(frame, frame_bid: str):
+ assert frame_bid == "" or (frame_bid.islower() and frame_bid.isalpha())
+
# mark all DOM elements in the frame (it will use the parent frame element's bid as a prefix)
- super_iframe_offset = frame.evaluate(
+ warning_msgs = frame.evaluate(
js_frame_mark_elements,
- [
- parent_bid,
- BID_ATTR,
- global_iframe_position,
- iframe_offset,
- ],
+ [frame_bid, BID_ATTR],
)
+ # print warning messages if any
+ for msg in warning_msgs:
+ logging.warning(msg)
# recursively mark all descendant frames
- for _, sub_frame in enumerate(frame.child_frames):
- if not sub_frame.is_detached():
- is_frame_hidden = sub_frame.evaluate(
- """ () => {
- const style = window.getComputedStyle(document.documentElement);
- const is_null_size = document.documentElement.offsetWidth <= 0 || document.documentElement.offsetHeight <= 0;
- return style.display === 'none' || style.visibility === 'hidden' || is_null_size;
-}"""
+ for child_frame in frame.child_frames:
+ # deal with detached frames
+ if child_frame.is_detached():
+ continue
+ # deal with weird frames (pdf viewer in