Skip to content

Commit

Permalink
File upload works robust and can connect to existing browser
Browse files Browse the repository at this point in the history
  • Loading branch information
MagMueller committed Dec 4, 2024
1 parent 2954a82 commit 05300b3
Show file tree
Hide file tree
Showing 10 changed files with 205 additions and 121 deletions.
4 changes: 2 additions & 2 deletions browser_use/agent/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def __init__(
system_prompt_class: Type[SystemPrompt] = SystemPrompt,
max_input_tokens: int = 128000,
validate_output: bool = False,
include_attributes: list[str] = [],
include_attributes: list[str] = ['title', 'type', 'name'],
max_error_length: int = 400,
max_actions_per_step: int = 10,
):
Expand Down Expand Up @@ -160,7 +160,7 @@ async def step(self) -> None:
)
self._last_result = result

if result[-1].is_done:
if len(result) > 0 and result[-1].is_done:
logger.info(f'📄 Result: {result[-1].extracted_content}')

self.consecutive_failures = 0
Expand Down
6 changes: 5 additions & 1 deletion browser_use/agent/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,11 @@ def final_result(self) -> None | str:

def is_done(self) -> bool:
"""Check if the agent is done"""
if self.history and self.history[-1].result[-1].is_done:
if (
self.history
and len(self.history[-1].result) > 0
and self.history[-1].result[-1].is_done
):
return self.history[-1].result[-1].is_done
return False

Expand Down
45 changes: 44 additions & 1 deletion browser_use/browser/browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,16 @@ class BrowserConfig:
wss_url: None
Connect to a browser instance via WebSocket
chrome_instance_path: None
Path to a Chrome instance to use to connect to your normal browser
e.g. '/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome'
"""

headless: bool = False
disable_security: bool = False
disable_security: bool = True
extra_chromium_args: list[str] = field(default_factory=list)
chrome_instance_path: str | None = None
wss_url: str | None = None

new_context_config: BrowserContextConfig = field(default_factory=BrowserContextConfig)
Expand Down Expand Up @@ -91,7 +96,44 @@ async def _setup_browser(self, playwright: Playwright) -> PlaywrightBrowser:
if self.config.wss_url:
browser = await playwright.chromium.connect(self.config.wss_url)
return browser
elif self.config.chrome_instance_path:
import subprocess

import requests

try:
# Check if browser is already running
response = requests.get('http://localhost:9222/json/version', timeout=2)
if response.status_code == 200:
logger.info('Reusing existing Chrome instance')
browser = await playwright.chromium.connect_over_cdp(
endpoint_url='http://localhost:9222',
timeout=20000, # 20 second timeout for connection
)
return browser
except requests.ConnectionError:
logger.debug('No existing Chrome instance found, starting a new one')

# Start a new Chrome instance
subprocess.Popen(
[
self.config.chrome_instance_path,
'--remote-debugging-port=9222',
],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)

# Attempt to connect again after starting a new instance
try:
browser = await playwright.chromium.connect_over_cdp(
endpoint_url='http://localhost:9222',
timeout=20000, # 20 second timeout for connection
)
return browser
except Exception as e:
logger.error(f'Failed to start a new Chrome instance: {str(e)}')
raise
else:
try:
disable_security_args = []
Expand All @@ -118,6 +160,7 @@ async def _setup_browser(self, playwright: Playwright) -> PlaywrightBrowser:
'--no-default-browser-check',
'--no-startup-window',
'--window-position=0,0',
'--window-size=3000,3000',
]
+ disable_security_args
+ self.config.extra_chromium_args,
Expand Down
70 changes: 56 additions & 14 deletions browser_use/browser/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,14 @@ class BrowserContextConfig:
maximum_wait_page_load_time: 5.0
Maximum time to wait for page load before proceeding anyway
wait_between_actions: 1.0
Time to wait between multiple per step actions
browser_window_size: {'width': 1280, 'height': 1024}
Default browser window size
no_viewport: False
Disable viewport
save_recording_path: None
Path to save video recordings
Expand All @@ -78,8 +83,8 @@ class BrowserContextConfig:

disable_security: bool = False

extra_chromium_args: list[str] = field(default_factory=list)
browser_window_size: Optional[BrowserContextWindowSize] = None
no_viewport: bool = True

save_recording_path: str | None = None
trace_path: str | None = None
Expand Down Expand Up @@ -203,18 +208,23 @@ async def get_current_page(self) -> Page:

async def _create_context(self, browser: PlaywrightBrowser):
"""Creates a new browser context with anti-detection measures and loads cookies if available."""
context = await browser.new_context(
viewport=self.config.browser_window_size,
no_viewport=True,
user_agent=(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
),
java_script_enabled=True,
bypass_csp=self.config.disable_security,
ignore_https_errors=self.config.disable_security,
record_video_dir=self.config.save_recording_path,
)
if self.browser.config.chrome_instance_path and len(browser.contexts) > 0:
# Connect to existing Chrome instance instead of creating new one
context = browser.contexts[0]
else:
# Original code for creating new context
context = await browser.new_context(
viewport=self.config.browser_window_size,
no_viewport=self.config.no_viewport,
user_agent=(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
),
java_script_enabled=True,
bypass_csp=self.config.disable_security,
ignore_https_errors=self.config.disable_security,
record_video_dir=self.config.save_recording_path,
)

if self.config.trace_path:
await context.tracing.start(screenshots=True, snapshots=True, sources=True)
Expand Down Expand Up @@ -725,7 +735,7 @@ def _enhanced_css_selector_for_element(self, element: DOMElementNode) -> str:
tag_name = element.tag_name or '*'
return f"{tag_name}[highlight_index='{element.highlight_index}']"

async def get_locate_element(self, element: DOMElementNode):
async def get_locate_element(self, element: DOMElementNode) -> ElementHandle | None:
current_frame = await self.get_current_page()

# Start with the target element and collect all parents
Expand Down Expand Up @@ -868,3 +878,35 @@ async def save_cookies(self):
json.dump(cookies, f)
except Exception as e:
logger.warning(f'Failed to save cookies: {str(e)}')

async def is_file_uploader(
self, element_node: DOMElementNode, max_depth: int = 3, current_depth: int = 0
) -> bool:
"""Check if element or its children are file uploaders"""
if current_depth > max_depth:
return False

# Check current element
is_uploader = False

if not isinstance(element_node, DOMElementNode):
return False

# Check for file input attributes
if element_node.tag_name == 'input':
is_uploader = (
element_node.attributes.get('type') == 'file'
or element_node.attributes.get('accept') is not None
)

if is_uploader:
return True

# Recursively check children
if element_node.children and current_depth < max_depth:
for child in element_node.children:
if isinstance(child, DOMElementNode):
if await self.is_file_uploader(child, max_depth, current_depth + 1):
return True

return False
47 changes: 30 additions & 17 deletions browser_use/controller/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,19 @@ async def click_element(params: ClickElementAction, browser: BrowserContext):
element_node = state.selector_map[params.index]
initial_pages = len(session.context.pages)

# if element has file uploader then dont click
if await browser.is_file_uploader(element_node):
msg = f'Index {params.index} - has an element which opens file upload dialog. To upload files please use a specific function to upload files '
logger.info(msg)
return ActionResult(extracted_content=msg, include_in_memory=True)

msg = None

try:
await browser._click_element_node(element_node)
msg = f'🖱️ Clicked index {params.index}'
logger.info(msg + f' - {element_node.xpath}')
logger.info(msg)
logger.debug(f'Element xpath: {element_node.xpath}')
if len(session.context.pages) > initial_pages:
new_tab_msg = 'New tab opened - switching to it'
msg += f' - {new_tab_msg}'
Expand Down Expand Up @@ -119,7 +126,8 @@ async def input_text(params: InputTextAction, browser: BrowserContext):
element_node = state.selector_map[params.index]
await browser._input_text_element_node(element_node, params.text)
msg = f'⌨️ Input "{params.text}" into index {params.index}'
logger.info(msg + f' - {element_node.xpath}')
logger.info(msg)
logger.debug(f'Element xpath: {element_node.xpath}')
return ActionResult(extracted_content=msg, include_in_memory=True)

# Tab Management Actions
Expand Down Expand Up @@ -500,10 +508,26 @@ async def multi_act(
await browser_context.remove_highlights()

for i, action in enumerate(actions):
if changed and action.get_index() is not None:
# next action requires index but there are new elements on the page
logger.info(f'Something new appeared after action {i}')
break
if action.get_index() is not None and i != 0:
new_state = await browser_context.get_state()
new_att_hashes = set(
e.hash.attributes_hash for e in new_state.selector_map.values()
)

if not new_att_hashes.issubset(cached_att_hashes):
logger.debug(f'Attributes changed - stopping after {i + 1} actions')
changed = True
new_path_hashes = set(
e.hash.branch_path_hash for e in new_state.selector_map.values()
)
if not new_path_hashes.issubset(cached_path_hashes):
logger.debug(f'Branch path changed - stopping after {i + 1} actions')
changed = True

if changed:
# next action requires index but there are new elements on the page
logger.info(f'Something new appeared after action {i}')
break

results.append(await self.act(action, browser_context))

Expand All @@ -514,17 +538,6 @@ async def multi_act(
await asyncio.sleep(browser_context.config.wait_between_actions)
# hash all elements. if it is a subset of cached_state its fine - else break (new elements on page)

new_state = await browser_context.get_state()
new_att_hashes = set(e.hash.attributes_hash for e in new_state.selector_map.values())

if not new_att_hashes.issubset(cached_att_hashes):
logger.debug(f'Attributes changed - stopping after {i + 1} actions')
changed = True
new_path_hashes = set(e.hash.branch_path_hash for e in new_state.selector_map.values())
if not new_path_hashes.issubset(cached_path_hashes):
logger.debug(f'Branch path changed - stopping after {i + 1} actions')
changed = True

return results

@time_execution_sync('--act')
Expand Down
10 changes: 10 additions & 0 deletions browser_use/dom/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,16 @@ def process_node(node: DOMBaseNode, depth: int) -> None:
process_node(self, 0)
return '\n'.join(formatted_text)

def get_file_upload_element(self) -> Optional['DOMElementNode']:
if self.tag_name == 'input' and self.attributes.get('type') == 'file':
return self
for child in self.children:
if isinstance(child, DOMElementNode):
result = child.get_file_upload_element()
if result:
return result
return None


class ElementTreeSerializer:
@staticmethod
Expand Down
4 changes: 2 additions & 2 deletions examples/file_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ async def close_file_dialog(browser: BrowserContext):

async def main():
sites = [
'https://practice.expandtesting.com/upload',
'https://kzmpmkh2zfk1ojnpxfn1.lite.vusercontent.net/',
]
task = f'go to {" ".join(sites)} each in new tabs and Upload my file then subbmit extract the page content and go to google and find elon musk and stop'
task = f'go to {" ".join(sites)} each in new tabs and Upload my file then subbmit and stop'

model = ChatOpenAI(model='gpt-4o')
agent = Agent(
Expand Down
59 changes: 15 additions & 44 deletions examples/find_and_apply_to_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,58 +69,29 @@ def read_cv():
return ActionResult(extracted_content=text, include_in_memory=True)


@controller.action(
'Upload cv to index - dont click the index - only call this function', requires_browser=True
)
@controller.action('Upload cv to element - call this function to upload ', requires_browser=True)
async def upload_cv(index: int, browser: BrowserContext):
page = await browser.get_current_page()
path = str(CV.absolute())
target_element = await browser.get_element_by_index(index)
selector_map = await browser.get_selector_map()
file_upload_dom_element = selector_map[index].get_file_upload_element()

if not target_element:
raise Exception(f'Could not find element at index {index}')
if file_upload_dom_element is None:
return ActionResult(error=f'No file upload element found at index {index}')

async def attempt_1():
is_visible = await target_element.is_visible()
if not is_visible:
return False
file_upload_element = await browser.get_locate_element(file_upload_dom_element)
if file_upload_element is None:
return ActionResult(error=f'No file upload element found at index {index}')

# First check if element is a file input
tag_name = await target_element.evaluate('el => el.tagName.toLowerCase()')
if tag_name == 'input' and await target_element.evaluate("el => el.type === 'file'"):
await target_element.set_input_files(path)
async def attempt_1():
try:
await file_upload_element.set_input_files(path)
return True
except Exception as e:
logger.debug(f'Error in set_input_files: {str(e)}')
return False

return False

async def attempt_2():
# Direct input[type="file"] approach using the target element
# Get all file inputs and find the one closest to our target element
file_inputs = await page.query_selector_all('input[type="file"]')

for input_element in file_inputs:
# Check if this input is associated with our target element
is_associated = await page.evaluate(
"""
([input, target]) => {
const inputRect = input.getBoundingClientRect();
const targetRect = target.getBoundingClientRect();
const distance = Math.hypot(
inputRect.left - targetRect.left,
inputRect.top - targetRect.top
);
return distance < 200;
}
""",
[input_element, target_element],
)

if is_associated:
await input_element.set_input_files(path)
return True
return False

for attempt_func in [attempt_1, attempt_2]:
for attempt_func in [attempt_1]:
try:
if await attempt_func():
logger.info(f'Successfully uploaded file to index {index}')
Expand Down
Loading

0 comments on commit 05300b3

Please sign in to comment.