File upload works robust and can connect to existing browser

shubhamgore2468 · Dec 4, 2024 · 05300b3 · 05300b3
1 parent 2954a82
commit 05300b3
Show file tree

Hide file tree

Showing 10 changed files with 205 additions and 121 deletions.
diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py
@@ -66,7 +66,7 @@ def __init__(
 		system_prompt_class: Type[SystemPrompt] = SystemPrompt,
 		max_input_tokens: int = 128000,
 		validate_output: bool = False,
-		include_attributes: list[str] = [],
+		include_attributes: list[str] = ['title', 'type', 'name'],
 		max_error_length: int = 400,
 		max_actions_per_step: int = 10,
 	):
@@ -160,7 +160,7 @@ async def step(self) -> None:
 			)
 			self._last_result = result
 
-			if result[-1].is_done:
+			if len(result) > 0 and result[-1].is_done:
 				logger.info(f'📄 Result: {result[-1].extracted_content}')
 
 			self.consecutive_failures = 0

diff --git a/browser_use/agent/views.py b/browser_use/agent/views.py
@@ -170,7 +170,11 @@ def final_result(self) -> None | str:
 
 	def is_done(self) -> bool:
 		"""Check if the agent is done"""
-		if self.history and self.history[-1].result[-1].is_done:
+		if (
+			self.history
+			and len(self.history[-1].result) > 0
+			and self.history[-1].result[-1].is_done
+		):
 			return self.history[-1].result[-1].is_done
 		return False
 

diff --git a/browser_use/browser/browser.py b/browser_use/browser/browser.py
@@ -34,11 +34,16 @@ class BrowserConfig:
 
 		wss_url: None
 			Connect to a browser instance via WebSocket
+
+		chrome_instance_path: None
+			Path to a Chrome instance to use to connect to your normal browser
+			e.g. '/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome'
 	"""
 
 	headless: bool = False
-	disable_security: bool = False
+	disable_security: bool = True
 	extra_chromium_args: list[str] = field(default_factory=list)
+	chrome_instance_path: str | None = None
 	wss_url: str | None = None
 
 	new_context_config: BrowserContextConfig = field(default_factory=BrowserContextConfig)
@@ -91,7 +96,44 @@ async def _setup_browser(self, playwright: Playwright) -> PlaywrightBrowser:
 		if self.config.wss_url:
 			browser = await playwright.chromium.connect(self.config.wss_url)
 			return browser
+		elif self.config.chrome_instance_path:
+			import subprocess
+
+			import requests
 
+			try:
+				# Check if browser is already running
+				response = requests.get('http://localhost:9222/json/version', timeout=2)
+				if response.status_code == 200:
+					logger.info('Reusing existing Chrome instance')
+					browser = await playwright.chromium.connect_over_cdp(
+						endpoint_url='http://localhost:9222',
+						timeout=20000,  # 20 second timeout for connection
+					)
+					return browser
+			except requests.ConnectionError:
+				logger.debug('No existing Chrome instance found, starting a new one')
+
+			# Start a new Chrome instance
+			subprocess.Popen(
+				[
+					self.config.chrome_instance_path,
+					'--remote-debugging-port=9222',
+				],
+				stdout=subprocess.DEVNULL,
+				stderr=subprocess.DEVNULL,
+			)
+
+			# Attempt to connect again after starting a new instance
+			try:
+				browser = await playwright.chromium.connect_over_cdp(
+					endpoint_url='http://localhost:9222',
+					timeout=20000,  # 20 second timeout for connection
+				)
+				return browser
+			except Exception as e:
+				logger.error(f'Failed to start a new Chrome instance: {str(e)}')
+				raise
 		else:
 			try:
 				disable_security_args = []
@@ -118,6 +160,7 @@ async def _setup_browser(self, playwright: Playwright) -> PlaywrightBrowser:
 						'--no-default-browser-check',
 						'--no-startup-window',
 						'--window-position=0,0',
+						'--window-size=3000,3000',
 					]
 					+ disable_security_args
 					+ self.config.extra_chromium_args,

diff --git a/browser_use/browser/context.py b/browser_use/browser/context.py
@@ -60,9 +60,14 @@ class BrowserContextConfig:
 		maximum_wait_page_load_time: 5.0
 			Maximum time to wait for page load before proceeding anyway
 
+		wait_between_actions: 1.0
+			Time to wait between multiple per step actions
+
 		browser_window_size: {'width': 1280, 'height': 1024}
 			Default browser window size
 
+		no_viewport: False
+			Disable viewport
 		save_recording_path: None
 			Path to save video recordings
 
@@ -78,8 +83,8 @@ class BrowserContextConfig:
 
 	disable_security: bool = False
 
-	extra_chromium_args: list[str] = field(default_factory=list)
 	browser_window_size: Optional[BrowserContextWindowSize] = None
+	no_viewport: bool = True
 
 	save_recording_path: str | None = None
 	trace_path: str | None = None
@@ -203,18 +208,23 @@ async def get_current_page(self) -> Page:
 
 	async def _create_context(self, browser: PlaywrightBrowser):
 		"""Creates a new browser context with anti-detection measures and loads cookies if available."""
-		context = await browser.new_context(
-			viewport=self.config.browser_window_size,
-			no_viewport=True,
-			user_agent=(
-				'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
-				'(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
-			),
-			java_script_enabled=True,
-			bypass_csp=self.config.disable_security,
-			ignore_https_errors=self.config.disable_security,
-			record_video_dir=self.config.save_recording_path,
-		)
+		if self.browser.config.chrome_instance_path and len(browser.contexts) > 0:
+			# Connect to existing Chrome instance instead of creating new one
+			context = browser.contexts[0]
+		else:
+			# Original code for creating new context
+			context = await browser.new_context(
+				viewport=self.config.browser_window_size,
+				no_viewport=self.config.no_viewport,
+				user_agent=(
+					'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
+					'(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
+				),
+				java_script_enabled=True,
+				bypass_csp=self.config.disable_security,
+				ignore_https_errors=self.config.disable_security,
+				record_video_dir=self.config.save_recording_path,
+			)
 
 		if self.config.trace_path:
 			await context.tracing.start(screenshots=True, snapshots=True, sources=True)
@@ -725,7 +735,7 @@ def _enhanced_css_selector_for_element(self, element: DOMElementNode) -> str:
 			tag_name = element.tag_name or '*'
 			return f"{tag_name}[highlight_index='{element.highlight_index}']"
 
-	async def get_locate_element(self, element: DOMElementNode):
+	async def get_locate_element(self, element: DOMElementNode) -> ElementHandle | None:
 		current_frame = await self.get_current_page()
 
 		# Start with the target element and collect all parents
@@ -868,3 +878,35 @@ async def save_cookies(self):
 					json.dump(cookies, f)
 			except Exception as e:
 				logger.warning(f'Failed to save cookies: {str(e)}')
+
+	async def is_file_uploader(
+		self, element_node: DOMElementNode, max_depth: int = 3, current_depth: int = 0
+	) -> bool:
+		"""Check if element or its children are file uploaders"""
+		if current_depth > max_depth:
+			return False
+
+		# Check current element
+		is_uploader = False
+
+		if not isinstance(element_node, DOMElementNode):
+			return False
+
+		# Check for file input attributes
+		if element_node.tag_name == 'input':
+			is_uploader = (
+				element_node.attributes.get('type') == 'file'
+				or element_node.attributes.get('accept') is not None
+			)
+
+		if is_uploader:
+			return True
+
+		# Recursively check children
+		if element_node.children and current_depth < max_depth:
+			for child in element_node.children:
+				if isinstance(child, DOMElementNode):
+					if await self.is_file_uploader(child, max_depth, current_depth + 1):
+						return True
+
+		return False
diff --git a/browser_use/controller/service.py b/browser_use/controller/service.py
@@ -84,12 +84,19 @@ async def click_element(params: ClickElementAction, browser: BrowserContext):
 			element_node = state.selector_map[params.index]
 			initial_pages = len(session.context.pages)
 
+			# if element has file uploader then dont click
+			if await browser.is_file_uploader(element_node):
+				msg = f'Index {params.index} - has an element which opens file upload dialog. To upload files please use a specific function to upload files '
+				logger.info(msg)
+				return ActionResult(extracted_content=msg, include_in_memory=True)
+
 			msg = None
 
 			try:
 				await browser._click_element_node(element_node)
 				msg = f'🖱️  Clicked index {params.index}'
-				logger.info(msg + f' - {element_node.xpath}')
+				logger.info(msg)
+				logger.debug(f'Element xpath: {element_node.xpath}')
 				if len(session.context.pages) > initial_pages:
 					new_tab_msg = 'New tab opened - switching to it'
 					msg += f' - {new_tab_msg}'
@@ -119,7 +126,8 @@ async def input_text(params: InputTextAction, browser: BrowserContext):
 			element_node = state.selector_map[params.index]
 			await browser._input_text_element_node(element_node, params.text)
 			msg = f'⌨️  Input "{params.text}" into index {params.index}'
-			logger.info(msg + f' - {element_node.xpath}')
+			logger.info(msg)
+			logger.debug(f'Element xpath: {element_node.xpath}')
 			return ActionResult(extracted_content=msg, include_in_memory=True)
 
 		# Tab Management Actions
@@ -500,10 +508,26 @@ async def multi_act(
 		await browser_context.remove_highlights()
 
 		for i, action in enumerate(actions):
-			if changed and action.get_index() is not None:
-				# next action requires index but there are new elements on the page
-				logger.info(f'Something new appeared after action {i}')
-				break
+			if action.get_index() is not None and i != 0:
+				new_state = await browser_context.get_state()
+				new_att_hashes = set(
+					e.hash.attributes_hash for e in new_state.selector_map.values()
+				)
+
+				if not new_att_hashes.issubset(cached_att_hashes):
+					logger.debug(f'Attributes changed - stopping after {i + 1} actions')
+					changed = True
+				new_path_hashes = set(
+					e.hash.branch_path_hash for e in new_state.selector_map.values()
+				)
+				if not new_path_hashes.issubset(cached_path_hashes):
+					logger.debug(f'Branch path changed - stopping after {i + 1} actions')
+					changed = True
+
+				if changed:
+					# next action requires index but there are new elements on the page
+					logger.info(f'Something new appeared after action {i}')
+					break
 
 			results.append(await self.act(action, browser_context))
 
@@ -514,17 +538,6 @@ async def multi_act(
 			await asyncio.sleep(browser_context.config.wait_between_actions)
 			# hash all elements. if it is a subset of cached_state its fine - else break (new elements on page)
 
-			new_state = await browser_context.get_state()
-			new_att_hashes = set(e.hash.attributes_hash for e in new_state.selector_map.values())
-
-			if not new_att_hashes.issubset(cached_att_hashes):
-				logger.debug(f'Attributes changed - stopping after {i + 1} actions')
-				changed = True
-			new_path_hashes = set(e.hash.branch_path_hash for e in new_state.selector_map.values())
-			if not new_path_hashes.issubset(cached_path_hashes):
-				logger.debug(f'Branch path changed - stopping after {i + 1} actions')
-				changed = True
-
 		return results
 
 	@time_execution_sync('--act')

diff --git a/browser_use/dom/views.py b/browser_use/dom/views.py
@@ -130,6 +130,16 @@ def process_node(node: DOMBaseNode, depth: int) -> None:
 		process_node(self, 0)
 		return '\n'.join(formatted_text)
 
+	def get_file_upload_element(self) -> Optional['DOMElementNode']:
+		if self.tag_name == 'input' and self.attributes.get('type') == 'file':
+			return self
+		for child in self.children:
+			if isinstance(child, DOMElementNode):
+				result = child.get_file_upload_element()
+				if result:
+					return result
+		return None
+
 
 class ElementTreeSerializer:
 	@staticmethod

diff --git a/examples/file_upload.py b/examples/file_upload.py
@@ -38,9 +38,9 @@ async def close_file_dialog(browser: BrowserContext):
 
 async def main():
 	sites = [
-		'https://practice.expandtesting.com/upload',
+		'https://kzmpmkh2zfk1ojnpxfn1.lite.vusercontent.net/',
 	]
-	task = f'go to {" ".join(sites)} each in new tabs and Upload my file then subbmit extract the page content and go to google and find elon musk and stop'
+	task = f'go to {" ".join(sites)} each in new tabs and Upload my file then subbmit and stop'
 
 	model = ChatOpenAI(model='gpt-4o')
 	agent = Agent(

diff --git a/examples/find_and_apply_to_jobs.py b/examples/find_and_apply_to_jobs.py
@@ -69,58 +69,29 @@ def read_cv():
 	return ActionResult(extracted_content=text, include_in_memory=True)
 
 
-@controller.action(
-	'Upload cv to index - dont click the index - only call this function', requires_browser=True
-)
+@controller.action('Upload cv to element - call this function to upload ', requires_browser=True)
 async def upload_cv(index: int, browser: BrowserContext):
 	page = await browser.get_current_page()
 	path = str(CV.absolute())
-	target_element = await browser.get_element_by_index(index)
+	selector_map = await browser.get_selector_map()
+	file_upload_dom_element = selector_map[index].get_file_upload_element()
 
-	if not target_element:
-		raise Exception(f'Could not find element at index {index}')
+	if file_upload_dom_element is None:
+		return ActionResult(error=f'No file upload element found at index {index}')
 
-	async def attempt_1():
-		is_visible = await target_element.is_visible()
-		if not is_visible:
-			return False
+	file_upload_element = await browser.get_locate_element(file_upload_dom_element)
+	if file_upload_element is None:
+		return ActionResult(error=f'No file upload element found at index {index}')
 
-		# First check if element is a file input
-		tag_name = await target_element.evaluate('el => el.tagName.toLowerCase()')
-		if tag_name == 'input' and await target_element.evaluate("el => el.type === 'file'"):
-			await target_element.set_input_files(path)
+	async def attempt_1():
+		try:
+			await file_upload_element.set_input_files(path)
 			return True
+		except Exception as e:
+			logger.debug(f'Error in set_input_files: {str(e)}')
+			return False
 
-		return False
-
-	async def attempt_2():
-		# Direct input[type="file"] approach using the target element
-		# Get all file inputs and find the one closest to our target element
-		file_inputs = await page.query_selector_all('input[type="file"]')
-
-		for input_element in file_inputs:
-			# Check if this input is associated with our target element
-			is_associated = await page.evaluate(
-				"""
-				([input, target]) => {
-					const inputRect = input.getBoundingClientRect();
-					const targetRect = target.getBoundingClientRect();
-					const distance = Math.hypot(
-						inputRect.left - targetRect.left,
-						inputRect.top - targetRect.top
-					);
-					return distance < 200;
-				}
-			""",
-				[input_element, target_element],
-			)
-
-			if is_associated:
-				await input_element.set_input_files(path)
-				return True
-		return False
-
-	for attempt_func in [attempt_1, attempt_2]:
+	for attempt_func in [attempt_1]:
 		try:
 			if await attempt_func():
 				logger.info(f'Successfully uploaded file to index {index}')