Skip to content

Commit

Permalink
Merge pull request #184 from vvincent1234/fix/adapt_latest_browser-use
Browse files Browse the repository at this point in the history
Fix/adapt latest browser use
  • Loading branch information
warmshao authored Jan 28, 2025
2 parents 566bca7 + 75ab505 commit 0c9cb9b
Show file tree
Hide file tree
Showing 10 changed files with 115 additions and 186 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ playwright install
- `--dark-mode`: Enables dark mode for the user interface.
3. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`.
4. **Using Your Own Browser(Optional):**
- Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser.
- Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser. Leave `CHROME_USER_DATA` empty if you want to use local user data.
- Windows
```env
CHROME_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe"
Expand All @@ -118,7 +118,7 @@ playwright install
- Mac
```env
CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
CHROME_USER_DATA="~/Library/Application Support/Google/Chrome/Profile 1"
CHROME_USER_DATA="/Users/YourUsername/Library/Application Support/Google/Chrome"
```
- Close all Chrome windows
- Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent.
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
browser-use==0.1.29
pyperclip==1.9.0
gradio==5.10.0
json-repair
50 changes: 32 additions & 18 deletions src/agent/custom_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
import base64
import io
import platform
from browser_use.agent.prompts import SystemPrompt
from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
from browser_use.agent.service import Agent
from browser_use.agent.views import (
ActionResult,
ActionModel,
AgentHistoryList,
AgentOutput,
AgentHistory,
Expand All @@ -30,6 +31,7 @@
from langchain_core.messages import (
BaseMessage,
)
from json_repair import repair_json
from src.utils.agent_state import AgentState

from .custom_massage_manager import CustomMassageManager
Expand All @@ -52,6 +54,7 @@ def __init__(
max_failures: int = 5,
retry_delay: int = 10,
system_prompt_class: Type[SystemPrompt] = SystemPrompt,
agent_prompt_class: Type[AgentMessagePrompt] = AgentMessagePrompt,
max_input_tokens: int = 128000,
validate_output: bool = False,
include_attributes: list[str] = [
Expand Down Expand Up @@ -98,28 +101,31 @@ def __init__(
register_done_callback=register_done_callback,
tool_calling_method=tool_calling_method
)
if self.model_name in ["deepseek-reasoner"] or self.model_name.startswith("deepseek-r1"):
if self.model_name in ["deepseek-reasoner"] or "deepseek-r1" in self.model_name:
# deepseek-reasoner does not support function calling
self.use_deepseek_r1 = True
# deepseek-reasoner only support 64000 context
self.max_input_tokens = 64000
else:
self.use_deepseek_r1 = False

# record last actions
self._last_actions = None
# custom new info
self.add_infos = add_infos
# agent_state for Stop
self.agent_state = agent_state
self.agent_prompt_class = agent_prompt_class
self.message_manager = CustomMassageManager(
llm=self.llm,
task=self.task,
action_descriptions=self.controller.registry.get_prompt_description(),
system_prompt_class=self.system_prompt_class,
agent_prompt_class=agent_prompt_class,
max_input_tokens=self.max_input_tokens,
include_attributes=self.include_attributes,
max_error_length=self.max_error_length,
max_actions_per_step=self.max_actions_per_step,
use_deepseek_r1=self.use_deepseek_r1
max_actions_per_step=self.max_actions_per_step
)

def _setup_action_models(self) -> None:
Expand Down Expand Up @@ -186,9 +192,11 @@ async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutpu
logger.info(ai_message.reasoning_content)
logger.info(f"🤯 End Deep Thinking")
if isinstance(ai_message.content, list):
parsed_json = json.loads(ai_message.content[0].replace("```json", "").replace("```", ""))
ai_content = ai_message.content[0].replace("```json", "").replace("```", "")
else:
parsed_json = json.loads(ai_message.content.replace("```json", "").replace("```", ""))
ai_content = ai_message.content.replace("```json", "").replace("```", "")
ai_content = repair_json(ai_content)
parsed_json = json.loads(ai_content)
parsed: AgentOutput = self.AgentOutput(**parsed_json)
if parsed is None:
logger.debug(ai_message.content)
Expand All @@ -197,9 +205,11 @@ async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutpu
ai_message = self.llm.invoke(input_messages)
self.message_manager._add_message_with_tokens(ai_message)
if isinstance(ai_message.content, list):
parsed_json = json.loads(ai_message.content[0].replace("```json", "").replace("```", ""))
ai_content = ai_message.content[0].replace("```json", "").replace("```", "")
else:
parsed_json = json.loads(ai_message.content.replace("```json", "").replace("```", ""))
ai_content = ai_message.content.replace("```json", "").replace("```", "")
ai_content = repair_json(ai_content)
parsed_json = json.loads(ai_content)
parsed: AgentOutput = self.AgentOutput(**parsed_json)
if parsed is None:
logger.debug(ai_message.content)
Expand All @@ -222,7 +232,7 @@ async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:

try:
state = await self.browser_context.get_state(use_vision=self.use_vision)
self.message_manager.add_state_message(state, self._last_result, step_info)
self.message_manager.add_state_message(state, self._last_actions, self._last_result, step_info)
input_messages = self.message_manager.get_messages()
try:
model_output = await self.get_next_action(input_messages)
Expand All @@ -231,27 +241,31 @@ async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
self.update_step_info(model_output, step_info)
logger.info(f"🧠 All Memory: \n{step_info.memory}")
self._save_conversation(input_messages, model_output)
# should we remove last state message? at least, deepseek-reasoner cannot remove
if self.model_name != "deepseek-reasoner":
self.message_manager._remove_last_state_message()
# remove prev message
self.message_manager._remove_state_message_by_index(-1)
except Exception as e:
# model call failed, remove last state message from history
self.message_manager._remove_last_state_message()
self.message_manager._remove_state_message_by_index(-1)
raise e

actions: list[ActionModel] = model_output.action
result: list[ActionResult] = await self.controller.multi_act(
model_output.action, self.browser_context
actions, self.browser_context
)
if len(result) != len(model_output.action):
if len(result) != len(actions):
# I think something changes, such information should let LLM know
for ri in range(len(result), len(model_output.action)):
for ri in range(len(result), len(actions)):
result.append(ActionResult(extracted_content=None,
include_in_memory=True,
error=f"{model_output.action[ri].model_dump_json(exclude_unset=True)} is Failed to execute. \
Something new appeared after action {model_output.action[len(result) - 1].model_dump_json(exclude_unset=True)}",
error=f"{actions[ri].model_dump_json(exclude_unset=True)} is Failed to execute. \
Something new appeared after action {actions[len(result) - 1].model_dump_json(exclude_unset=True)}",
is_done=False))
if len(actions) == 0:
# TODO: fix no action case
result = [ActionResult(is_done=True, extracted_content=step_info.memory, include_in_memory=True)]
self._last_result = result

self._last_actions = actions
if len(result) > 0 and result[-1].is_done:
logger.info(f"📄 Result: {result[-1].extracted_content}")

Expand Down
27 changes: 20 additions & 7 deletions src/agent/custom_massage_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

from browser_use.agent.message_manager.service import MessageManager
from browser_use.agent.message_manager.views import MessageHistory
from browser_use.agent.prompts import SystemPrompt
from browser_use.agent.views import ActionResult, AgentStepInfo
from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
from browser_use.agent.views import ActionResult, AgentStepInfo, ActionModel
from browser_use.browser.views import BrowserState
from langchain_core.language_models import BaseChatModel
from langchain_anthropic import ChatAnthropic
Expand All @@ -31,14 +31,14 @@ def __init__(
task: str,
action_descriptions: str,
system_prompt_class: Type[SystemPrompt],
agent_prompt_class: Type[AgentMessagePrompt],
max_input_tokens: int = 128000,
estimated_characters_per_token: int = 3,
image_tokens: int = 800,
include_attributes: list[str] = [],
max_error_length: int = 400,
max_actions_per_step: int = 10,
message_context: Optional[str] = None,
use_deepseek_r1: bool = False
message_context: Optional[str] = None
):
super().__init__(
llm=llm,
Expand All @@ -53,8 +53,7 @@ def __init__(
max_actions_per_step=max_actions_per_step,
message_context=message_context
)
self.tool_id = 1
self.use_deepseek_r1 = use_deepseek_r1
self.agent_prompt_class = agent_prompt_class
# Custom: Move Task info to state_message
self.history = MessageHistory()
self._add_message_with_tokens(self.system_prompt)
Expand All @@ -75,13 +74,15 @@ def cut_messages(self):
def add_state_message(
self,
state: BrowserState,
actions: Optional[List[ActionModel]] = None,
result: Optional[List[ActionResult]] = None,
step_info: Optional[AgentStepInfo] = None,
) -> None:
"""Add browser state as human message"""
# otherwise add state message and result to next message (which will not stay in memory)
state_message = CustomAgentMessagePrompt(
state_message = self.agent_prompt_class(
state,
actions,
result,
include_attributes=self.include_attributes,
max_error_length=self.max_error_length,
Expand All @@ -102,3 +103,15 @@ def _count_text_tokens(self, text: str) -> int:
len(text) // self.estimated_characters_per_token
) # Rough estimate if no tokenizer available
return tokens

def _remove_state_message_by_index(self, remove_ind=-1) -> None:
"""Remove last state message from history"""
i = len(self.history.messages) - 1
remove_cnt = 0
while i >= 0:
if isinstance(self.history.messages[i].message, HumanMessage):
remove_cnt += 1
if remove_cnt == abs(remove_ind):
self.history.remove_message(i)
break
i -= 1
21 changes: 13 additions & 8 deletions src/agent/custom_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import List, Optional

from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
from browser_use.agent.views import ActionResult
from browser_use.agent.views import ActionResult, ActionModel
from browser_use.browser.views import BrowserState
from langchain_core.messages import HumanMessage, SystemMessage

Expand Down Expand Up @@ -56,7 +56,7 @@ def important_rules(self) -> str:
- Use scroll to find elements you are looking for
5. TASK COMPLETION:
- If you think all the requirements of user\'s instruction have been completed and no further operation is required, output the done action to terminate the operation process.
- If you think all the requirements of user\'s instruction have been completed and no further operation is required, output the **Done** action to terminate the operation process.
- Don't hallucinate actions.
- If the task requires specific information - make sure to include everything in the done function. This is what the user will see.
- If you are running out of steps (current step), think about speeding it up, and ALWAYS use the done action as the last action.
Expand Down Expand Up @@ -140,6 +140,7 @@ class CustomAgentMessagePrompt(AgentMessagePrompt):
def __init__(
self,
state: BrowserState,
actions: Optional[List[ActionModel]] = None,
result: Optional[List[ActionResult]] = None,
include_attributes: list[str] = [],
max_error_length: int = 400,
Expand All @@ -151,10 +152,11 @@ def __init__(
max_error_length=max_error_length,
step_info=step_info
)
self.actions = actions

def get_user_message(self) -> HumanMessage:
if self.step_info:
step_info_description = f'Current step: {self.step_info.step_number + 1}/{self.step_info.max_steps}'
step_info_description = f'Current step: {self.step_info.step_number}/{self.step_info.max_steps}\n'
else:
step_info_description = ''

Expand All @@ -181,7 +183,7 @@ def get_user_message(self) -> HumanMessage:

state_description = f"""
{step_info_description}
1. Task: {self.step_info.task}
1. Task: {self.step_info.task}.
2. Hints(Optional):
{self.step_info.add_infos}
3. Memory:
Expand All @@ -193,17 +195,20 @@ def get_user_message(self) -> HumanMessage:
{elements_text}
"""

if self.result:

if self.actions and self.result:
state_description += "\n **Previous Actions** \n"
state_description += f'Previous step: {self.step_info.step_number-1}/{self.step_info.max_steps} \n'
for i, result in enumerate(self.result):
action = self.actions[i]
state_description += f"Previous action {i + 1}/{len(self.result)}: {action.model_dump_json(exclude_unset=True)}\n"
if result.include_in_memory:
if result.extracted_content:
state_description += f"\nResult of previous action {i + 1}/{len(self.result)}: {result.extracted_content}"
state_description += f"Result of previous action {i + 1}/{len(self.result)}: {result.extracted_content}\n"
if result.error:
# only use last 300 characters of error
error = result.error[-self.max_error_length:]
state_description += (
f"\nError of previous action {i + 1}/{len(self.result)}: ...{error}"
f"Error of previous action {i + 1}/{len(self.result)}: ...{error}\n"
)

if self.state.screenshot:
Expand Down
2 changes: 1 addition & 1 deletion src/controller/custom_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pydantic import BaseModel
from browser_use.agent.views import ActionResult
from browser_use.browser.context import BrowserContext
from browser_use.controller.service import Controller
from browser_use.controller.service import Controller, DoneAction


class CustomController(Controller):
Expand Down
5 changes: 3 additions & 2 deletions src/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,9 @@ def get_llm_model(provider: str, **kwargs):
else:
base_url = kwargs.get("base_url")

if kwargs.get("model_name", "qwen2.5:7b").startswith("deepseek-r1"):
if "deepseek-r1" in kwargs.get("model_name", "qwen2.5:7b"):
return DeepSeekR1ChatOllama(
model=kwargs.get("model_name", "deepseek-r1:7b"),
model=kwargs.get("model_name", "deepseek-r1:14b"),
temperature=kwargs.get("temperature", 0.0),
num_ctx=kwargs.get("num_ctx", 32000),
base_url=kwargs.get("base_url", base_url),
Expand All @@ -106,6 +106,7 @@ def get_llm_model(provider: str, **kwargs):
model=kwargs.get("model_name", "qwen2.5:7b"),
temperature=kwargs.get("temperature", 0.0),
num_ctx=kwargs.get("num_ctx", 32000),
num_predict=kwargs.get("num_predict", 1024),
base_url=kwargs.get("base_url", base_url),
)
elif provider == "azure_openai":
Expand Down
Loading

0 comments on commit 0c9cb9b

Please sign in to comment.