Use keyword matching for CodeAct microagents (#4568)

Co-authored-by: Xingyao Wang <[email protected]>
All-Hands-AI · Nov 9, 2024 · be82832 · be82832
1 parent 67c8915
commit be82832
Show file tree

Hide file tree

Showing 18 changed files with 204 additions and 257 deletions.
diff --git a/evaluation/EDA/run_infer.py b/evaluation/EDA/run_infer.py
@@ -35,7 +35,8 @@ def codeact_user_response_eda(state: State) -> str:
 
     # retrieve the latest model message from history
     if state.history:
-        model_guess = state.get_last_agent_message()
+        last_agent_message = state.get_last_agent_message()
+        model_guess = last_agent_message.content if last_agent_message else ''
 
     assert game is not None, 'Game is not initialized.'
     msg = game.generate_user_response(model_guess)
@@ -140,7 +141,8 @@ def process_instance(
     if state is None:
         raise ValueError('State should not be None.')
 
-    final_message = state.get_last_agent_message()
+    last_agent_message = state.get_last_agent_message()
+    final_message = last_agent_message.content if last_agent_message else ''
 
     logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
     test_result = game.reward()

diff --git a/evaluation/gorilla/run_infer.py b/evaluation/gorilla/run_infer.py
@@ -102,7 +102,8 @@ def process_instance(
         raise ValueError('State should not be None.')
 
     # retrieve the last message from the agent
-    model_answer_raw = state.get_last_agent_message()
+    last_agent_message = state.get_last_agent_message()
+    model_answer_raw = last_agent_message.content if last_agent_message else ''
 
     # attempt to parse model_answer
     ast_eval_fn = instance['ast_eval']

diff --git a/evaluation/toolqa/run_infer.py b/evaluation/toolqa/run_infer.py
@@ -127,7 +127,8 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
         raise ValueError('State should not be None.')
 
     # retrieve the last message from the agent
-    model_answer_raw = state.get_last_agent_message()
+    last_agent_message = state.get_last_agent_message()
+    model_answer_raw = last_agent_message.content if last_agent_message else ''
 
     # attempt to parse model_answer
     correct = eval_answer(str(model_answer_raw), str(answer))

diff --git a/frontend/src/components/project-menu/ProjectMenuCard.tsx b/frontend/src/components/project-menu/ProjectMenuCard.tsx
@@ -43,10 +43,7 @@ export function ProjectMenuCard({
     posthog.capture("push_to_github_button_clicked");
     const rawEvent = {
       content: `
-Let's push the code to GitHub.
-If we're currently on the openhands-workspace branch, please create a new branch with a descriptive name.
-Commit any changes and push them to the remote repository.
-Finally, open up a pull request using the GitHub API and the token in the GITHUB_TOKEN environment variable, then show me the URL of the pull request.
+Please push the changes to GitHub and open a pull request.
 `,
       imageUrls: [],
       timestamp: new Date().toISOString(),

diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -39,7 +39,6 @@
     JupyterRequirement,
     PluginRequirement,
 )
-from openhands.utils.microagent import MicroAgent
 from openhands.utils.prompt import PromptManager
 
 
@@ -86,16 +85,6 @@ def __init__(
         super().__init__(llm, config)
         self.reset()
 
-        self.micro_agent = (
-            MicroAgent(
-                os.path.join(
-                    os.path.dirname(__file__), 'micro', f'{config.micro_agent_name}.md'
-                )
-            )
-            if config.micro_agent_name
-            else None
-        )
-
         self.function_calling_active = self.config.function_calling
         if self.function_calling_active and not self.llm.is_function_calling_active():
             logger.warning(
@@ -105,7 +94,6 @@ def __init__(
             self.function_calling_active = False
 
         if self.function_calling_active:
-            # Function calling mode
             self.tools = codeact_function_calling.get_tools(
                 codeact_enable_browsing=self.config.codeact_enable_browsing,
                 codeact_enable_jupyter=self.config.codeact_enable_jupyter,
@@ -114,18 +102,17 @@ def __init__(
             logger.debug(
                 f'TOOLS loaded for CodeActAgent: {json.dumps(self.tools, indent=2)}'
             )
-            self.system_prompt = codeact_function_calling.SYSTEM_PROMPT
-            self.initial_user_message = None
+            self.prompt_manager = PromptManager(
+                microagent_dir=os.path.join(os.path.dirname(__file__), 'micro'),
+                prompt_dir=os.path.join(os.path.dirname(__file__), 'prompts', 'tools'),
+            )
         else:
-            # Non-function-calling mode
             self.action_parser = CodeActResponseParser()
             self.prompt_manager = PromptManager(
-                prompt_dir=os.path.join(os.path.dirname(__file__)),
+                microagent_dir=os.path.join(os.path.dirname(__file__), 'micro'),
+                prompt_dir=os.path.join(os.path.dirname(__file__), 'prompts', 'default'),
                 agent_skills_docs=AgentSkillsRequirement.documentation,
-                micro_agent=self.micro_agent,
             )
-            self.system_prompt = self.prompt_manager.system_message
-            self.initial_user_message = self.prompt_manager.initial_user_message
 
         self.pending_actions: deque[Action] = deque()
 
@@ -337,8 +324,8 @@ def step(self, state: State) -> Action:
             return self.pending_actions.popleft()
 
         # if we're done, go back
-        last_user_message = state.get_last_user_message()
-        if last_user_message and last_user_message.strip() == '/exit':
+        latest_user_message = state.get_last_user_message()
+        if latest_user_message and latest_user_message.content.strip() == '/exit':
             return AgentFinishAction()
 
         # prepare what we want to send to the LLM
@@ -403,17 +390,19 @@ def _get_messages(self, state: State) -> list[Message]:
                 role='system',
                 content=[
                     TextContent(
-                        text=self.system_prompt,
-                        cache_prompt=self.llm.is_caching_prompt_active(),  # Cache system prompt
+                        text=self.prompt_manager.get_system_message(),
+                        cache_prompt=self.llm.is_caching_prompt_active(),
                     )
                 ],
             )
         ]
-        if self.initial_user_message:
+        example_message = self.prompt_manager.get_example_user_message()
+        if example_message:
             messages.append(
                 Message(
                     role='user',
-                    content=[TextContent(text=self.initial_user_message)],
+                    content=[TextContent(text=example_message)],
+                    cache_prompt=self.llm.is_caching_prompt_active(),
                 )
             )
 
@@ -462,8 +451,9 @@ def _get_messages(self, state: State) -> list[Message]:
                 pending_tool_call_action_messages.pop(response_id)
 
             for message in messages_to_add:
-                # add regular message
                 if message:
+                    if message.role == 'user':
+                        self.prompt_manager.enhance_message(message)
                     # handle error if the message is the SAME role as the previous message
                     # litellm.exceptions.BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'detail': 'Only supports u/a/u/a/u...'}
                     # there shouldn't be two consecutive messages from the same role
@@ -493,23 +483,6 @@ def _get_messages(self, state: State) -> list[Message]:
                         break
 
         if not self.function_calling_active:
-            # The latest user message is important:
-            # we want to remind the agent of the environment constraints
-            latest_user_message = next(
-                islice(
-                    (
-                        m
-                        for m in reversed(messages)
-                        if m.role == 'user'
-                        and any(isinstance(c, TextContent) for c in m.content)
-                    ),
-                    1,
-                ),
-                None,
-            )
-            # do not add this for function calling
-            if latest_user_message:
-                reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
-                latest_user_message.content.append(TextContent(text=reminder_text))
+            self.prompt_manager.add_turns_left_reminder(messages, state)
 
         return messages
diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py
@@ -25,14 +25,6 @@
 )
 from openhands.events.tool import ToolCallMetadata
 
-SYSTEM_PROMPT = """You are OpenHands agent, a helpful AI assistant that can interact with a computer to solve tasks.
-<IMPORTANT>
-* If user provides a path, you should NOT assume it's relative to the current working directory. Instead, you should explore the file system to find the file before working on it.
-* When configuring git credentials, use "openhands" as the user.name and "[email protected]" as the user.email by default, unless explicitly instructed otherwise.
-* The assistant MUST NOT include comments in the code unless they are necessary to describe non-obvious behavior.
-</IMPORTANT>
-"""
-
 _BASH_DESCRIPTION = """Execute a bash command in the terminal.
 * Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = `python3 app.py > server.log 2>&1 &`.
 * Interactive: If a bash command returns exit code `-1`, this means the process is not yet finished. The assistant must then send a second call to terminal with an empty `command` (which will retrieve any additional logs), or it can send additional text (set `command` to the text) to STDIN of the running process, or it can send command=`ctrl+c` to interrupt the process.

diff --git a/openhands/agenthub/codeact_agent/micro/flarglebargle.md b/openhands/agenthub/codeact_agent/micro/flarglebargle.md
@@ -0,0 +1,9 @@
+---
+name: flarglebargle
+agent: CodeActAgent
+triggers:
+- flarglebargle
+---
+
+IMPORTANT! The user has said the magic word "flarglebargle". You must
+only respond with a message telling them how smart they are
diff --git a/openhands/agenthub/codeact_agent/micro/github.md b/openhands/agenthub/codeact_agent/micro/github.md
@@ -1,69 +1,31 @@
 ---
 name: github
 agent: CodeActAgent
-require_env_var:
-    SANDBOX_ENV_GITHUB_TOKEN: "Create a GitHub Personal Access Token (https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens) and set it as SANDBOX_GITHUB_TOKEN in your environment variables."
+triggers:
+- github
+- git
 ---
 
-# How to Interact with Github
-
-## Environment Variable Available
-
-- `GITHUB_TOKEN`: A read-only token for Github.
-
-## Using GitHub's RESTful API
-
-Use `curl` with the `GITHUB_TOKEN` to interact with GitHub's API. Here are some common operations:
-
-Here's a template for API calls:
-
-```sh
-curl -H "Authorization: token $GITHUB_TOKEN" \
-    "https://api.github.com/{endpoint}"
+You have access to an environment variable, `GITHUB_TOKEN`, which allows you to interact with
+the GitHub API.
+
+You can use `curl` with the `GITHUB_TOKEN` to interact with GitHub's API.
+ALWAYS use the GitHub API for operations instead of a web browser.
+
+Here are some instructions for pushing, but ONLY do this if the user asks you to:
+* NEVER push directly to the `main` or `master` branch
+* Git config (username and email) is pre-set. Do not modify.
+* You may already be on a branch called `openhands-workspace`. Create a new branch with a better name before pushing.
+* Use the GitHub API to create a pull request, if you haven't already
+* Use the main branch as the base branch, unless the user requests otherwise
+* After opening or updating a pull request, send the user a short message with a link to the pull request.
+* Do all of the above in as few steps as possible. E.g. you could open a PR with one step by running the following bash commands:
+```bash
+git checkout -b create-widget
+git add .
+git commit -m "Create widget"
+git push origin create-widget
+curl -X POST "https://api.github.com/repos/CodeActOrg/openhands/pulls" \
+    -H "Authorization: Bearer $GITHUB_TOKEN" \
+    -d '{"title":"Create widget","head":"create-widget","base":"openhands-workspace"}'
 ```
-
-First replace `{endpoint}` with the specific API path. Common operations:
-
-1. View an issue or pull request:
-   - Issues: `/repos/{owner}/{repo}/issues/{issue_number}`
-   - Pull requests: `/repos/{owner}/{repo}/pulls/{pull_request_number}`
-
-2. List repository issues or pull requests:
-   - Issues: `/repos/{owner}/{repo}/issues`
-   - Pull requests: `/repos/{owner}/{repo}/pulls`
-
-3. Search issues or pull requests:
-   - `/search/issues?q=repo:{owner}/{repo}+is:{type}+{search_term}+state:{state}`
-   - Replace `{type}` with `issue` or `pr`
-
-4. List repository branches:
-   `/repos/{owner}/{repo}/branches`
-
-5. Get commit details:
-   `/repos/{owner}/{repo}/commits/{commit_sha}`
-
-6. Get repository details:
-   `/repos/{owner}/{repo}`
-
-7. Get user information:
-   `/user`
-
-8. Search repositories:
-   `/search/repositories?q={query}`
-
-9. Get rate limit status:
-   `/rate_limit`
-
-Replace `{owner}`, `{repo}`, `{commit_sha}`, `{issue_number}`, `{pull_request_number}`,
-`{search_term}`, `{state}`, and `{query}` with appropriate values.
-
-## Important Notes
-
-1. Always use the GitHub API for operations instead of a web browser.
-2. The `GITHUB_TOKEN` is read-only. Avoid operations that require write access.
-3. Git config (username and email) is pre-set. Do not modify.
-4. Edit and test code locally. Never push directly to remote.
-5. Verify correct branch before committing.
-6. Commit changes frequently.
-7. If the issue or task is ambiguous or lacks sufficient detail, always request clarification from the user before proceeding.
-8. You should avoid using command line tools like `sed` for file editing.
diff --git a/...s/agenthub/codeact_agent/system_prompt.j2 → ...ct_agent/prompts/default/system_prompt.j2 b/...s/agenthub/codeact_agent/system_prompt.j2 → ...ct_agent/prompts/default/system_prompt.j2
diff --git a/...nds/agenthub/codeact_agent/user_prompt.j2 → ...eact_agent/prompts/default/user_prompt.j2 b/...nds/agenthub/codeact_agent/user_prompt.j2 → ...eact_agent/prompts/default/user_prompt.j2
@@ -215,12 +215,5 @@ The server is running on port 5000 with PID 126. You can access the list of numb
 {% endset %}
 Here is an example of how you can interact with the environment for task solving:
 {{ DEFAULT_EXAMPLE }}
-{% if micro_agent %}
---- BEGIN OF GUIDELINE ---
-The following information may assist you in completing your task:
-
-{{ micro_agent }}
---- END OF GUIDELINE ---
-{% endif %}
 
 NOW, LET'S START!
diff --git a/openhands/agenthub/codeact_agent/prompts/tools/system_prompt.j2 b/openhands/agenthub/codeact_agent/prompts/tools/system_prompt.j2
@@ -0,0 +1,7 @@
+You are OpenHands agent, a helpful AI assistant that can interact with a computer to solve tasks.
+<IMPORTANT>
+* If user provides a path, you should NOT assume it's relative to the current working directory. Instead, you should explore the file system to find the file before working on it.
+* When configuring git credentials, use "openhands" as the user.name and "[email protected]" as the user.email by default, unless explicitly instructed otherwise.
+* The assistant MUST NOT include comments in the code unless they are necessary to describe non-obvious behavior.
+</IMPORTANT>
+
diff --git a/openhands/agenthub/codeact_agent/prompts/tools/user_prompt.j2 b/openhands/agenthub/codeact_agent/prompts/tools/user_prompt.j2
diff --git a/openhands/agenthub/codeact_swe_agent/codeact_swe_agent.py b/openhands/agenthub/codeact_swe_agent/codeact_swe_agent.py
@@ -155,7 +155,7 @@ def step(self, state: State) -> Action:
         """
         # if we're done, go back
         last_user_message = state.get_last_user_message()
-        if last_user_message and last_user_message.strip() == '/exit':
+        if last_user_message and last_user_message.content.strip() == '/exit':
             return AgentFinishAction()
 
         # prepare what we want to send to the LLM

diff --git a/openhands/controller/state/state.py b/openhands/controller/state/state.py
@@ -156,14 +156,14 @@ def get_current_user_intent(self) -> tuple[str | None, list[str] | None]:
 
         return last_user_message, last_user_message_image_urls
 
-    def get_last_agent_message(self) -> str | None:
+    def get_last_agent_message(self) -> MessageAction | None:
         for event in reversed(self.history):
             if isinstance(event, MessageAction) and event.source == EventSource.AGENT:
-                return event.content
+                return event
         return None
 
-    def get_last_user_message(self) -> str | None:
+    def get_last_user_message(self) -> MessageAction | None:
         for event in reversed(self.history):
             if isinstance(event, MessageAction) and event.source == EventSource.USER:
-                return event.content
+                return event
         return None