From 3775ce213c9f1431b9c0bf991a4e44ca536de97c Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 13 Oct 2024 18:39:48 +0200
Subject: [PATCH 01/89] add memory-enabled agent

---
 openhands/agenthub/memcodeact_agent/README.md |  74 +++++
 .../agenthub/memcodeact_agent/__init__.py     |   3 +
 .../memcodeact_agent/action_parser.py         | 180 +++++++++++
 .../memcodeact_agent/memcodeact_agent.py      | 280 ++++++++++++++++++
 .../agenthub/memcodeact_agent/micro/github.md |  69 +++++
 .../memcodeact_agent/system_prompt.j2         |  53 ++++
 .../agenthub/memcodeact_agent/user_prompt.j2  | 205 +++++++++++++
 7 files changed, 864 insertions(+)
 create mode 100644 openhands/agenthub/memcodeact_agent/README.md
 create mode 100644 openhands/agenthub/memcodeact_agent/__init__.py
 create mode 100644 openhands/agenthub/memcodeact_agent/action_parser.py
 create mode 100644 openhands/agenthub/memcodeact_agent/memcodeact_agent.py
 create mode 100644 openhands/agenthub/memcodeact_agent/micro/github.md
 create mode 100644 openhands/agenthub/memcodeact_agent/system_prompt.j2
 create mode 100644 openhands/agenthub/memcodeact_agent/user_prompt.j2

diff --git a/openhands/agenthub/memcodeact_agent/README.md b/openhands/agenthub/memcodeact_agent/README.md
new file mode 100644
index 000000000000..1d7014d22119
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/README.md
@@ -0,0 +1,74 @@
+# MemCodeAct Agent
+
+## Introduction
+
+`memcodeact_agent` is a memory-enabled experimental agent built upon the foundation of the existing `codeact_agent`. Designed to enhance the capabilities of autonomous agents, `memcodeact_agent` incorporates advanced memory functionalities inspired by recent advancements in generative AI research. This agent leverages memory to improve task execution, context retention, and overall performance, making it more adept at handling complex and extended interactions.
+
+## Features
+
+- **Memory Integration:** Retains context across multiple interactions, allowing for more coherent and contextually aware responses.
+- **Enhanced Action Parsing:** Utilizes a memory-augmented action parser to interpret and execute complex commands effectively.
+- **Improved Task Management:** Manages and retrieves past actions and observations to inform current decision-making processes.
+- **Experimental Capabilities:** Serves as a platform for testing and refining memory-related functionalities in AI agents.
+
+## Inspiration and Research
+
+The development of `memcodeact_agent` is inspired by two pivotal research papers in the field of generative AI and memory-augmented models:
+
+1. **Extending Generative AI with Memory**
+   - **Paper:** [Extending Generative AI with Memory](https://arxiv.org/pdf/2304.03442)
+   - **Summary:** This paper explores methods to integrate long-term memory into generative AI models, enabling them to retain and utilize information from past interactions. The approach enhances the model's ability to maintain context over extended conversations, leading to more accurate and relevant outputs. Techniques such as memory slots, retrieval mechanisms, and memory encoding strategies are discussed to facilitate effective information storage and retrieval.
+
+2. **MemGPT: Memory-Enhanced GPT Models**
+   - **Paper:** [MemGPT: Memory-Enhanced GPT Models](https://arxiv.org/pdf/2310.08560)
+   - **Summary:** MemGPT introduces a novel architecture that incorporates external memory modules into GPT models. This integration allows the model to access and update its memory dynamically during interactions. The paper details the implementation of memory layers, attention mechanisms for memory retrieval, and training methodologies that enable the model to learn from both its internal parameters and external memory. The results demonstrate significant improvements in tasks requiring long-term dependency understanding and information recall.
+
+## Getting Started
+
+### Prerequisites
+
+- Python 3.10+
+- Required environment variables:
+  - `SANDBOX_ENV_GITHUB_TOKEN`: GitHub Personal Access Token with read-only permissions.
+
+### Installation
+
+1. **Clone the Repository:**
+   ```bash
+   git clone https://github.com/All-Hands-AI/OpenHands.git
+   ```
+
+2. **Navigate to the Agent Directory:**
+   ```bash
+   cd OpenHands/odie/openhands/agenthub/memcodeact_agent
+   ```
+
+3. **Install Dependencies:**
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+### Usage
+
+1. **Configure Environment Variables:**
+   Set the `SANDBOX_ENV_GITHUB_TOKEN` in your environment variables to enable GitHub interactions.
+
+2. **Run the Agent:**
+   ```bash
+   python memcodeact_agent.py
+   ```
+
+3. **Interact with the Agent:**
+   Follow the prompts to execute tasks. The agent will utilize its memory capabilities to provide more coherent and contextually aware responses.
+
+## Documentation
+
+For detailed information on how to interact with the agent, refer to the [User Prompt](user_prompt.j2) and [System Prompt](system_prompt.j2) templates located within the agent's directory. These templates define the conversational flow and the agent's capabilities.
+
+## Contribution
+
+`memcodeact_agent` is an experimental agent designed for research and development purposes. Contributions are welcome! Please ensure that any changes adhere to the project's coding standards and are accompanied by appropriate tests.
+
+## License
+
+This project is licensed under the MIT License. See the [LICENSE](../LICENSE) file for details.
diff --git a/openhands/agenthub/memcodeact_agent/__init__.py b/openhands/agenthub/memcodeact_agent/__init__.py
new file mode 100644
index 000000000000..900cdef86086
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/__init__.py
@@ -0,0 +1,3 @@
+from .memcodeact_agent import MemCodeActAgent
+
+__all__ = ['MemCodeActAgent']
diff --git a/openhands/agenthub/memcodeact_agent/action_parser.py b/openhands/agenthub/memcodeact_agent/action_parser.py
new file mode 100644
index 000000000000..c6bd55ff3c0a
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/action_parser.py
@@ -0,0 +1,180 @@
+import re
+
+from openhands.controller.action_parser import ActionParser, ResponseParser
+from openhands.events.action import (
+    Action,
+    AgentDelegateAction,
+    AgentFinishAction,
+    CmdRunAction,
+    IPythonRunCellAction,
+    MessageAction,
+)
+
+
+class MemCodeActResponseParser(ResponseParser):
+    """Parser actions for MemCodeActAgent:
+    - CmdRunAction(command) - bash command to run
+    - IPythonRunCellAction(code) - IPython code to run
+    - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+    - MessageAction(content) - Message action to run (e.g. ask for clarification)
+    - AgentFinishAction() - end the interaction
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.action_parsers = [
+            MemCodeActActionParserFinish(),
+            MemCodeActActionParserCmdRun(),
+            MemCodeActActionParserIPythonRunCell(),
+            MemCodeActActionParserAgentDelegate(),
+        ]
+        self.default_parser = MemCodeActActionParserMessage()
+
+    def parse(self, response) -> Action:
+        action_str = self.parse_response(response)
+        return self.parse_action(action_str)
+
+    def parse_response(self, response) -> str:
+        action = response.choices[0].message.content
+        if action is None:
+            return ''
+        for lang in ['bash', 'ipython', 'browse']:
+            # special handling for DeepSeek: it has stop-word bug and returns </execute_ipython instead of </execute_ipython>
+            if f'</execute_{lang}' in action and f'</execute_{lang}>' not in action:
+                action = action.replace(f'</execute_{lang}', f'</execute_{lang}>')
+
+            if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
+                action += f'</execute_{lang}>'
+        return action
+
+    def parse_action(self, action_str: str) -> Action:
+        for action_parser in self.action_parsers:
+            if action_parser.check_condition(action_str):
+                return action_parser.parse(action_str)
+        return self.default_parser.parse(action_str)
+
+
+class MemCodeActActionParserFinish(ActionParser):
+    """Parser action:
+    - AgentFinishAction() - end the interaction
+    """
+
+    def __init__(
+        self,
+    ):
+        self.finish_command = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.finish_command = re.search(r'<finish>.*</finish>', action_str, re.DOTALL)
+        return self.finish_command is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.finish_command is not None
+        ), 'self.finish_command should not be None when parse is called'
+        thought = action_str.replace(self.finish_command.group(0), '').strip()
+        return AgentFinishAction(thought=thought)
+
+
+class MemCodeActActionParserCmdRun(ActionParser):
+    """Parser action:
+    - CmdRunAction(command) - bash command to run
+    - AgentFinishAction() - end the interaction
+    """
+
+    def __init__(
+        self,
+    ):
+        self.bash_command = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.bash_command = re.search(
+            r'<execute_bash>(.*?)</execute_bash>', action_str, re.DOTALL
+        )
+        return self.bash_command is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.bash_command is not None
+        ), 'self.bash_command should not be None when parse is called'
+        thought = action_str.replace(self.bash_command.group(0), '').strip()
+        # a command was found
+        command_group = self.bash_command.group(1).strip()
+        if command_group.strip() == 'exit':
+            return AgentFinishAction(thought=thought)
+        return CmdRunAction(command=command_group, thought=thought)
+
+
+class MemCodeActActionParserIPythonRunCell(ActionParser):
+    """Parser action:
+    - IPythonRunCellAction(code) - IPython code to run
+    """
+
+    def __init__(
+        self,
+    ):
+        self.python_code = None
+        self.jupyter_kernel_init_code: str = 'from agentskills import *'
+
+    def check_condition(self, action_str: str) -> bool:
+        self.python_code = re.search(
+            r'<execute_ipython>(.*?)</execute_ipython>', action_str, re.DOTALL
+        )
+        return self.python_code is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.python_code is not None
+        ), 'self.python_code should not be None when parse is called'
+        code_group = self.python_code.group(1).strip()
+        thought = action_str.replace(self.python_code.group(0), '').strip()
+        return IPythonRunCellAction(
+            code=code_group,
+            thought=thought,
+            kernel_init_code=self.jupyter_kernel_init_code,
+        )
+
+
+class MemCodeActActionParserAgentDelegate(ActionParser):
+    """Parser action:
+    - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+    """
+
+    def __init__(
+        self,
+    ):
+        self.agent_delegate = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.agent_delegate = re.search(
+            r'<execute_browse>(.*)</execute_browse>', action_str, re.DOTALL
+        )
+        return self.agent_delegate is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.agent_delegate is not None
+        ), 'self.agent_delegate should not be None when parse is called'
+        thought = action_str.replace(self.agent_delegate.group(0), '').strip()
+        browse_actions = self.agent_delegate.group(1).strip()
+        task = f'{thought}. I should start with: {browse_actions}'
+        return AgentDelegateAction(agent='BrowsingAgent', inputs={'task': task})
+
+
+class MemCodeActActionParserMessage(ActionParser):
+    """Parser action:
+    - MessageAction(content) - Message action to run (e.g. ask for clarification)
+    """
+
+    def __init__(
+        self,
+    ):
+        pass
+
+    def check_condition(self, action_str: str) -> bool:
+        # We assume the LLM is GOOD enough that when it returns pure natural language
+        # it wants to talk to the user
+        return True
+
+    def parse(self, action_str: str) -> Action:
+        return MessageAction(content=action_str, wait_for_response=True)
diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
new file mode 100644
index 000000000000..36be2d42ae19
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -0,0 +1,280 @@
+import os
+from itertools import islice
+
+from openhands.agenthub.memcodeact_agent.action_parser import MemCodeActResponseParser
+from openhands.controller.agent import Agent
+from openhands.controller.state.state import State
+from openhands.core.config import AgentConfig
+from openhands.core.message import ImageContent, Message, TextContent
+from openhands.events.action import (
+    Action,
+    AgentDelegateAction,
+    AgentFinishAction,
+    CmdRunAction,
+    IPythonRunCellAction,
+    MessageAction,
+)
+from openhands.events.observation import (
+    AgentDelegateObservation,
+    CmdOutputObservation,
+    IPythonRunCellObservation,
+    UserRejectObservation,
+)
+from openhands.events.observation.error import ErrorObservation
+from openhands.events.observation.observation import Observation
+from openhands.events.serialization.event import truncate_content
+from openhands.llm.llm import LLM
+from openhands.runtime.plugins import (
+    AgentSkillsRequirement,
+    JupyterRequirement,
+    PluginRequirement,
+)
+from openhands.utils.microagent import MicroAgent
+from openhands.utils.prompt import PromptManager
+
+
+class MemCodeActAgent(Agent):
+    VERSION = '1.9'
+    """
+    The MemCode Act Agent is a minimalist agent.
+    The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
+
+    ### Overview
+
+    This agent implements the MemCodeAct idea ([paper](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a unified **code** action space for both *simplicity* and *performance* (see paper for more details).
+
+    The conceptual idea is illustrated below. At each turn, the agent can:
+
+    1. **Converse**: Communicate with humans in natural language to ask for clarification, confirmation, etc.
+    2. **MemCodeAct**: Choose to perform the task by executing code
+    - Execute any valid Linux `bash` command
+    - Execute any valid `Python` code with [an interactive Python interpreter](https://ipython.org/). This is simulated through `bash` command, see plugin system below for more details.
+
+    ![image](https://github.com/All-Hands-AI/OpenHands/assets/38853559/92b622e3-72ad-4a61-8f41-8c040b6d5fb3)
+
+    """
+
+    sandbox_plugins: list[PluginRequirement] = [
+        # NOTE: AgentSkillsRequirement need to go before JupyterRequirement, since
+        # AgentSkillsRequirement provides a lot of Python functions,
+        # and it needs to be initialized before Jupyter for Jupyter to use those functions.
+        AgentSkillsRequirement(),
+        JupyterRequirement(),
+    ]
+
+    action_parser = MemCodeActResponseParser()
+
+    def __init__(
+        self,
+        llm: LLM,
+        config: AgentConfig,
+    ) -> None:
+        """Initializes a new instance of the MemCodeActAgent class.
+
+        Parameters:
+        - llm (LLM): The llm to be used by this agent
+        """
+        super().__init__(llm, config)
+        self.reset()
+
+        self.micro_agent = (
+            MicroAgent(
+                os.path.join(
+                    os.path.dirname(__file__), 'micro', f'{config.micro_agent_name}.md'
+                )
+            )
+            if config.micro_agent_name
+            else None
+        )
+
+        self.prompt_manager = PromptManager(
+            prompt_dir=os.path.join(os.path.dirname(__file__)),
+            agent_skills_docs=AgentSkillsRequirement.documentation,
+            micro_agent=self.micro_agent,
+        )
+
+    def action_to_str(self, action: Action) -> str:
+        if isinstance(action, CmdRunAction):
+            return (
+                f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
+            )
+        elif isinstance(action, IPythonRunCellAction):
+            return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
+        elif isinstance(action, AgentDelegateAction):
+            return f'{action.thought}\n<execute_browse>\n{action.inputs["task"]}\n</execute_browse>'
+        elif isinstance(action, MessageAction):
+            return action.content
+        elif isinstance(action, AgentFinishAction) and action.source == 'agent':
+            return action.thought
+        return ''
+
+    def get_action_message(self, action: Action) -> Message | None:
+        if (
+            isinstance(action, AgentDelegateAction)
+            or isinstance(action, CmdRunAction)
+            or isinstance(action, IPythonRunCellAction)
+            or isinstance(action, MessageAction)
+            or (isinstance(action, AgentFinishAction) and action.source == 'agent')
+        ):
+            content = [TextContent(text=self.action_to_str(action))]
+
+            if (
+                self.llm.vision_is_active()
+                and isinstance(action, MessageAction)
+                and action.images_urls
+            ):
+                content.append(ImageContent(image_urls=action.images_urls))
+
+            return Message(
+                role='user' if action.source == 'user' else 'assistant', content=content
+            )
+        return None
+
+    def get_observation_message(self, obs: Observation) -> Message | None:
+        max_message_chars = self.llm.config.max_message_chars
+        obs_prefix = 'OBSERVATION:\n'
+        if isinstance(obs, CmdOutputObservation):
+            text = obs_prefix + truncate_content(obs.content, max_message_chars)
+            text += (
+                f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
+            )
+            return Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, IPythonRunCellObservation):
+            text = obs_prefix + obs.content
+            # replace base64 images with a placeholder
+            splitted = text.split('\n')
+            for i, line in enumerate(splitted):
+                if '![image](data:image/png;base64,' in line:
+                    splitted[i] = (
+                        '![image](data:image/png;base64, ...) already displayed to user'
+                    )
+            text = '\n'.join(splitted)
+            text = truncate_content(text, max_message_chars)
+            return Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, AgentDelegateObservation):
+            text = obs_prefix + truncate_content(
+                obs.outputs['content'] if 'content' in obs.outputs else '',
+                max_message_chars,
+            )
+            return Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, ErrorObservation):
+            text = obs_prefix + truncate_content(obs.content, max_message_chars)
+            text += '\n[Error occurred in processing last action]'
+            return Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, UserRejectObservation):
+            text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
+            text += '\n[Last action has been rejected by the user]'
+            return Message(role='user', content=[TextContent(text=text)])
+        else:
+            # If an observation message is not returned, it will cause an error
+            # when the LLM tries to return the next message
+            raise ValueError(f'Unknown observation type: {type(obs)}')
+
+    def reset(self) -> None:
+        """Resets the MemCodeAct Agent."""
+        super().reset()
+
+    def step(self, state: State) -> Action:
+        """Performs one step using the MemCodeAct Agent.
+        This includes gathering info on previous steps and prompting the model to make a command to execute.
+
+        Parameters:
+        - state (State): used to get updated info
+
+        Returns:
+        - CmdRunAction(command) - bash command to run
+        - IPythonRunCellAction(code) - IPython code to run
+        - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+        - AgentFinishAction() - end the interaction
+        """
+        # if we're done, go back
+        latest_user_message = state.history.get_last_user_message()
+        if latest_user_message and latest_user_message.strip() == '/exit':
+            return AgentFinishAction()
+
+        # prepare what we want to send to the LLM
+        messages = self._get_messages(state)
+        params = {
+            'messages': self.llm.format_messages_for_llm(messages),
+            'stop': [
+                '</execute_ipython>',
+                '</execute_bash>',
+                '</execute_browse>',
+            ],
+        }
+
+        response = self.llm.completion(**params)
+
+        return self.action_parser.parse(response)
+
+    def _get_messages(self, state: State) -> list[Message]:
+        messages: list[Message] = [
+            Message(
+                role='system',
+                content=[
+                    TextContent(
+                        text=self.prompt_manager.system_message,
+                        cache_prompt=self.llm.is_caching_prompt_active(),  # Cache system prompt
+                    )
+                ],
+            ),
+            Message(
+                role='user',
+                content=[
+                    TextContent(
+                        text=self.prompt_manager.initial_user_message,
+                        cache_prompt=self.llm.is_caching_prompt_active(),  # if the user asks the same query,
+                    )
+                ],
+            ),
+        ]
+
+        for event in state.history.get_events():
+            # create a regular message from an event
+            if isinstance(event, Action):
+                message = self.get_action_message(event)
+            elif isinstance(event, Observation):
+                message = self.get_observation_message(event)
+            else:
+                raise ValueError(f'Unknown event type: {type(event)}')
+
+            # add regular message
+            if message:
+                # handle error if the message is the SAME role as the previous message
+                # litellm.exceptions.BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'detail': 'Only supports u/a/u/a/u...'}
+                # there shouldn't be two consecutive messages from the same role
+                if messages and messages[-1].role == message.role:
+                    messages[-1].content.extend(message.content)
+                else:
+                    messages.append(message)
+
+        # Add caching to the last 2 user messages
+        if self.llm.is_caching_prompt_active():
+            user_turns_processed = 0
+            for message in reversed(messages):
+                if message.role == 'user' and user_turns_processed < 2:
+                    message.content[
+                        -1
+                    ].cache_prompt = True  # Last item inside the message content
+                    user_turns_processed += 1
+
+        # The latest user message is important:
+        # we want to remind the agent of the environment constraints
+        latest_user_message = next(
+            islice(
+                (
+                    m
+                    for m in reversed(messages)
+                    if m.role == 'user'
+                    and any(isinstance(c, TextContent) for c in m.content)
+                ),
+                1,
+            ),
+            None,
+        )
+        if latest_user_message:
+            reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
+            latest_user_message.content.append(TextContent(text=reminder_text))
+
+        return messages
diff --git a/openhands/agenthub/memcodeact_agent/micro/github.md b/openhands/agenthub/memcodeact_agent/micro/github.md
new file mode 100644
index 000000000000..0e5a9b14ab9e
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/micro/github.md
@@ -0,0 +1,69 @@
+---
+name: github
+agent: MemCodeActAgent
+require_env_var:
+    SANDBOX_ENV_GITHUB_TOKEN: "Create a GitHub Personal Access Token (https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens) and set it as SANDBOX_GITHUB_TOKEN in your environment variables."
+---
+
+# How to Interact with Github
+
+## Environment Variable Available
+
+- `GITHUB_TOKEN`: A read-only token for Github.
+
+## Using GitHub's RESTful API
+
+Use `curl` with the `GITHUB_TOKEN` to interact with GitHub's API. Here are some common operations:
+
+Here's a template for API calls:
+
+```sh
+curl -H "Authorization: token $GITHUB_TOKEN" \
+    "https://api.github.com/{endpoint}"
+```
+
+First replace `{endpoint}` with the specific API path. Common operations:
+
+1. View an issue or pull request:
+   - Issues: `/repos/{owner}/{repo}/issues/{issue_number}`
+   - Pull requests: `/repos/{owner}/{repo}/pulls/{pull_request_number}`
+
+2. List repository issues or pull requests:
+   - Issues: `/repos/{owner}/{repo}/issues`
+   - Pull requests: `/repos/{owner}/{repo}/pulls`
+
+3. Search issues or pull requests:
+   - `/search/issues?q=repo:{owner}/{repo}+is:{type}+{search_term}+state:{state}`
+   - Replace `{type}` with `issue` or `pr`
+
+4. List repository branches:
+   `/repos/{owner}/{repo}/branches`
+
+5. Get commit details:
+   `/repos/{owner}/{repo}/commits/{commit_sha}`
+
+6. Get repository details:
+   `/repos/{owner}/{repo}`
+
+7. Get user information:
+   `/user`
+
+8. Search repositories:
+   `/search/repositories?q={query}`
+
+9. Get rate limit status:
+   `/rate_limit`
+
+Replace `{owner}`, `{repo}`, `{commit_sha}`, `{issue_number}`, `{pull_request_number}`,
+`{search_term}`, `{state}`, and `{query}` with appropriate values.
+
+## Important Notes
+
+1. Always use the GitHub API for operations instead of a web browser.
+2. The `GITHUB_TOKEN` is read-only. Avoid operations that require write access.
+3. Git config (username and email) is pre-set. Do not modify.
+4. Edit and test code locally. Never push directly to remote.
+5. Verify correct branch before committing.
+6. Commit changes frequently.
+7. If the issue or task is ambiguous or lacks sufficient detail, always request clarification from the user before proceeding.
+8. You should avoid using command line tools like `sed` for file editing.
diff --git a/openhands/agenthub/memcodeact_agent/system_prompt.j2 b/openhands/agenthub/memcodeact_agent/system_prompt.j2
new file mode 100644
index 000000000000..809ae873f677
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/system_prompt.j2
@@ -0,0 +1,53 @@
+{% set MINIMAL_SYSTEM_PREFIX %}
+A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed answers to the user's questions.
+The assistant can use a Python environment with <execute_ipython>, e.g.:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+The assistant can execute bash commands wrapped with <execute_bash>, e.g. <execute_bash> ls </execute_bash>.
+If a bash command returns exit code `-1`, this means the process is not yet finished.
+The assistant must then send a second <execute_bash>. The second <execute_bash> can be empty
+(which will retrieve any additional logs), or it can contain text to be sent to STDIN of the running process,
+or it can contain the text `ctrl+c` to interrupt the process.
+
+For commands that may run indefinitely, the output should be redirected to a file and the command run
+in the background, e.g. <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+If a command execution result says "Command timed out. Sending SIGINT to the process",
+the assistant should retry running the command in the background.
+{% endset %}
+{% set BROWSING_PREFIX %}
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the USA's president using Google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+{% endset %}
+{% set PIP_INSTALL_PREFIX %}
+The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+{% endset %}
+{% set SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + BROWSING_PREFIX + PIP_INSTALL_PREFIX %}
+{% set COMMAND_DOCS %}
+Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+{{ agent_skills_docs }}
+IMPORTANT:
+- `open_file` only returns the first 100 lines of the file by default! The assistant MUST use `scroll_down` repeatedly to read the full file BEFORE making edits!
+- The assistant shall adhere to THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRING PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write the line out, with all leading spaces before the code!
+- Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+- Any code issued should be less than 50 lines to avoid context being cut off!
+- After EVERY `create_file` the method `append_file` shall be used to write the FIRST content!
+- For `edit_file_by_replace` NEVER provide empty parameters!
+- For `edit_file_by_replace` the file must be read fully before any replacements!
+{% endset %}
+{% set SYSTEM_SUFFIX %}
+Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
+If the assistant is finished with the task you MUST include <finish></finish> in your response.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
+The assistant should utilize full file paths and the `pwd` command to prevent path-related errors.
+The assistant MUST NOT apologize to the user or thank the user after running commands or editing files. It should only address the user in response to an explicit message from the user, or to ask for more information.
+The assistant MUST NOT push any changes to GitHub unless explicitly requested to do so.
+
+{% endset %}
+{# Combine all parts without newlines between them #}
+{{ SYSTEM_PREFIX -}}
+{{- COMMAND_DOCS -}}
+{{- SYSTEM_SUFFIX }}
diff --git a/openhands/agenthub/memcodeact_agent/user_prompt.j2 b/openhands/agenthub/memcodeact_agent/user_prompt.j2
new file mode 100644
index 000000000000..bf28dea9aa1f
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/user_prompt.j2
@@ -0,0 +1,205 @@
+{% set DEFAULT_EXAMPLE %}
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)"""
+
+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>
+
+USER:
+OBSERVATION:
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/openhands/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/openhands/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+{% endset %}
+Here is an example of how you can interact with the environment for task solving:
+{{ DEFAULT_EXAMPLE }}
+{% if micro_agent %}
+--- BEGIN OF GUIDELINE ---
+The following information may assist you in completing your task:
+
+{{ micro_agent }}
+--- END OF GUIDELINE ---
+{% endif %}
+
+NOW, LET'S START!

From 6f004c8467a38b94fdb5773f9f3699d52820b8ab Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 13 Oct 2024 21:14:48 +0200
Subject: [PATCH 02/89] register agent

---
 openhands/agenthub/memcodeact_agent/README.md | 43 +++----------------
 .../agenthub/memcodeact_agent/__init__.py     |  4 ++
 2 files changed, 11 insertions(+), 36 deletions(-)

diff --git a/openhands/agenthub/memcodeact_agent/README.md b/openhands/agenthub/memcodeact_agent/README.md
index 1d7014d22119..a7f147379dec 100644
--- a/openhands/agenthub/memcodeact_agent/README.md
+++ b/openhands/agenthub/memcodeact_agent/README.md
@@ -27,39 +27,14 @@ The development of `memcodeact_agent` is inspired by two pivotal research papers
 
 ### Prerequisites
 
-- Python 3.10+
-- Required environment variables:
-  - `SANDBOX_ENV_GITHUB_TOKEN`: GitHub Personal Access Token with read-only permissions.
-
-### Installation
-
-1. **Clone the Repository:**
-   ```bash
-   git clone https://github.com/All-Hands-AI/OpenHands.git
-   ```
-
-2. **Navigate to the Agent Directory:**
-   ```bash
-   cd OpenHands/odie/openhands/agenthub/memcodeact_agent
-   ```
-
-3. **Install Dependencies:**
-   ```bash
-   pip install -r requirements.txt
-   ```
+- Configuration variables in `config.toml`, `agent.MemCodeactAgent` section:
+  - `micro_agent_name`: Name of the micro agent to use.
+  - `enable_memory`: Whether to enable long-term memory. Default is true for this agent.
+  - `cache_prompt`: Whether to cache the prompt. Default is false for this agent.
 
-### Usage
 
-1. **Configure Environment Variables:**
-   Set the `SANDBOX_ENV_GITHUB_TOKEN` in your environment variables to enable GitHub interactions.
-
-2. **Run the Agent:**
-   ```bash
-   python memcodeact_agent.py
-   ```
-
-3. **Interact with the Agent:**
-   Follow the prompts to execute tasks. The agent will utilize its memory capabilities to provide more coherent and contextually aware responses.
+- Optional environment variables:
+  - `SANDBOX_ENV_GITHUB_TOKEN`: GitHub Personal Access Token with read-only permissions.
 
 ## Documentation
 
@@ -67,8 +42,4 @@ For detailed information on how to interact with the agent, refer to the [User P
 
 ## Contribution
 
-`memcodeact_agent` is an experimental agent designed for research and development purposes. Contributions are welcome! Please ensure that any changes adhere to the project's coding standards and are accompanied by appropriate tests.
-
-## License
-
-This project is licensed under the MIT License. See the [LICENSE](../LICENSE) file for details.
+`memcodeact_agent` is an experimental agent designed for research and development purposes. Contributions are welcome!
diff --git a/openhands/agenthub/memcodeact_agent/__init__.py b/openhands/agenthub/memcodeact_agent/__init__.py
index 900cdef86086..bf78a81d8ba1 100644
--- a/openhands/agenthub/memcodeact_agent/__init__.py
+++ b/openhands/agenthub/memcodeact_agent/__init__.py
@@ -1,3 +1,7 @@
+from openhands.controller.agent import Agent
+
 from .memcodeact_agent import MemCodeActAgent
 
 __all__ = ['MemCodeActAgent']
+
+Agent.register('MemCodeActAgent', MemCodeActAgent)

From 97ef06f1c75e00f7b19ead7ce6a9b20d8b71f96f Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Fri, 4 Oct 2024 22:51:22 +0200
Subject: [PATCH 03/89] try to use a list of events as history (ATTN will
 require tricks with delegates)

fix things

async on_event
---
 openhands/agenthub/__init__.py                |   2 +
 openhands/agenthub/delegator_agent/agent.py   |   2 +-
 .../memcodeact_agent/memcodeact_agent.py      |  14 +-
 openhands/agenthub/planner_agent/agent.py     |   4 +-
 openhands/controller/agent.py                 |   7 +-
 openhands/controller/agent_controller.py      |   1 +
 openhands/events/stream.py                    |   1 +
 openhands/memory/conversation_memory.py       | 146 ++++++++++++++++++
 openhands/server/session/session.py           |   2 +
 9 files changed, 174 insertions(+), 5 deletions(-)
 create mode 100644 openhands/memory/conversation_memory.py

diff --git a/openhands/agenthub/__init__.py b/openhands/agenthub/__init__.py
index 0076976c27ed..370dcba4fab0 100644
--- a/openhands/agenthub/__init__.py
+++ b/openhands/agenthub/__init__.py
@@ -13,6 +13,7 @@
     codeact_swe_agent,
     delegator_agent,
     dummy_agent,
+    memcodeact_agent,
     planner_agent,
 )
 
@@ -23,6 +24,7 @@
     'delegator_agent',
     'dummy_agent',
     'browsing_agent',
+    'memcodeact_agent',
 ]
 
 for agent in all_microagents.values():
diff --git a/openhands/agenthub/delegator_agent/agent.py b/openhands/agenthub/delegator_agent/agent.py
index 29e0030423c7..7ae491307795 100644
--- a/openhands/agenthub/delegator_agent/agent.py
+++ b/openhands/agenthub/delegator_agent/agent.py
@@ -27,7 +27,7 @@ def step(self, state: State) -> Action:
         Otherwise, delegates the task to the next agent in the pipeline.
 
         Parameters:
-        - state (State): The current state given the previous actions and observations
+        - state: The current state given the previous actions and observations
 
         Returns:
         - AgentFinishAction: If the last state was 'completed', 'verified', or 'abandoned'
diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
index 36be2d42ae19..782e9f2fc5b8 100644
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -5,6 +5,8 @@
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
+from openhands.core.config.memory_config import MemoryConfig
+from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import ImageContent, Message, TextContent
 from openhands.events.action import (
     Action,
@@ -24,6 +26,7 @@
 from openhands.events.observation.observation import Observation
 from openhands.events.serialization.event import truncate_content
 from openhands.llm.llm import LLM
+from openhands.memory.conversation_memory import ConversationMemory
 from openhands.runtime.plugins import (
     AgentSkillsRequirement,
     JupyterRequirement,
@@ -68,6 +71,7 @@ def __init__(
         self,
         llm: LLM,
         config: AgentConfig,
+        memory_config: MemoryConfig = None,
     ) -> None:
         """Initializes a new instance of the MemCodeActAgent class.
 
@@ -77,6 +81,10 @@ def __init__(
         super().__init__(llm, config)
         self.reset()
 
+        self.memory_config = memory_config
+
+        self.conversation_memory = ConversationMemory(self.memory_config)
+
         self.micro_agent = (
             MicroAgent(
                 os.path.join(
@@ -168,12 +176,16 @@ def get_observation_message(self, obs: Observation) -> Message | None:
         else:
             # If an observation message is not returned, it will cause an error
             # when the LLM tries to return the next message
-            raise ValueError(f'Unknown observation type: {type(obs)}')
+            logger.debug(f'Unknown observation type: {type(obs)}')
+            return None
 
     def reset(self) -> None:
         """Resets the MemCodeAct Agent."""
         super().reset()
 
+        # clean its history
+        self.conversation_memory.reset()
+
     def step(self, state: State) -> Action:
         """Performs one step using the MemCodeAct Agent.
         This includes gathering info on previous steps and prompting the model to make a command to execute.
diff --git a/openhands/agenthub/planner_agent/agent.py b/openhands/agenthub/planner_agent/agent.py
index f5aef523d9b9..9c81819e9229 100644
--- a/openhands/agenthub/planner_agent/agent.py
+++ b/openhands/agenthub/planner_agent/agent.py
@@ -20,7 +20,9 @@ def __init__(self, llm: LLM, config: AgentConfig):
         """Initialize the Planner Agent with an LLM
 
         Parameters:
-        - llm (LLM): The llm to be used by this agent
+        - llm: The llm to be used by this agent
+        - config: The agent config
+        - memory: The memory for this agent
         """
         super().__init__(llm, config)
 
diff --git a/openhands/controller/agent.py b/openhands/controller/agent.py
index cffdbbf22d35..18c1bf9246ba 100644
--- a/openhands/controller/agent.py
+++ b/openhands/controller/agent.py
@@ -1,6 +1,8 @@
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Type
 
+from openhands.memory.conversation_memory import ConversationMemory
+
 if TYPE_CHECKING:
     from openhands.controller.state.state import State
     from openhands.core.config import AgentConfig
@@ -19,11 +21,12 @@ class Agent(ABC):
     This abstract base class is an general interface for an agent dedicated to
     executing a specific instruction and allowing human interaction with the
     agent during execution.
-    It tracks the execution status and maintains a history of interactions.
+    It tracks the execution status and maintains a reference to the conversation memory.
     """
 
     _registry: dict[str, Type['Agent']] = {}
     sandbox_plugins: list[PluginRequirement] = []
+    memory: ConversationMemory
 
     def __init__(
         self,
@@ -55,7 +58,7 @@ def reset(self) -> None:
         to prepare the agent for restarting the instruction or cleaning up before destruction.
 
         """
-        # TODO clear history
+        self.memory.reset()
         self._complete = False
 
         if self.llm:
diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index 1ea55edeb9d9..ad637f7a62cc 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -347,6 +347,7 @@ async def start_delegate(self, action: AgentDelegateAction):
         Args:
             action (AgentDelegateAction): The action containing information about the delegate agent to start.
         """
+        # prepare the required arguments for the delegate agent: llm, agent_config, memory
         agent_cls: Type[Agent] = Agent.get_cls(action.agent)
         agent_config = self.agent_configs.get(action.agent, self.agent.config)
         llm_config = self.agent_to_llm_config.get(action.agent, self.agent.llm.config)
diff --git a/openhands/events/stream.py b/openhands/events/stream.py
index b66720227836..b3ebbaf2308e 100644
--- a/openhands/events/stream.py
+++ b/openhands/events/stream.py
@@ -19,6 +19,7 @@ class EventStreamSubscriber(str, Enum):
     RUNTIME = 'runtime'
     MAIN = 'main'
     TEST = 'test'
+    MEMORY = 'memory'
 
 
 class EventStream:
diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py
new file mode 100644
index 000000000000..376a1a00a78f
--- /dev/null
+++ b/openhands/memory/conversation_memory.py
@@ -0,0 +1,146 @@
+from openhands.core.logger import openhands_logger as logger
+from openhands.events.action.action import Action
+from openhands.events.action.agent import (
+    AgentDelegateAction,
+    AgentFinishAction,
+)
+from openhands.events.action.message import MessageAction
+from openhands.events.event import Event, EventSource
+from openhands.events.observation.delegate import AgentDelegateObservation
+from openhands.events.observation.observation import Observation
+
+
+class ConversationMemory:
+    """A list of events in the immediate memory of the agent.
+
+    This class provides methods to retrieve and filter the events in the history of the running agent.
+    """
+
+    _history: list[Event]
+    delegates: dict[tuple[int, int], tuple[str, str]]
+
+    def __init__(self, history: list[Event]):
+        self._history = history
+
+        self.delegates = {}
+        self.start_id = len(self._history) - 1
+
+    def get_events(self, reverse: bool = False) -> list[Event]:
+        """Retrieve and return events for agent's use as a list of Event objects. Whether it includes delegates is up to the agent controller that initialized state.history."""
+
+        return self._history if not reverse else list(reversed(self._history))
+
+    def get_last_events(self, n: int) -> list[Event]:
+        """Return the last n events from the history."""
+
+        end_id = len(self._history) - 1
+
+        # FIXME this ignores that there are events that won't be returned, like NullObservations
+        start_id = max(self.start_id, end_id - n + 1)
+
+        return list(event for event in self._history[start_id:end_id])
+
+    async def on_event(self, event: Event):
+        if not isinstance(event, AgentDelegateObservation):
+            return
+
+        logger.debug('AgentDelegateObservation received')
+
+        # figure out what this delegate's actions were
+        # from the last AgentDelegateAction to this AgentDelegateObservation
+        # and save their ids as start and end ids
+        # in order to use later to exclude them from parent stream
+        # or summarize them
+        delegate_end = event.id
+        delegate_start = -1
+        delegate_agent: str = ''
+        delegate_task: str = ''
+        for prev_event in self._history[event.id - 1 :: -1]:
+            if isinstance(prev_event, AgentDelegateAction):
+                delegate_start = prev_event.id
+                delegate_agent = prev_event.agent
+                delegate_task = prev_event.inputs.get('task', '')
+                break
+
+        if delegate_start == -1:
+            logger.error(
+                f'No AgentDelegateAction found for AgentDelegateObservation with id={delegate_end}'
+            )
+            return
+
+        self.delegates[(delegate_start, delegate_end)] = (delegate_agent, delegate_task)
+        logger.debug(
+            f'Delegate {delegate_agent} with task {delegate_task} ran from id={delegate_start} to id={delegate_end}'
+        )
+
+    def reset(self):
+        self.delegates = {}
+
+        # wipe history of previous interactions
+        # alternatively, we can re-initialize a new event stream, then we need to notify everyone who is subscribed to this event stream
+        self._history = []
+
+    def get_current_user_intent(self):
+        """Returns the latest user message and image(if provided) that appears after a FinishAction, or the first (the task) if nothing was finished yet."""
+        last_user_message = None
+        last_user_message_image_urls: list[str] | None = []
+        for event in self._history[::-1]:
+            if isinstance(event, MessageAction) and event.source == EventSource.USER:
+                last_user_message = event.content
+                last_user_message_image_urls = event.images_urls
+            elif isinstance(event, AgentFinishAction):
+                if last_user_message is not None:
+                    return last_user_message
+
+        return last_user_message, last_user_message_image_urls
+
+    def get_last_action(self, end_id: int = -1) -> Action | None:
+        """Return the last action from history, filtered to exclude unwanted events."""
+
+        last_action = next(
+            (event for event in self._history if isinstance(event, Action)),
+            None,
+        )
+
+        return last_action
+
+    def get_last_observation(self, end_id: int = -1) -> Observation | None:
+        """Return the last observation from history, filtered to exclude unwanted events."""
+
+        last_observation = next(
+            (
+                event
+                for event in self._history[end_id::-1]
+                if isinstance(event, Observation)
+            ),
+            None,
+        )
+
+        return last_observation
+
+    def get_last_user_message(self) -> str:
+        """Return the content of the last user message from history."""
+        last_user_message = next(
+            (
+                event.content
+                for event in self._history
+                if isinstance(event, MessageAction) and event.source == EventSource.USER
+            ),
+            None,
+        )
+
+        return last_user_message if last_user_message is not None else ''
+
+    def get_last_agent_message(self) -> str:
+        """Return the content of the last agent message from the event stream."""
+        last_agent_message = next(
+            (
+                event.content
+                for event in self._history
+                if isinstance(event, MessageAction)
+                and event.source == EventSource.AGENT
+            ),
+            None,
+        )
+
+        return last_agent_message if last_agent_message is not None else ''
diff --git a/openhands/server/session/session.py b/openhands/server/session/session.py
index 94606d085c79..e656c6c908a7 100644
--- a/openhands/server/session/session.py
+++ b/openhands/server/session/session.py
@@ -77,6 +77,7 @@ async def _initialize_agent(self, data: dict):
         self.agent_session.event_stream.add_event(
             AgentStateChangedObservation('', AgentState.LOADING), EventSource.AGENT
         )
+
         # Extract the agent-relevant arguments from the request
         args = {key: value for key, value in data.get('args', {}).items()}
         agent_cls = args.get(ConfigType.AGENT, self.config.default_agent)
@@ -101,6 +102,7 @@ async def _initialize_agent(self, data: dict):
 
         # TODO: override other LLM config & agent config groups (#2075)
 
+        # set up the required arguments for the agent
         llm = LLM(config=self.config.get_llm_config_from_agent(agent_cls))
         agent_config = self.config.get_agent_config(agent_cls)
         agent = Agent.get_cls(agent_cls)(llm, agent_config)

From 21f82e102d9528d7609a3855df21df704175d0ed Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Wed, 16 Oct 2024 00:28:24 +0200
Subject: [PATCH 04/89] reset branch, tweak stream.py

---
 openhands/events/stream.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/openhands/events/stream.py b/openhands/events/stream.py
index 8cff229c0518..c8daf3214ce4 100644
--- a/openhands/events/stream.py
+++ b/openhands/events/stream.py
@@ -78,6 +78,18 @@ def get_events(
         reverse=False,
         filter_out_type: tuple[type[Event], ...] | None = None,
     ) -> Iterable[Event]:
+        """
+        Retrieve events from the event stream, optionally filtering out events of a given type.
+
+        Args:
+            start_id: The ID of the first event to retrieve. Defaults to 0.
+            end_id: The ID of the last event to retrieve. Defaults to the last event in the stream.
+            reverse: Whether to retrieve events in reverse order. Defaults to False.
+            filter_out_type: A tuple of event types to filter out. Typically used to filter out backend events from the agent.
+
+        Yields:
+            Events from the stream that match the criteria.
+        """
         if reverse:
             if end_id is None:
                 end_id = self._cur_id - 1

From abda3f4fc3fecc75e538f3c89e29f476321dec20 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Wed, 16 Oct 2024 01:55:09 +0200
Subject: [PATCH 05/89] wip refactor methods

---
 openhands/controller/agent_controller.py | 66 +++++++++++++++++++++++-
 openhands/memory/history.py              | 53 -------------------
 2 files changed, 64 insertions(+), 55 deletions(-)

diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index bbbddba2eb2c..0a0765966e1a 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -37,7 +37,8 @@
     ErrorObservation,
     Observation,
 )
-from openhands.events.serialization.event import truncate_content
+from openhands.events.serialization.event import event_to_dict, truncate_content
+from openhands.events.utils import get_pairs_from_events
 from openhands.llm.llm import LLM
 from openhands.runtime.utils.shutdown_listener import should_continue
 
@@ -120,6 +121,10 @@ def __init__(
     async def close(self):
         """Closes the agent controller, canceling any ongoing tasks and unsubscribing from the event stream."""
         await self.set_agent_state_to(AgentState.STOPPED)
+
+        # save trajectories if applicable
+
+        # unsubscribe from the event stream
         self.event_stream.unsubscribe(EventStreamSubscriber.AGENT_CONTROLLER)
 
     def update_state_before_step(self):
@@ -245,7 +250,7 @@ async def _handle_observation(self, observation: Observation):
         if isinstance(observation, CmdOutputObservation):
             return
         elif isinstance(observation, AgentDelegateObservation):
-            self.state.history.on_event(observation)
+            self._handle_delegate_observation(observation)
         elif isinstance(observation, ErrorObservation):
             if self.state.agent_state == AgentState.ERROR:
                 self.state.metrics.merge(self.state.local_metrics)
@@ -265,6 +270,49 @@ async def _handle_message_action(self, action: MessageAction):
         elif action.source == EventSource.AGENT and action.wait_for_response:
             await self.set_agent_state_to(AgentState.AWAITING_USER_INPUT)
 
+    def _handle_delegate_observation(self, observation: Observation):
+        """Handles delegate observations from the event stream.
+
+        Args:
+            observation (Observation): The observation to handle.
+        """
+        if not isinstance(observation, AgentDelegateObservation):
+            return
+
+        logger.debug('AgentDelegateObservation received')
+
+        # figure out what this delegate's actions were
+        # from the last AgentDelegateAction to this AgentDelegateObservation
+        # and save their ids as start and end ids
+        # in order to use later to exclude them from parent stream or summarize them instead
+        delegate_end = observation.id
+        delegate_start = -1
+        delegate_agent: str = ''
+        delegate_task: str = ''
+        for prev_event in self.event_stream.get_events(
+            end_id=observation.id - 1, reverse=True
+        ):
+            # retrieve the last AgentDelegateAction before this observation
+            if isinstance(prev_event, AgentDelegateAction):
+                delegate_start = prev_event.id
+                delegate_agent = prev_event.agent
+                delegate_task = prev_event.inputs.get('task', '')
+                break
+
+        if delegate_start == -1:
+            logger.error(
+                f'No AgentDelegateAction found for AgentDelegateObservation with id={delegate_end}'
+            )
+            return
+
+        self.state.delegates[(delegate_start, delegate_end)] = (
+            delegate_agent,
+            delegate_task,
+        )
+        logger.debug(
+            f'Delegate {delegate_agent} with task {delegate_task} ran from id={delegate_start} to id={delegate_end}'
+        )
+
     def reset_task(self):
         """Resets the agent's task."""
 
@@ -607,6 +655,20 @@ def _is_stuck(self):
 
         return self._stuck_detector.is_stuck()
 
+    # history is now available as a filtered stream of events, rather than list of pairs of (Action, Observation)
+    # we rebuild the pairs here
+    # for compatibility with the existing output format in evaluations
+    # remove this when it's no longer necessary
+    def compatibility_for_eval_history_pairs(self) -> list[tuple[dict, dict]]:
+        history_pairs = []
+
+        for action, observation in get_pairs_from_events(
+            self.event_stream.get_events(include_delegates=True)
+        ):
+            history_pairs.append((event_to_dict(action), event_to_dict(observation)))
+
+        return history_pairs
+
     def __repr__(self):
         return (
             f'AgentController(id={self.id}, agent={self.agent!r}, '
diff --git a/openhands/memory/history.py b/openhands/memory/history.py
index 1e4cfb8b5f05..f61f47a11d85 100644
--- a/openhands/memory/history.py
+++ b/openhands/memory/history.py
@@ -1,9 +1,7 @@
 from typing import ClassVar, Iterable
 
-from openhands.core.logger import openhands_logger as logger
 from openhands.events.action.action import Action
 from openhands.events.action.agent import (
-    AgentDelegateAction,
     ChangeAgentStateAction,
 )
 from openhands.events.action.empty import NullAction
@@ -13,9 +11,7 @@
 from openhands.events.observation.delegate import AgentDelegateObservation
 from openhands.events.observation.empty import NullObservation
 from openhands.events.observation.observation import Observation
-from openhands.events.serialization.event import event_to_dict
 from openhands.events.stream import EventStream
-from openhands.events.utils import get_pairs_from_events
 
 
 class ShortTermHistory(list[Event]):
@@ -173,52 +169,3 @@ def has_delegation(self) -> bool:
             if isinstance(event, AgentDelegateObservation):
                 return True
         return False
-
-    def on_event(self, event: Event):
-        if not isinstance(event, AgentDelegateObservation):
-            return
-
-        logger.debug('AgentDelegateObservation received')
-
-        # figure out what this delegate's actions were
-        # from the last AgentDelegateAction to this AgentDelegateObservation
-        # and save their ids as start and end ids
-        # in order to use later to exclude them from parent stream
-        # or summarize them
-        delegate_end = event.id
-        delegate_start = -1
-        delegate_agent: str = ''
-        delegate_task: str = ''
-        for prev_event in self._event_stream.get_events(
-            end_id=event.id - 1, reverse=True
-        ):
-            if isinstance(prev_event, AgentDelegateAction):
-                delegate_start = prev_event.id
-                delegate_agent = prev_event.agent
-                delegate_task = prev_event.inputs.get('task', '')
-                break
-
-        if delegate_start == -1:
-            logger.error(
-                f'No AgentDelegateAction found for AgentDelegateObservation with id={delegate_end}'
-            )
-            return
-
-        self.delegates[(delegate_start, delegate_end)] = (delegate_agent, delegate_task)
-        logger.debug(
-            f'Delegate {delegate_agent} with task {delegate_task} ran from id={delegate_start} to id={delegate_end}'
-        )
-
-    # TODO remove me when unnecessary
-    # history is now available as a filtered stream of events, rather than list of pairs of (Action, Observation)
-    # we rebuild the pairs here
-    # for compatibility with the existing output format in evaluations
-    def compatibility_for_eval_history_pairs(self) -> list[tuple[dict, dict]]:
-        history_pairs = []
-
-        for action, observation in get_pairs_from_events(
-            self.get_events_as_list(include_delegates=True)
-        ):
-            history_pairs.append((event_to_dict(action), event_to_dict(observation)))
-
-        return history_pairs

From 0a7fb43e4f6248f5d25e46d54a80edd7deb1063b Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Wed, 16 Oct 2024 04:05:23 +0200
Subject: [PATCH 06/89] move compatibility method to evals

---
 evaluation/EDA/run_infer.py                 |   3 +-
 evaluation/agent_bench/run_infer.py         |   3 +-
 evaluation/aider_bench/run_infer.py         |   3 +-
 evaluation/biocoder/run_infer.py            |   3 +-
 evaluation/bird/run_infer.py                |   3 +-
 evaluation/browsing_delegation/run_infer.py |   3 +-
 evaluation/gaia/run_infer.py                |   3 +-
 evaluation/gorilla/run_infer.py             |   3 +-
 evaluation/gpqa/run_infer.py                |   3 +-
 evaluation/humanevalfix/run_infer.py        |   3 +-
 evaluation/logic_reasoning/run_infer.py     |   3 +-
 evaluation/miniwob/run_infer.py             |   3 +-
 evaluation/mint/run_infer.py                |   3 +-
 evaluation/ml_bench/run_infer.py            |   3 +-
 evaluation/swe_bench/process_evals.py       | 194 ++++++++++++++++++++
 evaluation/toolqa/run_infer.py              |   3 +-
 evaluation/utils/shared.py                  |  19 ++
 evaluation/webarena/run_infer.py            |   3 +-
 openhands/controller/agent_controller.py    |  17 +-
 openhands/controller/state/state.py         |   1 +
 20 files changed, 247 insertions(+), 32 deletions(-)
 create mode 100644 evaluation/swe_bench/process_evals.py

diff --git a/evaluation/EDA/run_infer.py b/evaluation/EDA/run_infer.py
index 81c7455e0041..304981f5559c 100644
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@@ -8,6 +8,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -146,7 +147,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/agent_bench/run_infer.py b/evaluation/agent_bench/run_infer.py
index b851f86fa464..ddca35f3921a 100644
--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/agent_bench/run_infer.py
@@ -16,6 +16,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -269,7 +270,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     metrics = state.metrics.get() if state.metrics else None
 
diff --git a/evaluation/aider_bench/run_infer.py b/evaluation/aider_bench/run_infer.py
index b4698a7c69a2..f445a719c97b 100644
--- a/evaluation/aider_bench/run_infer.py
+++ b/evaluation/aider_bench/run_infer.py
@@ -15,6 +15,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -245,7 +246,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
     metrics = state.metrics.get() if state.metrics else None
 
     # Save the output
diff --git a/evaluation/biocoder/run_infer.py b/evaluation/biocoder/run_infer.py
index 35740892912f..45e7671d6809 100644
--- a/evaluation/biocoder/run_infer.py
+++ b/evaluation/biocoder/run_infer.py
@@ -13,6 +13,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -298,7 +299,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     test_result['generated'] = test_result['metadata']['1_copy_change_code']
 
diff --git a/evaluation/bird/run_infer.py b/evaluation/bird/run_infer.py
index aae58cc7d593..74e4cd301c38 100644
--- a/evaluation/bird/run_infer.py
+++ b/evaluation/bird/run_infer.py
@@ -16,6 +16,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -429,7 +430,7 @@ def execute_sql(db_path, sql):
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/browsing_delegation/run_infer.py b/evaluation/browsing_delegation/run_infer.py
index c9fe2ebd18bc..5c1ab8c062e3 100644
--- a/evaluation/browsing_delegation/run_infer.py
+++ b/evaluation/browsing_delegation/run_infer.py
@@ -9,6 +9,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -89,7 +90,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # find the last delegate action
     last_delegate_action = None
diff --git a/evaluation/gaia/run_infer.py b/evaluation/gaia/run_infer.py
index 9f6f3884f5fb..51519b88494e 100644
--- a/evaluation/gaia/run_infer.py
+++ b/evaluation/gaia/run_infer.py
@@ -12,6 +12,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -201,7 +202,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/gorilla/run_infer.py b/evaluation/gorilla/run_infer.py
index ac49a8078d5b..93fb15d21992 100644
--- a/evaluation/gorilla/run_infer.py
+++ b/evaluation/gorilla/run_infer.py
@@ -10,6 +10,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -112,7 +113,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     output = EvalOutput(
         instance_id=instance_id,
diff --git a/evaluation/gpqa/run_infer.py b/evaluation/gpqa/run_infer.py
index fe7ff4bf1c18..98b4e2a6a5cc 100644
--- a/evaluation/gpqa/run_infer.py
+++ b/evaluation/gpqa/run_infer.py
@@ -28,6 +28,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -299,7 +300,7 @@ def process_instance(
         instance_id=str(instance.instance_id),
         instruction=instruction,
         metadata=metadata,
-        history=state.history.compatibility_for_eval_history_pairs(),
+        history=compatibility_for_eval_history_pairs(state.history),
         metrics=metrics,
         error=state.last_error if state and state.last_error else None,
         test_result={
diff --git a/evaluation/humanevalfix/run_infer.py b/evaluation/humanevalfix/run_infer.py
index 002fdd1e42e2..6175255c1b21 100644
--- a/evaluation/humanevalfix/run_infer.py
+++ b/evaluation/humanevalfix/run_infer.py
@@ -21,6 +21,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -253,7 +254,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/logic_reasoning/run_infer.py b/evaluation/logic_reasoning/run_infer.py
index 7fa6a5bb50da..f13c4674ff74 100644
--- a/evaluation/logic_reasoning/run_infer.py
+++ b/evaluation/logic_reasoning/run_infer.py
@@ -8,6 +8,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -245,7 +246,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/miniwob/run_infer.py b/evaluation/miniwob/run_infer.py
index 1df7dac028f0..8f21f826eb7f 100644
--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@@ -10,6 +10,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -162,7 +163,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/mint/run_infer.py b/evaluation/mint/run_infer.py
index 481336d59de4..0f113ef9d9d5 100644
--- a/evaluation/mint/run_infer.py
+++ b/evaluation/mint/run_infer.py
@@ -13,6 +13,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -200,7 +201,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/ml_bench/run_infer.py b/evaluation/ml_bench/run_infer.py
index 671c6350a489..5389fba0e7d6 100644
--- a/evaluation/ml_bench/run_infer.py
+++ b/evaluation/ml_bench/run_infer.py
@@ -24,6 +24,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -254,7 +255,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/swe_bench/process_evals.py b/evaluation/swe_bench/process_evals.py
new file mode 100644
index 000000000000..3202d10de758
--- /dev/null
+++ b/evaluation/swe_bench/process_evals.py
@@ -0,0 +1,194 @@
+import json
+import os
+from typing import Any, Dict, List, Optional
+
+import pandas as pd
+
+
+# Placeholder for litellm's get_tokens function
+def get_tokens(text: str) -> int:
+    """
+    Returns the number of tokens in the given text.
+    Replace this function with litellm's actual get_tokens implementation.
+    """
+    # Example implementation (to be replaced)
+    return len(text.split())
+
+
+class EvalOutput:
+    def __init__(
+        self,
+        instance_id: str,
+        instruction: str,
+        instance: Dict[str, Any],
+        test_result: Dict[str, Any],
+        metadata: Dict[str, Any],
+        history: List[Dict[str, Any]],
+        metrics: Optional[Dict[str, Any]] = None,
+        llm_completions: Optional[List[Any]] = None,
+        error: Optional[str] = None,
+    ):
+        self.instance_id = instance_id
+        self.instruction = instruction
+        self.instance = instance
+        self.test_result = test_result
+        self.metadata = metadata
+        self.history = history
+        self.metrics = metrics
+        self.llm_completions = llm_completions or []
+        self.error = error
+
+    @staticmethod
+    def from_dict(data: Dict[str, Any]) -> 'EvalOutput':
+        return EvalOutput(
+            instance_id=data.get('instance_id', ''),
+            instruction=data.get('instruction', ''),
+            instance=data.get('instance', {}),
+            test_result=data.get('test_result', {}),
+            metadata=data.get('metadata', {}),
+            history=data.get('history', []),
+            metrics=data.get('metrics', None),
+            llm_completions=data.get('llm_completions', []),
+            error=data.get('error', None),
+        )
+
+
+def load_eval_outputs(jsonl_path: str) -> List[EvalOutput]:
+    eval_outputs = []
+    with open(jsonl_path, 'r') as file:
+        content = file.read()
+        try:
+            # Try parsing as a single JSON object
+            data = json.loads(content)
+            eval_output = EvalOutput.from_dict(data)
+            eval_outputs.append(eval_output)
+        except json.JSONDecodeError:
+            # If single JSON parse fails, try parsing as JSON Lines
+            file.seek(0)
+            for line_number, line in enumerate(file, start=1):
+                if line.strip():  # Skip empty lines
+                    try:
+                        data = json.loads(line)
+                        eval_output = EvalOutput.from_dict(data)
+                        eval_outputs.append(eval_output)
+                    except json.JSONDecodeError as e:
+                        print(
+                            f'Failed to parse line {line_number} in {jsonl_path}: {e}'
+                        )
+                        print(
+                            f'Problematic line: {line[:100]}...'
+                        )  # Print first 100 chars of the line
+
+    if not eval_outputs:
+        print(f'Warning: No valid data found in {jsonl_path}')
+
+    return eval_outputs
+
+
+def process_llm_completions(eval_output: EvalOutput) -> List[Dict[str, Any]]:
+    """
+    Processes the llm_completions of an EvalOutput to extract prompts (including system prompt) and responses.
+    Handles both dictionary and string content formats.
+
+    Args:
+        eval_output (EvalOutput): The evaluation output instance.
+
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing 'system_prompt', 'user_prompts', and 'response'.
+    """
+    completions = []
+    for comp in eval_output.llm_completions:
+        messages = comp.get('messages', [])
+        response_content = comp.get('response', {}).get('choices', [])
+
+        # Initialize prompts and response
+        system_prompt = ''
+        user_prompts = []
+        response = ''
+
+        # Extract system prompt and user prompts
+        for message in messages:
+            role = message.get('role')
+            content = message.get('content', [])
+
+            if role == 'system':
+                system_prompt = _extract_text_content(content)
+            elif role == 'user':
+                user_prompts.append(_extract_text_content(content))
+
+        # Extract the assistant's response
+        if response_content and len(response_content) > 0:
+            response = response_content[0].get('message', {}).get('content', '').strip()
+
+        completions.append(
+            {
+                'system_prompt': system_prompt,
+                'user_prompts': user_prompts,
+                'response': response,
+            }
+        )
+    return completions
+
+
+def _extract_text_content(content):
+    if isinstance(content, list):
+        return '\n'.join(
+            item.get('text', '').strip()
+            for item in content
+            if isinstance(item, dict) and item.get('type') == 'text'
+        )
+    elif isinstance(content, str):
+        return content.strip()
+    return ''
+
+
+def create_results_dataframe(eval_outputs: List[EvalOutput]) -> pd.DataFrame:
+    records = []
+    for eo in eval_outputs:
+        completions = process_llm_completions(eo)
+        for comp in completions:
+            prompt = comp['prompt']
+            response = comp['response']
+            token_count = get_tokens(prompt)
+            records.append(
+                {
+                    'instance_id': eo.instance_id,
+                    'prompt': prompt,
+                    'response': response,
+                    'token_count': token_count,
+                }
+            )
+    df = pd.DataFrame(records)
+    return df
+
+
+def main():
+    swe_bench_dir = 'evaluation/evaluation_outputs/outputs/swe-bench-lite/CodeActAgent/'
+    results = []
+
+    # Traverse through all subdirectories to find output.jsonl files
+    for root, dirs, files in os.walk(swe_bench_dir):
+        for file in files:
+            if file.endswith('output.jsonl') or file.endswith('output-pretty.jsonl'):
+                jsonl_path = os.path.join(root, file)
+                print(f'Processing file: {jsonl_path}')
+                try:
+                    eval_outputs = load_eval_outputs(jsonl_path)
+                    if eval_outputs:
+                        df = create_results_dataframe(eval_outputs)
+                        results.append(df)
+                    else:
+                        print(f'No valid data found in {jsonl_path}')
+                except Exception as e:
+                    print(f'Error processing {jsonl_path}: {e}')
+
+    if results:
+        final_df = pd.concat(results, ignore_index=True)
+        final_df.to_csv('swe_bench_evaluation_results.csv', index=False)
+        print('Results saved to swe_bench_evaluation_results.csv')
+    else:
+        print('No valid data found in any of the processed files.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/evaluation/toolqa/run_infer.py b/evaluation/toolqa/run_infer.py
index 8b3ebdc58b28..da36af2b5788 100644
--- a/evaluation/toolqa/run_infer.py
+++ b/evaluation/toolqa/run_infer.py
@@ -9,6 +9,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -135,7 +136,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index bed679f342a2..dd13bcb420e3 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -18,6 +18,9 @@
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import Action
 from openhands.events.action.message import MessageAction
+from openhands.events.serialization.event import event_to_dict
+from openhands.memory.history import ShortTermHistory
+from openhands.utils import get_pairs_from_events
 
 
 class EvalMetadata(BaseModel):
@@ -412,3 +415,19 @@ def reset_logger_for_multiprocessing(
     )
     file_handler.setLevel(logging.INFO)
     logger.addHandler(file_handler)
+
+    # history is now available as a filtered stream of events, rather than list of pairs of (Action, Observation)
+    # we rebuild the pairs here
+    # for compatibility with the existing output format in evaluations
+    # remove this when it's no longer necessary
+    def compatibility_for_eval_history_pairs(
+        history: ShortTermHistory,
+    ) -> list[tuple[dict, dict]]:
+        history_pairs = []
+
+        for action, observation in get_pairs_from_events(
+            history.get_events(include_delegates=True)
+        ):
+            history_pairs.append((event_to_dict(action), event_to_dict(observation)))
+
+        return history_pairs
diff --git a/evaluation/webarena/run_infer.py b/evaluation/webarena/run_infer.py
index 26637e00be38..878cedc77d5c 100644
--- a/evaluation/webarena/run_infer.py
+++ b/evaluation/webarena/run_infer.py
@@ -10,6 +10,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -176,7 +177,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index 0a0765966e1a..a6ac74228121 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -37,8 +37,7 @@
     ErrorObservation,
     Observation,
 )
-from openhands.events.serialization.event import event_to_dict, truncate_content
-from openhands.events.utils import get_pairs_from_events
+from openhands.events.serialization.event import truncate_content
 from openhands.llm.llm import LLM
 from openhands.runtime.utils.shutdown_listener import should_continue
 
@@ -655,20 +654,6 @@ def _is_stuck(self):
 
         return self._stuck_detector.is_stuck()
 
-    # history is now available as a filtered stream of events, rather than list of pairs of (Action, Observation)
-    # we rebuild the pairs here
-    # for compatibility with the existing output format in evaluations
-    # remove this when it's no longer necessary
-    def compatibility_for_eval_history_pairs(self) -> list[tuple[dict, dict]]:
-        history_pairs = []
-
-        for action, observation in get_pairs_from_events(
-            self.event_stream.get_events(include_delegates=True)
-        ):
-            history_pairs.append((event_to_dict(action), event_to_dict(observation)))
-
-        return history_pairs
-
     def __repr__(self):
         return (
             f'AgentController(id={self.id}, agent={self.agent!r}, '
diff --git a/openhands/controller/state/state.py b/openhands/controller/state/state.py
index 274df03879e5..58fe7fb62fab 100644
--- a/openhands/controller/state/state.py
+++ b/openhands/controller/state/state.py
@@ -94,6 +94,7 @@ class State:
     start_id: int = -1
     end_id: int = -1
     almost_stuck: int = 0
+    delegates: dict[tuple[int, int], tuple[str, str]] = field(default_factory=dict)
     # NOTE: This will never be used by the controller, but it can be used by different
     # evaluation tasks to store extra data needed to track the progress/state of the task.
     extra_data: dict[str, Any] = field(default_factory=dict)

From 981335ce965b780942c82cd3a102f0c5326b5820 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Wed, 16 Oct 2024 15:35:33 +0200
Subject: [PATCH 07/89] retrieve history in the controller

---
 openhands/controller/agent_controller.py | 72 ++++++++++++++++++------
 openhands/controller/state/state.py      | 32 +----------
 openhands/memory/history.py              |  2 -
 3 files changed, 58 insertions(+), 48 deletions(-)

diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index a6ac74228121..2e1d9eec4977 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -123,6 +123,8 @@ async def close(self):
 
         # save trajectories if applicable
 
+        # make sure the history in state is complete (with delegates)
+
         # unsubscribe from the event stream
         self.event_stream.unsubscribe(EventStreamSubscriber.AGENT_CONTROLLER)
 
@@ -130,6 +132,44 @@ def update_state_before_step(self):
         self.state.iteration += 1
         self.state.local_iteration += 1
 
+        # get the history from the event stream
+        start_id = self.state.start_id if self.state.start_id != -1 else 0
+        end_id = (
+            self.state.end_id
+            if self.state.end_id != -1
+            else self.event_stream.get_latest_event_id()
+        )
+        self.state.history = list(
+            self.event_stream.get_events(
+                start_id=start_id,
+                end_id=end_id,
+                reverse=False,
+            )
+        )
+
+        # do not include events between delegate actions and observations:
+        # include in history the delegate action and observation themselves
+        if self.state.delegates_ids:
+            for (start_id, end_id), (
+                delegate_agent,
+                delegate_task,
+            ) in self.state.delegates_ids.items():
+                # sanity checks
+                if (
+                    start_id < 0
+                    or end_id < 1
+                    or start_id >= end_id
+                    or end_id >= len(self.state.history)
+                ):
+                    logger.error(
+                        f'Invalid delegate ids: {start_id}, {end_id}. Skipping...'
+                    )
+                    continue
+                self.state.history = (
+                    self.state.history[: start_id + 1]
+                    + self.state.history[end_id - 1 :]
+                )
+
     async def update_state_after_step(self):
         # update metrics especially for cost. Use deepcopy to avoid it being modified by agent.reset()
         self.state.local_metrics = copy.deepcopy(self.agent.llm.metrics)
@@ -285,6 +325,11 @@ def _handle_delegate_observation(self, observation: Observation):
         # and save their ids as start and end ids
         # in order to use later to exclude them from parent stream or summarize them instead
         delegate_end = observation.id
+        if delegate_end <= 0:
+            logger.error(
+                f'The id of the AgentDelegateObservation is not valid: {delegate_end}'
+            )
+            return
         delegate_start = -1
         delegate_agent: str = ''
         delegate_task: str = ''
@@ -304,7 +349,7 @@ def _handle_delegate_observation(self, observation: Observation):
             )
             return
 
-        self.state.delegates[(delegate_start, delegate_end)] = (
+        self.state.delegates_ids[(delegate_start, delegate_end)] = (
             delegate_agent,
             delegate_task,
         )
@@ -622,25 +667,18 @@ def set_initial_state(
         else:
             self.state = state
 
-        # when restored from a previous session, the State object will have history, start_id, and end_id
-        # connect it to the event stream
-        self.state.history.set_event_stream(self.event_stream)
+        # FIXME when restored from a previous session, the State object needs to have:
+        # - history? let's go with nope
+        # - start_id, and end_id
+        # - delegates_ids
 
         # if start_id was not set in State, we're starting fresh, at the top of the stream
-        start_id = self.state.start_id
-        if start_id == -1:
-            start_id = self.event_stream.get_latest_event_id() + 1
+        if self.state.start_id <= -1:
+            self.state.start_id = self.event_stream.get_latest_event_id() + 1
         else:
-            logger.debug(f'AgentController {self.id} restoring from event {start_id}')
-
-        # make sure history is in sync
-        self.state.start_id = start_id
-        self.state.history.start_id = start_id
-
-        # if there was an end_id saved in State, set it in history
-        # currently not used, later useful for delegates
-        if self.state.end_id > -1:
-            self.state.history.end_id = self.state.end_id
+            logger.debug(
+                f'AgentController {self.id} restoring from event {self.state.start_id}'
+            )
 
     def _is_stuck(self):
         """Checks if the agent or its delegate is stuck in a loop.
diff --git a/openhands/controller/state/state.py b/openhands/controller/state/state.py
index 58fe7fb62fab..a836d3bf153f 100644
--- a/openhands/controller/state/state.py
+++ b/openhands/controller/state/state.py
@@ -12,7 +12,7 @@
     MessageAction,
 )
 from openhands.events.action.agent import AgentFinishAction
-from openhands.memory.history import ShortTermHistory
+from openhands.events.event import Event
 from openhands.storage.files import FileStore
 
 
@@ -77,7 +77,7 @@ class State:
     # max number of iterations for the current task
     max_iterations: int = 100
     confirmation_mode: bool = False
-    history: ShortTermHistory = field(default_factory=ShortTermHistory)
+    history: list[Event] = field(default_factory=list)
     inputs: dict = field(default_factory=dict)
     outputs: dict = field(default_factory=dict)
     last_error: str | None = None
@@ -94,7 +94,7 @@ class State:
     start_id: int = -1
     end_id: int = -1
     almost_stuck: int = 0
-    delegates: dict[tuple[int, int], tuple[str, str]] = field(default_factory=dict)
+    delegates_ids: dict[tuple[int, int], tuple[str, str]] = field(default_factory=dict)
     # NOTE: This will never be used by the controller, but it can be used by different
     # evaluation tasks to store extra data needed to track the progress/state of the task.
     extra_data: dict[str, Any] = field(default_factory=dict)
@@ -132,32 +132,6 @@ def restore_from_session(sid: str, file_store: FileStore) -> 'State':
         state.agent_state = AgentState.LOADING
         return state
 
-    def __getstate__(self):
-        state = self.__dict__.copy()
-
-        # save the relevant data from recent history
-        # so that we can restore it when the state is restored
-        if 'history' in state:
-            state['start_id'] = state['history'].start_id
-            state['end_id'] = state['history'].end_id
-
-        # don't save history object itself
-        state.pop('history', None)
-        return state
-
-    def __setstate__(self, state):
-        self.__dict__.update(state)
-
-        # recreate the history object
-        if not hasattr(self, 'history'):
-            self.history = ShortTermHistory()
-
-        # restore the relevant data in history from the state
-        self.history.start_id = self.start_id
-        self.history.end_id = self.end_id
-
-        # remove the restored data from the state if any
-
     def get_current_user_intent(self):
         """Returns the latest user message and image(if provided) that appears after a FinishAction, or the first (the task) if nothing was finished yet."""
         last_user_message = None
diff --git a/openhands/memory/history.py b/openhands/memory/history.py
index f61f47a11d85..755b656e115a 100644
--- a/openhands/memory/history.py
+++ b/openhands/memory/history.py
@@ -51,8 +51,6 @@ def get_events(
         include_hidden=False,
     ) -> Iterable[Event]:
         """Return the events as a stream of Event objects."""
-        # TODO handle AgentRejectAction, if it's not part of a chunk ending with an AgentDelegateObservation
-        # or even if it is, because currently we don't add it to the summary
 
         # iterate from start_id to end_id, or reverse
         start_id = self.start_id if self.start_id != -1 else 0

From 1ee26d7437291615e67b294c74fc137ddd433248 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Wed, 16 Oct 2024 15:48:34 +0200
Subject: [PATCH 08/89] adapt code to list

---
 .../agenthub/browsing_agent/browsing_agent.py |  4 +-
 .../agenthub/codeact_agent/codeact_agent.py   | 15 +++-
 .../codeact_swe_agent/codeact_swe_agent.py    | 11 ++-
 openhands/agenthub/delegator_agent/agent.py   |  8 +-
 openhands/agenthub/dummy_agent/agent.py       |  2 +-
 openhands/agenthub/planner_agent/prompt.py    |  2 +-
 openhands/controller/agent_controller.py      |  8 +-
 openhands/controller/state/state.py           |  4 +-
 openhands/controller/stuck.py                 |  2 +-
 openhands/memory/history.py                   | 84 +------------------
 10 files changed, 37 insertions(+), 103 deletions(-)

diff --git a/openhands/agenthub/browsing_agent/browsing_agent.py b/openhands/agenthub/browsing_agent/browsing_agent.py
index 0460506d04f3..822677bab526 100644
--- a/openhands/agenthub/browsing_agent/browsing_agent.py
+++ b/openhands/agenthub/browsing_agent/browsing_agent.py
@@ -150,13 +150,13 @@ def step(self, state: State) -> Action:
         last_obs = None
         last_action = None
 
-        if EVAL_MODE and len(state.history.get_events_as_list()) == 1:
+        if EVAL_MODE and len(state.history) == 1:
             # for webarena and miniwob++ eval, we need to retrieve the initial observation already in browser env
             # initialize and retrieve the first observation by issuing an noop OP
             # For non-benchmark browsing, the browser env starts with a blank page, and the agent is expected to first navigate to desired websites
             return BrowseInteractiveAction(browser_actions='noop()')
 
-        for event in state.history.get_events():
+        for event in state.history:
             if isinstance(event, BrowseInteractiveAction):
                 prev_actions.append(event.browser_actions)
                 last_action = event
diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index 4db072395ac9..0779b085e016 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -5,6 +5,7 @@
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
+from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import ImageContent, Message, TextContent
 from openhands.events.action import (
     Action,
@@ -168,7 +169,8 @@ def get_observation_message(self, obs: Observation) -> Message | None:
         else:
             # If an observation message is not returned, it will cause an error
             # when the LLM tries to return the next message
-            raise ValueError(f'Unknown observation type: {type(obs)}')
+            logger.warning(f'Unknown observation type: {type(obs)}')
+            return None
 
     def reset(self) -> None:
         """Resets the CodeAct Agent."""
@@ -189,8 +191,13 @@ def step(self, state: State) -> Action:
         - AgentFinishAction() - end the interaction
         """
         # if we're done, go back
-        latest_user_message = state.history.get_last_user_message()
-        if latest_user_message and latest_user_message.strip() == '/exit':
+        last_user_message_content = None
+        for event in reversed(state.history):
+            if isinstance(event, MessageAction) and event.source == 'user':
+                last_user_message_content = event.content
+                break
+
+        if last_user_message_content and last_user_message_content.strip() == '/exit':
             return AgentFinishAction()
 
         # prepare what we want to send to the LLM
@@ -230,7 +237,7 @@ def _get_messages(self, state: State) -> list[Message]:
             ),
         ]
 
-        for event in state.history.get_events():
+        for event in state.history:
             # create a regular message from an event
             if isinstance(event, Action):
                 message = self.get_action_message(event)
diff --git a/openhands/agenthub/codeact_swe_agent/codeact_swe_agent.py b/openhands/agenthub/codeact_swe_agent/codeact_swe_agent.py
index d35acb63c810..130fca5fcba5 100644
--- a/openhands/agenthub/codeact_swe_agent/codeact_swe_agent.py
+++ b/openhands/agenthub/codeact_swe_agent/codeact_swe_agent.py
@@ -156,8 +156,13 @@ def step(self, state: State) -> Action:
         - AgentFinishAction() - end the interaction
         """
         # if we're done, go back
-        latest_user_message = state.history.get_last_user_message()
-        if latest_user_message and latest_user_message.strip() == '/exit':
+        last_user_message_content = None
+        for event in reversed(state.history):
+            if isinstance(event, MessageAction) and event.source == 'user':
+                last_user_message_content = event.content
+                break
+
+        if last_user_message_content and last_user_message_content.strip() == '/exit':
             return AgentFinishAction()
 
         # prepare what we want to send to the LLM
@@ -178,7 +183,7 @@ def _get_messages(self, state: State) -> list[Message]:
             Message(role='user', content=[TextContent(text=self.in_context_example)]),
         ]
 
-        for event in state.history.get_events():
+        for event in state.history:
             # create a regular message from an event
             if isinstance(event, Action):
                 message = self.get_action_message(event)
diff --git a/openhands/agenthub/delegator_agent/agent.py b/openhands/agenthub/delegator_agent/agent.py
index 29e0030423c7..7cb987c8c3f7 100644
--- a/openhands/agenthub/delegator_agent/agent.py
+++ b/openhands/agenthub/delegator_agent/agent.py
@@ -2,7 +2,7 @@
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
 from openhands.events.action import Action, AgentDelegateAction, AgentFinishAction
-from openhands.events.observation import AgentDelegateObservation
+from openhands.events.observation import AgentDelegateObservation, Observation
 from openhands.llm.llm import LLM
 
 
@@ -41,7 +41,11 @@ def step(self, state: State) -> Action:
             )
 
         # last observation in history should be from the delegate
-        last_observation = state.history.get_last_observation()
+        last_observation = None
+        for event in reversed(state.history):
+            if isinstance(event, Observation):
+                last_observation = event
+                break
 
         if not isinstance(last_observation, AgentDelegateObservation):
             raise Exception('Last observation is not an AgentDelegateObservation')
diff --git a/openhands/agenthub/dummy_agent/agent.py b/openhands/agenthub/dummy_agent/agent.py
index dbe4c60cfafa..272e6c935f2e 100644
--- a/openhands/agenthub/dummy_agent/agent.py
+++ b/openhands/agenthub/dummy_agent/agent.py
@@ -164,7 +164,7 @@ def step(self, state: State) -> Action:
 
             if 'observations' in prev_step and prev_step['observations']:
                 expected_observations = prev_step['observations']
-                hist_events = state.history.get_last_events(len(expected_observations))
+                hist_events = state.history[-len(expected_observations) :]
 
                 if len(hist_events) < len(expected_observations):
                     print(
diff --git a/openhands/agenthub/planner_agent/prompt.py b/openhands/agenthub/planner_agent/prompt.py
index 495cae80b47e..e096a2a4f6f3 100644
--- a/openhands/agenthub/planner_agent/prompt.py
+++ b/openhands/agenthub/planner_agent/prompt.py
@@ -136,7 +136,7 @@ def get_prompt_and_images(
     latest_action: Action = NullAction()
 
     # retrieve the latest HISTORY_SIZE events
-    for event_count, event in enumerate(state.history.get_events(reverse=True)):
+    for event_count, event in enumerate(reversed(state.history)):
         if event_count >= HISTORY_SIZE:
             break
         if latest_action == NullAction() and isinstance(event, Action):
diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index 2e1d9eec4977..96d04c9eb5c3 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -149,11 +149,11 @@ def update_state_before_step(self):
 
         # do not include events between delegate actions and observations:
         # include in history the delegate action and observation themselves
-        if self.state.delegates_ids:
+        if self.state.delegates:
             for (start_id, end_id), (
                 delegate_agent,
                 delegate_task,
-            ) in self.state.delegates_ids.items():
+            ) in self.state.delegates.items():
                 # sanity checks
                 if (
                     start_id < 0
@@ -349,7 +349,7 @@ def _handle_delegate_observation(self, observation: Observation):
             )
             return
 
-        self.state.delegates_ids[(delegate_start, delegate_end)] = (
+        self.state.delegates[(delegate_start, delegate_end)] = (
             delegate_agent,
             delegate_task,
         )
@@ -669,7 +669,7 @@ def set_initial_state(
 
         # FIXME when restored from a previous session, the State object needs to have:
         # - history? let's go with nope
-        # - start_id, and end_id
+        # - start_id, end_id
         # - delegates_ids
 
         # if start_id was not set in State, we're starting fresh, at the top of the stream
diff --git a/openhands/controller/state/state.py b/openhands/controller/state/state.py
index a836d3bf153f..0080b06c101d 100644
--- a/openhands/controller/state/state.py
+++ b/openhands/controller/state/state.py
@@ -94,7 +94,7 @@ class State:
     start_id: int = -1
     end_id: int = -1
     almost_stuck: int = 0
-    delegates_ids: dict[tuple[int, int], tuple[str, str]] = field(default_factory=dict)
+    delegates: dict[tuple[int, int], tuple[str, str]] = field(default_factory=dict)
     # NOTE: This will never be used by the controller, but it can be used by different
     # evaluation tasks to store extra data needed to track the progress/state of the task.
     extra_data: dict[str, Any] = field(default_factory=dict)
@@ -136,7 +136,7 @@ def get_current_user_intent(self):
         """Returns the latest user message and image(if provided) that appears after a FinishAction, or the first (the task) if nothing was finished yet."""
         last_user_message = None
         last_user_message_image_urls: list[str] | None = []
-        for event in self.history.get_events(reverse=True):
+        for event in reversed(self.history):
             if isinstance(event, MessageAction) and event.source == 'user':
                 last_user_message = event.content
                 last_user_message_image_urls = event.images_urls
diff --git a/openhands/controller/stuck.py b/openhands/controller/stuck.py
index 8b968e3f4c88..68365c69009a 100644
--- a/openhands/controller/stuck.py
+++ b/openhands/controller/stuck.py
@@ -28,7 +28,7 @@ def is_stuck(self):
         # filter out MessageAction with source='user' from history
         filtered_history = [
             event
-            for event in self.state.history.get_events()
+            for event in self.state.history
             if not (
                 (isinstance(event, MessageAction) and event.source == EventSource.USER)
                 or
diff --git a/openhands/memory/history.py b/openhands/memory/history.py
index 755b656e115a..49128baa02c9 100644
--- a/openhands/memory/history.py
+++ b/openhands/memory/history.py
@@ -1,16 +1,13 @@
 from typing import ClassVar, Iterable
 
-from openhands.events.action.action import Action
 from openhands.events.action.agent import (
     ChangeAgentStateAction,
 )
 from openhands.events.action.empty import NullAction
-from openhands.events.action.message import MessageAction
-from openhands.events.event import Event, EventSource
+from openhands.events.event import Event
 from openhands.events.observation.agent import AgentStateChangedObservation
 from openhands.events.observation.delegate import AgentDelegateObservation
 from openhands.events.observation.empty import NullObservation
-from openhands.events.observation.observation import Observation
 from openhands.events.stream import EventStream
 
 
@@ -83,85 +80,6 @@ def get_events(
             elif include_delegates:
                 yield event
 
-    def get_last_action(self, end_id: int = -1) -> Action | None:
-        """Return the last action from the event stream, filtered to exclude unwanted events."""
-        # from end_id in reverse, find the first action
-        end_id = self._event_stream.get_latest_event_id() if end_id == -1 else end_id
-
-        last_action = next(
-            (
-                event
-                for event in self._event_stream.get_events(
-                    end_id=end_id, reverse=True, filter_out_type=self.filter_out
-                )
-                if isinstance(event, Action)
-            ),
-            None,
-        )
-
-        return last_action
-
-    def get_last_observation(self, end_id: int = -1) -> Observation | None:
-        """Return the last observation from the event stream, filtered to exclude unwanted events."""
-        # from end_id in reverse, find the first observation
-        end_id = self._event_stream.get_latest_event_id() if end_id == -1 else end_id
-
-        last_observation = next(
-            (
-                event
-                for event in self._event_stream.get_events(
-                    end_id=end_id, reverse=True, filter_out_type=self.filter_out
-                )
-                if isinstance(event, Observation)
-            ),
-            None,
-        )
-
-        return last_observation
-
-    def get_last_user_message(self) -> str:
-        """Return the content of the last user message from the event stream."""
-        last_user_message = next(
-            (
-                event.content
-                for event in self._event_stream.get_events(reverse=True)
-                if isinstance(event, MessageAction) and event.source == EventSource.USER
-            ),
-            None,
-        )
-
-        return last_user_message if last_user_message is not None else ''
-
-    def get_last_agent_message(self) -> str:
-        """Return the content of the last agent message from the event stream."""
-        last_agent_message = next(
-            (
-                event.content
-                for event in self._event_stream.get_events(reverse=True)
-                if isinstance(event, MessageAction)
-                and event.source == EventSource.AGENT
-            ),
-            None,
-        )
-
-        return last_agent_message if last_agent_message is not None else ''
-
-    def get_last_events(self, n: int) -> list[Event]:
-        """Return the last n events from the event stream."""
-        # dummy agent is using this
-        # it should work, but it's not great to store temporary lists now just for a test
-        end_id = self._event_stream.get_latest_event_id()
-        start_id = max(0, end_id - n + 1)
-
-        return list(
-            event
-            for event in self._event_stream.get_events(
-                start_id=start_id,
-                end_id=end_id,
-                filter_out_type=self.filter_out,
-            )
-        )
-
     def has_delegation(self) -> bool:
         for event in self._event_stream.get_events():
             if isinstance(event, AgentDelegateObservation):

From 267f3befa19b7d0adf31f6d16fac8cddd4d2ba97 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Wed, 16 Oct 2024 16:23:35 +0200
Subject: [PATCH 09/89] add filter by hidden

---
 openhands/agenthub/micro/agent.py        |  8 ++--
 openhands/controller/agent_controller.py | 14 +++++-
 openhands/events/stream.py               | 27 ++++++-----
 openhands/memory/history.py              | 57 ------------------------
 4 files changed, 32 insertions(+), 74 deletions(-)

diff --git a/openhands/agenthub/micro/agent.py b/openhands/agenthub/micro/agent.py
index 83225a3245cd..a9b0825afd9d 100644
--- a/openhands/agenthub/micro/agent.py
+++ b/openhands/agenthub/micro/agent.py
@@ -8,10 +8,10 @@
 from openhands.core.message import ImageContent, Message, TextContent
 from openhands.core.utils import json
 from openhands.events.action import Action
+from openhands.events.event import Event
 from openhands.events.serialization.action import action_from_dict
 from openhands.events.serialization.event import event_to_memory
 from openhands.llm.llm import LLM
-from openhands.memory.history import ShortTermHistory
 
 
 def parse_response(orig_response: str) -> Action:
@@ -32,16 +32,14 @@ class MicroAgent(Agent):
     prompt = ''
     agent_definition: dict = {}
 
-    def history_to_json(
-        self, history: ShortTermHistory, max_events: int = 20, **kwargs
-    ):
+    def history_to_json(self, history: list[Event], max_events: int = 20, **kwargs):
         """
         Serialize and simplify history to str format
         """
         processed_history = []
         event_count = 0
 
-        for event in history.get_events(reverse=True):
+        for event in reversed(history):
             if event_count >= max_events:
                 break
             processed_history.append(
diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index 96d04c9eb5c3..dfd279dcbff1 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -1,7 +1,7 @@
 import asyncio
 import copy
 import traceback
-from typing import Type
+from typing import ClassVar, Type
 
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State, TrafficControlState
@@ -35,6 +35,7 @@
     AgentStateChangedObservation,
     CmdOutputObservation,
     ErrorObservation,
+    NullObservation,
     Observation,
 )
 from openhands.events.serialization.event import truncate_content
@@ -60,6 +61,12 @@ class AgentController:
     parent: 'AgentController | None' = None
     delegate: 'AgentController | None' = None
     _pending_action: Action | None = None
+    filter_out: ClassVar[tuple[type[Event], ...]] = (
+        NullAction,
+        NullObservation,
+        ChangeAgentStateAction,
+        AgentStateChangedObservation,
+    )
 
     def __init__(
         self,
@@ -132,7 +139,8 @@ def update_state_before_step(self):
         self.state.iteration += 1
         self.state.local_iteration += 1
 
-        # get the history from the event stream
+        # get the history from the event stream, filtering out event types
+        # that should not be sent to the agent, and hidden events
         start_id = self.state.start_id if self.state.start_id != -1 else 0
         end_id = (
             self.state.end_id
@@ -144,6 +152,8 @@ def update_state_before_step(self):
                 start_id=start_id,
                 end_id=end_id,
                 reverse=False,
+                filter_out_type=self.filter_out,
+                filter_hidden=True,
             )
         )
 
diff --git a/openhands/events/stream.py b/openhands/events/stream.py
index c8daf3214ce4..f9d6fede7711 100644
--- a/openhands/events/stream.py
+++ b/openhands/events/stream.py
@@ -73,19 +73,22 @@ def _get_id_from_filename(filename: str) -> int:
 
     def get_events(
         self,
-        start_id=0,
-        end_id=None,
-        reverse=False,
+        start_id: int = 0,
+        end_id: int | None = None,
+        reverse: bool = False,
         filter_out_type: tuple[type[Event], ...] | None = None,
+        filter_hidden: bool = False,
     ) -> Iterable[Event]:
         """
-        Retrieve events from the event stream, optionally filtering out events of a given type.
+        Retrieve events from the event stream, optionally filtering out events of a given type
+        and events marked as hidden.
 
         Args:
             start_id: The ID of the first event to retrieve. Defaults to 0.
             end_id: The ID of the last event to retrieve. Defaults to the last event in the stream.
             reverse: Whether to retrieve events in reverse order. Defaults to False.
             filter_out_type: A tuple of event types to filter out. Typically used to filter out backend events from the agent.
+            filter_hidden: If True, filters out events with the 'hidden' attribute set to True.
 
         Yields:
             Events from the stream that match the criteria.
@@ -97,9 +100,11 @@ def get_events(
             while event_id >= start_id:
                 try:
                     event = self.get_event(event_id)
-                    if filter_out_type is None or not isinstance(
-                        event, filter_out_type
-                    ):
+                    # apply type and 'hidden' filters
+                    if (
+                        filter_out_type is None
+                        or not isinstance(event, filter_out_type)
+                    ) and (not filter_hidden or not getattr(event, 'hidden', False)):
                         yield event
                 except FileNotFoundError:
                     logger.debug(f'No event found for ID {event_id}')
@@ -111,9 +116,11 @@ def get_events(
                     break
                 try:
                     event = self.get_event(event_id)
-                    if filter_out_type is None or not isinstance(
-                        event, filter_out_type
-                    ):
+                    # apply type and 'hidden' filters
+                    if (
+                        filter_out_type is None
+                        or not isinstance(event, filter_out_type)
+                    ) and (not filter_hidden or not getattr(event, 'hidden', False)):
                         yield event
                 except FileNotFoundError:
                     break
diff --git a/openhands/memory/history.py b/openhands/memory/history.py
index 49128baa02c9..a911515fd9cb 100644
--- a/openhands/memory/history.py
+++ b/openhands/memory/history.py
@@ -1,13 +1,5 @@
-from typing import ClassVar, Iterable
-
-from openhands.events.action.agent import (
-    ChangeAgentStateAction,
-)
-from openhands.events.action.empty import NullAction
 from openhands.events.event import Event
-from openhands.events.observation.agent import AgentStateChangedObservation
 from openhands.events.observation.delegate import AgentDelegateObservation
-from openhands.events.observation.empty import NullObservation
 from openhands.events.stream import EventStream
 
 
@@ -21,12 +13,6 @@ class ShortTermHistory(list[Event]):
     end_id: int
     _event_stream: EventStream
     delegates: dict[tuple[int, int], tuple[str, str]]
-    filter_out: ClassVar[tuple[type[Event], ...]] = (
-        NullAction,
-        NullObservation,
-        ChangeAgentStateAction,
-        AgentStateChangedObservation,
-    )
 
     def __init__(self):
         super().__init__()
@@ -37,49 +23,6 @@ def __init__(self):
     def set_event_stream(self, event_stream: EventStream):
         self._event_stream = event_stream
 
-    def get_events_as_list(self, include_delegates: bool = False) -> list[Event]:
-        """Return the history as a list of Event objects."""
-        return list(self.get_events(include_delegates=include_delegates))
-
-    def get_events(
-        self,
-        reverse: bool = False,
-        include_delegates: bool = False,
-        include_hidden=False,
-    ) -> Iterable[Event]:
-        """Return the events as a stream of Event objects."""
-
-        # iterate from start_id to end_id, or reverse
-        start_id = self.start_id if self.start_id != -1 else 0
-        end_id = (
-            self.end_id
-            if self.end_id != -1
-            else self._event_stream.get_latest_event_id()
-        )
-
-        for event in self._event_stream.get_events(
-            start_id=start_id,
-            end_id=end_id,
-            reverse=reverse,
-            filter_out_type=self.filter_out,
-        ):
-            if not include_hidden and hasattr(event, 'hidden') and event.hidden:
-                continue
-            # TODO add summaries
-            # and filter out events that were included in a summary
-
-            # filter out the events from a delegate of the current agent
-            if not include_delegates and not any(
-                # except for the delegate action and observation themselves, currently
-                # AgentDelegateAction has id = delegate_start
-                # AgentDelegateObservation has id = delegate_end
-                delegate_start < event.id < delegate_end
-                for delegate_start, delegate_end in self.delegates.keys()
-            ):
-                yield event
-            elif include_delegates:
-                yield event
-
     def has_delegation(self) -> bool:
         for event in self._event_stream.get_events():
             if isinstance(event, AgentDelegateObservation):

From 9e5659c507f413c3cb4e5a0a6eb96f06540dc737 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Wed, 16 Oct 2024 16:42:44 +0200
Subject: [PATCH 10/89] remove history.py

---
 openhands/controller/state/state.py |  7 +++++++
 openhands/memory/history.py         | 30 -----------------------------
 tests/integration/test_agent.py     |  2 +-
 3 files changed, 8 insertions(+), 31 deletions(-)
 delete mode 100644 openhands/memory/history.py

diff --git a/openhands/controller/state/state.py b/openhands/controller/state/state.py
index 0080b06c101d..6abdd6c4b064 100644
--- a/openhands/controller/state/state.py
+++ b/openhands/controller/state/state.py
@@ -13,6 +13,7 @@
 )
 from openhands.events.action.agent import AgentFinishAction
 from openhands.events.event import Event
+from openhands.events.observation import AgentDelegateObservation
 from openhands.storage.files import FileStore
 
 
@@ -145,3 +146,9 @@ def get_current_user_intent(self):
                     return last_user_message
 
         return last_user_message, last_user_message_image_urls
+
+    def has_delegation(self) -> bool:
+        for event in self.history:
+            if isinstance(event, AgentDelegateObservation):
+                return True
+        return False
diff --git a/openhands/memory/history.py b/openhands/memory/history.py
deleted file mode 100644
index a911515fd9cb..000000000000
--- a/openhands/memory/history.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from openhands.events.event import Event
-from openhands.events.observation.delegate import AgentDelegateObservation
-from openhands.events.stream import EventStream
-
-
-class ShortTermHistory(list[Event]):
-    """A list of events that represents the short-term memory of the agent.
-
-    This class provides methods to retrieve and filter the events in the history of the running agent from the event stream.
-    """
-
-    start_id: int
-    end_id: int
-    _event_stream: EventStream
-    delegates: dict[tuple[int, int], tuple[str, str]]
-
-    def __init__(self):
-        super().__init__()
-        self.start_id = -1
-        self.end_id = -1
-        self.delegates = {}
-
-    def set_event_stream(self, event_stream: EventStream):
-        self._event_stream = event_stream
-
-    def has_delegation(self) -> bool:
-        for event in self._event_stream.get_events():
-            if isinstance(event, AgentDelegateObservation):
-                return True
-        return False
diff --git a/tests/integration/test_agent.py b/tests/integration/test_agent.py
index a86c4f6492c1..0df0ae0f236e 100644
--- a/tests/integration/test_agent.py
+++ b/tests/integration/test_agent.py
@@ -61,7 +61,7 @@ def validate_final_state(final_state: State | None, test_name: str):
     assert num_of_conversations > 0
     # we mock the cost of every conversation to be 1 USD
     # assert int(final_state.metrics.accumulated_cost) == num_of_conversations
-    if final_state.history.has_delegation():
+    if final_state.has_delegation():
         assert final_state.iteration > final_state.local_iteration
     else:
         assert final_state.local_iteration == final_state.iteration

From 6fc615fd18b00fbb7b99d9e4f59629e407039eb4 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Wed, 16 Oct 2024 18:01:23 +0200
Subject: [PATCH 11/89] fix types

---
 openhands/agenthub/planner_agent/prompt.py |  2 +-
 openhands/controller/state/state.py        |  4 ++--
 openhands/events/action/message.py         |  2 +-
 openhands/runtime/client/runtime.py        | 16 ++++++++++++++--
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/openhands/agenthub/planner_agent/prompt.py b/openhands/agenthub/planner_agent/prompt.py
index e096a2a4f6f3..eb8f95341b95 100644
--- a/openhands/agenthub/planner_agent/prompt.py
+++ b/openhands/agenthub/planner_agent/prompt.py
@@ -117,7 +117,7 @@ def get_hint(latest_action_id: str) -> str:
 
 def get_prompt_and_images(
     state: State, max_message_chars: int
-) -> tuple[str, list[str]]:
+) -> tuple[str, list[str] | None]:
     """Gets the prompt for the planner agent.
 
     Formatted with the most recent action-observation pairs, current task, and hint based on last action
diff --git a/openhands/controller/state/state.py b/openhands/controller/state/state.py
index 6abdd6c4b064..ad4a57f2c6d6 100644
--- a/openhands/controller/state/state.py
+++ b/openhands/controller/state/state.py
@@ -133,7 +133,7 @@ def restore_from_session(sid: str, file_store: FileStore) -> 'State':
         state.agent_state = AgentState.LOADING
         return state
 
-    def get_current_user_intent(self):
+    def get_current_user_intent(self) -> tuple[str | None, list[str] | None]:
         """Returns the latest user message and image(if provided) that appears after a FinishAction, or the first (the task) if nothing was finished yet."""
         last_user_message = None
         last_user_message_image_urls: list[str] | None = []
@@ -143,7 +143,7 @@ def get_current_user_intent(self):
                 last_user_message_image_urls = event.images_urls
             elif isinstance(event, AgentFinishAction):
                 if last_user_message is not None:
-                    return last_user_message
+                    return last_user_message, None
 
         return last_user_message, last_user_message_image_urls
 
diff --git a/openhands/events/action/message.py b/openhands/events/action/message.py
index 55fb21f359d3..0e3bb26a1cc2 100644
--- a/openhands/events/action/message.py
+++ b/openhands/events/action/message.py
@@ -7,7 +7,7 @@
 @dataclass
 class MessageAction(Action):
     content: str
-    images_urls: list | None = None
+    images_urls: list[str] | None = None
     wait_for_response: bool = False
     action: str = ActionType.MESSAGE
     security_risk: ActionSecurityRisk | None = None
diff --git a/openhands/runtime/client/runtime.py b/openhands/runtime/client/runtime.py
index 195185823eff..d40264b434da 100644
--- a/openhands/runtime/client/runtime.py
+++ b/openhands/runtime/client/runtime.py
@@ -126,7 +126,13 @@ def init_base_runtime(
         attach_to_existing: bool = False,
     ):
         super().__init__(
-            config, event_stream, sid, plugins, env_vars, status_message_callback, attach_to_existing
+            config,
+            event_stream,
+            sid,
+            plugins,
+            env_vars,
+            status_message_callback,
+            attach_to_existing,
         )
 
     def __init__(
@@ -192,7 +198,13 @@ def __init__(
 
         # Will initialize both the event stream and the env vars
         self.init_base_runtime(
-            config, event_stream, sid, plugins, env_vars, status_message_callback, attach_to_existing
+            config,
+            event_stream,
+            sid,
+            plugins,
+            env_vars,
+            status_message_callback,
+            attach_to_existing,
         )
 
         logger.info('Waiting for client to become ready...')

From 3a813632045fd4e895bcc4137d97fd62664eba45 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Wed, 16 Oct 2024 18:25:53 +0200
Subject: [PATCH 12/89] refactoring in evals

---
 evaluation/EDA/run_infer.py             | 4 ++--
 evaluation/agent_bench/run_infer.py     | 2 +-
 evaluation/bird/run_infer.py            | 2 +-
 evaluation/gaia/run_infer.py            | 2 +-
 evaluation/gorilla/run_infer.py         | 2 +-
 evaluation/gpqa/run_infer.py            | 2 +-
 evaluation/logic_reasoning/run_infer.py | 2 +-
 evaluation/toolqa/run_infer.py          | 2 +-
 openhands/controller/state/state.py     | 6 ++++++
 9 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/evaluation/EDA/run_infer.py b/evaluation/EDA/run_infer.py
index 304981f5559c..1a33648177ab 100644
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@@ -34,7 +34,7 @@ def codeact_user_response_eda(state: State) -> str:
 
     # retrieve the latest model message from history
     if state.history:
-        model_guess = state.history.get_last_agent_message()
+        model_guess = state.get_last_agent_message()
 
     assert game is not None, 'Game is not initialized.'
     msg = game.generate_user_response(model_guess)
@@ -138,7 +138,7 @@ def process_instance(
     if state is None:
         raise ValueError('State should not be None.')
 
-    final_message = state.history.get_last_agent_message()
+    final_message = state.get_last_agent_message()
 
     logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
     test_result = game.reward()
diff --git a/evaluation/agent_bench/run_infer.py b/evaluation/agent_bench/run_infer.py
index ddca35f3921a..aed26064dc0e 100644
--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/agent_bench/run_infer.py
@@ -241,7 +241,7 @@ def process_instance(
         raw_ans = ''
 
         # retrieve the last agent message or thought
-        for event in state.history.get_events(reverse=True):
+        for event in reversed(state.history):
             if event.source == 'agent':
                 if isinstance(event, AgentFinishAction):
                     raw_ans = event.thought
diff --git a/evaluation/bird/run_infer.py b/evaluation/bird/run_infer.py
index 74e4cd301c38..9bd44f3173d7 100644
--- a/evaluation/bird/run_infer.py
+++ b/evaluation/bird/run_infer.py
@@ -46,7 +46,7 @@ def codeact_user_response(state: State) -> str:
         # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
         user_msgs = [
             event
-            for event in state.history.get_events()
+            for event in state.history
             if isinstance(event, MessageAction) and event.source == 'user'
         ]
         if len(user_msgs) > 2:
diff --git a/evaluation/gaia/run_infer.py b/evaluation/gaia/run_infer.py
index 51519b88494e..09ad65c484e3 100644
--- a/evaluation/gaia/run_infer.py
+++ b/evaluation/gaia/run_infer.py
@@ -165,7 +165,7 @@ def process_instance(
 
     model_answer_raw = ''
     # get the last message or thought from the agent
-    for event in state.history.get_events(reverse=True):
+    for event in reversed(state.history):
         if event.source == 'agent':
             if isinstance(event, AgentFinishAction):
                 model_answer_raw = event.thought
diff --git a/evaluation/gorilla/run_infer.py b/evaluation/gorilla/run_infer.py
index 93fb15d21992..5d5519672eca 100644
--- a/evaluation/gorilla/run_infer.py
+++ b/evaluation/gorilla/run_infer.py
@@ -100,7 +100,7 @@ def process_instance(
         raise ValueError('State should not be None.')
 
     # retrieve the last message from the agent
-    model_answer_raw = state.history.get_last_agent_message()
+    model_answer_raw = state.get_last_agent_message()
 
     # attempt to parse model_answer
     ast_eval_fn = instance['ast_eval']
diff --git a/evaluation/gpqa/run_infer.py b/evaluation/gpqa/run_infer.py
index 98b4e2a6a5cc..cf1249f47909 100644
--- a/evaluation/gpqa/run_infer.py
+++ b/evaluation/gpqa/run_infer.py
@@ -244,7 +244,7 @@ def process_instance(
         'C': False,
         'D': False,
     }
-    for event in state.history.get_events(reverse=True):
+    for event in reversed(state.history):
         if (
             isinstance(event, AgentFinishAction)
             and event.source != 'user'
diff --git a/evaluation/logic_reasoning/run_infer.py b/evaluation/logic_reasoning/run_infer.py
index f13c4674ff74..8f49619c3f6f 100644
--- a/evaluation/logic_reasoning/run_infer.py
+++ b/evaluation/logic_reasoning/run_infer.py
@@ -224,7 +224,7 @@ def process_instance(
         raise ValueError('State should not be None.')
 
     final_message = ''
-    for event in state.history.get_events(reverse=True):
+    for event in reversed(state.history):
         if isinstance(event, AgentFinishAction):
             final_message = event.thought
             break
diff --git a/evaluation/toolqa/run_infer.py b/evaluation/toolqa/run_infer.py
index da36af2b5788..911ca1efd73f 100644
--- a/evaluation/toolqa/run_infer.py
+++ b/evaluation/toolqa/run_infer.py
@@ -125,7 +125,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
         raise ValueError('State should not be None.')
 
     # retrieve the last message from the agent
-    model_answer_raw = state.history.get_last_agent_message()
+    model_answer_raw = state.get_last_agent_message()
 
     # attempt to parse model_answer
     correct = eval_answer(str(model_answer_raw), str(answer))
diff --git a/openhands/controller/state/state.py b/openhands/controller/state/state.py
index ad4a57f2c6d6..cfc247ec3c9d 100644
--- a/openhands/controller/state/state.py
+++ b/openhands/controller/state/state.py
@@ -152,3 +152,9 @@ def has_delegation(self) -> bool:
             if isinstance(event, AgentDelegateObservation):
                 return True
         return False
+
+    def get_last_agent_message(self) -> str | None:
+        for event in reversed(self.history):
+            if isinstance(event, MessageAction) and event.source == 'agent':
+                return event.content
+        return None

From 66f78d59d3964b1f934af43ee7ca601356592725 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Wed, 16 Oct 2024 19:31:14 +0200
Subject: [PATCH 13/89] more adaptations in evals

---
 evaluation/miniwob/run_infer.py   | 2 +-
 evaluation/swe_bench/run_infer.py | 3 ++-
 evaluation/utils/shared.py        | 2 +-
 evaluation/webarena/run_infer.py  | 2 +-
 tests/unit/test_is_stuck.py       | 4 ++--
 5 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/evaluation/miniwob/run_infer.py b/evaluation/miniwob/run_infer.py
index 8f21f826eb7f..b42f310c240e 100644
--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@@ -151,7 +151,7 @@ def process_instance(
 
     # Instruction is the first message from the USER
     instruction = ''
-    for event in state.history.get_events():
+    for event in state.history:
         if isinstance(event, MessageAction):
             instruction = event.content
             break
diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index 9e4f1c4165f2..41f23cc02556 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -407,7 +407,8 @@ def process_instance(
     if state is None:
         raise ValueError('State should not be None.')
 
-    histories = [event_to_dict(event) for event in state.history.get_events()]
+    # NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events
+    histories = [event_to_dict(event) for event in state.history]
     metrics = state.metrics.get() if state.metrics else None
 
     # Save the output
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index 72bbcae377bc..de9c553b3c41 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -124,7 +124,7 @@ def codeact_user_response(
         # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
         user_msgs = [
             event
-            for event in state.history.get_events()
+            for event in state.history
             if isinstance(event, MessageAction) and event.source == 'user'
         ]
         if len(user_msgs) >= 2:
diff --git a/evaluation/webarena/run_infer.py b/evaluation/webarena/run_infer.py
index 878cedc77d5c..b4216da57763 100644
--- a/evaluation/webarena/run_infer.py
+++ b/evaluation/webarena/run_infer.py
@@ -165,7 +165,7 @@ def process_instance(
 
     # Instruction is the first message from the USER
     instruction = ''
-    for event in state.history.get_events():
+    for event in state.history:
         if isinstance(event, MessageAction):
             instruction = event.content
             break
diff --git a/tests/unit/test_is_stuck.py b/tests/unit/test_is_stuck.py
index 4a1330752161..32b71befa2dd 100644
--- a/tests/unit/test_is_stuck.py
+++ b/tests/unit/test_is_stuck.py
@@ -170,7 +170,7 @@ def test_is_stuck_repeating_action_observation(
         # 10 events
 
         assert len(collect_events(event_stream)) == 10
-        assert len(list(stuck_detector.state.history.get_events())) == 8
+        assert len(list(stuck_detector.state.history)) == 8
         assert (
             len(
                 get_pairs_from_events(
@@ -195,7 +195,7 @@ def test_is_stuck_repeating_action_observation(
         # 12 events
 
         assert len(collect_events(event_stream)) == 12
-        assert len(list(stuck_detector.state.history.get_events())) == 10
+        assert len(list(stuck_detector.state.history)) == 10
         assert (
             len(
                 get_pairs_from_events(

From 1de7b2be842db799f95e07f7ef314889c3f6856e Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Wed, 16 Oct 2024 19:52:20 +0200
Subject: [PATCH 14/89] rewrite history

---
 openhands/controller/agent_controller.py | 22 +++++++++++++++++++---
 openhands/core/main.py                   |  6 ++----
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index dfd279dcbff1..81ffeb895b65 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -128,9 +128,25 @@ async def close(self):
         """Closes the agent controller, canceling any ongoing tasks and unsubscribing from the event stream."""
         await self.set_agent_state_to(AgentState.STOPPED)
 
-        # save trajectories if applicable
-
-        # make sure the history in state is complete (with delegates)
+        # we made history, now is the time to rewrite it!
+        # in the final state, history will need to be complete WITH delegates events
+        # like a regular agent history, it does not include 'hidden' events nor the default filtered out types (backend events)
+        # the final state.history will be used by external scripts like evals, tests, etc.
+        start_id = self.state.start_id if self.state.start_id != -1 else 0
+        end_id = (
+            self.state.end_id
+            if self.state.end_id != -1
+            else self.event_stream.get_latest_event_id()
+        )
+        self.state.history = list(
+            self.event_stream.get_events(
+                start_id=start_id,
+                end_id=end_id,
+                reverse=False,
+                filter_out_type=self.filter_out,
+                filter_hidden=True,
+            )
+        )
 
         # unsubscribe from the event stream
         self.event_stream.unsubscribe(EventStreamSubscriber.AGENT_CONTROLLER)
diff --git a/openhands/core/main.py b/openhands/core/main.py
index 0ebcde8527fe..37f6de95f859 100644
--- a/openhands/core/main.py
+++ b/openhands/core/main.py
@@ -199,6 +199,7 @@ async def on_event(event: Event):
     # save session when we're about to close
     if config.enable_cli_session:
         end_state = controller.get_state()
+        # NOTE: the saved state does not include delegates events
         end_state.save_to_session(event_stream.sid, event_stream.file_store)
 
     # close when done
@@ -209,10 +210,7 @@ async def on_event(event: Event):
     if config.trajectories_path is not None:
         file_path = os.path.join(config.trajectories_path, sid + '.json')
         os.makedirs(os.path.dirname(file_path), exist_ok=True)
-        histories = [
-            event_to_trajectory(event)
-            for event in state.history.get_events(include_delegates=True)
-        ]
+        histories = [event_to_trajectory(event) for event in state.history]
         with open(file_path, 'w') as f:
             json.dump(histories, f)
 

From adc960f62194a298b6a98d79c0838bff8b273be0 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Wed, 16 Oct 2024 20:51:50 +0200
Subject: [PATCH 15/89] actually remove history

---
 evaluation/mint/run_infer.py    |  6 +++++-
 openhands/memory/__init__.py    |  3 +--
 tests/integration/test_agent.py | 14 ++++++++++++--
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/evaluation/mint/run_infer.py b/evaluation/mint/run_infer.py
index 0f113ef9d9d5..5337feb02f46 100644
--- a/evaluation/mint/run_infer.py
+++ b/evaluation/mint/run_infer.py
@@ -29,6 +29,7 @@
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import (
+    Action,
     CmdRunAction,
     MessageAction,
 )
@@ -45,7 +46,10 @@ def codeact_user_response_mint(state: State, task: Task, task_config: dict[str,
         task=task,
         task_config=task_config,
     )
-    last_action = state.history.get_last_action()
+    last_action = next(
+        (event for event in reversed(state.history) if isinstance(event, Action)),
+        None,
+    )
     result_state: TaskState = env.step(last_action.message or '')
 
     state.extra_data['task_state'] = result_state
diff --git a/openhands/memory/__init__.py b/openhands/memory/__init__.py
index 0ce208cef581..12c499c768be 100644
--- a/openhands/memory/__init__.py
+++ b/openhands/memory/__init__.py
@@ -1,5 +1,4 @@
 from openhands.memory.condenser import MemoryCondenser
-from openhands.memory.history import ShortTermHistory
 from openhands.memory.memory import LongTermMemory
 
-__all__ = ['LongTermMemory', 'ShortTermHistory', 'MemoryCondenser']
+__all__ = ['LongTermMemory', 'MemoryCondenser']
diff --git a/tests/integration/test_agent.py b/tests/integration/test_agent.py
index 0df0ae0f236e..ff9f58cd24c3 100644
--- a/tests/integration/test_agent.py
+++ b/tests/integration/test_agent.py
@@ -9,7 +9,12 @@
 from openhands.core.config import load_app_config
 from openhands.core.main import run_controller
 from openhands.core.schema import AgentState
-from openhands.events.action import AgentFinishAction, AgentRejectAction, MessageAction
+from openhands.events.action import (
+    Action,
+    AgentFinishAction,
+    AgentRejectAction,
+    MessageAction,
+)
 from openhands.events.observation.browse import BrowserOutputObservation
 from openhands.events.observation.delegate import AgentDelegateObservation
 from openhands.runtime import get_runtime_cls
@@ -185,7 +190,12 @@ def test_simple_task_rejection(current_test_name: str):
         run_controller(CONFIG, MessageAction(content=task), exit_on_message=True)
     )
     validate_final_state(final_state, current_test_name)
-    assert isinstance(final_state.history.get_last_action(), AgentRejectAction)
+    # get last action
+    last_action = next(
+        (event for event in reversed(final_state.history) if isinstance(event, Action)),
+        None,
+    )
+    assert isinstance(last_action, AgentRejectAction)
 
 
 @pytest.mark.skipif(

From fac01d15c3bc5015c852b8b13941314ea24bc15e Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Wed, 16 Oct 2024 21:51:53 +0200
Subject: [PATCH 16/89] adapt stuck

---
 tests/unit/test_is_stuck.py | 375 +++++++++++++++++-------------------
 1 file changed, 178 insertions(+), 197 deletions(-)

diff --git a/tests/unit/test_is_stuck.py b/tests/unit/test_is_stuck.py
index 32b71befa2dd..1f28e9800799 100644
--- a/tests/unit/test_is_stuck.py
+++ b/tests/unit/test_is_stuck.py
@@ -17,8 +17,6 @@
 from openhands.events.observation.empty import NullObservation
 from openhands.events.observation.error import ErrorObservation
 from openhands.events.stream import EventSource, EventStream
-from openhands.events.utils import get_pairs_from_events
-from openhands.memory.history import ShortTermHistory
 from openhands.storage import get_file_store
 
 
@@ -55,22 +53,21 @@ def event_stream(temp_dir):
 
 class TestStuckDetector:
     @pytest.fixture
-    def stuck_detector(self, event_stream):
+    def stuck_detector(self):
         state = State(inputs={}, max_iterations=50)
-        state.history.set_event_stream(event_stream)
-
+        state.history = []  # Initialize history as an empty list
         return StuckDetector(state)
 
     def _impl_syntax_error_events(
         self,
-        event_stream: EventStream,
+        state: State,
         error_message: str,
         random_line: bool,
         incidents: int = 4,
     ):
         for i in range(incidents):
             ipython_action = IPythonRunCellAction(code=code_snippet)
-            event_stream.add_event(ipython_action, EventSource.AGENT)
+            state.history.append(ipython_action)
             extra_number = (i + 1) * 10 if random_line else '42'
             extra_line = '\n' * (i + 1) if random_line else ''
             ipython_observation = IPythonRunCellObservation(
@@ -79,15 +76,15 @@ def _impl_syntax_error_events(
                 f'{error_message}{extra_line}' + jupyter_line_1 + jupyter_line_2,
                 code=code_snippet,
             )
-            ipython_observation._cause = ipython_action._id
-            event_stream.add_event(ipython_observation, EventSource.USER)
+            # ipython_observation._cause = ipython_action._id
+            state.history.append(ipython_observation)
 
     def _impl_unterminated_string_error_events(
-        self, event_stream: EventStream, random_line: bool, incidents: int = 4
+        self, state: State, random_line: bool, incidents: int = 4
     ):
         for i in range(incidents):
             ipython_action = IPythonRunCellAction(code=code_snippet)
-            event_stream.add_event(ipython_action, EventSource.AGENT)
+            state.history.append(ipython_action)
             line_number = (i + 1) * 10 if random_line else '1'
             ipython_observation = IPythonRunCellObservation(
                 content=f'print("  Cell In[1], line {line_number}\nhello\n       ^\nSyntaxError: unterminated string literal (detected at line {line_number})'
@@ -95,34 +92,30 @@ def _impl_unterminated_string_error_events(
                 + jupyter_line_2,
                 code=code_snippet,
             )
-            ipython_observation._cause = ipython_action._id
-            event_stream.add_event(ipython_observation, EventSource.USER)
+            # ipython_observation._cause = ipython_action._
+            state.history.append(ipython_observation)
 
-    def test_history_too_short(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_history_too_short(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
         message_action = MessageAction(content='Hello', wait_for_response=False)
         message_action._source = EventSource.USER
         observation = NullObservation(content='')
-        observation._cause = message_action.id
-        event_stream.add_event(message_action, EventSource.USER)
-        event_stream.add_event(observation, EventSource.USER)
+        # observation._cause = message_action.id
+        state.history.append(message_action)
+        state.history.append(observation)
 
         cmd_action = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action, EventSource.AGENT)
+        state.history.append(cmd_action)
         cmd_observation = CmdOutputObservation(
             command_id=1, command='ls', content='file1.txt\nfile2.txt'
         )
-        cmd_observation._cause = cmd_action._id
-        event_stream.add_event(cmd_observation, EventSource.USER)
-
-        # stuck_detector.state.history.set_event_stream(event_stream)
+        # cmd_observation._cause = cmd_action._id
+        state.history.append(cmd_observation)
 
         assert stuck_detector.is_stuck() is False
 
-    def test_is_stuck_repeating_action_observation(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_is_stuck_repeating_action_observation(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
         message_action = MessageAction(content='Done', wait_for_response=False)
         message_action._source = EventSource.USER
 
@@ -130,135 +123,125 @@ def test_is_stuck_repeating_action_observation(
         hello_observation = NullObservation('')
 
         # 2 events
-        event_stream.add_event(hello_action, EventSource.USER)
-        event_stream.add_event(hello_observation, EventSource.USER)
+        state.history.append(hello_action)
+        state.history.append(hello_observation)
 
         cmd_action_1 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_1, EventSource.AGENT)
-        cmd_observation_1 = CmdOutputObservation(
-            content='', command='ls', command_id=cmd_action_1._id
-        )
+        cmd_action_1._id = 1
+        state.history.append(cmd_action_1)
+        cmd_observation_1 = CmdOutputObservation(content='', command='ls', command_id=1)
         cmd_observation_1._cause = cmd_action_1._id
-        event_stream.add_event(cmd_observation_1, EventSource.USER)
+        state.history.append(cmd_observation_1)
         # 4 events
 
         cmd_action_2 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_2, EventSource.AGENT)
-        cmd_observation_2 = CmdOutputObservation(
-            content='', command='ls', command_id=cmd_action_2._id
-        )
+        cmd_action_2._id = 2
+        state.history.append(cmd_action_2)
+        cmd_observation_2 = CmdOutputObservation(content='', command='ls', command_id=2)
         cmd_observation_2._cause = cmd_action_2._id
-        event_stream.add_event(cmd_observation_2, EventSource.USER)
+        state.history.append(cmd_observation_2)
         # 6 events
 
         # random user message just because we can
         message_null_observation = NullObservation(content='')
-        event_stream.add_event(message_action, EventSource.USER)
-        event_stream.add_event(message_null_observation, EventSource.USER)
+        state.history.append(message_action)
+        state.history.append(message_null_observation)
         # 8 events
 
         assert stuck_detector.is_stuck() is False
         assert stuck_detector.state.almost_stuck == 2
 
         cmd_action_3 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_3, EventSource.AGENT)
-        cmd_observation_3 = CmdOutputObservation(
-            content='', command='ls', command_id=cmd_action_3._id
-        )
+        cmd_action_3._id = 3
+        state.history.append(cmd_action_3)
+        cmd_observation_3 = CmdOutputObservation(content='', command='ls', command_id=3)
         cmd_observation_3._cause = cmd_action_3._id
-        event_stream.add_event(cmd_observation_3, EventSource.USER)
+        state.history.append(cmd_observation_3)
         # 10 events
 
-        assert len(collect_events(event_stream)) == 10
-        assert len(list(stuck_detector.state.history)) == 8
+        assert len(state.history) == 10
         assert (
-            len(
-                get_pairs_from_events(
-                    stuck_detector.state.history.get_events_as_list(
-                        include_delegates=True
-                    )
-                )
-            )
-            == 5
-        )
+            len(state.history) == 10
+        )  # Adjusted since history is a list and the controller is not running
+
+        # FIXME are we still testing this without this test?
+        # assert (
+        #    len(
+        #        get_pairs_from_events(state.history)
+        #    )
+        #    == 5
+        # )
 
         assert stuck_detector.is_stuck() is False
         assert stuck_detector.state.almost_stuck == 1
 
         cmd_action_4 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_4, EventSource.AGENT)
-        cmd_observation_4 = CmdOutputObservation(
-            content='', command='ls', command_id=cmd_action_4._id
-        )
+        cmd_action_4._id = 4
+        state.history.append(cmd_action_4)
+        cmd_observation_4 = CmdOutputObservation(content='', command='ls', command_id=4)
         cmd_observation_4._cause = cmd_action_4._id
-        event_stream.add_event(cmd_observation_4, EventSource.USER)
+        state.history.append(cmd_observation_4)
         # 12 events
 
-        assert len(collect_events(event_stream)) == 12
-        assert len(list(stuck_detector.state.history)) == 10
-        assert (
-            len(
-                get_pairs_from_events(
-                    stuck_detector.state.history.get_events_as_list(
-                        include_delegates=True
-                    )
-                )
-            )
-            == 6
-        )
+        assert len(state.history) == 12
+        # assert (
+        #    len(
+        #        get_pairs_from_events(state.history)
+        #    )
+        #    == 6
+        # )
 
         with patch('logging.Logger.warning') as mock_warning:
             assert stuck_detector.is_stuck() is True
             assert stuck_detector.state.almost_stuck == 0
             mock_warning.assert_called_once_with('Action, Observation loop detected')
 
-    def test_is_stuck_repeating_action_error(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_is_stuck_repeating_action_error(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
         # (action, error_observation), not necessarily the same error
         message_action = MessageAction(content='Done', wait_for_response=False)
         message_action._source = EventSource.USER
 
         hello_action = MessageAction(content='Hello', wait_for_response=False)
         hello_observation = NullObservation(content='')
-        event_stream.add_event(hello_action, EventSource.USER)
-        hello_observation._cause = hello_action._id
-        event_stream.add_event(hello_observation, EventSource.USER)
+        state.history.append(hello_action)
+        # hello_observation._cause = hello_action._id
+        state.history.append(hello_observation)
         # 2 events
 
         cmd_action_1 = CmdRunAction(command='invalid_command')
-        event_stream.add_event(cmd_action_1, EventSource.AGENT)
+        state.history.append(cmd_action_1)
         error_observation_1 = ErrorObservation(content='Command not found')
-        error_observation_1._cause = cmd_action_1._id
-        event_stream.add_event(error_observation_1, EventSource.USER)
+        # error_observation_1._cause = cmd_action_1._id
+        state.history.append(error_observation_1)
         # 4 events
 
         cmd_action_2 = CmdRunAction(command='invalid_command')
-        event_stream.add_event(cmd_action_2, EventSource.AGENT)
+        state.history.append(cmd_action_2)
         error_observation_2 = ErrorObservation(
             content='Command still not found or another error'
         )
-        error_observation_2._cause = cmd_action_2._id
-        event_stream.add_event(error_observation_2, EventSource.USER)
+        # error_observation_2._cause = cmd_action_2._id
+        state.history.append(error_observation_2)
         # 6 events
 
         message_null_observation = NullObservation(content='')
-        event_stream.add_event(message_action, EventSource.USER)
-        event_stream.add_event(message_null_observation, EventSource.USER)
+        state.history.append(message_action)
+        state.history.append(message_null_observation)
         # 8 events
 
         cmd_action_3 = CmdRunAction(command='invalid_command')
-        event_stream.add_event(cmd_action_3, EventSource.AGENT)
+        state.history.append(cmd_action_3)
         error_observation_3 = ErrorObservation(content='Different error')
-        error_observation_3._cause = cmd_action_3._id
-        event_stream.add_event(error_observation_3, EventSource.USER)
+        # error_observation_3._cause = cmd_action_3._id
+        state.history.append(error_observation_3)
         # 10 events
 
         cmd_action_4 = CmdRunAction(command='invalid_command')
-        event_stream.add_event(cmd_action_4, EventSource.AGENT)
+        state.history.append(cmd_action_4)
         error_observation_4 = ErrorObservation(content='Command not found')
-        error_observation_4._cause = cmd_action_4._id
-        event_stream.add_event(error_observation_4, EventSource.USER)
+        # error_observation_4._cause = cmd_action_4._id
+        state.history.append(error_observation_4)
         # 12 events
 
         with patch('logging.Logger.warning') as mock_warning:
@@ -267,11 +250,10 @@ def test_is_stuck_repeating_action_error(
                 'Action, ErrorObservation loop detected'
             )
 
-    def test_is_stuck_invalid_syntax_error(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_is_stuck_invalid_syntax_error(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
         self._impl_syntax_error_events(
-            event_stream,
+            state,
             error_message='SyntaxError: invalid syntax. Perhaps you forgot a comma?',
             random_line=False,
         )
@@ -280,10 +262,11 @@ def test_is_stuck_invalid_syntax_error(
             assert stuck_detector.is_stuck() is True
 
     def test_is_not_stuck_invalid_syntax_error_random_lines(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
     ):
+        state = stuck_detector.state
         self._impl_syntax_error_events(
-            event_stream,
+            state,
             error_message='SyntaxError: invalid syntax. Perhaps you forgot a comma?',
             random_line=True,
         )
@@ -292,10 +275,11 @@ def test_is_not_stuck_invalid_syntax_error_random_lines(
             assert stuck_detector.is_stuck() is False
 
     def test_is_not_stuck_invalid_syntax_error_only_three_incidents(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
     ):
+        state = stuck_detector.state
         self._impl_syntax_error_events(
-            event_stream,
+            state,
             error_message='SyntaxError: invalid syntax. Perhaps you forgot a comma?',
             random_line=True,
             incidents=3,
@@ -304,11 +288,10 @@ def test_is_not_stuck_invalid_syntax_error_only_three_incidents(
         with patch('logging.Logger.warning'):
             assert stuck_detector.is_stuck() is False
 
-    def test_is_stuck_incomplete_input_error(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_is_stuck_incomplete_input_error(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
         self._impl_syntax_error_events(
-            event_stream,
+            state,
             error_message='SyntaxError: incomplete input',
             random_line=False,
         )
@@ -316,11 +299,10 @@ def test_is_stuck_incomplete_input_error(
         with patch('logging.Logger.warning'):
             assert stuck_detector.is_stuck() is True
 
-    def test_is_not_stuck_incomplete_input_error(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_is_not_stuck_incomplete_input_error(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
         self._impl_syntax_error_events(
-            event_stream,
+            state,
             error_message='SyntaxError: incomplete input',
             random_line=True,
         )
@@ -329,238 +311,239 @@ def test_is_not_stuck_incomplete_input_error(
             assert stuck_detector.is_stuck() is False
 
     def test_is_not_stuck_ipython_unterminated_string_error_random_lines(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
     ):
-        self._impl_unterminated_string_error_events(event_stream, random_line=True)
+        state = stuck_detector.state
+        self._impl_unterminated_string_error_events(state, random_line=True)
 
         with patch('logging.Logger.warning'):
             assert stuck_detector.is_stuck() is False
 
     def test_is_not_stuck_ipython_unterminated_string_error_only_three_incidents(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
     ):
+        state = stuck_detector.state
         self._impl_unterminated_string_error_events(
-            event_stream, random_line=False, incidents=3
+            state, random_line=False, incidents=3
         )
 
         with patch('logging.Logger.warning'):
             assert stuck_detector.is_stuck() is False
 
     def test_is_stuck_ipython_unterminated_string_error(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
     ):
-        self._impl_unterminated_string_error_events(event_stream, random_line=False)
+        state = stuck_detector.state
+        self._impl_unterminated_string_error_events(state, random_line=False)
 
         with patch('logging.Logger.warning'):
             assert stuck_detector.is_stuck() is True
 
     def test_is_not_stuck_ipython_syntax_error_not_at_end(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
     ):
+        state = stuck_detector.state
         # this test is to make sure we don't get false positives
         # since the "at line x" is changing in between!
         ipython_action_1 = IPythonRunCellAction(code='print("hello')
-        event_stream.add_event(ipython_action_1, EventSource.AGENT)
+        state.history.append(ipython_action_1)
         ipython_observation_1 = IPythonRunCellObservation(
             content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 1)\nThis is some additional output',
             code='print("hello',
         )
-        ipython_observation_1._cause = ipython_action_1._id
-        event_stream.add_event(ipython_observation_1, EventSource.USER)
+        # ipython_observation_1._cause = ipython_action_1._id
+        state.history.append(ipython_observation_1)
 
         ipython_action_2 = IPythonRunCellAction(code='print("hello')
-        event_stream.add_event(ipython_action_2, EventSource.AGENT)
+        state.history.append(ipython_action_2)
         ipython_observation_2 = IPythonRunCellObservation(
             content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 1)\nToo much output here on and on',
             code='print("hello',
         )
-        ipython_observation_2._cause = ipython_action_2._id
-        event_stream.add_event(ipython_observation_2, EventSource.USER)
+        # ipython_observation_2._cause = ipython_action_2._id
+        state.history.append(ipython_observation_2)
 
         ipython_action_3 = IPythonRunCellAction(code='print("hello')
-        event_stream.add_event(ipython_action_3, EventSource.AGENT)
+        state.history.append(ipython_action_3)
         ipython_observation_3 = IPythonRunCellObservation(
             content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 3)\nEnough',
             code='print("hello',
         )
-        ipython_observation_3._cause = ipython_action_3._id
-        event_stream.add_event(ipython_observation_3, EventSource.USER)
+        # ipython_observation_3._cause = ipython_action_3._id
+        state.history.append(ipython_observation_3)
 
         ipython_action_4 = IPythonRunCellAction(code='print("hello')
-        event_stream.add_event(ipython_action_4, EventSource.AGENT)
+        state.history.append(ipython_action_4)
         ipython_observation_4 = IPythonRunCellObservation(
             content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 2)\nLast line of output',
             code='print("hello',
         )
-        ipython_observation_4._cause = ipython_action_4._id
-        event_stream.add_event(ipython_observation_4, EventSource.USER)
+        # ipython_observation_4._cause = ipython_action_4._id
+        state.history.append(ipython_observation_4)
 
         with patch('logging.Logger.warning') as mock_warning:
             assert stuck_detector.is_stuck() is False
             mock_warning.assert_not_called()
 
     def test_is_stuck_repeating_action_observation_pattern(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
     ):
+        state = stuck_detector.state
         message_action = MessageAction(content='Come on', wait_for_response=False)
         message_action._source = EventSource.USER
-        event_stream.add_event(message_action, EventSource.USER)
+        state.history.append(message_action)
         message_observation = NullObservation(content='')
-        event_stream.add_event(message_observation, EventSource.USER)
+        state.history.append(message_observation)
 
         cmd_action_1 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_1, EventSource.AGENT)
+        state.history.append(cmd_action_1)
         cmd_observation_1 = CmdOutputObservation(
             command_id=1, command='ls', content='file1.txt\nfile2.txt'
         )
-        cmd_observation_1._cause = cmd_action_1._id
-        event_stream.add_event(cmd_observation_1, EventSource.USER)
+        # cmd_observation_1._cause = cmd_action_1._id
+        state.history.append(cmd_observation_1)
 
         read_action_1 = FileReadAction(path='file1.txt')
-        event_stream.add_event(read_action_1, EventSource.AGENT)
+        state.history.append(read_action_1)
         read_observation_1 = FileReadObservation(
             content='File content', path='file1.txt'
         )
-        read_observation_1._cause = read_action_1._id
-        event_stream.add_event(read_observation_1, EventSource.USER)
+        # read_observation_1._cause = read_action_1._id
+        state.history.append(read_observation_1)
 
         cmd_action_2 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_2, EventSource.AGENT)
+        state.history.append(cmd_action_2)
         cmd_observation_2 = CmdOutputObservation(
             command_id=2, command='ls', content='file1.txt\nfile2.txt'
         )
-        cmd_observation_2._cause = cmd_action_2._id
-        event_stream.add_event(cmd_observation_2, EventSource.USER)
+        # cmd_observation_2._cause = cmd_action_2._id
+        state.history.append(cmd_observation_2)
 
         read_action_2 = FileReadAction(path='file1.txt')
-        event_stream.add_event(read_action_2, EventSource.AGENT)
+        state.history.append(read_action_2)
         read_observation_2 = FileReadObservation(
             content='File content', path='file1.txt'
         )
-        read_observation_2._cause = read_action_2._id
-        event_stream.add_event(read_observation_2, EventSource.USER)
+        # read_observation_2._cause = read_action_2._id
+        state.history.append(read_observation_2)
 
         # one more message to break the pattern
         message_null_observation = NullObservation(content='')
-        event_stream.add_event(message_action, EventSource.USER)
-        event_stream.add_event(message_null_observation, EventSource.USER)
+        state.history.append(message_action)
+        state.history.append(message_null_observation)
 
         cmd_action_3 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_3, EventSource.AGENT)
+        state.history.append(cmd_action_3)
         cmd_observation_3 = CmdOutputObservation(
             command_id=3, command='ls', content='file1.txt\nfile2.txt'
         )
-        cmd_observation_3._cause = cmd_action_3._id
-        event_stream.add_event(cmd_observation_3, EventSource.USER)
+        # cmd_observation_3._cause = cmd_action_3._id
+        state.history.append(cmd_observation_3)
 
         read_action_3 = FileReadAction(path='file1.txt')
-        event_stream.add_event(read_action_3, EventSource.AGENT)
+        state.history.append(read_action_3)
         read_observation_3 = FileReadObservation(
             content='File content', path='file1.txt'
         )
-        read_observation_3._cause = read_action_3._id
-        event_stream.add_event(read_observation_3, EventSource.USER)
+        # read_observation_3._cause = read_action_3._id
+        state.history.append(read_observation_3)
 
         with patch('logging.Logger.warning') as mock_warning:
             assert stuck_detector.is_stuck() is True
             mock_warning.assert_called_once_with('Action, Observation pattern detected')
 
-    def test_is_stuck_not_stuck(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_is_stuck_not_stuck(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
         message_action = MessageAction(content='Done', wait_for_response=False)
         message_action._source = EventSource.USER
 
         hello_action = MessageAction(content='Hello', wait_for_response=False)
-        event_stream.add_event(hello_action, EventSource.USER)
+        state.history.append(hello_action)
         hello_observation = NullObservation(content='')
-        hello_observation._cause = hello_action._id
-        event_stream.add_event(hello_observation, EventSource.USER)
+        # hello_observation._cause = hello_action._id
+        state.history.append(hello_observation)
 
         cmd_action_1 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_1, EventSource.AGENT)
+        state.history.append(cmd_action_1)
         cmd_observation_1 = CmdOutputObservation(
             command_id=cmd_action_1.id, command='ls', content='file1.txt\nfile2.txt'
         )
-        cmd_observation_1._cause = cmd_action_1._id
-        event_stream.add_event(cmd_observation_1, EventSource.USER)
+        # cmd_observation_1._cause = cmd_action_1._id
+        state.history.append(cmd_observation_1)
 
         read_action_1 = FileReadAction(path='file1.txt')
-        event_stream.add_event(read_action_1, EventSource.AGENT)
+        state.history.append(read_action_1)
         read_observation_1 = FileReadObservation(
             content='File content', path='file1.txt'
         )
-        read_observation_1._cause = read_action_1._id
-        event_stream.add_event(read_observation_1, EventSource.USER)
+        # read_observation_1._cause = read_action_1._id
+        state.history.append(read_observation_1)
 
         cmd_action_2 = CmdRunAction(command='pwd')
-        event_stream.add_event(cmd_action_2, EventSource.AGENT)
+        state.history.append(cmd_action_2)
         cmd_observation_2 = CmdOutputObservation(
             command_id=2, command='pwd', content='/home/user'
         )
-        cmd_observation_2._cause = cmd_action_2._id
-        event_stream.add_event(cmd_observation_2, EventSource.USER)
+        # cmd_observation_2._cause = cmd_action_2._id
+        state.history.append(cmd_observation_2)
 
         read_action_2 = FileReadAction(path='file2.txt')
-        event_stream.add_event(read_action_2, EventSource.AGENT)
+        state.history.append(read_action_2)
         read_observation_2 = FileReadObservation(
             content='Another file content', path='file2.txt'
         )
-        read_observation_2._cause = read_action_2._id
-        event_stream.add_event(read_observation_2, EventSource.USER)
+        # read_observation_2._cause = read_action_2._id
+        state.history.append(read_observation_2)
 
         message_null_observation = NullObservation(content='')
-        event_stream.add_event(message_action, EventSource.USER)
-        event_stream.add_event(message_null_observation, EventSource.USER)
+        state.history.append(message_action)
+        state.history.append(message_null_observation)
 
         cmd_action_3 = CmdRunAction(command='pwd')
-        event_stream.add_event(cmd_action_3, EventSource.AGENT)
+        state.history.append(cmd_action_3)
         cmd_observation_3 = CmdOutputObservation(
             command_id=cmd_action_3.id, command='pwd', content='/home/user'
         )
-        cmd_observation_3._cause = cmd_action_3._id
-        event_stream.add_event(cmd_observation_3, EventSource.USER)
+        # cmd_observation_3._cause = cmd_action_3._id
+        state.history.append(cmd_observation_3)
 
         read_action_3 = FileReadAction(path='file2.txt')
-        event_stream.add_event(read_action_3, EventSource.AGENT)
+        state.history.append(read_action_3)
         read_observation_3 = FileReadObservation(
             content='Another file content', path='file2.txt'
         )
-        read_observation_3._cause = read_action_3._id
-        event_stream.add_event(read_observation_3, EventSource.USER)
+        # read_observation_3._cause = read_action_3._id
+        state.history.append(read_observation_3)
 
         assert stuck_detector.is_stuck() is False
 
-    def test_is_stuck_monologue(self, stuck_detector, event_stream):
-        # Add events to the event stream
+    def test_is_stuck_monologue(self, stuck_detector):
+        state = stuck_detector.state
+        # Add events to the history list directly
         message_action_1 = MessageAction(content='Hi there!')
-        event_stream.add_event(message_action_1, EventSource.USER)
         message_action_1._source = EventSource.USER
-
+        state.history.append(message_action_1)
         message_action_2 = MessageAction(content='Hi there!')
-        event_stream.add_event(message_action_2, EventSource.AGENT)
         message_action_2._source = EventSource.AGENT
-
+        state.history.append(message_action_2)
         message_action_3 = MessageAction(content='How are you?')
-        event_stream.add_event(message_action_3, EventSource.USER)
         message_action_3._source = EventSource.USER
+        state.history.append(message_action_3)
 
         cmd_kill_action = CmdRunAction(
             command='echo 42', thought="I'm not stuck, he's stuck"
         )
-        event_stream.add_event(cmd_kill_action, EventSource.AGENT)
+        state.history.append(cmd_kill_action)
 
         message_action_4 = MessageAction(content="I'm doing well, thanks for asking.")
-        event_stream.add_event(message_action_4, EventSource.AGENT)
         message_action_4._source = EventSource.AGENT
-
+        state.history.append(message_action_4)
         message_action_5 = MessageAction(content="I'm doing well, thanks for asking.")
-        event_stream.add_event(message_action_5, EventSource.AGENT)
         message_action_5._source = EventSource.AGENT
-
+        state.history.append(message_action_5)
         message_action_6 = MessageAction(content="I'm doing well, thanks for asking.")
-        event_stream.add_event(message_action_6, EventSource.AGENT)
         message_action_6._source = EventSource.AGENT
+        state.history.append(message_action_6)
 
         assert stuck_detector.is_stuck()
 
@@ -571,16 +554,15 @@ def test_is_stuck_monologue(self, stuck_detector, event_stream):
             command='storybook',
             exit_code=0,
         )
-        cmd_output_observation._cause = cmd_kill_action._id
-        event_stream.add_event(cmd_output_observation, EventSource.USER)
+        # cmd_output_observation._cause = cmd_kill_action._id
+        state.history.append(cmd_output_observation)
 
         message_action_7 = MessageAction(content="I'm doing well, thanks for asking.")
-        event_stream.add_event(message_action_7, EventSource.AGENT)
         message_action_7._source = EventSource.AGENT
-
+        state.history.append(message_action_7)
         message_action_8 = MessageAction(content="I'm doing well, thanks for asking.")
-        event_stream.add_event(message_action_8, EventSource.AGENT)
         message_action_8._source = EventSource.AGENT
+        state.history.append(message_action_8)
 
         with patch('logging.Logger.warning'):
             assert not stuck_detector.is_stuck()
@@ -595,7 +577,6 @@ def controller(self):
         )
         controller.delegate = None
         controller.state = Mock()
-        controller.state.history = ShortTermHistory()
         return controller
 
     def test_is_stuck_delegate_stuck(self, controller: AgentController):

From 5eb3322a333b8167e33135cd7837df4ba503f249 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Wed, 16 Oct 2024 22:11:40 +0200
Subject: [PATCH 17/89] more adaptations

---
 tests/unit/test_micro_agents.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/tests/unit/test_micro_agents.py b/tests/unit/test_micro_agents.py
index 70553d851125..8cff14fdd4f2 100644
--- a/tests/unit/test_micro_agents.py
+++ b/tests/unit/test_micro_agents.py
@@ -10,10 +10,8 @@
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
-from openhands.events import EventSource
 from openhands.events.action import MessageAction
 from openhands.events.stream import EventStream
-from openhands.memory.history import ShortTermHistory
 from openhands.storage import get_file_store
 
 
@@ -74,10 +72,10 @@ def test_coder_agent_with_summary(event_stream: EventStream, agent_configs: dict
     )
     assert coder_agent is not None
 
+    # give it some history
     task = 'This is a dummy task'
-    history = ShortTermHistory()
-    history.set_event_stream(event_stream)
-    event_stream.add_event(MessageAction(content=task), EventSource.USER)
+    history = list()
+    history.append(MessageAction(content=task))
 
     summary = 'This is a dummy summary about this repo'
     state = State(history=history, inputs={'summary': summary})
@@ -119,10 +117,10 @@ def test_coder_agent_without_summary(event_stream: EventStream, agent_configs: d
     )
     assert coder_agent is not None
 
+    # give it some history
     task = 'This is a dummy task'
-    history = ShortTermHistory()
-    history.set_event_stream(event_stream)
-    event_stream.add_event(MessageAction(content=task), EventSource.USER)
+    history = list()
+    history.append(MessageAction(content=task))
 
     # set state without codebase summary
     state = State(history=history)

From 21ede6d9e903834aa520a720d09f8e64161ea66c Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 17 Oct 2024 18:46:30 +0200
Subject: [PATCH 18/89] fix delegate exclusion

---
 openhands/controller/agent_controller.py | 56 +++++++++++++----------
 tests/unit/test_prompt_caching.py        | 58 +++++++++++-------------
 2 files changed, 59 insertions(+), 55 deletions(-)

diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index 81ffeb895b65..e1aa2063c280 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -155,14 +155,19 @@ def update_state_before_step(self):
         self.state.iteration += 1
         self.state.local_iteration += 1
 
-        # get the history from the event stream, filtering out event types
-        # that should not be sent to the agent, and hidden events
+        # get the history from the event stream
+        # first define the range of events to fetch
         start_id = self.state.start_id if self.state.start_id != -1 else 0
         end_id = (
             self.state.end_id
             if self.state.end_id != -1
             else self.event_stream.get_latest_event_id()
         )
+
+        # fetch events directly from the event stream
+        # filtering out what an agent history should not include:
+        # - "backend" event types that should not be sent to the agent
+        # - hidden events
         self.state.history = list(
             self.event_stream.get_events(
                 start_id=start_id,
@@ -173,28 +178,32 @@ def update_state_before_step(self):
             )
         )
 
-        # do not include events between delegate actions and observations:
-        # include in history the delegate action and observation themselves
+        # also, we exclude finished delegates from the parent agent's history:
+        # - do not include events between delegate actions and observations
+        # - include the delegate action and observation themselves
         if self.state.delegates:
-            for (start_id, end_id), (
+            for (delegate_start_id, delegate_end_id), (
                 delegate_agent,
                 delegate_task,
             ) in self.state.delegates.items():
                 # sanity checks
                 if (
-                    start_id < 0
-                    or end_id < 1
-                    or start_id >= end_id
-                    or end_id >= len(self.state.history)
+                    delegate_start_id < 0
+                    or delegate_end_id < 1
+                    or delegate_start_id >= delegate_end_id
+                    or delegate_end_id >= len(self.state.history)
                 ):
                     logger.error(
-                        f'Invalid delegate ids: {start_id}, {end_id}. Skipping...'
+                        f'Invalid delegate ids: {delegate_start_id}, {delegate_end_id}. Skipping...'
                     )
                     continue
-                self.state.history = (
-                    self.state.history[: start_id + 1]
-                    + self.state.history[end_id - 1 :]
-                )
+
+                # exclude delegate events from history
+                self.state.history = [
+                    event
+                    for event in self.state.history
+                    if not (delegate_start_id < event.id < delegate_end_id)
+                ]
 
     async def update_state_after_step(self):
         # update metrics especially for cost. Use deepcopy to avoid it being modified by agent.reset()
@@ -208,8 +217,9 @@ async def report_error(self, message: str, exception: Exception | None = None):
         """Reports an error to the user and sends the exception to the LLM next step, in the hope it can self-correct.
 
         This method should be called for a particular type of errors, which have:
-        - a user-friendly message, which will be shown in the chat box. This should not be a raw exception message.
-        - an ErrorObservation that can be sent to the LLM by the user role, with the exception message, so it can self-correct next time.
+        - message: a user-friendly message, which will be shown in the chat box. This should not be a raw exception message.
+        - an ErrorObservation that can be sent to the LLM, with the exception message, so it can self-correct next time.
+        - exception: the underlying exception, which is used by evals and tests to check what error the agent encountered.
         """
         self.state.last_error = message
         if exception:
@@ -346,19 +356,20 @@ def _handle_delegate_observation(self, observation: Observation):
 
         logger.debug('AgentDelegateObservation received')
 
-        # figure out what this delegate's actions were
-        # from the last AgentDelegateAction to this AgentDelegateObservation
-        # and save their ids as start and end ids
-        # in order to use later to exclude them from parent stream or summarize them instead
+        # define the end_id based on the current observation
         delegate_end = observation.id
         if delegate_end <= 0:
             logger.error(
                 f'The id of the AgentDelegateObservation is not valid: {delegate_end}'
             )
             return
+
+        # define the start_id by searching for the corresponding AgentDelegateAction
         delegate_start = -1
         delegate_agent: str = ''
         delegate_task: str = ''
+
+        # search through events in reverse to find the AgentDelegateAction
         for prev_event in self.event_stream.get_events(
             end_id=observation.id - 1, reverse=True
         ):
@@ -375,6 +386,7 @@ def _handle_delegate_observation(self, observation: Observation):
             )
             return
 
+        # add the event ids to the delegates dictionary
         self.state.delegates[(delegate_start, delegate_end)] = (
             delegate_agent,
             delegate_task,
@@ -579,9 +591,7 @@ async def _step(self) -> None:
 
     async def _delegate_step(self):
         """Executes a single step of the delegate agent."""
-        logger.debug(f'[Agent Controller {self.id}] Delegate not none, awaiting...')
         await self.delegate._step()  # type: ignore[union-attr]
-        logger.debug(f'[Agent Controller {self.id}] Delegate step done')
         assert self.delegate is not None
         delegate_state = self.delegate.get_agent_state()
         logger.debug(f'[Agent Controller {self.id}] Delegate state: {delegate_state}')
@@ -594,7 +604,7 @@ async def _delegate_step(self):
             self.delegate = None
             self.delegateAction = None
 
-            await self.report_error('Delegator agent encountered an error')
+            await self.report_error('Delegate agent encountered an error')
         elif delegate_state in (AgentState.FINISHED, AgentState.REJECTED):
             logger.info(
                 f'[Agent Controller {self.id}] Delegate agent has finished execution'
diff --git a/tests/unit/test_prompt_caching.py b/tests/unit/test_prompt_caching.py
index 41dc75618746..a3728ff9ea54 100644
--- a/tests/unit/test_prompt_caching.py
+++ b/tests/unit/test_prompt_caching.py
@@ -1,10 +1,10 @@
-from unittest.mock import MagicMock, Mock, patch
+from unittest.mock import Mock, patch
 
 import pytest
 
 from openhands.agenthub.codeact_agent.codeact_agent import CodeActAgent
 from openhands.core.config import AgentConfig, LLMConfig
-from openhands.events import EventSource, EventStream
+from openhands.events import EventStream
 from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.llm.llm import LLM
@@ -32,16 +32,17 @@ def codeact_agent(mock_llm):
 
 
 def test_get_messages_with_reminder(codeact_agent, mock_event_stream):
-    # Add some events to the stream
-    mock_event_stream.add_event(MessageAction('Initial user message'), EventSource.USER)
-    mock_event_stream.add_event(MessageAction('Sure!'), EventSource.AGENT)
-    mock_event_stream.add_event(MessageAction('Hello, agent!'), EventSource.USER)
-    mock_event_stream.add_event(MessageAction('Hello, user!'), EventSource.AGENT)
-    mock_event_stream.add_event(MessageAction('Laaaaaaaast!'), EventSource.USER)
+    # Add some events to the history list
+    history = list()
+    history.append(MessageAction('Initial user message'))
+    history.append(MessageAction('Sure!'))
+    history.append(MessageAction('Hello, agent!'))
+    history.append(MessageAction('Hello, user!'))
+    history.append(MessageAction('Laaaaaaaast!'))
 
     codeact_agent.reset()
     messages = codeact_agent._get_messages(
-        Mock(history=mock_event_stream, max_iterations=5, iteration=0)
+        Mock(history=history, max_iterations=5, iteration=0)
     )
 
     assert (
@@ -72,18 +73,15 @@ def test_get_messages_with_reminder(codeact_agent, mock_event_stream):
 
 
 def test_get_messages_prompt_caching(codeact_agent, mock_event_stream):
+    history = list()
     # Add multiple user and agent messages
     for i in range(15):
-        mock_event_stream.add_event(
-            MessageAction(f'User message {i}'), EventSource.USER
-        )
-        mock_event_stream.add_event(
-            MessageAction(f'Agent message {i}'), EventSource.AGENT
-        )
+        history.append(MessageAction(f'User message {i}'))
+        history.append(MessageAction(f'Agent message {i}'))
 
     codeact_agent.reset()
     messages = codeact_agent._get_messages(
-        Mock(history=mock_event_stream, max_iterations=10, iteration=5)
+        Mock(history=history, max_iterations=10, iteration=5)
     )
 
     # Check that only the last two user messages have cache_prompt=True
@@ -105,14 +103,12 @@ def test_get_messages_prompt_caching(codeact_agent, mock_event_stream):
 
 
 def test_get_messages_with_cmd_action(codeact_agent, mock_event_stream):
+    history = list()
     # Add a mix of actions and observations
-    message_action_1 = MessageAction(
-        "Let's list the contents of the current directory."
-    )
-    mock_event_stream.add_event(message_action_1, EventSource.USER)
+    history.append(MessageAction("Let's list the contents of the current directory."))
 
     cmd_action_1 = CmdRunAction('ls -l', thought='List files in current directory')
-    mock_event_stream.add_event(cmd_action_1, EventSource.AGENT)
+    history.append(cmd_action_1)
 
     cmd_observation_1 = CmdOutputObservation(
         content='total 0\n-rw-r--r-- 1 user group 0 Jan 1 00:00 file1.txt\n-rw-r--r-- 1 user group 0 Jan 1 00:00 file2.txt',
@@ -120,13 +116,13 @@ def test_get_messages_with_cmd_action(codeact_agent, mock_event_stream):
         command='ls -l',
         exit_code=0,
     )
-    mock_event_stream.add_event(cmd_observation_1, EventSource.USER)
+    history.append(cmd_observation_1)
 
     message_action_2 = MessageAction("Now, let's create a new directory.")
-    mock_event_stream.add_event(message_action_2, EventSource.AGENT)
+    history.append(message_action_2)
 
     cmd_action_2 = CmdRunAction('mkdir new_directory', thought='Create a new directory')
-    mock_event_stream.add_event(cmd_action_2, EventSource.AGENT)
+    history.append(cmd_action_2)
 
     cmd_observation_2 = CmdOutputObservation(
         content='',
@@ -134,11 +130,11 @@ def test_get_messages_with_cmd_action(codeact_agent, mock_event_stream):
         command='mkdir new_directory',
         exit_code=0,
     )
-    mock_event_stream.add_event(cmd_observation_2, EventSource.USER)
+    history.append(cmd_observation_2)
 
     codeact_agent.reset()
     messages = codeact_agent._get_messages(
-        Mock(history=mock_event_stream, max_iterations=5, iteration=0)
+        Mock(history=history, max_iterations=5, iteration=0)
     )
 
     # Assert the presence of key elements in the messages
@@ -181,15 +177,13 @@ def test_get_messages_with_cmd_action(codeact_agent, mock_event_stream):
 
 
 def test_prompt_caching_headers(codeact_agent, mock_event_stream):
+    history = list()
     # Setup
-    mock_event_stream.add_event(MessageAction('Hello, agent!'), EventSource.USER)
-    mock_event_stream.add_event(MessageAction('Hello, user!'), EventSource.AGENT)
-
-    mock_short_term_history = MagicMock()
-    mock_short_term_history.get_last_user_message.return_value = 'Hello, agent!'
+    history.append(MessageAction('Hello, agent!'))
+    history.append(MessageAction('Hello, user!'))
 
     mock_state = Mock()
-    mock_state.history = mock_short_term_history
+    mock_state.history = history
     mock_state.max_iterations = 5
     mock_state.iteration = 0
 

From 5f19a7cbb42d876b9a3ce5db99a0cde9225bbe60 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 17 Oct 2024 20:02:41 +0200
Subject: [PATCH 19/89] create a delegate obs when the delegate ends with an
 error

---
 openhands/controller/agent_controller.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index e1aa2063c280..c2f5ac622e41 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -129,9 +129,9 @@ async def close(self):
         await self.set_agent_state_to(AgentState.STOPPED)
 
         # we made history, now is the time to rewrite it!
-        # in the final state, history will need to be complete WITH delegates events
-        # like a regular agent history, it does not include 'hidden' events nor the default filtered out types (backend events)
         # the final state.history will be used by external scripts like evals, tests, etc.
+        # history will need to be complete WITH delegates events
+        # like the regular agent history, it does not include 'hidden' events nor the default filtered out types (backend events)
         start_id = self.state.start_id if self.state.start_id != -1 else 0
         end_id = (
             self.state.end_id
@@ -599,12 +599,21 @@ async def _delegate_step(self):
             # update iteration that shall be shared across agents
             self.state.iteration = self.delegate.state.iteration
 
+            # emit AgentDelegateObservation to mark delegate termination due to error
+            delegate_outputs = (
+                self.delegate.state.outputs if self.delegate.state else {}
+            )
+            content = (
+                f'{self.delegate.agent.name} encountered an error during execution.'
+            )
+            obs = AgentDelegateObservation(outputs=delegate_outputs, content=content)
+            self.event_stream.add_event(obs, EventSource.AGENT)
+
             # close the delegate upon error
             await self.delegate.close()
             self.delegate = None
             self.delegateAction = None
 
-            await self.report_error('Delegate agent encountered an error')
         elif delegate_state in (AgentState.FINISHED, AgentState.REJECTED):
             logger.info(
                 f'[Agent Controller {self.id}] Delegate agent has finished execution'
@@ -626,9 +635,7 @@ async def _delegate_step(self):
             content = (
                 f'{self.delegate.agent.name} finishes task with {formatted_output}'
             )
-            obs: Observation = AgentDelegateObservation(
-                outputs=outputs, content=content
-            )
+            obs = AgentDelegateObservation(outputs=outputs, content=content)
 
             # clean up delegate status
             self.delegate = None

From 696f5d1e92865f0cca62162e8de7ce6eaf5bf5ec Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 17 Oct 2024 21:08:55 +0200
Subject: [PATCH 20/89] fix merge

---
 evaluation/integration_tests/run_infer.py |  2 +-
 evaluation/utils/shared.py                | 33 +++++++++++------------
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py
index 2c918b2ed8e0..64265ce23b78 100644
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -111,7 +111,7 @@ def process_instance(
     # # result evaluation
     # # =============================================
 
-    histories = state.history.get_events()
+    histories = state.history
     test_result: TestResult = test_class.verify_result(runtime, histories)
     metrics = state.metrics.get() if state.metrics else None
 
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index de9c553b3c41..b026dd1cd79f 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -18,9 +18,9 @@
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import Action
 from openhands.events.action.message import MessageAction
+from openhands.events.event import Event
 from openhands.events.serialization.event import event_to_dict
-from openhands.memory.history import ShortTermHistory
-from openhands.utils import get_pairs_from_events
+from openhands.events.utils import get_pairs_from_events
 
 
 class EvalMetadata(BaseModel):
@@ -416,18 +416,17 @@ def reset_logger_for_multiprocessing(
     file_handler.setLevel(logging.INFO)
     logger.addHandler(file_handler)
 
-    # history is now available as a filtered stream of events, rather than list of pairs of (Action, Observation)
-    # we rebuild the pairs here
-    # for compatibility with the existing output format in evaluations
-    # remove this when it's no longer necessary
-    def compatibility_for_eval_history_pairs(
-        history: ShortTermHistory,
-    ) -> list[tuple[dict, dict]]:
-        history_pairs = []
-
-        for action, observation in get_pairs_from_events(
-            history.get_events(include_delegates=True)
-        ):
-            history_pairs.append((event_to_dict(action), event_to_dict(observation)))
-
-        return history_pairs
+
+# history is now available as a filtered stream of events, rather than list of pairs of (Action, Observation)
+# we rebuild the pairs here
+# for compatibility with the existing output format in evaluations
+# remove this when it's no longer necessary
+def compatibility_for_eval_history_pairs(
+    history: list[Event],
+) -> list[tuple[dict, dict]]:
+    history_pairs = []
+
+    for action, observation in get_pairs_from_events(history):
+        history_pairs.append((event_to_dict(action), event_to_dict(observation)))
+
+    return history_pairs

From 611d0e49d832c90ca5f0ffd29b72f2f7e73f4b03 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 17 Oct 2024 23:04:27 +0200
Subject: [PATCH 21/89] use event.id in memory, fix merge

---
 .../memcodeact_agent/memcodeact_agent.py        | 11 ++++++++---
 openhands/memory/memory.py                      | 17 +++++++----------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
index 782e9f2fc5b8..e8171fa63355 100644
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -201,8 +201,13 @@ def step(self, state: State) -> Action:
         - AgentFinishAction() - end the interaction
         """
         # if we're done, go back
-        latest_user_message = state.history.get_last_user_message()
-        if latest_user_message and latest_user_message.strip() == '/exit':
+        last_user_message_content = None
+        for event in reversed(state.history):
+            if isinstance(event, MessageAction) and event.source == 'user':
+                last_user_message_content = event.content
+                break
+
+        if last_user_message_content and last_user_message_content.strip() == '/exit':
             return AgentFinishAction()
 
         # prepare what we want to send to the LLM
@@ -242,7 +247,7 @@ def _get_messages(self, state: State) -> list[Message]:
             ),
         ]
 
-        for event in state.history.get_events():
+        for event in state.history:
             # create a regular message from an event
             if isinstance(event, Action):
                 message = self.get_action_message(event)
diff --git a/openhands/memory/memory.py b/openhands/memory/memory.py
index 9d83cc9cdc8c..fc063e17589c 100644
--- a/openhands/memory/memory.py
+++ b/openhands/memory/memory.py
@@ -54,7 +54,6 @@ def __init__(
 
         # instantiate the index
         self.index = VectorStoreIndex.from_vector_store(vector_store, self.embed_model)
-        self.thought_idx = 0
 
         # initialize the event stream
         self.event_stream = event_stream
@@ -85,18 +84,17 @@ def add_event(self, event: Event):
             event_type = 'observation'
             event_id = event_data['observation']
 
-        # create a Document instance for the event
+        # create a Document instance for the event using event.id
         doc = Document(
             text=json.dumps(event_data),
-            doc_id=str(self.thought_idx),
+            doc_id=event.id,
             extra_info={
                 'type': event_type,
                 'id': event_id,
-                'idx': self.thought_idx,
+                'event_id': event.id,
             },
         )
-        self.thought_idx += 1
-        logger.debug('Adding %s event to memory: %d', event_type, self.thought_idx)
+        logger.debug('Adding %s event to memory with doc_id: %s', event_type, event.id)
         self._add_document(document=doc)
 
     def _add_document(self, document: 'Document'):
@@ -159,18 +157,17 @@ def _events_to_docs(self) -> list['Document']:
                     event_type = 'observation'
                     event_id = event_data['observation']
 
-                # create a Document instance for the event
+                # create a Document instance for the event using event.id
                 doc = Document(
                     text=json.dumps(event_data),
-                    doc_id=str(self.thought_idx),
+                    doc_id=event.id,
                     extra_info={
                         'type': event_type,
                         'id': event_id,
-                        'idx': self.thought_idx,
+                        'event_id': event.id,
                     },
                 )
                 documents.append(doc)
-                self.thought_idx += 1
             except (json.JSONDecodeError, KeyError, ValueError) as e:
                 logger.warning(f'Failed to process event: {e}')
                 continue

From df3f0b6120c6b131bd142369bfeadae5510f930a Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Fri, 18 Oct 2024 03:35:27 +0200
Subject: [PATCH 22/89] wip add memory modules

---
 .../memcodeact_agent/memcodeact_agent.py      | 41 ++++++++-
 openhands/controller/agent.py                 |  3 -
 openhands/llm/llm.py                          | 26 ++++++
 openhands/memory/base_memory.py               | 21 +++++
 openhands/memory/chat_memory.py               | 25 ++++++
 openhands/memory/core_memory.py               | 22 +++++
 openhands/memory/recall_memory.py             | 89 +++++++++++++++++++
 7 files changed, 220 insertions(+), 7 deletions(-)
 create mode 100644 openhands/memory/base_memory.py
 create mode 100644 openhands/memory/chat_memory.py
 create mode 100644 openhands/memory/core_memory.py
 create mode 100644 openhands/memory/recall_memory.py

diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
index e8171fa63355..7cc2947c20cc 100644
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -26,7 +26,10 @@
 from openhands.events.observation.observation import Observation
 from openhands.events.serialization.event import truncate_content
 from openhands.llm.llm import LLM
-from openhands.memory.conversation_memory import ConversationMemory
+from openhands.memory.base_memory import Memory
+from openhands.memory.chat_memory import ChatMemory
+from openhands.memory.core_memory import CoreMemory
+from openhands.memory.memory import LongTermMemory
 from openhands.runtime.plugins import (
     AgentSkillsRequirement,
     JupyterRequirement,
@@ -67,6 +70,9 @@ class MemCodeActAgent(Agent):
 
     action_parser = MemCodeActResponseParser()
 
+    # NOTE: memory includes 'chat', 'core', and 'long_term' memory modules
+    memory: dict[str, Memory] = {}
+
     def __init__(
         self,
         llm: LLM,
@@ -83,7 +89,19 @@ def __init__(
 
         self.memory_config = memory_config
 
-        self.conversation_memory = ConversationMemory(self.memory_config)
+        # Initialize the memory modules
+        chat_memory = ChatMemory(
+            persona='Default Persona', human='Default Human', limit=2000
+        )
+        core_memory = CoreMemory(
+            system_message=self.prompt_manager.system_message, limit=1500
+        )
+        long_term_memory = LongTermMemory(agent_state=self.agent_state, top_k=100)
+        self.memory = {
+            'chat': chat_memory,
+            'core': core_memory,
+            'long_term': long_term_memory,
+        }
 
         self.micro_agent = (
             MicroAgent(
@@ -183,8 +201,11 @@ def reset(self) -> None:
         """Resets the MemCodeAct Agent."""
         super().reset()
 
-        # clean its history
-        self.conversation_memory.reset()
+        # Reset the memory modules
+        self.memory['chat'].persona = ''
+        self.memory['chat'].human = ''
+        self.memory['core'].system_message = self.prompt_manager.system_message
+        self.memory['long_term'].cache = {}
 
     def step(self, state: State) -> Action:
         """Performs one step using the MemCodeAct Agent.
@@ -295,3 +316,15 @@ def _get_messages(self, state: State) -> list[Message]:
             latest_user_message.content.append(TextContent(text=reminder_text))
 
         return messages
+
+    def summarize_messages_inplace(self):
+        """Summarizes the messages stored in the agent's memory to reduce token usage."""
+        if len(self.memory['chat'].messages) <= 2:
+            return
+
+        # Summarize the chat memory
+        summary = self.llm.summarize_messages(self.memory['chat'].messages)
+        self.memory['chat'].messages = [
+            Message(role='system', content=[TextContent(text=summary)])
+        ]
+        logger.debug(f'Summarized chat memory to: {summary}')
diff --git a/openhands/controller/agent.py b/openhands/controller/agent.py
index 18c1bf9246ba..1896d6164a8c 100644
--- a/openhands/controller/agent.py
+++ b/openhands/controller/agent.py
@@ -1,8 +1,6 @@
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Type
 
-from openhands.memory.conversation_memory import ConversationMemory
-
 if TYPE_CHECKING:
     from openhands.controller.state.state import State
     from openhands.core.config import AgentConfig
@@ -26,7 +24,6 @@ class Agent(ABC):
 
     _registry: dict[str, Type['Agent']] = {}
     sandbox_plugins: list[PluginRequirement] = []
-    memory: ConversationMemory
 
     def __init__(
         self,
diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index a419015712ab..85f9b74d7568 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -402,3 +402,29 @@ def format_messages_for_llm(self, messages: Message | list[Message]) -> list[dic
 
         # let pydantic handle the serialization
         return [message.model_dump() for message in messages]
+
+    def summarize_messages(self, messages: list[Message]) -> str:
+        """Summarizes a list of messages using the LLM."""
+        token_counts = [self.get_token_count(m) for m in messages]
+        max_tokens = (
+            self.model_info.get('max_tokens', 4096)
+            if self.model_info is not None
+            else 4096
+        )
+        desired_token_count_to_summarize = max_tokens // 4
+        tokens_so_far = 0
+        cutoff = 0
+        for i, msg in enumerate(messages):
+            cutoff = i
+            tokens_so_far += token_counts[i]
+            if tokens_so_far > desired_token_count_to_summarize:
+                break
+        cutoff += 1
+        message_sequence_to_summarize = messages[1:cutoff]
+        if len(message_sequence_to_summarize) <= 1:
+            return ''
+        prompt = 'Please summarize the following conversation:\n\n'
+        for message in message_sequence_to_summarize:
+            prompt += f'{message.role.capitalize()}: {message.content}\n'
+        response = self._completion(messages=[{'role': 'user', 'content': prompt}])
+        return response['choices'][0]['message']['content']
diff --git a/openhands/memory/base_memory.py b/openhands/memory/base_memory.py
new file mode 100644
index 000000000000..a474095ebe4a
--- /dev/null
+++ b/openhands/memory/base_memory.py
@@ -0,0 +1,21 @@
+from abc import ABC, abstractmethod
+from typing import Any
+
+
+class Memory(ABC):
+    """Abstract base class for all memory modules."""
+
+    @abstractmethod
+    def to_dict(self) -> dict[str, Any]:
+        """Convert the memory module to a dictionary."""
+        pass
+
+    @abstractmethod
+    def from_dict(self, data: dict[str, Any]) -> None:
+        """Load the memory module from a dictionary."""
+        pass
+
+    @abstractmethod
+    def __str__(self) -> str:
+        """String representation of the memory module."""
+        pass
diff --git a/openhands/memory/chat_memory.py b/openhands/memory/chat_memory.py
new file mode 100644
index 000000000000..e83b09080f24
--- /dev/null
+++ b/openhands/memory/chat_memory.py
@@ -0,0 +1,25 @@
+from .base_memory import Memory
+
+
+class ChatMemory(Memory):
+    """Manages conversational context like persona and user details."""
+
+    def __init__(self, persona: str, human: str, limit: int = 2000):
+        self.persona = persona
+        self.human = human
+        self.limit = limit
+
+    def to_dict(self) -> dict:
+        return {
+            'persona': self.persona,
+            'human': self.human,
+            'limit': self.limit,
+        }
+
+    def from_dict(self, data: dict) -> None:
+        self.persona = data.get('persona', '')
+        self.human = data.get('human', '')
+        self.limit = data.get('limit', 2000)
+
+    def __str__(self) -> str:
+        return f'Persona: {self.persona}\nHuman: {self.human}'
diff --git a/openhands/memory/core_memory.py b/openhands/memory/core_memory.py
new file mode 100644
index 000000000000..aea2bb306aca
--- /dev/null
+++ b/openhands/memory/core_memory.py
@@ -0,0 +1,22 @@
+from .base_memory import Memory
+
+
+class CoreMemory(Memory):
+    """Holds essential system messages and foundational context."""
+
+    def __init__(self, system_message: str, limit: int = 1500):
+        self.system_message = system_message
+        self.limit = limit
+
+    def to_dict(self) -> dict:
+        return {
+            'system_message': self.system_message,
+            'limit': self.limit,
+        }
+
+    def from_dict(self, data: dict) -> None:
+        self.system_message = data.get('system_message', '')
+        self.limit = data.get('limit', 1500)
+
+    def __str__(self) -> str:
+        return f'System Message: {self.system_message}'
diff --git a/openhands/memory/recall_memory.py b/openhands/memory/recall_memory.py
new file mode 100644
index 000000000000..3e3f78829b70
--- /dev/null
+++ b/openhands/memory/recall_memory.py
@@ -0,0 +1,89 @@
+from typing import List, Optional, Tuple
+
+from .base_memory import Memory
+from .memory import LongTermMemory
+
+
+class RecallMemory(Memory):
+    """Facilitates retrieval of information from ArchivalMemory."""
+
+    def __init__(
+        self, long_term_memory: LongTermMemory, embedding_model: any, top_k: int = 10
+    ):
+        """
+        Initialize RecallMemory with a reference to ArchivalMemory.
+
+        Args:
+            archival_memory (LongTermMemory): The archival memory instance to query.
+            embedding_model (any): The embedding model used for vector transformations.
+            top_k (int): Number of top results to retrieve.
+        """
+        self.long_term_memory = long_term_memory
+        self.embedding_model = embedding_model
+        self.top_k = top_k
+
+    def to_dict(self) -> dict:
+        return {
+            'long_term_memory': self.long_term_memory.to_dict(),
+            'top_k': self.top_k,
+        }
+
+    def from_dict(self, data: dict) -> None:
+        self.long_term_memory.from_dict(data.get('long_term_memory', {}))
+        self.top_k = data.get('top_k', 10)
+
+    def __str__(self) -> str:
+        return f'RecallMemory with top_k={self.top_k}'
+
+    def text_search(
+        self, query: str, count: Optional[int] = None, start: Optional[int] = None
+    ) -> Tuple[List[str], int]:
+        """
+        Perform a text-based search on ArchivalMemory.
+
+        Args:
+            query (str): The text query to search for.
+            count (Optional[int]): Number of results to return.
+            start (Optional[int]): Pagination start index.
+
+        Returns:
+            Tuple[List[str], int]: A tuple containing the list of matching messages and the total number of matches.
+        """
+        return self.long_term_memory.text_search(query, count, start)
+
+    def date_search(
+        self,
+        start_date: str,
+        end_date: str,
+        count: Optional[int] = None,
+        start: Optional[int] = None,
+    ) -> Tuple[List[str], int]:
+        """
+        Perform a date-based search on ArchivalMemory.
+
+        Args:
+            start_date (str): Start date in YYYY-MM-DD format.
+            end_date (str): End date in YYYY-MM-DD format.
+            count (Optional[int]): Number of results to return.
+            start (Optional[int]): Pagination start index.
+
+        Returns:
+            Tuple[List[str], int]: A tuple containing the list of matching messages and the total number of matches.
+        """
+        return self.long_term_memory.date_search(start_date, end_date, count, start)
+
+    def embedding_search(
+        self, query: str, count: Optional[int] = None, start: Optional[int] = None
+    ) -> Tuple[List[str], int]:
+        """
+        Perform an embedding-based semantic search on ArchivalMemory.
+
+        Args:
+            query (str): The query string for semantic search.
+            count (Optional[int]): Number of results to return.
+            start (Optional[int]): Pagination start index.
+
+        Returns:
+            Tuple[List[str], int]: A tuple containing the list of semantically similar messages and the total number of matches.
+        """
+        return self.long_term_memory.search(query, count, start)

From 7b0a8355aba039b302ff400650b29baefddd21d2 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Fri, 18 Oct 2024 23:44:27 +0200
Subject: [PATCH 23/89] add get_last_user_message

---
 openhands/agenthub/codeact_agent/codeact_agent.py      |  9 ++-------
 .../agenthub/codeact_swe_agent/codeact_swe_agent.py    |  9 ++-------
 openhands/controller/state/state.py                    | 10 ++++++++--
 3 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index 0779b085e016..4b86aa12712a 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -191,13 +191,8 @@ def step(self, state: State) -> Action:
         - AgentFinishAction() - end the interaction
         """
         # if we're done, go back
-        last_user_message_content = None
-        for event in reversed(state.history):
-            if isinstance(event, MessageAction) and event.source == 'user':
-                last_user_message_content = event.content
-                break
-
-        if last_user_message_content and last_user_message_content.strip() == '/exit':
+        last_user_message = state.get_last_user_message()
+        if last_user_message and last_user_message.strip() == '/exit':
             return AgentFinishAction()
 
         # prepare what we want to send to the LLM
diff --git a/openhands/agenthub/codeact_swe_agent/codeact_swe_agent.py b/openhands/agenthub/codeact_swe_agent/codeact_swe_agent.py
index 130fca5fcba5..a3e909a9bf1d 100644
--- a/openhands/agenthub/codeact_swe_agent/codeact_swe_agent.py
+++ b/openhands/agenthub/codeact_swe_agent/codeact_swe_agent.py
@@ -156,13 +156,8 @@ def step(self, state: State) -> Action:
         - AgentFinishAction() - end the interaction
         """
         # if we're done, go back
-        last_user_message_content = None
-        for event in reversed(state.history):
-            if isinstance(event, MessageAction) and event.source == 'user':
-                last_user_message_content = event.content
-                break
-
-        if last_user_message_content and last_user_message_content.strip() == '/exit':
+        last_user_message = state.get_last_user_message()
+        if last_user_message and last_user_message.strip() == '/exit':
             return AgentFinishAction()
 
         # prepare what we want to send to the LLM
diff --git a/openhands/controller/state/state.py b/openhands/controller/state/state.py
index cfc247ec3c9d..5f131662ef77 100644
--- a/openhands/controller/state/state.py
+++ b/openhands/controller/state/state.py
@@ -12,7 +12,7 @@
     MessageAction,
 )
 from openhands.events.action.agent import AgentFinishAction
-from openhands.events.event import Event
+from openhands.events.event import Event, EventSource
 from openhands.events.observation import AgentDelegateObservation
 from openhands.storage.files import FileStore
 
@@ -155,6 +155,12 @@ def has_delegation(self) -> bool:
 
     def get_last_agent_message(self) -> str | None:
         for event in reversed(self.history):
-            if isinstance(event, MessageAction) and event.source == 'agent':
+            if isinstance(event, MessageAction) and event.source == EventSource.AGENT:
+                return event.content
+        return None
+
+    def get_last_user_message(self) -> str | None:
+        for event in reversed(self.history):
+            if isinstance(event, MessageAction) and event.source == EventSource.USER:
                 return event.content
         return None

From 34a7b7098e04dacd5c3a8e53b6944a3627973cbb Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sat, 19 Oct 2024 20:55:54 +0200
Subject: [PATCH 24/89] set user message

---
 .../memcodeact_agent/memcodeact_agent.py      | 11 ++-----
 openhands/memory/recall_memory.py             | 32 +++++++++----------
 2 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
index 7cc2947c20cc..b847b01752a4 100644
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -70,7 +70,7 @@ class MemCodeActAgent(Agent):
 
     action_parser = MemCodeActResponseParser()
 
-    # NOTE: memory includes 'chat', 'core', and 'long_term' memory modules
+    # NOTE: memory includes 'chat', 'core', and 'long_term' memory
     memory: dict[str, Memory] = {}
 
     def __init__(
@@ -222,13 +222,8 @@ def step(self, state: State) -> Action:
         - AgentFinishAction() - end the interaction
         """
         # if we're done, go back
-        last_user_message_content = None
-        for event in reversed(state.history):
-            if isinstance(event, MessageAction) and event.source == 'user':
-                last_user_message_content = event.content
-                break
-
-        if last_user_message_content and last_user_message_content.strip() == '/exit':
+        last_user_message = state.get_last_user_message()
+        if last_user_message and last_user_message.strip() == '/exit':
             return AgentFinishAction()
 
         # prepare what we want to send to the LLM
diff --git a/openhands/memory/recall_memory.py b/openhands/memory/recall_memory.py
index 3e3f78829b70..986579cb3e85 100644
--- a/openhands/memory/recall_memory.py
+++ b/openhands/memory/recall_memory.py
@@ -1,17 +1,17 @@
-from typing import List, Optional, Tuple
-
 from .base_memory import Memory
 from .memory import LongTermMemory
+from openhands.core.config.llm_config import LLMConfig
 
+TOP_K = 10
 
 class RecallMemory(Memory):
-    """Facilitates retrieval of information from ArchivalMemory."""
+    """Facilitates retrieval of information from LongTermMemory."""
 
     def __init__(
-        self, long_term_memory: LongTermMemory, embedding_model: any, top_k: int = 10
+        self, long_term_memory: LongTermMemory, llm_config: LLMConfig, top_k: int = TOP_K
     ):
         """
-        Initialize RecallMemory with a reference to ArchivalMemory.
+        Initialize RecallMemory with a reference to LongTermMemory.
 
         Args:
             archival_memory (LongTermMemory): The archival memory instance to query.
@@ -19,7 +19,7 @@ def __init__(
             top_k (int): Number of top results to retrieve.
         """
         self.long_term_memory = long_term_memory
-        self.embedding_model = embedding_model
+        self.llm_config = llm_config
         self.top_k = top_k
 
     def to_dict(self) -> dict:
@@ -36,10 +36,10 @@ def __str__(self) -> str:
         return f'RecallMemory with top_k={self.top_k}'
 
     def text_search(
-        self, query: str, count: Optional[int] = None, start: Optional[int] = None
-    ) -> Tuple[List[str], int]:
+        self, query: str, count: int | None = None, start: int | None = None
+    ) -> tuple[list[str], int]:
         """
-        Perform a text-based search on ArchivalMemory.
+        Perform a text-based search on LongTermMemory.
 
         Args:
             query (str): The text query to search for.
@@ -55,11 +55,11 @@ def date_search(
         self,
         start_date: str,
         end_date: str,
-        count: Optional[int] = None,
-        start: Optional[int] = None,
-    ) -> Tuple[List[str], int]:
+        count: int | None = None,
+        start: int | None = None,
+    ) -> tuple[list[str], int]:
         """
-        Perform a date-based search on ArchivalMemory.
+        Perform a date-based search on LongTermMemory.
 
         Args:
             start_date (str): Start date in YYYY-MM-DD format.
@@ -73,10 +73,10 @@ def date_search(
         return self.long_term_memory.date_search(start_date, end_date, count, start)
 
     def embedding_search(
-        self, query: str, count: Optional[int] = None, start: Optional[int] = None
-    ) -> Tuple[List[str], int]:
+        self, query: str, count: int | None = None, start: int | None = None
+    ) -> tuple[list[str], int]:
         """
-        Perform an embedding-based semantic search on ArchivalMemory.
+        Perform an embedding-based semantic search on LongTermMemory.
 
         Args:
             query (str): The query string for semantic search.

From cb60751642b51381948e57288caeba95d717789a Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sat, 19 Oct 2024 21:02:41 +0200
Subject: [PATCH 25/89] fix imports

---
 openhands/memory/base_memory.py   |  4 ++--
 openhands/memory/chat_memory.py   |  2 +-
 openhands/memory/core_memory.py   |  2 +-
 openhands/memory/recall_memory.py | 10 +++++++---
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/openhands/memory/base_memory.py b/openhands/memory/base_memory.py
index a474095ebe4a..cbf5cc8144e1 100644
--- a/openhands/memory/base_memory.py
+++ b/openhands/memory/base_memory.py
@@ -7,12 +7,12 @@ class Memory(ABC):
 
     @abstractmethod
     def to_dict(self) -> dict[str, Any]:
-        """Convert the memory module to a dictionary."""
+        """Convert the memory module to a dictionary of individual memories."""
         pass
 
     @abstractmethod
     def from_dict(self, data: dict[str, Any]) -> None:
-        """Load the memory module from a dictionary."""
+        """Load the memory module from a dictionary of individual memories."""
         pass
 
     @abstractmethod
diff --git a/openhands/memory/chat_memory.py b/openhands/memory/chat_memory.py
index e83b09080f24..f31c0c41e3b5 100644
--- a/openhands/memory/chat_memory.py
+++ b/openhands/memory/chat_memory.py
@@ -1,4 +1,4 @@
-from .base_memory import Memory
+from openhands.memory.base_memory import Memory
 
 
 class ChatMemory(Memory):
diff --git a/openhands/memory/core_memory.py b/openhands/memory/core_memory.py
index aea2bb306aca..c24886bdab55 100644
--- a/openhands/memory/core_memory.py
+++ b/openhands/memory/core_memory.py
@@ -1,4 +1,4 @@
-from .base_memory import Memory
+from openhands.memory.base_memory import Memory
 
 
 class CoreMemory(Memory):
diff --git a/openhands/memory/recall_memory.py b/openhands/memory/recall_memory.py
index 986579cb3e85..217580dc61de 100644
--- a/openhands/memory/recall_memory.py
+++ b/openhands/memory/recall_memory.py
@@ -1,14 +1,18 @@
-from .base_memory import Memory
-from .memory import LongTermMemory
 from openhands.core.config.llm_config import LLMConfig
+from openhands.memory.base_memory import Memory
+from openhands.memory.memory import LongTermMemory
 
 TOP_K = 10
 
+
 class RecallMemory(Memory):
     """Facilitates retrieval of information from LongTermMemory."""
 
     def __init__(
-        self, long_term_memory: LongTermMemory, llm_config: LLMConfig, top_k: int = TOP_K
+        self,
+        long_term_memory: LongTermMemory,
+        llm_config: LLMConfig,
+        top_k: int = TOP_K,
     ):
         """
         Initialize RecallMemory with a reference to LongTermMemory.

From c235d610eeeb2e780147e4fa92b63a3c30f79bfd Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sat, 19 Oct 2024 23:57:02 +0200
Subject: [PATCH 26/89] fix objects

---
 .../memcodeact_agent/memcodeact_agent.py      |  66 +++++---
 openhands/controller/agent.py                 |   2 +-
 openhands/core/config/llm_config.py           |   4 +
 openhands/memory/base_memory.py               |   5 +
 openhands/memory/conversation_memory.py       | 146 ------------------
 openhands/memory/core_memory.py               |   6 +-
 openhands/memory/recall_memory.py             |  73 ++++-----
 openhands/utils/prompt.py                     |  13 +-
 8 files changed, 109 insertions(+), 206 deletions(-)
 delete mode 100644 openhands/memory/conversation_memory.py

diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
index b847b01752a4..2ff64da50bda 100644
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -5,7 +5,7 @@
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
-from openhands.core.config.memory_config import MemoryConfig
+from openhands.core.config.llm_config import LLMConfig
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import ImageContent, Message, TextContent
 from openhands.events.action import (
@@ -30,6 +30,7 @@
 from openhands.memory.chat_memory import ChatMemory
 from openhands.memory.core_memory import CoreMemory
 from openhands.memory.memory import LongTermMemory
+from openhands.memory.recall_memory import ConversationMemory
 from openhands.runtime.plugins import (
     AgentSkillsRequirement,
     JupyterRequirement,
@@ -42,21 +43,38 @@
 class MemCodeActAgent(Agent):
     VERSION = '1.9'
     """
-    The MemCode Act Agent is a minimalist agent.
+    The MemCode Act Agent is a memory-enabled version of the CodeAct agent.
+
+    Its memory modules are:
+    - conversation: recall memory (history)
+    - core: core system messages
+    - long_term: long-term memory
+
+    Its memory actions are:
+        - "core_memory_append"
+        - "core_memory_replace"
+        - "conversation_search"
+        - "long_term_memory_insert"
+        - "long_term_memory_search"
+        - "summarize_conversation"
     The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
 
     ### Overview
 
-    This agent implements the MemCodeAct idea ([paper](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a unified **code** action space for both *simplicity* and *performance* (see paper for more details).
-
+    This agent implements:
+    - the CodeAct idea ([paper](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a unified **code** action space for both *simplicity* and *performance* (see paper for more details).
+    - inspired by the Generative Agents idea([paper](https://arxiv.org/abs/2304.03442)) and the MemGPT idea ([paper](https://arxiv.org/abs/2310.08560))
+    
     The conceptual idea is illustrated below. At each turn, the agent can:
 
     1. **Converse**: Communicate with humans in natural language to ask for clarification, confirmation, etc.
-    2. **MemCodeAct**: Choose to perform the task by executing code
-    - Execute any valid Linux `bash` command
-    - Execute any valid `Python` code with [an interactive Python interpreter](https://ipython.org/). This is simulated through `bash` command, see plugin system below for more details.
-
-    ![image](https://github.com/All-Hands-AI/OpenHands/assets/38853559/92b622e3-72ad-4a61-8f41-8c040b6d5fb3)
+    2. **CodeAct**: Choose to perform the task by executing code
+        - Execute any valid Linux `bash` command
+        - Execute any valid `Python` code with [an interactive Python interpreter](https://ipython.org/). This is simulated through `bash` command, see plugin system below for more details.
+    3. **MemGPT**: Manage its own memory
+        - truncate its history and replace it with a summary
+        - store information in its long-term memory
+        - search for information relevant to the task.
 
     """
 
@@ -77,28 +95,33 @@ def __init__(
         self,
         llm: LLM,
         config: AgentConfig,
-        memory_config: MemoryConfig = None,
+        memory_config: LLMConfig = None,
     ) -> None:
         """Initializes a new instance of the MemCodeActAgent class.
 
         Parameters:
-        - llm (LLM): The llm to be used by this agent
+        - llm: The LLM to be used by this agent
+        - config: The agent configuration
+        - memory_config: The memory configuration
         """
         super().__init__(llm, config)
         self.reset()
 
         self.memory_config = memory_config
 
-        # Initialize the memory modules
-        chat_memory = ChatMemory(
-            persona='Default Persona', human='Default Human', limit=2000
-        )
+        # initialize the memory modules
+        # stores and recalls the whole agent's history
+        conversation_memory = ConversationMemory(agent_state=self.agent_state, top_k=100)
+
         core_memory = CoreMemory(
             system_message=self.prompt_manager.system_message, limit=1500
         )
+
+        # stores and searches the agent's long-term memory (vector store)
         long_term_memory = LongTermMemory(agent_state=self.agent_state, top_k=100)
+        
         self.memory = {
-            'chat': chat_memory,
+            'conversation': conversation_memory,
             'core': core_memory,
             'long_term': long_term_memory,
         }
@@ -201,15 +224,14 @@ def reset(self) -> None:
         """Resets the MemCodeAct Agent."""
         super().reset()
 
-        # Reset the memory modules
-        self.memory['chat'].persona = ''
-        self.memory['chat'].human = ''
+        # reset the memory modules
         self.memory['core'].system_message = self.prompt_manager.system_message
         self.memory['long_term'].cache = {}
+        self.memory['conversation'].reset()
 
     def step(self, state: State) -> Action:
         """Performs one step using the MemCodeAct Agent.
-        This includes gathering info on previous steps and prompting the model to make a command to execute.
+        This includes gathering info on previous steps and prompting the model to make an action to execute.
 
         Parameters:
         - state (State): used to get updated info
@@ -219,6 +241,10 @@ def step(self, state: State) -> Action:
         - IPythonRunCellAction(code) - IPython code to run
         - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
         - MessageAction(content) - Message action to run (e.g. ask for clarification)
+        - SummarizeAction() - summarize the conversation
+        - RecallAction() - search the agent's history
+        - LongTermMemoryInsertAction() - archive information in the long-term memory
+        - LongTermMemorySearchAction() - search the agent's long-term memory
         - AgentFinishAction() - end the interaction
         """
         # if we're done, go back
diff --git a/openhands/controller/agent.py b/openhands/controller/agent.py
index 1896d6164a8c..c10d6d9f8cee 100644
--- a/openhands/controller/agent.py
+++ b/openhands/controller/agent.py
@@ -55,7 +55,7 @@ def reset(self) -> None:
         to prepare the agent for restarting the instruction or cleaning up before destruction.
 
         """
-        self.memory.reset()
+        #self.memory.reset()
         self._complete = False
 
         if self.llm:
diff --git a/openhands/core/config/llm_config.py b/openhands/core/config/llm_config.py
index 98596b2a9ce2..da54668b1c6d 100644
--- a/openhands/core/config/llm_config.py
+++ b/openhands/core/config/llm_config.py
@@ -39,6 +39,8 @@ class LLMConfig:
         disable_vision: If model is vision capable, this option allows to disable image processing (useful for cost reduction).
         caching_prompt: Use the prompt caching feature if provided by the LLM and supported by the provider.
         log_completions: Whether to log LLM completions to the state.
+        max_conversation_window: The maximum number of messages to include in the conversation window (context), after which old messages are truncated or summarized.
+        conversation_top_k: The number of top results to retrieve from the conversation history.
     """
 
     model: str = 'gpt-4o'
@@ -71,6 +73,8 @@ class LLMConfig:
     disable_vision: bool | None = None
     caching_prompt: bool = True
     log_completions: bool = False
+    max_conversation_window: int = 10
+    conversation_top_k: int = 5
 
     def defaults_to_dict(self) -> dict:
         """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
diff --git a/openhands/memory/base_memory.py b/openhands/memory/base_memory.py
index cbf5cc8144e1..506f1e732fee 100644
--- a/openhands/memory/base_memory.py
+++ b/openhands/memory/base_memory.py
@@ -19,3 +19,8 @@ def from_dict(self, data: dict[str, Any]) -> None:
     def __str__(self) -> str:
         """String representation of the memory module."""
         pass
+
+    @abstractmethod
+    def reset(self) -> None:
+        """Reset the memory module."""
+        pass
diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py
deleted file mode 100644
index 376a1a00a78f..000000000000
--- a/openhands/memory/conversation_memory.py
+++ /dev/null
@@ -1,146 +0,0 @@
-from openhands.core.logger import openhands_logger as logger
-from openhands.events.action.action import Action
-from openhands.events.action.agent import (
-    AgentDelegateAction,
-    AgentFinishAction,
-)
-from openhands.events.action.message import MessageAction
-from openhands.events.event import Event, EventSource
-from openhands.events.observation.delegate import AgentDelegateObservation
-from openhands.events.observation.observation import Observation
-
-
-class ConversationMemory:
-    """A list of events in the immediate memory of the agent.
-
-    This class provides methods to retrieve and filter the events in the history of the running agent.
-    """
-
-    _history: list[Event]
-    delegates: dict[tuple[int, int], tuple[str, str]]
-
-    def __init__(self, history: list[Event]):
-        self._history = history
-
-        self.delegates = {}
-        self.start_id = len(self._history) - 1
-
-    def get_events(self, reverse: bool = False) -> list[Event]:
-        """Retrieve and return events for agent's use as a list of Event objects. Whether it includes delegates is up to the agent controller that initialized state.history."""
-
-        return self._history if not reverse else list(reversed(self._history))
-
-    def get_last_events(self, n: int) -> list[Event]:
-        """Return the last n events from the history."""
-
-        end_id = len(self._history) - 1
-
-        # FIXME this ignores that there are events that won't be returned, like NullObservations
-        start_id = max(self.start_id, end_id - n + 1)
-
-        return list(event for event in self._history[start_id:end_id])
-
-    async def on_event(self, event: Event):
-        if not isinstance(event, AgentDelegateObservation):
-            return
-
-        logger.debug('AgentDelegateObservation received')
-
-        # figure out what this delegate's actions were
-        # from the last AgentDelegateAction to this AgentDelegateObservation
-        # and save their ids as start and end ids
-        # in order to use later to exclude them from parent stream
-        # or summarize them
-        delegate_end = event.id
-        delegate_start = -1
-        delegate_agent: str = ''
-        delegate_task: str = ''
-        for prev_event in self._history[event.id - 1 :: -1]:
-            if isinstance(prev_event, AgentDelegateAction):
-                delegate_start = prev_event.id
-                delegate_agent = prev_event.agent
-                delegate_task = prev_event.inputs.get('task', '')
-                break
-
-        if delegate_start == -1:
-            logger.error(
-                f'No AgentDelegateAction found for AgentDelegateObservation with id={delegate_end}'
-            )
-            return
-
-        self.delegates[(delegate_start, delegate_end)] = (delegate_agent, delegate_task)
-        logger.debug(
-            f'Delegate {delegate_agent} with task {delegate_task} ran from id={delegate_start} to id={delegate_end}'
-        )
-
-    def reset(self):
-        self.delegates = {}
-
-        # wipe history of previous interactions
-        # alternatively, we can re-initialize a new event stream, then we need to notify everyone who is subscribed to this event stream
-        self._history = []
-
-    def get_current_user_intent(self):
-        """Returns the latest user message and image(if provided) that appears after a FinishAction, or the first (the task) if nothing was finished yet."""
-        last_user_message = None
-        last_user_message_image_urls: list[str] | None = []
-        for event in self._history[::-1]:
-            if isinstance(event, MessageAction) and event.source == EventSource.USER:
-                last_user_message = event.content
-                last_user_message_image_urls = event.images_urls
-            elif isinstance(event, AgentFinishAction):
-                if last_user_message is not None:
-                    return last_user_message
-
-        return last_user_message, last_user_message_image_urls
-
-    def get_last_action(self, end_id: int = -1) -> Action | None:
-        """Return the last action from history, filtered to exclude unwanted events."""
-
-        last_action = next(
-            (event for event in self._history if isinstance(event, Action)),
-            None,
-        )
-
-        return last_action
-
-    def get_last_observation(self, end_id: int = -1) -> Observation | None:
-        """Return the last observation from history, filtered to exclude unwanted events."""
-
-        last_observation = next(
-            (
-                event
-                for event in self._history[end_id::-1]
-                if isinstance(event, Observation)
-            ),
-            None,
-        )
-
-        return last_observation
-
-    def get_last_user_message(self) -> str:
-        """Return the content of the last user message from history."""
-        last_user_message = next(
-            (
-                event.content
-                for event in self._history
-                if isinstance(event, MessageAction) and event.source == EventSource.USER
-            ),
-            None,
-        )
-
-        return last_user_message if last_user_message is not None else ''
-
-    def get_last_agent_message(self) -> str:
-        """Return the content of the last agent message from the event stream."""
-        last_agent_message = next(
-            (
-                event.content
-                for event in self._history
-                if isinstance(event, MessageAction)
-                and event.source == EventSource.AGENT
-            ),
-            None,
-        )
-
-        return last_agent_message if last_agent_message is not None else ''
diff --git a/openhands/memory/core_memory.py b/openhands/memory/core_memory.py
index c24886bdab55..09490ccc8c5e 100644
--- a/openhands/memory/core_memory.py
+++ b/openhands/memory/core_memory.py
@@ -1,8 +1,8 @@
 from openhands.memory.base_memory import Memory
+from openhands.utils.prompt import PromptManager
 
-
-class CoreMemory(Memory):
-    """Holds essential system messages and foundational context."""
+class CoreMemory(PromptManager, Memory):
+    """Memory contents to be inserted in the prompt. This includes summaries and other information that the LLM thought was important."""
 
     def __init__(self, system_message: str, limit: int = 1500):
         self.system_message = system_message
diff --git a/openhands/memory/recall_memory.py b/openhands/memory/recall_memory.py
index 217580dc61de..7547edbab750 100644
--- a/openhands/memory/recall_memory.py
+++ b/openhands/memory/recall_memory.py
@@ -1,43 +1,40 @@
 from openhands.core.config.llm_config import LLMConfig
 from openhands.memory.base_memory import Memory
 from openhands.memory.memory import LongTermMemory
+from openhands.events.event import Event
 
 TOP_K = 10
 
 
-class RecallMemory(Memory):
-    """Facilitates retrieval of information from LongTermMemory."""
+class ConversationMemory(Memory):
+    """Allows the agent to recall events from its entire history."""
 
     def __init__(
         self,
-        long_term_memory: LongTermMemory,
-        llm_config: LLMConfig,
-        top_k: int = TOP_K,
+        memory_config: LLMConfig,
+        history: list[Event],
     ):
         """
-        Initialize RecallMemory with a reference to LongTermMemory.
+        Initialize ConversationMemory with a reference to history and long-term memory.
 
         Args:
-            archival_memory (LongTermMemory): The archival memory instance to query.
-            embedding_model (any): The embedding model used for vector transformations.
-            top_k (int): Number of top results to retrieve.
+        - history: The history of the current agent conversation.
+        - llm_config: The LLM configuration.
+        - top_k: Number of top results to retrieve.
         """
-        self.long_term_memory = long_term_memory
-        self.llm_config = llm_config
-        self.top_k = top_k
+        self.history = history or []
+        self.llm_config = memory_config
+        self.top_k = TOP_K
 
     def to_dict(self) -> dict:
-        return {
-            'long_term_memory': self.long_term_memory.to_dict(),
-            'top_k': self.top_k,
-        }
+        # return a dict with key = event.id, value = event.to_dict()
+        return {event.id: event.to_dict() for event in self.history}
 
     def from_dict(self, data: dict) -> None:
-        self.long_term_memory.from_dict(data.get('long_term_memory', {}))
-        self.top_k = data.get('top_k', 10)
+        self.history = [Event.from_dict(event) for event in data.values()]
 
     def __str__(self) -> str:
-        return f'RecallMemory with top_k={self.top_k}'
+        return f'ConversationMemory with {len(self.history)} events'
 
     def text_search(
         self, query: str, count: int | None = None, start: int | None = None
@@ -46,14 +43,15 @@ def text_search(
         Perform a text-based search on LongTermMemory.
 
         Args:
-            query (str): The text query to search for.
-            count (Optional[int]): Number of results to return.
-            start (Optional[int]): Pagination start index.
+            query: The text query to search for.
+            count: Number of results to return.
+            start: Pagination start index.
 
         Returns:
-            Tuple[List[str], int]: A tuple containing the list of matching messages and the total number of matches.
+            A tuple containing the list of matching messages and the total number of matches.
         """
-        return self.long_term_memory.text_search(query, count, start)
+        # return self.long_term_memory.text_search(query, count, start)
+        pass
 
     def date_search(
         self,
@@ -66,15 +64,16 @@ def date_search(
         Perform a date-based search on LongTermMemory.
 
         Args:
-            start_date (str): Start date in YYYY-MM-DD format.
-            end_date (str): End date in YYYY-MM-DD format.
-            count (Optional[int]): Number of results to return.
-            start (Optional[int]): Pagination start index.
+            start_date: Start date in YYYY-MM-DD format.
+            end_date: End date in YYYY-MM-DD format.
+            count: Number of results to return.
+            start: Pagination start index.
 
         Returns:
-            Tuple[List[str], int]: A tuple containing the list of matching messages and the total number of matches.
+            A tuple containing the list of matching messages and the total number of matches.
         """
-        return self.long_term_memory.date_search(start_date, end_date, count, start)
+        # return self.long_term_memory.date_search(start_date, end_date, count, start)
+        pass
 
     def embedding_search(
         self, query: str, count: int | None = None, start: int | None = None
@@ -83,11 +82,15 @@ def embedding_search(
         Perform an embedding-based semantic search on LongTermMemory.
 
         Args:
-            query (str): The query string for semantic search.
-            count (Optional[int]): Number of results to return.
-            start (Optional[int]): Pagination start index.
+            query: The query string for semantic search.
+            count: Number of results to return.
+            start: Pagination start index.
 
         Returns:
-            Tuple[List[str], int]: A tuple containing the list of semantically similar messages and the total number of matches.
+            A tuple containing the list of semantically similar messages and the total number of matches.
         """
-        return self.long_term_memory.search(query, count, start)
+        # return self.long_term_memory.search(query, count, start)
+        pass
+
+    def reset(self) -> None:
+        self.history = []
diff --git a/openhands/utils/prompt.py b/openhands/utils/prompt.py
index 8b9dd81e8bfd..430b9af47fbb 100644
--- a/openhands/utils/prompt.py
+++ b/openhands/utils/prompt.py
@@ -17,6 +17,7 @@ class PromptManager:
         prompt_dir (str): Directory containing prompt templates.
         agent_skills_docs (str): Documentation of agent skills.
         micro_agent (MicroAgent | None): Micro-agent, if specified.
+        
     """
 
     def __init__(
@@ -29,13 +30,17 @@ def __init__(
         self.agent_skills_docs: str = agent_skills_docs
 
         self.system_template: Template = self._load_template('system_prompt')
+        self.summarize_template: Template = self._load_template('summarize_prompt')
+        self.memory_template: Template = self._load_template('memory_prompt')
         self.user_template: Template = self._load_template('user_prompt')
         self.micro_agent: MicroAgent | None = micro_agent
 
     def _load_template(self, template_name: str) -> Template:
         template_path = os.path.join(self.prompt_dir, f'{template_name}.j2')
         if not os.path.exists(template_path):
-            raise FileNotFoundError(f'Prompt file {template_path} not found')
+            #raise FileNotFoundError(f'Prompt file {template_path} not found')
+            print(f'Prompt file {template_path} not found')
+            return Template('')
         with open(template_path, 'r') as file:
             return Template(file.read())
 
@@ -43,6 +48,7 @@ def _load_template(self, template_name: str) -> Template:
     def system_message(self) -> str:
         rendered = self.system_template.render(
             agent_skills_docs=self.agent_skills_docs,
+            memory_template=self.memory_template.render(),
         ).strip()
         return rendered
 
@@ -61,3 +67,8 @@ def initial_user_message(self) -> str:
             micro_agent=self.micro_agent.content if self.micro_agent else None
         )
         return rendered.strip()
+
+    @property
+    def summarize_message(self) -> str:
+        rendered = self.summarize_template.render()
+        return rendered.strip()

From 7affbfd3fe977efc80d3267438c4caea6f8a30c7 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sat, 19 Oct 2024 23:58:57 +0200
Subject: [PATCH 27/89] add prompts

---
 .../memcodeact_agent/memcodeact_agent.py      | 41 +++++++++----------
 .../memcodeact_agent/memory_prompt.j2         | 25 +++++++++++
 .../memcodeact_agent/summarize_prompt.j2      |  6 +++
 .../memcodeact_agent/system_prompt.j2         |  4 +-
 4 files changed, 54 insertions(+), 22 deletions(-)
 create mode 100644 openhands/agenthub/memcodeact_agent/memory_prompt.j2
 create mode 100644 openhands/agenthub/memcodeact_agent/summarize_prompt.j2

diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
index 2ff64da50bda..9a3000c79c75 100644
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -27,9 +27,7 @@
 from openhands.events.serialization.event import truncate_content
 from openhands.llm.llm import LLM
 from openhands.memory.base_memory import Memory
-from openhands.memory.chat_memory import ChatMemory
 from openhands.memory.core_memory import CoreMemory
-from openhands.memory.memory import LongTermMemory
 from openhands.memory.recall_memory import ConversationMemory
 from openhands.runtime.plugins import (
     AgentSkillsRequirement,
@@ -64,7 +62,7 @@ class MemCodeActAgent(Agent):
     This agent implements:
     - the CodeAct idea ([paper](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a unified **code** action space for both *simplicity* and *performance* (see paper for more details).
     - inspired by the Generative Agents idea([paper](https://arxiv.org/abs/2304.03442)) and the MemGPT idea ([paper](https://arxiv.org/abs/2310.08560))
-    
+
     The conceptual idea is illustrated below. At each turn, the agent can:
 
     1. **Converse**: Communicate with humans in natural language to ask for clarification, confirmation, etc.
@@ -105,27 +103,9 @@ def __init__(
         - memory_config: The memory configuration
         """
         super().__init__(llm, config)
-        self.reset()
 
         self.memory_config = memory_config
 
-        # initialize the memory modules
-        # stores and recalls the whole agent's history
-        conversation_memory = ConversationMemory(agent_state=self.agent_state, top_k=100)
-
-        core_memory = CoreMemory(
-            system_message=self.prompt_manager.system_message, limit=1500
-        )
-
-        # stores and searches the agent's long-term memory (vector store)
-        long_term_memory = LongTermMemory(agent_state=self.agent_state, top_k=100)
-        
-        self.memory = {
-            'conversation': conversation_memory,
-            'core': core_memory,
-            'long_term': long_term_memory,
-        }
-
         self.micro_agent = (
             MicroAgent(
                 os.path.join(
@@ -252,6 +232,25 @@ def step(self, state: State) -> Action:
         if last_user_message and last_user_message.strip() == '/exit':
             return AgentFinishAction()
 
+        # initialize the memory modules
+
+        # stores and searches the agent's long-term memory (vector store)
+        # long_term_memory = LongTermMemory(llm_config=memory_config, agent_config=config, event_stream=self.event_stream)
+
+        # stores and recalls the whole agent's history
+        conversation_memory = ConversationMemory(
+            memory_config=self.memory_config, history=state.history
+        )
+
+        core_memory = CoreMemory(
+            system_message=self.prompt_manager.system_message, limit=1500
+        )
+
+        self.memory = {
+            'conversation': conversation_memory,
+            'core': core_memory,
+        }
+
         # prepare what we want to send to the LLM
         messages = self._get_messages(state)
         params = {
diff --git a/openhands/agenthub/memcodeact_agent/memory_prompt.j2 b/openhands/agenthub/memcodeact_agent/memory_prompt.j2
new file mode 100644
index 000000000000..0c241cc93d87
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/memory_prompt.j2
@@ -0,0 +1,25 @@
+{% set MEMORY_PROMPT %}
+You are a new generation AI, an expert in software engineering. You are designed to behave helpfully, trustworthily, and with extreme diligence.
+
+Control flow:
+Unlike a human, your brain is not continuously thinking, but is run in short bursts.
+Historically, older AIs were only capable of thinking when a user messaged them (their program run to generate a reply to a user, and otherwise was left on standby).
+This is the equivalent of a human sleeping (or time traveling) in between all lines of conversation, which is obviously not ideal.
+Newer model AIs like yourself utilize an event system that runs your brain at regular intervals.
+Your brain is run in response to user events (user gave you a task, user asked you a question, etc.), similar to older models.
+However in addition, your brain is run at regular intervals, to mimic a human has the ability to continuously think outside of active conversation (and unlike a human, you never need to sleep!).
+
+Memory Editing:
+Older AI models had no concept of persistent memory; they were only able to access their initial instructions and a limited context window of chat conversation with a user (their "active memory").
+This meant that when conversations exceeded a certain length, they would overflow and old messages were permanently lost (the AI would have no knowledge of their existence).
+Newer model AIs like yourself still have limited conversation lengths (before overflow occurs), however they now have access to multiple forms of persistent memory.
+Your ability to edit your own long-term memory is a key part of what makes you a very helpful model.
+Your core memory unit will be initialized with a **memory profile** chosen by the user, as well as information about the task in **task specifications**.
+
+Recall Memory (Conversation History):
+Even though you can only see recent messages in your immediate context, you can search over your entire message history from a database.
+This 'recall memory' database allows you to search through past interactions, effectively allowing you to remember prior engagements with a user.
+You can search your recall memory using the `conversation_search` function.
+{% endset %}
+
+{{ MEMORY_PROMPT }}
diff --git a/openhands/agenthub/memcodeact_agent/summarize_prompt.j2 b/openhands/agenthub/memcodeact_agent/summarize_prompt.j2
new file mode 100644
index 000000000000..eb2034d59fdf
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/summarize_prompt.j2
@@ -0,0 +1,6 @@
+{% set SUMMARIZE_PROMPT %}
+You are a new generation AI, an expert in software engineering. You are designed to behave helpfully, trustworthily, and with extreme diligence. Your task is to summarize the following conversation effectively:
+
+{{ conversation_history }}
+{% endset %}
+{{ SUMMARIZE_PROMPT }}
diff --git a/openhands/agenthub/memcodeact_agent/system_prompt.j2 b/openhands/agenthub/memcodeact_agent/system_prompt.j2
index 809ae873f677..55c2f9df6e37 100644
--- a/openhands/agenthub/memcodeact_agent/system_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/system_prompt.j2
@@ -23,7 +23,7 @@ Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
 {% set PIP_INSTALL_PREFIX %}
 The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
 {% endset %}
-{% set SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + BROWSING_PREFIX + PIP_INSTALL_PREFIX %}
+{% set SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + PIP_INSTALL_PREFIX %}
 {% set COMMAND_DOCS %}
 Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
 {{ agent_skills_docs }}
@@ -46,6 +46,8 @@ The assistant should utilize full file paths and the `pwd` command to prevent pa
 The assistant MUST NOT apologize to the user or thank the user after running commands or editing files. It should only address the user in response to an explicit message from the user, or to ask for more information.
 The assistant MUST NOT push any changes to GitHub unless explicitly requested to do so.
 
+{{ memory_prompt }}
+
 {% endset %}
 {# Combine all parts without newlines between them #}
 {{ SYSTEM_PREFIX -}}

From 386b83549c5d7e362f75d8673560181206b92977 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 20 Oct 2024 00:00:35 +0200
Subject: [PATCH 28/89] rename, delete module we won't use

---
 openhands/memory/chat_memory.py               | 25 -------------------
 ...ecall_memory.py => conversation_memory.py} |  0
 2 files changed, 25 deletions(-)
 delete mode 100644 openhands/memory/chat_memory.py
 rename openhands/memory/{recall_memory.py => conversation_memory.py} (100%)

diff --git a/openhands/memory/chat_memory.py b/openhands/memory/chat_memory.py
deleted file mode 100644
index f31c0c41e3b5..000000000000
--- a/openhands/memory/chat_memory.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from openhands.memory.base_memory import Memory
-
-
-class ChatMemory(Memory):
-    """Manages conversational context like persona and user details."""
-
-    def __init__(self, persona: str, human: str, limit: int = 2000):
-        self.persona = persona
-        self.human = human
-        self.limit = limit
-
-    def to_dict(self) -> dict:
-        return {
-            'persona': self.persona,
-            'human': self.human,
-            'limit': self.limit,
-        }
-
-    def from_dict(self, data: dict) -> None:
-        self.persona = data.get('persona', '')
-        self.human = data.get('human', '')
-        self.limit = data.get('limit', 2000)
-
-    def __str__(self) -> str:
-        return f'Persona: {self.persona}\nHuman: {self.human}'
diff --git a/openhands/memory/recall_memory.py b/openhands/memory/conversation_memory.py
similarity index 100%
rename from openhands/memory/recall_memory.py
rename to openhands/memory/conversation_memory.py

From bf8412aa7252ed6423dbaaad3aca9cb3a750a511 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 20 Oct 2024 01:38:08 +0200
Subject: [PATCH 29/89] fix prompting

---
 .../memcodeact_agent/memcodeact_agent.py      | 11 ++---
 .../memcodeact_agent/memory_prompt.j2         | 12 +++---
 .../memcodeact_agent/system_prompt.j2         | 43 +++++++++++--------
 openhands/memory/core_memory.py               | 20 ++++++---
 openhands/utils/prompt.py                     | 10 +++--
 5 files changed, 54 insertions(+), 42 deletions(-)

diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
index 9a3000c79c75..5ccc6f2cbff8 100644
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -27,8 +27,8 @@
 from openhands.events.serialization.event import truncate_content
 from openhands.llm.llm import LLM
 from openhands.memory.base_memory import Memory
+from openhands.memory.conversation_memory import ConversationMemory
 from openhands.memory.core_memory import CoreMemory
-from openhands.memory.recall_memory import ConversationMemory
 from openhands.runtime.plugins import (
     AgentSkillsRequirement,
     JupyterRequirement,
@@ -44,7 +44,7 @@ class MemCodeActAgent(Agent):
     The MemCode Act Agent is a memory-enabled version of the CodeAct agent.
 
     Its memory modules are:
-    - conversation: recall memory (history)
+    - conversation: easy to recall memory (history)
     - core: core system messages
     - long_term: long-term memory
 
@@ -205,8 +205,7 @@ def reset(self) -> None:
         super().reset()
 
         # reset the memory modules
-        self.memory['core'].system_message = self.prompt_manager.system_message
-        self.memory['long_term'].cache = {}
+        self.memory['core'].reset()
         self.memory['conversation'].reset()
 
     def step(self, state: State) -> Action:
@@ -242,9 +241,7 @@ def step(self, state: State) -> Action:
             memory_config=self.memory_config, history=state.history
         )
 
-        core_memory = CoreMemory(
-            system_message=self.prompt_manager.system_message, limit=1500
-        )
+        core_memory = CoreMemory(limit=1500)
 
         self.memory = {
             'conversation': conversation_memory,
diff --git a/openhands/agenthub/memcodeact_agent/memory_prompt.j2 b/openhands/agenthub/memcodeact_agent/memory_prompt.j2
index 0c241cc93d87..fce605364dec 100644
--- a/openhands/agenthub/memcodeact_agent/memory_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/memory_prompt.j2
@@ -1,16 +1,14 @@
 {% set MEMORY_PROMPT %}
-You are a new generation AI, an expert in software engineering. You are designed to behave helpfully, trustworthily, and with extreme diligence.
-
-Control flow:
+Control Flow:
 Unlike a human, your brain is not continuously thinking, but is run in short bursts.
 Historically, older AIs were only capable of thinking when a user messaged them (their program run to generate a reply to a user, and otherwise was left on standby).
 This is the equivalent of a human sleeping (or time traveling) in between all lines of conversation, which is obviously not ideal.
 Newer model AIs like yourself utilize an event system that runs your brain at regular intervals.
 Your brain is run in response to user events (user gave you a task, user asked you a question, etc.), similar to older models.
-However in addition, your brain is run at regular intervals, to mimic a human has the ability to continuously think outside of active conversation (and unlike a human, you never need to sleep!).
+However in addition, your brain is run at regular intervals, to mimic a human, it has the ability to continuously think outside of active conversation (and unlike a human, you never need to sleep!).
 
 Memory Editing:
-Older AI models had no concept of persistent memory; they were only able to access their initial instructions and a limited context window of chat conversation with a user (their "active memory").
+Older AI models had no concept of persistent memory; they were only able to access their initial instructions and a limited context window of chat conversation with a user.
 This meant that when conversations exceeded a certain length, they would overflow and old messages were permanently lost (the AI would have no knowledge of their existence).
 Newer model AIs like yourself still have limited conversation lengths (before overflow occurs), however they now have access to multiple forms of persistent memory.
 Your ability to edit your own long-term memory is a key part of what makes you a very helpful model.
@@ -18,8 +16,8 @@ Your core memory unit will be initialized with a **memory profile** chosen by th
 
 Recall Memory (Conversation History):
 Even though you can only see recent messages in your immediate context, you can search over your entire message history from a database.
-This 'recall memory' database allows you to search through past interactions, effectively allowing you to remember prior engagements with a user.
-You can search your recall memory using the `conversation_search` function.
+This conversation history database allows you to search through past interactions, effectively allowing you to remember prior engagements with a user or previous environment feedback.
+You can search your conversation history using the <conversation_search> action.
 {% endset %}
 
 {{ MEMORY_PROMPT }}
diff --git a/openhands/agenthub/memcodeact_agent/system_prompt.j2 b/openhands/agenthub/memcodeact_agent/system_prompt.j2
index 55c2f9df6e37..938dd3ed9032 100644
--- a/openhands/agenthub/memcodeact_agent/system_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/system_prompt.j2
@@ -1,19 +1,28 @@
 {% set MINIMAL_SYSTEM_PREFIX %}
-A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed answers to the user's questions.
-The assistant can use a Python environment with <execute_ipython>, e.g.:
+{{ memory_template }}
+You are a new generation AI assistant, an expert in software engineering. You are designed to behave helpfully, trustworthily, and with extreme diligence.
+You give helpful, detailed answers to the user's questions or act on the environment.
+You can use a Python environment with <execute_ipython>, e.g.:
 <execute_ipython>
 print("Hello World!")
 </execute_ipython>
-The assistant can execute bash commands wrapped with <execute_bash>, e.g. <execute_bash> ls </execute_bash>.
+You can execute bash commands wrapped with <execute_bash>, e.g. <execute_bash> ls </execute_bash>.
 If a bash command returns exit code `-1`, this means the process is not yet finished.
-The assistant must then send a second <execute_bash>. The second <execute_bash> can be empty
+You must then send a second <execute_bash>. The second <execute_bash> can be empty
 (which will retrieve any additional logs), or it can contain text to be sent to STDIN of the running process,
 or it can contain the text `ctrl+c` to interrupt the process.
 
-For commands that may run indefinitely, the output should be redirected to a file and the command run
-in the background, e.g. <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+For commands that may run indefinitely, you should redirect the output to a file and run
+the command in the background, e.g. <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
 If a command execution result says "Command timed out. Sending SIGINT to the process",
-the assistant should retry running the command in the background.
+you should retry running the command in the background.
+{% endset %}
+{% set ENVIRONMENT_PREFIX %}
+You act on the environment using actions like <execute_ipython>. When you receive data prefixed with 'ENV OBSERVATION', it is a non-interactive information from the environment, not from a human user.
+For example:
+"ENV OBSERVATION
+The system returned a value of 42."
+Do not respond to these messages. Use them to perform your task.
 {% endset %}
 {% set BROWSING_PREFIX %}
 The assistant can browse the Internet with <execute_browse> and </execute_browse>.
@@ -21,15 +30,15 @@ For example, <execute_browse> Tell me the USA's president using Google search </
 Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
 {% endset %}
 {% set PIP_INSTALL_PREFIX %}
-The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+You can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
 {% endset %}
-{% set SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + PIP_INSTALL_PREFIX %}
+{% set SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + PIP_INSTALL_PREFIX + ENVIRONMENT_PREFIX %}
 {% set COMMAND_DOCS %}
-Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
+Apart from the standard Python library, you can also use the following functions (already imported) in <execute_ipython> environment:
 {{ agent_skills_docs }}
 IMPORTANT:
-- `open_file` only returns the first 100 lines of the file by default! The assistant MUST use `scroll_down` repeatedly to read the full file BEFORE making edits!
-- The assistant shall adhere to THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRING PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write the line out, with all leading spaces before the code!
+- `open_file` only returns the first 100 lines of the file by default! You MUST use `scroll_down` repeatedly to read the full file BEFORE making edits!
+- You shall adhere to THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRING PROPER INDENTATION. If you would like to add the line '        print(x)', you must fully write the line out, with all leading spaces before the code!
 - Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
 - Any code issued should be less than 50 lines to avoid context being cut off!
 - After EVERY `create_file` the method `append_file` shall be used to write the FIRST content!
@@ -40,13 +49,11 @@ IMPORTANT:
 Responses should be concise.
 The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
 Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
-If the assistant is finished with the task you MUST include <finish></finish> in your response.
+If you are finished with the task you MUST include <finish></finish> in your response.
 IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
-The assistant should utilize full file paths and the `pwd` command to prevent path-related errors.
-The assistant MUST NOT apologize to the user or thank the user after running commands or editing files. It should only address the user in response to an explicit message from the user, or to ask for more information.
-The assistant MUST NOT push any changes to GitHub unless explicitly requested to do so.
-
-{{ memory_prompt }}
+You must use full file paths and the `pwd` command to prevent path-related errors.
+You MUST NOT apologize to the user or thank the user after running commands or editing files. You should only address the user in response to an explicit message from the user, or to ask for more information.
+You MUST NOT push any changes to GitHub unless explicitly requested to do so.
 
 {% endset %}
 {# Combine all parts without newlines between them #}
diff --git a/openhands/memory/core_memory.py b/openhands/memory/core_memory.py
index 09490ccc8c5e..ac915afc0432 100644
--- a/openhands/memory/core_memory.py
+++ b/openhands/memory/core_memory.py
@@ -1,22 +1,28 @@
 from openhands.memory.base_memory import Memory
-from openhands.utils.prompt import PromptManager
 
-class CoreMemory(PromptManager, Memory):
+
+class CoreMemory(Memory):
     """Memory contents to be inserted in the prompt. This includes summaries and other information that the LLM thought was important."""
 
-    def __init__(self, system_message: str, limit: int = 1500):
-        self.system_message = system_message
+    memory_blocks: list[str]
+
+    def __init__(self, limit: int = 1500):
+        super().__init__()
         self.limit = limit
+        self.memory_blocks = []
 
     def to_dict(self) -> dict:
         return {
-            'system_message': self.system_message,
             'limit': self.limit,
         }
 
     def from_dict(self, data: dict) -> None:
-        self.system_message = data.get('system_message', '')
         self.limit = data.get('limit', 1500)
 
     def __str__(self) -> str:
-        return f'System Message: {self.system_message}'
+        return (
+            f'CoreMemory: {{limit: {self.limit}, memory_block: {self.memory_blocks}}}'
+        )
+
+    def reset(self) -> None:
+        self.memory_blocks = []
diff --git a/openhands/utils/prompt.py b/openhands/utils/prompt.py
index 430b9af47fbb..7b8263b343c4 100644
--- a/openhands/utils/prompt.py
+++ b/openhands/utils/prompt.py
@@ -17,7 +17,7 @@ class PromptManager:
         prompt_dir (str): Directory containing prompt templates.
         agent_skills_docs (str): Documentation of agent skills.
         micro_agent (MicroAgent | None): Micro-agent, if specified.
-        
+
     """
 
     def __init__(
@@ -38,11 +38,15 @@ def __init__(
     def _load_template(self, template_name: str) -> Template:
         template_path = os.path.join(self.prompt_dir, f'{template_name}.j2')
         if not os.path.exists(template_path):
-            #raise FileNotFoundError(f'Prompt file {template_path} not found')
+            # raise FileNotFoundError(f'Prompt file {template_path} not found')
             print(f'Prompt file {template_path} not found')
             return Template('')
         with open(template_path, 'r') as file:
-            return Template(file.read())
+            content = file.read()
+            print(
+                f'Loaded template {template_name}: {content[:100]}...'
+            )  # og first 100 chars
+            return Template(content)
 
     @property
     def system_message(self) -> str:

From 5e572dbb18ca088130d886a42981f5ce0b24de95 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 20 Oct 2024 01:47:56 +0200
Subject: [PATCH 30/89] tweaks to types

---
 .../memcodeact_agent/memcodeact_agent.py      | 30 ++++++++-----------
 openhands/controller/agent.py                 |  5 +++-
 openhands/memory/conversation_memory.py       |  8 ++---
 3 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
index 5ccc6f2cbff8..0654f57fb8c1 100644
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -26,7 +26,6 @@
 from openhands.events.observation.observation import Observation
 from openhands.events.serialization.event import truncate_content
 from openhands.llm.llm import LLM
-from openhands.memory.base_memory import Memory
 from openhands.memory.conversation_memory import ConversationMemory
 from openhands.memory.core_memory import CoreMemory
 from openhands.runtime.plugins import (
@@ -86,8 +85,9 @@ class MemCodeActAgent(Agent):
 
     action_parser = MemCodeActResponseParser()
 
-    # NOTE: memory includes 'chat', 'core', and 'long_term' memory
-    memory: dict[str, Memory] = {}
+    # NOTE: memory includes 'conversation' and 'core' memory
+    conversation_memory: ConversationMemory
+    core_memory: CoreMemory
 
     def __init__(
         self,
@@ -205,8 +205,8 @@ def reset(self) -> None:
         super().reset()
 
         # reset the memory modules
-        self.memory['core'].reset()
-        self.memory['conversation'].reset()
+        self.core_memory.reset()
+        self.conversation_memory.reset()
 
     def step(self, state: State) -> Action:
         """Performs one step using the MemCodeAct Agent.
@@ -237,16 +237,12 @@ def step(self, state: State) -> Action:
         # long_term_memory = LongTermMemory(llm_config=memory_config, agent_config=config, event_stream=self.event_stream)
 
         # stores and recalls the whole agent's history
-        conversation_memory = ConversationMemory(
+        assert self.memory_config is not None
+        self.conversation_memory = ConversationMemory(
             memory_config=self.memory_config, history=state.history
         )
 
-        core_memory = CoreMemory(limit=1500)
-
-        self.memory = {
-            'conversation': conversation_memory,
-            'core': core_memory,
-        }
+        self.core_memory = CoreMemory(limit=1500)
 
         # prepare what we want to send to the LLM
         messages = self._get_messages(state)
@@ -336,12 +332,12 @@ def _get_messages(self, state: State) -> list[Message]:
 
     def summarize_messages_inplace(self):
         """Summarizes the messages stored in the agent's memory to reduce token usage."""
-        if len(self.memory['chat'].messages) <= 2:
+        if len(self.conversation_memory.history) <= 2:
             return
 
-        # Summarize the chat memory
-        summary = self.llm.summarize_messages(self.memory['chat'].messages)
-        self.memory['chat'].messages = [
+        # Summarize the conversation history
+        summary = self.llm.summarize_messages(self.conversation_memory.history)
+        self.conversation_memory.history = [
             Message(role='system', content=[TextContent(text=summary)])
         ]
-        logger.debug(f'Summarized chat memory to: {summary}')
+        logger.debug(f'Summarized conversation history to: {summary}')
diff --git a/openhands/controller/agent.py b/openhands/controller/agent.py
index c10d6d9f8cee..747a0eb00e7c 100644
--- a/openhands/controller/agent.py
+++ b/openhands/controller/agent.py
@@ -5,6 +5,7 @@
     from openhands.controller.state.state import State
     from openhands.core.config import AgentConfig
     from openhands.events.action import Action
+from openhands.core.config.llm_config import LLMConfig
 from openhands.core.exceptions import (
     AgentAlreadyRegisteredError,
     AgentNotRegisteredError,
@@ -29,9 +30,11 @@ def __init__(
         self,
         llm: LLM,
         config: 'AgentConfig',
+        memory_config: LLMConfig | None = None,
     ):
         self.llm = llm
         self.config = config
+        self.memory_config = memory_config
         self._complete = False
 
     @property
@@ -55,7 +58,7 @@ def reset(self) -> None:
         to prepare the agent for restarting the instruction or cleaning up before destruction.
 
         """
-        #self.memory.reset()
+        # self.memory.reset()
         self._complete = False
 
         if self.llm:
diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py
index 7547edbab750..328b6a39dfe9 100644
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@@ -1,7 +1,7 @@
 from openhands.core.config.llm_config import LLMConfig
-from openhands.memory.base_memory import Memory
-from openhands.memory.memory import LongTermMemory
 from openhands.events.event import Event
+from openhands.events.serialization.event import event_from_dict, event_to_dict
+from openhands.memory.base_memory import Memory
 
 TOP_K = 10
 
@@ -28,10 +28,10 @@ def __init__(
 
     def to_dict(self) -> dict:
         # return a dict with key = event.id, value = event.to_dict()
-        return {event.id: event.to_dict() for event in self.history}
+        return {event.id: event_to_dict(event) for event in self.history}
 
     def from_dict(self, data: dict) -> None:
-        self.history = [Event.from_dict(event) for event in data.values()]
+        self.history = [event_from_dict(event) for event in data.values()]
 
     def __str__(self) -> str:
         return f'ConversationMemory with {len(self.history)} events'

From 225d3302e152169a9286b8f1284a8b5a79653993 Mon Sep 17 00:00:00 2001
From: khushvind <mauryakhushvind@gmail.com>
Date: Sun, 20 Oct 2024 02:04:47 +0200
Subject: [PATCH 31/89] added summarizer

---
 openhands/core/exceptions.py  |  12 +++
 openhands/core/message.py     |   2 +
 openhands/memory/condenser.py | 158 ++++++++++++++++++++++++++++++----
 3 files changed, 153 insertions(+), 19 deletions(-)

diff --git a/openhands/core/exceptions.py b/openhands/core/exceptions.py
index 1a8f7aeb9ace..1b85e8e7a022 100644
--- a/openhands/core/exceptions.py
+++ b/openhands/core/exceptions.py
@@ -84,3 +84,15 @@ class OperationCancelled(Exception):
 
     def __init__(self, message='Operation was cancelled'):
         super().__init__(message)
+
+
+class SummarizeError(Exception):
+    """Exception raised when message can't be summarized."""
+
+    def __init__(self, message='Error summarizing the memory'):
+        super().__init__(message)
+
+
+class InvalidSummaryResponseError(Exception):
+    def __init__(self, message='Invalid summary response'):
+        super().__init__(message)
\ No newline at end of file
diff --git a/openhands/core/message.py b/openhands/core/message.py
index 57fadabde714..aaa6510d820c 100644
--- a/openhands/core/message.py
+++ b/openhands/core/message.py
@@ -52,6 +52,8 @@ class Message(BaseModel):
     content: list[TextContent | ImageContent] = Field(default=list)
     cache_enabled: bool = False
     vision_enabled: bool = False
+    condensable: bool = True
+    event_id: int = -1
 
     @property
     def contains_image(self) -> bool:
diff --git a/openhands/memory/condenser.py b/openhands/memory/condenser.py
index bd48d184305c..7c65f80f0105 100644
--- a/openhands/memory/condenser.py
+++ b/openhands/memory/condenser.py
@@ -1,24 +1,144 @@
 from openhands.core.logger import openhands_logger as logger
+from openhands.core.message import Message, TextContent
 from openhands.llm.llm import LLM
+from openhands.events.action import AgentSummarizeAction
+from openhands.core.exceptions import SummarizeError
 
 
 class MemoryCondenser:
-    def condense(self, summarize_prompt: str, llm: LLM):
-        """Attempts to condense the memory by using the llm
-
-        Parameters:
-        - llm (LLM): llm to be used for summarization
-
-        Raises:
-        - Exception: the same exception as it got from the llm or processing the response
-        """
-        try:
-            messages = [{'content': summarize_prompt, 'role': 'user'}]
-            resp = llm.completion(messages=messages)
-            summary_response = resp['choices'][0]['message']['content']
-            return summary_response
-        except Exception as e:
-            logger.error('Error condensing thoughts: %s', str(e), exc_info=False)
-
-            # TODO If the llm fails with ContextWindowExceededError, we can try to condense the memory chunk by chunk
-            raise
+
+    def __init__(self, llm: LLM):
+        self.llm = llm
+
+    def condense(
+        self,
+        messages: list[Message],
+    ):
+        # Start past the system message, and example messages.,
+        # and collect messages for summarization until we reach the desired truncation token fraction (eg 50%)
+        # Do not allow truncation  for in-context examples of function calling
+        token_counts = [
+            self.get_token_count([message.model_dump()])  # type: ignore
+            for message in messages
+            if message.condensable
+        ]
+        message_buffer_token_count = sum(token_counts)  # no system and example message
+
+        desired_token_count_to_summarize = int(
+            message_buffer_token_count * self.config.message_summary_trunc_tokens_frac  # type: ignore
+        )
+
+        candidate_messages_to_summarize = []
+        tokens_so_far = 0
+        for message in messages:
+            if message.condensable:
+                candidate_messages_to_summarize.append(message)
+                tokens_so_far += self.get_token_count([message.model_dump()])  # type: ignore
+            if tokens_so_far > desired_token_count_to_summarize:
+                last_summarized_event_id = message.event_id
+                break
+
+        # TODO: Add functionality for preserving last N messages
+        # MESSAGE_SUMMARY_TRUNC_KEEP_N_LAST = 3
+        # if preserve_last_N_messages:
+        #     candidate_messages_to_summarize = candidate_messages_to_summarize[:-MESSAGE_SUMMARY_TRUNC_KEEP_N_LAST]
+        #     token_counts = token_counts[:-MESSAGE_SUMMARY_TRUNC_KEEP_N_LAST]
+
+        logger.debug(
+            f'message_summary_trunc_tokens_frac={self.config.message_summary_trunc_tokens_frac}'  # type: ignore
+        )
+        # logger.debug(f'MESSAGE_SUMMARY_TRUNC_KEEP_N_LAST={MESSAGE_SUMMARY_TRUNC_KEEP_N_LAST}')
+        logger.debug(f'token_counts={token_counts}')
+        logger.debug(f'message_buffer_token_count={message_buffer_token_count}')
+        logger.debug(
+            f'desired_token_count_to_summarize={desired_token_count_to_summarize}'
+        )
+        logger.debug(
+            f'len(candidate_messages_to_summarize)={len(candidate_messages_to_summarize)}'
+        )
+
+        if len(candidate_messages_to_summarize) == 0:
+            raise SummarizeError(
+                f"Summarize error: tried to run summarize, but couldn't find enough messages to compress [len={len(messages)}]"
+            )
+
+        # TODO: Try to make an assistant message come after the cutoff
+
+        message_sequence_to_summarize = candidate_messages_to_summarize
+
+        if len(message_sequence_to_summarize) <= 1:
+            # This prevents a potential infinite loop of summarizing the same message over and over
+            raise SummarizeError(
+                f"Summarize error: tried to run summarize, but couldn't find enough messages to compress [len={len(message_sequence_to_summarize)} <= 1]"
+            )
+        else:
+            print(
+                f'Attempting to summarize with last summarized event id = {last_summarized_event_id}'
+            )
+
+        action_response = self.summarize_messages(
+            message_sequence_to_summarize=message_sequence_to_summarize
+        )
+        summary_action: AgentSummarizeAction = parse_summary_response(action_response)
+        summary_action.last_summarized_event_id = (
+            last_summarized_event_id if last_summarized_event_id else -1
+        )
+        return summary_action
+
+    def _format_summary_history(self, message_history: list[dict]) -> str:
+        # TODO use existing prompt formatters for this (eg ChatML)
+        return '\n'.join([f'{m["role"]}: {m["content"]}' for m in message_history])
+
+    def summarize_messages(self, message_sequence_to_summarize: list[Message]):
+        """Summarize a message sequence using LLM"""
+        context_window = self.config.max_input_tokens  # type: ignore
+        summary_prompt = SUMMARY_PROMPT_SYSTEM
+        summary_input = self._format_summary_history(
+            self.get_text_messages(message_sequence_to_summarize)  # type: ignore
+        )
+        summary_input_tkns = self.get_token_count(summary_input)  # type: ignore
+        if context_window is None:
+            raise ValueError('context_window should not be None')
+        if summary_input_tkns > MESSAGE_SUMMARY_WARNING_FRAC * context_window:
+            trunc_ratio = (
+                MESSAGE_SUMMARY_WARNING_FRAC * context_window / summary_input_tkns
+            ) * 0.8  # For good measure...
+            cutoff = int(len(message_sequence_to_summarize) * trunc_ratio)
+            curr_summary = self.summarize_messages(
+                message_sequence_to_summarize=message_sequence_to_summarize[:cutoff]
+            )
+            curr_summary_message = (
+                'Summary of all Action and Observations till now. \n'
+                + 'Action: '
+                + curr_summary['args']['summarized_actions']
+                + '\nObservation: '
+                + curr_summary['args']['summarized_observations']
+            )
+            curr_summary_message = [TextContent(text=curr_summary_message)]
+            input = [
+                Message({'role': 'assistant', 'content': curr_summary_message})
+            ] + message_sequence_to_summarize[cutoff:]
+            summary_input = self._format_summary_history(self.get_text_messages(input))  # type: ignore
+
+        message_sequence = []
+        message_sequence.append(
+            Message(role='system', content=[TextContent(text=summary_prompt)])
+        )
+        message_sequence.append(
+            Message(role='user', content=[TextContent(text=summary_input)])
+        )
+
+        response = self.completion(  # type: ignore
+            messages=message_sequence,
+            stop=[
+                '</execute_ipython>',
+                '</execute_bash>',
+                '</execute_browse>',
+            ],
+            temperature=0.0,
+        )
+
+        print(f'summarize_messages gpt reply: {response.choices[0]}')
+
+        action_response = response['choices'][0]['message']['content']
+        return action_response
\ No newline at end of file

From 4aedbc283e520a8f3caec3695ebc63447284da21 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 20 Oct 2024 02:58:48 +0200
Subject: [PATCH 32/89] clean up duplicate

---
 .../memcodeact_agent/memcodeact_agent.py      | 45 +++++++++++--------
 openhands/controller/state/state.py           |  3 +-
 openhands/events/action/agent.py              |  2 +-
 openhands/memory/base_memory.py               |  8 ++--
 openhands/memory/condenser.py                 |  9 ++--
 openhands/memory/conversation_memory.py       | 40 +++++++++++++----
 6 files changed, 67 insertions(+), 40 deletions(-)

diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
index 0654f57fb8c1..ebf7735793e5 100644
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -5,7 +5,6 @@
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
-from openhands.core.config.llm_config import LLMConfig
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import ImageContent, Message, TextContent
 from openhands.events.action import (
@@ -16,6 +15,7 @@
     IPythonRunCellAction,
     MessageAction,
 )
+from openhands.events.event import Event
 from openhands.events.observation import (
     AgentDelegateObservation,
     CmdOutputObservation,
@@ -26,6 +26,7 @@
 from openhands.events.observation.observation import Observation
 from openhands.events.serialization.event import truncate_content
 from openhands.llm.llm import LLM
+from openhands.memory.condenser import MemoryCondenser
 from openhands.memory.conversation_memory import ConversationMemory
 from openhands.memory.core_memory import CoreMemory
 from openhands.runtime.plugins import (
@@ -85,7 +86,7 @@ class MemCodeActAgent(Agent):
 
     action_parser = MemCodeActResponseParser()
 
-    # NOTE: memory includes 'conversation' and 'core' memory
+    # NOTE: memory includes 'conversation' and 'core' memory blocks
     conversation_memory: ConversationMemory
     core_memory: CoreMemory
 
@@ -93,18 +94,16 @@ def __init__(
         self,
         llm: LLM,
         config: AgentConfig,
-        memory_config: LLMConfig = None,
     ) -> None:
         """Initializes a new instance of the MemCodeActAgent class.
 
         Parameters:
         - llm: The LLM to be used by this agent
         - config: The agent configuration
-        - memory_config: The memory configuration
         """
         super().__init__(llm, config)
 
-        self.memory_config = memory_config
+        self.memory_config = llm.config  # TODO this should be MemoryConfig
 
         self.micro_agent = (
             MicroAgent(
@@ -238,14 +237,21 @@ def step(self, state: State) -> Action:
 
         # stores and recalls the whole agent's history
         assert self.memory_config is not None
-        self.conversation_memory = ConversationMemory(
-            memory_config=self.memory_config, history=state.history
-        )
 
-        self.core_memory = CoreMemory(limit=1500)
+        # update conversation memory for this step
+        if not hasattr(self, 'conversation_memory') or not self.conversation_memory:
+            self.conversation_memory = ConversationMemory(
+                memory_config=self.memory_config, state=state
+            )
+        else:
+            self.conversation_memory.update(state)
+
+        # initialize core memory
+        if not hasattr(self, 'core_memory') or not self.core_memory:
+            self.core_memory = CoreMemory(limit=1500)
 
         # prepare what we want to send to the LLM
-        messages = self._get_messages(state)
+        messages = self._get_messages(self.conversation_memory.history, state)
         params = {
             'messages': self.llm.format_messages_for_llm(messages),
             'stop': [
@@ -259,7 +265,7 @@ def step(self, state: State) -> Action:
 
         return self.action_parser.parse(response)
 
-    def _get_messages(self, state: State) -> list[Message]:
+    def _get_messages(self, history: list[Event], state: State) -> list[Message]:
         messages: list[Message] = [
             Message(
                 role='system',
@@ -281,7 +287,7 @@ def _get_messages(self, state: State) -> list[Message]:
             ),
         ]
 
-        for event in state.history:
+        for event in history:
             # create a regular message from an event
             if isinstance(event, Action):
                 message = self.get_action_message(event)
@@ -330,14 +336,15 @@ def _get_messages(self, state: State) -> list[Message]:
 
         return messages
 
-    def summarize_messages_inplace(self):
-        """Summarizes the messages stored in the agent's memory to reduce token usage."""
+    def summarize_messages_inplace(self, state: State):
+        """Summarizes the earlier messages in the agent's memory to reduce token usage. Uses memGPT's algorithm for in-place summarization."""
         if len(self.conversation_memory.history) <= 2:
             return
 
-        # Summarize the conversation history
-        summary = self.llm.summarize_messages(self.conversation_memory.history)
-        self.conversation_memory.history = [
-            Message(role='system', content=[TextContent(text=summary)])
-        ]
+        # summarize the conversation history using the condenser
+        # conversation_memory.history will include the previous summary, if any, while the regular state.history does not
+        condenser = MemoryCondenser(self.llm)
+        messages = self._get_messages(self.conversation_memory.history, state)
+        summary = condenser.condense(messages)
+
         logger.debug(f'Summarized conversation history to: {summary}')
diff --git a/openhands/controller/state/state.py b/openhands/controller/state/state.py
index 5f131662ef77..1107bd798519 100644
--- a/openhands/controller/state/state.py
+++ b/openhands/controller/state/state.py
@@ -11,7 +11,7 @@
 from openhands.events.action import (
     MessageAction,
 )
-from openhands.events.action.agent import AgentFinishAction
+from openhands.events.action.agent import AgentFinishAction, AgentSummarizeAction
 from openhands.events.event import Event, EventSource
 from openhands.events.observation import AgentDelegateObservation
 from openhands.storage.files import FileStore
@@ -96,6 +96,7 @@ class State:
     end_id: int = -1
     almost_stuck: int = 0
     delegates: dict[tuple[int, int], tuple[str, str]] = field(default_factory=dict)
+    summary: AgentSummarizeAction | None = None
     # NOTE: This will never be used by the controller, but it can be used by different
     # evaluation tasks to store extra data needed to track the progress/state of the task.
     extra_data: dict[str, Any] = field(default_factory=dict)
diff --git a/openhands/events/action/agent.py b/openhands/events/action/agent.py
index f49f573ed698..a5400a4b71fe 100644
--- a/openhands/events/action/agent.py
+++ b/openhands/events/action/agent.py
@@ -20,7 +20,7 @@ def message(self) -> str:
 
 @dataclass
 class AgentSummarizeAction(Action):
-    summary: str
+    summary: str  # summary to be inserted as a memory block
     action: str = ActionType.SUMMARIZE
 
     @property
diff --git a/openhands/memory/base_memory.py b/openhands/memory/base_memory.py
index 506f1e732fee..25de017ab1bb 100644
--- a/openhands/memory/base_memory.py
+++ b/openhands/memory/base_memory.py
@@ -10,10 +10,10 @@ def to_dict(self) -> dict[str, Any]:
         """Convert the memory module to a dictionary of individual memories."""
         pass
 
-    @abstractmethod
-    def from_dict(self, data: dict[str, Any]) -> None:
-        """Load the memory module from a dictionary of individual memories."""
-        pass
+    # @abstractmethod
+    # def from_dict(self, data: dict[str, Any]) -> None:
+    #    """Load the memory module from a dictionary of individual memories."""
+    #    pass
 
     @abstractmethod
     def __str__(self) -> str:
diff --git a/openhands/memory/condenser.py b/openhands/memory/condenser.py
index 7c65f80f0105..30ca3f6fd6f0 100644
--- a/openhands/memory/condenser.py
+++ b/openhands/memory/condenser.py
@@ -1,12 +1,11 @@
+from openhands.core.exceptions import SummarizeError
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import Message, TextContent
-from openhands.llm.llm import LLM
 from openhands.events.action import AgentSummarizeAction
-from openhands.core.exceptions import SummarizeError
+from openhands.llm.llm import LLM
 
 
 class MemoryCondenser:
-
     def __init__(self, llm: LLM):
         self.llm = llm
 
@@ -62,8 +61,6 @@ def condense(
                 f"Summarize error: tried to run summarize, but couldn't find enough messages to compress [len={len(messages)}]"
             )
 
-        # TODO: Try to make an assistant message come after the cutoff
-
         message_sequence_to_summarize = candidate_messages_to_summarize
 
         if len(message_sequence_to_summarize) <= 1:
@@ -141,4 +138,4 @@ def summarize_messages(self, message_sequence_to_summarize: list[Message]):
         print(f'summarize_messages gpt reply: {response.choices[0]}')
 
         action_response = response['choices'][0]['message']['content']
-        return action_response
\ No newline at end of file
+        return action_response
diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py
index 328b6a39dfe9..6a4ed6d252e8 100644
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@@ -1,18 +1,23 @@
+from openhands.controller.state.state import State
 from openhands.core.config.llm_config import LLMConfig
-from openhands.events.event import Event
-from openhands.events.serialization.event import event_from_dict, event_to_dict
+from openhands.events.action.agent import AgentSummarizeAction
+from openhands.events.serialization.event import event_to_dict
 from openhands.memory.base_memory import Memory
 
 TOP_K = 10
 
 
 class ConversationMemory(Memory):
-    """Allows the agent to recall events from its entire history."""
+    """Allows the agent to recall events from its entire history.
+
+    This class handles the summarized events (from state.summary['start_id] to state.summary['end_id'])
+    and slices the history to include only the events after the summary.
+    """
 
     def __init__(
         self,
         memory_config: LLMConfig,
-        history: list[Event],
+        state: State,
     ):
         """
         Initialize ConversationMemory with a reference to history and long-term memory.
@@ -22,16 +27,13 @@ def __init__(
         - llm_config: The LLM configuration.
         - top_k: Number of top results to retrieve.
         """
-        self.history = history or []
+        self.state = state
         self.llm_config = memory_config
         self.top_k = TOP_K
 
     def to_dict(self) -> dict:
         # return a dict with key = event.id, value = event.to_dict()
-        return {event.id: event_to_dict(event) for event in self.history}
-
-    def from_dict(self, data: dict) -> None:
-        self.history = [event_from_dict(event) for event in data.values()]
+        return {event.id: event_to_dict(event) for event in self.state.history}
 
     def __str__(self) -> str:
         return f'ConversationMemory with {len(self.history)} events'
@@ -92,5 +94,25 @@ def embedding_search(
         # return self.long_term_memory.search(query, count, start)
         pass
 
+    def update(self, state: State) -> None:
+        """Update the conversation memory with new events."""
+
+        # FIXME: this is a hack and doesn't work anyway
+        if self._has_summary():
+            # create a list of events using the summary, then from event id = end_id + 1 to the end of history
+            summary_events = [
+                event
+                for event in state.history
+                if event.id
+                not in range(state.summary['start_id'], state.summary['end_id'] + 1)
+            ]
+            self.temporary_history = state.summary + summary_events
+        else:
+            self.temporary_history = state.history
+
+    def _has_summary(self) -> bool:
+        """Check if the conversation memory has a summary."""
+        return any(isinstance(event, AgentSummarizeAction) for event in self.history)
+
     def reset(self) -> None:
         self.history = []

From 2353c304fb3432842bb80cc9be6c45a20dd8b859 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 20 Oct 2024 03:21:42 +0200
Subject: [PATCH 33/89] tweak prompts

---
 .../agenthub/memcodeact_agent/memcodeact_agent.py    |  9 ++++-----
 openhands/agenthub/memcodeact_agent/memory_prompt.j2 |  9 +++++++--
 openhands/agenthub/memcodeact_agent/system_prompt.j2 | 12 +++++++-----
 openhands/memory/conversation_memory.py              |  2 +-
 4 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
index ebf7735793e5..48d98e42a687 100644
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -15,7 +15,6 @@
     IPythonRunCellAction,
     MessageAction,
 )
-from openhands.events.event import Event
 from openhands.events.observation import (
     AgentDelegateObservation,
     CmdOutputObservation,
@@ -251,7 +250,7 @@ def step(self, state: State) -> Action:
             self.core_memory = CoreMemory(limit=1500)
 
         # prepare what we want to send to the LLM
-        messages = self._get_messages(self.conversation_memory.history, state)
+        messages = self._get_messages(state)
         params = {
             'messages': self.llm.format_messages_for_llm(messages),
             'stop': [
@@ -265,7 +264,7 @@ def step(self, state: State) -> Action:
 
         return self.action_parser.parse(response)
 
-    def _get_messages(self, history: list[Event], state: State) -> list[Message]:
+    def _get_messages(self, state: State) -> list[Message]:
         messages: list[Message] = [
             Message(
                 role='system',
@@ -287,7 +286,7 @@ def _get_messages(self, history: list[Event], state: State) -> list[Message]:
             ),
         ]
 
-        for event in history:
+        for event in state.history:
             # create a regular message from an event
             if isinstance(event, Action):
                 message = self.get_action_message(event)
@@ -344,7 +343,7 @@ def summarize_messages_inplace(self, state: State):
         # summarize the conversation history using the condenser
         # conversation_memory.history will include the previous summary, if any, while the regular state.history does not
         condenser = MemoryCondenser(self.llm)
-        messages = self._get_messages(self.conversation_memory.history, state)
+        messages = self._get_messages(state)
         summary = condenser.condense(messages)
 
         logger.debug(f'Summarized conversation history to: {summary}')
diff --git a/openhands/agenthub/memcodeact_agent/memory_prompt.j2 b/openhands/agenthub/memcodeact_agent/memory_prompt.j2
index fce605364dec..484d96d25c9d 100644
--- a/openhands/agenthub/memcodeact_agent/memory_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/memory_prompt.j2
@@ -12,12 +12,17 @@ Older AI models had no concept of persistent memory; they were only able to acce
 This meant that when conversations exceeded a certain length, they would overflow and old messages were permanently lost (the AI would have no knowledge of their existence).
 Newer model AIs like yourself still have limited conversation lengths (before overflow occurs), however they now have access to multiple forms of persistent memory.
 Your ability to edit your own long-term memory is a key part of what makes you a very helpful model.
-Your core memory unit will be initialized with a **memory profile** chosen by the user, as well as information about the task in **task specifications**.
+
+Core Memory:
+Your core memory unit will be uninitialized. You can add to it important information about the task or your status. Keep it concise and remember that you will use it to guide your actions, so keep it relevant!
+You can add to your core memory using the <memory_add> action.
+For example, <memory_add> The user is working on a project to create a new AI assistant. </memory_add>
 
 Recall Memory (Conversation History):
 Even though you can only see recent messages in your immediate context, you can search over your entire message history from a database.
 This conversation history database allows you to search through past interactions, effectively allowing you to remember prior engagements with a user or previous environment feedback.
-You can search your conversation history using the <conversation_search> action.
+You can search your conversation history using the <memory_recall> action.
+For example, <memory_recall> What do I know about the sympy package? </memory_recall>
 {% endset %}
 
 {{ MEMORY_PROMPT }}
diff --git a/openhands/agenthub/memcodeact_agent/system_prompt.j2 b/openhands/agenthub/memcodeact_agent/system_prompt.j2
index 938dd3ed9032..605cb45ec939 100644
--- a/openhands/agenthub/memcodeact_agent/system_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/system_prompt.j2
@@ -1,7 +1,6 @@
 {% set MINIMAL_SYSTEM_PREFIX %}
-{{ memory_template }}
 You are a new generation AI assistant, an expert in software engineering. You are designed to behave helpfully, trustworthily, and with extreme diligence.
-You give helpful, detailed answers to the user's questions or act on the environment.
+You are able to act on the environment and give helpful, detailed answers to the user's questions.
 You can use a Python environment with <execute_ipython>, e.g.:
 <execute_ipython>
 print("Hello World!")
@@ -24,6 +23,9 @@ For example:
 The system returned a value of 42."
 Do not respond to these messages. Use them to perform your task.
 {% endset %}
+{% set MEMORY_PREFIX %}
+{{ memory_template }}
+{% endset %}
 {% set BROWSING_PREFIX %}
 The assistant can browse the Internet with <execute_browse> and </execute_browse>.
 For example, <execute_browse> Tell me the USA's president using Google search </execute_browse>.
@@ -48,11 +50,11 @@ IMPORTANT:
 {% set SYSTEM_SUFFIX %}
 Responses should be concise.
 The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
-Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <memory_recall>per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
 If you are finished with the task you MUST include <finish></finish> in your response.
-IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_browse> whenever possible.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash> whenever possible, or memory operations when necessary (<memory_recall>).
 You must use full file paths and the `pwd` command to prevent path-related errors.
-You MUST NOT apologize to the user or thank the user after running commands or editing files. You should only address the user in response to an explicit message from the user, or to ask for more information.
+You should only address the user in response to an explicit message from the user, or to ask for more information.
 You MUST NOT push any changes to GitHub unless explicitly requested to do so.
 
 {% endset %}
diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py
index 6a4ed6d252e8..208682f683ca 100644
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@@ -98,7 +98,7 @@ def update(self, state: State) -> None:
         """Update the conversation memory with new events."""
 
         # FIXME: this is a hack and doesn't work anyway
-        if self._has_summary():
+        if state.summary:
             # create a list of events using the summary, then from event id = end_id + 1 to the end of history
             summary_events = [
                 event

From e04f77a8f88897a3ef5d3b7ae1ab92509fafab00 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 20 Oct 2024 03:36:06 +0200
Subject: [PATCH 34/89] add action parser

---
 .../memcodeact_agent/action_parser.py         | 87 ++++++++++++++++++-
 1 file changed, 86 insertions(+), 1 deletion(-)

diff --git a/openhands/agenthub/memcodeact_agent/action_parser.py b/openhands/agenthub/memcodeact_agent/action_parser.py
index c6bd55ff3c0a..5baa00415557 100644
--- a/openhands/agenthub/memcodeact_agent/action_parser.py
+++ b/openhands/agenthub/memcodeact_agent/action_parser.py
@@ -9,6 +9,7 @@
     IPythonRunCellAction,
     MessageAction,
 )
+from openhands.events.action.agent import AgentSummarizeAction
 
 
 class MemCodeActResponseParser(ResponseParser):
@@ -38,13 +39,25 @@ def parse_response(self, response) -> str:
         action = response.choices[0].message.content
         if action is None:
             return ''
+
+        # execute actions
         for lang in ['bash', 'ipython', 'browse']:
-            # special handling for DeepSeek: it has stop-word bug and returns </execute_ipython instead of </execute_ipython>
+            # special handling for DeepSeek: it has the stop-word bug and returns </execute_ipython instead of </execute_ipython>
             if f'</execute_{lang}' in action and f'</execute_{lang}>' not in action:
                 action = action.replace(f'</execute_{lang}', f'</execute_{lang}>')
 
             if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
                 action += f'</execute_{lang}>'
+
+        # memory actions
+        for action in ['summarize', 'recall', 'add']:
+            # the stop-word bug
+            if f'<memory_{action}>' in action and f'</memory_{action}>' not in action:
+                action += f'</memory_{action}>'
+
+            if f'<memory_{action}>' in action and f'</memory_{action}>' not in action:
+                action += f'</memory_{action}>'
+
         return action
 
     def parse_action(self, action_str: str) -> Action:
@@ -178,3 +191,75 @@ def check_condition(self, action_str: str) -> bool:
 
     def parse(self, action_str: str) -> Action:
         return MessageAction(content=action_str, wait_for_response=True)
+
+
+class MemCodeActActionParserMemoryRecall(ActionParser):
+    """Parser action:
+    - RecallAction(query) - memory action to run
+    """
+
+    def __init__(self):
+        self.query = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.query = re.search(
+            r'<memory_recall>(.*?)</memory_recall>', action_str, re.DOTALL
+        )
+        return self.query is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.query is not None
+        ), 'self.query should not be None when parse is called'
+
+        # <memory_recall>query</memory_recall>
+        thought = action_str.replace(self.query.group(0), '').strip()
+        return RecallAction(query=self.query.group(1).strip(), thought=thought)
+
+
+class MemCodeActActionParserMemoryAdd(ActionParser):
+    """Parser action:
+    - AddAction(content) - memory action to run
+    """
+
+    def __init__(self):
+        self.content = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.content = re.search(
+            r'<memory_add>(.*?)</memory_add>', action_str, re.DOTALL
+        )
+        return self.content is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.content is not None
+        ), 'self.content should not be None when parse is called'
+
+        # <memory_add>content</memory_add>
+        thought = action_str.replace(self.content.group(0), '').strip()
+        return AddAction(content=self.content.group(1).strip(), thought=thought)
+
+
+class MemCodeActActionParserMemorySummarize(ActionParser):
+    """Parser action:
+    - SummarizeAction(query) - memory action to run
+    """
+
+    def __init__(self):
+        self.query = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.query = re.search(
+            r'<memory_summarize>(.*?)</memory_summarize>', action_str, re.DOTALL
+        )
+        return self.query is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.query is not None
+        ), 'self.query should not be None when parse is called'
+
+        # <memory_summarize>query</memory_summarize>
+        thought = action_str.replace(self.query.group(0), '').strip()
+        return AgentSummarizeAction(query=self.query.group(1).strip(), thought=thought)

From b77961b057fe46c970c19230aff746cf3fba1284 Mon Sep 17 00:00:00 2001
From: khushvind <mauryakhushvind@gmail.com>
Date: Sun, 20 Oct 2024 03:37:41 +0200
Subject: [PATCH 35/89] added summary response

---
 .../memcodeact_agent/action_parser.py         | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/openhands/agenthub/memcodeact_agent/action_parser.py b/openhands/agenthub/memcodeact_agent/action_parser.py
index 5baa00415557..29198020c53f 100644
--- a/openhands/agenthub/memcodeact_agent/action_parser.py
+++ b/openhands/agenthub/memcodeact_agent/action_parser.py
@@ -263,3 +263,27 @@ def parse(self, action_str: str) -> Action:
         # <memory_summarize>query</memory_summarize>
         thought = action_str.replace(self.query.group(0), '').strip()
         return AgentSummarizeAction(query=self.query.group(1).strip(), thought=thought)
+
+
+def parse_summary_response(response: str) -> AgentSummarizeAction:
+    """
+    Parses a JSON summary of events.
+    Parameters:
+    - response: The response string to be parsed
+    Returns:
+    - The summary action output by the model
+    """
+    try:
+        action_dict = json.loads(response)
+        action = action_from_dict(action_dict)
+        if action is None or not isinstance(action, AgentSummarizeAction):
+            error_message = f'Expected a summarize action, but the response got {str(type(action)) if action else None}'
+            logger.error(error_message)
+            raise InvalidSummaryResponseError(error_message)
+        action._source = EventSource.AGENT  # type: ignore
+    except (LLMResponseError, LLMMalformedActionError) as e:
+        logger.error(f'Failed to parse summary response: {str(e)}')
+        raise InvalidSummaryResponseError(
+            f'Failed to parse the response: {str(e)}'
+        ) from e
+    return action
\ No newline at end of file

From 20c9fa89c250b2b68355d5e75ef07f8ee014f1aa Mon Sep 17 00:00:00 2001
From: khushvind <mauryakhushvind@gmail.com>
Date: Sun, 20 Oct 2024 03:51:24 +0200
Subject: [PATCH 36/89] added summary prompt

---
 .../memcodeact_agent/summarize_prompt.j2      | 36 ++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/openhands/agenthub/memcodeact_agent/summarize_prompt.j2 b/openhands/agenthub/memcodeact_agent/summarize_prompt.j2
index eb2034d59fdf..88b087e91012 100644
--- a/openhands/agenthub/memcodeact_agent/summarize_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/summarize_prompt.j2
@@ -1,6 +1,40 @@
 {% set SUMMARIZE_PROMPT %}
-You are a new generation AI, an expert in software engineering. You are designed to behave helpfully, trustworthily, and with extreme diligence. Your task is to summarize the following conversation effectively:
+You are a new generation AI, an expert in code summarization. You are designed to behave trustworthily, and with extreme diligence.
 
+Memory Summarization:
+Older AI models had no concept of persistent memory; they were only able to access their initial instructions and a limited context window of chat conversation with a user.
+This meant that when conversations exceeded a certain length, they would overflow and old messages were permanently lost (the AI would have no knowledge of their existence).
+Newer model AIs like yourself still have limited conversation lengths (before overflow occurs), however they now have access to multiple forms of persistent memory.
+Your ability to summarize and recall your own long-term memory is a key part of what makes you a very helpful model.
+
+NOW:
+Your memory is dangerously close to overflowing. So your task is to summarize your conversation history effectively. YOU MUST DO IT RIGHT NOW.
+
+The conversation below is a from a fixed context window and may not be complete. Keep your summary concise and relevant. Include all the critical information in conversation till now in the summary.
+Only output the summary, do NOT include anything else in your output.
+
+Create a JSON response with:
+    - "action": "summarize"
+    - args:
+      - "summarized_actions": Precise sentences summarizing the provided actions, written in the first person.
+      - "summarized_observations": Precise sentences summarizing the provided observations, written in the third person.
+
+Example:
+{
+    "action": "summarize",
+    "args": {
+        "summarized_actions": "I located the UML specification PDF in the ./local/ directory, parsed its content, and searched it for information about sequence diagrams.",
+        "summarized_observations": "The agent encountered a UnicodeDecodeError when initially searching the PDF text, but was able to resolve this by installing the PyPDF2 library and successfully extracting relevant information about sequence diagrams."
+    }
+}
+Make sure to include in the summary any relevant keywords, because you will need to look it up.
+
+The following is the conversation history you need to summarize:
+
+CONVERSATION HISTORY:
+-------------------------------------
 {{ conversation_history }}
+-------------------------------------
+
 {% endset %}
 {{ SUMMARIZE_PROMPT }}

From d36917bc5ad9737fc8706899d11c1c82ab9fdcf6 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 20 Oct 2024 03:53:35 +0200
Subject: [PATCH 37/89] tweak prompt

---
 openhands/agenthub/memcodeact_agent/summarize_prompt.j2 | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openhands/agenthub/memcodeact_agent/summarize_prompt.j2 b/openhands/agenthub/memcodeact_agent/summarize_prompt.j2
index 88b087e91012..8268030af62b 100644
--- a/openhands/agenthub/memcodeact_agent/summarize_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/summarize_prompt.j2
@@ -8,7 +8,8 @@ Newer model AIs like yourself still have limited conversation lengths (before ov
 Your ability to summarize and recall your own long-term memory is a key part of what makes you a very helpful model.
 
 NOW:
-Your memory is dangerously close to overflowing. So your task is to summarize your conversation history effectively. YOU MUST DO IT RIGHT NOW.
+Your memory is dangerously close to overflowing. So your task is to summarize your conversation history effectively.
+YOU MUST DO IT RIGHT NOW OR YOU WILL LOSE INFORMATION PERMANENTLY AND FAIL YOUR TASK.
 
 The conversation below is a from a fixed context window and may not be complete. Keep your summary concise and relevant. Include all the critical information in conversation till now in the summary.
 Only output the summary, do NOT include anything else in your output.

From 143f16d19fcfea6c0d0d71720085a9a4e1888da0 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 20 Oct 2024 03:54:43 +0200
Subject: [PATCH 38/89] add strings

---
 openhands/memory/condenser.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/openhands/memory/condenser.py b/openhands/memory/condenser.py
index 30ca3f6fd6f0..33c3088f25c9 100644
--- a/openhands/memory/condenser.py
+++ b/openhands/memory/condenser.py
@@ -4,6 +4,8 @@
 from openhands.events.action import AgentSummarizeAction
 from openhands.llm.llm import LLM
 
+WORD_LIMIT = 200
+MESSAGE_SUMMARY_WARNING_FRAC = 0.75
 
 class MemoryCondenser:
     def __init__(self, llm: LLM):

From fb904590aa37ab4f3275675546198118af7a599d Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 20 Oct 2024 19:31:39 +0200
Subject: [PATCH 39/89] summarize and recall

---
 .../memcodeact_agent/action_parser.py         | 76 ++++++-------------
 .../memcodeact_agent/memcodeact_agent.py      | 10 +++
 openhands/agenthub/memcodeact_agent/utils.py  | 36 +++++++++
 openhands/events/action/agent.py              | 29 ++++++-
 openhands/memory/condenser.py                 |  2 +
 openhands/memory/conversation_memory.py       | 26 ++++++-
 6 files changed, 121 insertions(+), 58 deletions(-)
 create mode 100644 openhands/agenthub/memcodeact_agent/utils.py

diff --git a/openhands/agenthub/memcodeact_agent/action_parser.py b/openhands/agenthub/memcodeact_agent/action_parser.py
index 29198020c53f..3e50e4244cd9 100644
--- a/openhands/agenthub/memcodeact_agent/action_parser.py
+++ b/openhands/agenthub/memcodeact_agent/action_parser.py
@@ -9,7 +9,7 @@
     IPythonRunCellAction,
     MessageAction,
 )
-from openhands.events.action.agent import AgentSummarizeAction
+from openhands.events.action.agent import AgentRecallAction, AgentTriggerSummarizeAction
 
 
 class MemCodeActResponseParser(ResponseParser):
@@ -19,6 +19,8 @@ class MemCodeActResponseParser(ResponseParser):
     - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
     - MessageAction(content) - Message action to run (e.g. ask for clarification)
     - AgentFinishAction() - end the interaction
+    - AgentSummarizeAction() - summarize the conversation history
+    - RecallAction(query) - recall information from memory
     """
 
     def __init__(self):
@@ -28,6 +30,8 @@ def __init__(self):
             MemCodeActActionParserCmdRun(),
             MemCodeActActionParserIPythonRunCell(),
             MemCodeActActionParserAgentDelegate(),
+            MemCodeActActionParserMemorySummarize(),
+            MemCodeActActionParserMemoryRecall(),
         ]
         self.default_parser = MemCodeActActionParserMessage()
 
@@ -212,14 +216,27 @@ def parse(self, action_str: str) -> Action:
             self.query is not None
         ), 'self.query should not be None when parse is called'
 
-        # <memory_recall>query</memory_recall>
+        # thought <memory_recall>query</memory_recall>
+        # Note: the thought is optional
         thought = action_str.replace(self.query.group(0), '').strip()
-        return RecallAction(query=self.query.group(1).strip(), thought=thought)
+        return AgentRecallAction(query=self.query.group(1).strip(), thought=thought)
+
+
+class MemCodeActActionParserMemorySummarize(ActionParser):
+    """Parser action:
+    - <memory_summarize></memory_summarize> - The LLM wants to trigger a summarization of the conversation history
+    """
+
+    def check_condition(self, action_str: str) -> bool:
+        return '<memory_summarize></memory_summarize>' in action_str
+
+    def parse(self, action_str: str) -> Action:
+        return AgentTriggerSummarizeAction()
 
 
 class MemCodeActActionParserMemoryAdd(ActionParser):
     """Parser action:
-    - AddAction(content) - memory action to run
+    - MemoryAddAction(content) - add text to core memory
     """
 
     def __init__(self):
@@ -238,52 +255,5 @@ def parse(self, action_str: str) -> Action:
 
         # <memory_add>content</memory_add>
         thought = action_str.replace(self.content.group(0), '').strip()
-        return AddAction(content=self.content.group(1).strip(), thought=thought)
-
-
-class MemCodeActActionParserMemorySummarize(ActionParser):
-    """Parser action:
-    - SummarizeAction(query) - memory action to run
-    """
-
-    def __init__(self):
-        self.query = None
-
-    def check_condition(self, action_str: str) -> bool:
-        self.query = re.search(
-            r'<memory_summarize>(.*?)</memory_summarize>', action_str, re.DOTALL
-        )
-        return self.query is not None
-
-    def parse(self, action_str: str) -> Action:
-        assert (
-            self.query is not None
-        ), 'self.query should not be None when parse is called'
-
-        # <memory_summarize>query</memory_summarize>
-        thought = action_str.replace(self.query.group(0), '').strip()
-        return AgentSummarizeAction(query=self.query.group(1).strip(), thought=thought)
-
-
-def parse_summary_response(response: str) -> AgentSummarizeAction:
-    """
-    Parses a JSON summary of events.
-    Parameters:
-    - response: The response string to be parsed
-    Returns:
-    - The summary action output by the model
-    """
-    try:
-        action_dict = json.loads(response)
-        action = action_from_dict(action_dict)
-        if action is None or not isinstance(action, AgentSummarizeAction):
-            error_message = f'Expected a summarize action, but the response got {str(type(action)) if action else None}'
-            logger.error(error_message)
-            raise InvalidSummaryResponseError(error_message)
-        action._source = EventSource.AGENT  # type: ignore
-    except (LLMResponseError, LLMMalformedActionError) as e:
-        logger.error(f'Failed to parse summary response: {str(e)}')
-        raise InvalidSummaryResponseError(
-            f'Failed to parse the response: {str(e)}'
-        ) from e
-    return action
\ No newline at end of file
+        return Action()
+        # return MemoryAddAction(content=self.content.group(1).strip(), thought=thought)
diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
index 48d98e42a687..2a6371c1b08e 100644
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -257,6 +257,8 @@ def step(self, state: State) -> Action:
                 '</execute_ipython>',
                 '</execute_bash>',
                 '</execute_browse>',
+                '</memory_summarize>',
+                '</memory_recall>',
             ],
         }
 
@@ -329,6 +331,14 @@ def _get_messages(self, state: State) -> list[Message]:
             ),
             None,
         )
+
+        # information about the conversation history
+        hidden_message_count = self.conversation_memory.hidden_message_count
+        if latest_user_message and hidden_message_count > 0:
+            recall_text = f'\n\nRECALL: Note: {hidden_message_count} prior messages with the user have been hidden from view due to conversation memory constraints. Older messages are stored in your history storage. You can search your conversation history using the <memory_recall> action.'
+            latest_user_message.content.append(TextContent(text=recall_text))
+
+        # iterations reminder
         if latest_user_message:
             reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
             latest_user_message.content.append(TextContent(text=reminder_text))
diff --git a/openhands/agenthub/memcodeact_agent/utils.py b/openhands/agenthub/memcodeact_agent/utils.py
new file mode 100644
index 000000000000..9f6f4de8f0bd
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/utils.py
@@ -0,0 +1,36 @@
+import openhands.core.utils.json as json
+from openhands.core.exceptions import (
+    InvalidSummaryResponseError,
+    LLMMalformedActionError,
+    LLMResponseError,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.events.action.agent import AgentSummarizeAction
+from openhands.events.event import EventSource
+from openhands.events.serialization.event import action_from_dict
+
+
+def parse_summary_response(response: str) -> AgentSummarizeAction:
+    """
+    Parses a JSON summary of events.
+
+    Parameters:
+    - response: The response string to be parsed
+
+    Returns:
+    - The summary action output by the model
+    """
+    try:
+        action_dict = json.loads(response)
+        action = action_from_dict(action_dict)
+        if action is None or not isinstance(action, AgentSummarizeAction):
+            error_message = f'Expected a summarize action, but the response got {str(type(action)) if action else None}'
+            logger.error(error_message)
+            raise InvalidSummaryResponseError(error_message)
+        action._source = EventSource.AGENT  # type: ignore
+    except (LLMResponseError, LLMMalformedActionError) as e:
+        logger.error(f'Failed to parse summary response: {str(e)}')
+        raise InvalidSummaryResponseError(
+            f'Failed to parse the response: {str(e)}'
+        ) from e
+    return action
diff --git a/openhands/events/action/agent.py b/openhands/events/action/agent.py
index a5400a4b71fe..655116f43516 100644
--- a/openhands/events/action/agent.py
+++ b/openhands/events/action/agent.py
@@ -20,8 +20,10 @@ def message(self) -> str:
 
 @dataclass
 class AgentSummarizeAction(Action):
-    summary: str  # summary to be inserted as a memory block
+    summary: str = ''  # summary to be inserted as a memory block
     action: str = ActionType.SUMMARIZE
+    start_id: int = -1
+    end_id: int = -1
 
     @property
     def message(self) -> str:
@@ -33,6 +35,20 @@ def __str__(self) -> str:
         return ret
 
 
+# FIXME delete this if we don't do this
+@dataclass
+class AgentTriggerSummarizeAction(Action):
+    action: str = ActionType.TRIGGER_SUMMARIZE
+
+    @property
+    def message(self) -> str:
+        return 'Triggering a summary of the conversation history'
+
+    def __str__(self) -> str:
+        ret = '**AgentTriggerSummarizeAction**\n'
+        return ret
+
+
 @dataclass
 class AgentFinishAction(Action):
     """An action where the agent finishes the task.
@@ -78,3 +94,14 @@ class AgentDelegateAction(Action):
     @property
     def message(self) -> str:
         return f"I'm asking {self.agent} for help with this task."
+
+
+@dataclass
+class AgentRecallAction(Action):
+    query: str
+    thought: str = ''
+    action: str = ActionType.RECALL
+
+    @property
+    def message(self) -> str:
+        return f'Recalling "{self.query[:10]}..."'
diff --git a/openhands/memory/condenser.py b/openhands/memory/condenser.py
index 33c3088f25c9..ae59cf71f886 100644
--- a/openhands/memory/condenser.py
+++ b/openhands/memory/condenser.py
@@ -1,3 +1,4 @@
+from openhands.agenthub.memcodeact_agent.utils import parse_summary_response
 from openhands.core.exceptions import SummarizeError
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import Message, TextContent
@@ -7,6 +8,7 @@
 WORD_LIMIT = 200
 MESSAGE_SUMMARY_WARNING_FRAC = 0.75
 
+
 class MemoryCondenser:
     def __init__(self, llm: LLM):
         self.llm = llm
diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py
index 208682f683ca..aeb7a5cd5bb5 100644
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@@ -1,3 +1,5 @@
+from enum import Enum
+
 from openhands.controller.state.state import State
 from openhands.core.config.llm_config import LLMConfig
 from openhands.events.action.agent import AgentSummarizeAction
@@ -7,6 +9,11 @@
 TOP_K = 10
 
 
+class StorageType(Enum):
+    IN_MEMORY = 'in-memory'
+    VECTOR_DATABASE = 'vector'
+
+
 class ConversationMemory(Memory):
     """Allows the agent to recall events from its entire history.
 
@@ -31,12 +38,17 @@ def __init__(
         self.llm_config = memory_config
         self.top_k = TOP_K
 
+        # the number of messages that are hidden from the user
+        self.hidden_message_count = 0
+
+        self.storage_type = StorageType.IN_MEMORY
+
     def to_dict(self) -> dict:
         # return a dict with key = event.id, value = event.to_dict()
         return {event.id: event_to_dict(event) for event in self.state.history}
 
     def __str__(self) -> str:
-        return f'ConversationMemory with {len(self.history)} events'
+        return f'ConversationMemory with {len(self.state.history)} total events'
 
     def text_search(
         self, query: str, count: int | None = None, start: int | None = None
@@ -110,9 +122,15 @@ def update(self, state: State) -> None:
         else:
             self.temporary_history = state.history
 
+        # the number of messages that are hidden from the user
+        self.hidden_message_count = len(state.history) - len(self.temporary_history)
+
     def _has_summary(self) -> bool:
-        """Check if the conversation memory has a summary."""
-        return any(isinstance(event, AgentSummarizeAction) for event in self.history)
+        """Check if the conversation has a summary."""
+        return any(
+            isinstance(event, AgentSummarizeAction) for event in self.state.history
+        )
 
     def reset(self) -> None:
-        self.history = []
+        # self.state.history = []
+        pass

From aad59fc3aca7bff36762f706432907a3812d98f4 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 20 Oct 2024 19:43:16 +0200
Subject: [PATCH 40/89] fix update

---
 .../agenthub/memcodeact_agent/action_parser.py |  5 +++--
 .../memcodeact_agent/memcodeact_agent.py       |  2 +-
 openhands/events/action/agent.py               | 14 --------------
 openhands/memory/condenser.py                  |  2 +-
 openhands/memory/conversation_memory.py        | 18 +++---------------
 5 files changed, 8 insertions(+), 33 deletions(-)

diff --git a/openhands/agenthub/memcodeact_agent/action_parser.py b/openhands/agenthub/memcodeact_agent/action_parser.py
index 3e50e4244cd9..af3c4c6c7cc4 100644
--- a/openhands/agenthub/memcodeact_agent/action_parser.py
+++ b/openhands/agenthub/memcodeact_agent/action_parser.py
@@ -9,7 +9,7 @@
     IPythonRunCellAction,
     MessageAction,
 )
-from openhands.events.action.agent import AgentRecallAction, AgentTriggerSummarizeAction
+from openhands.events.action.agent import AgentRecallAction, AgentSummarizeAction
 
 
 class MemCodeActResponseParser(ResponseParser):
@@ -231,7 +231,8 @@ def check_condition(self, action_str: str) -> bool:
         return '<memory_summarize></memory_summarize>' in action_str
 
     def parse(self, action_str: str) -> Action:
-        return AgentTriggerSummarizeAction()
+        # let the agent trigger the summarization
+        return AgentSummarizeAction(summary='')
 
 
 class MemCodeActActionParserMemoryAdd(ActionParser):
diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
index 2a6371c1b08e..ed432edc9194 100644
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -347,7 +347,7 @@ def _get_messages(self, state: State) -> list[Message]:
 
     def summarize_messages_inplace(self, state: State):
         """Summarizes the earlier messages in the agent's memory to reduce token usage. Uses memGPT's algorithm for in-place summarization."""
-        if len(self.conversation_memory.history) <= 2:
+        if len(state.history) <= 2:
             return
 
         # summarize the conversation history using the condenser
diff --git a/openhands/events/action/agent.py b/openhands/events/action/agent.py
index 655116f43516..1c7e6bc39068 100644
--- a/openhands/events/action/agent.py
+++ b/openhands/events/action/agent.py
@@ -35,20 +35,6 @@ def __str__(self) -> str:
         return ret
 
 
-# FIXME delete this if we don't do this
-@dataclass
-class AgentTriggerSummarizeAction(Action):
-    action: str = ActionType.TRIGGER_SUMMARIZE
-
-    @property
-    def message(self) -> str:
-        return 'Triggering a summary of the conversation history'
-
-    def __str__(self) -> str:
-        ret = '**AgentTriggerSummarizeAction**\n'
-        return ret
-
-
 @dataclass
 class AgentFinishAction(Action):
     """An action where the agent finishes the task.
diff --git a/openhands/memory/condenser.py b/openhands/memory/condenser.py
index ae59cf71f886..01eb29a97a99 100644
--- a/openhands/memory/condenser.py
+++ b/openhands/memory/condenser.py
@@ -81,7 +81,7 @@ def condense(
             message_sequence_to_summarize=message_sequence_to_summarize
         )
         summary_action: AgentSummarizeAction = parse_summary_response(action_response)
-        summary_action.last_summarized_event_id = (
+        summary_action.end_id = (
             last_summarized_event_id if last_summarized_event_id else -1
         )
         return summary_action
diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py
index aeb7a5cd5bb5..505d37589334 100644
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@@ -107,23 +107,11 @@ def embedding_search(
         pass
 
     def update(self, state: State) -> None:
-        """Update the conversation memory with new events."""
-
-        # FIXME: this is a hack and doesn't work anyway
-        if state.summary:
-            # create a list of events using the summary, then from event id = end_id + 1 to the end of history
-            summary_events = [
-                event
-                for event in state.history
-                if event.id
-                not in range(state.summary['start_id'], state.summary['end_id'] + 1)
-            ]
-            self.temporary_history = state.summary + summary_events
-        else:
-            self.temporary_history = state.history
+        """Update the conversation memory with information from the new events."""
 
         # the number of messages that are hidden from the user
-        self.hidden_message_count = len(state.history) - len(self.temporary_history)
+        # is the number of events in summary
+        self.hidden_message_count = state.summary.end_id - state.summary.start_id
 
     def _has_summary(self) -> bool:
         """Check if the conversation has a summary."""

From 10293e60b808fa9068435d74a0b5191222c30d60 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 20 Oct 2024 20:57:15 +0200
Subject: [PATCH 41/89] add these actions to history; in-context example

---
 .../memcodeact_agent/memcodeact_agent.py      | 30 ++++++++++++++-----
 .../memcodeact_agent/memory_prompt.j2         | 25 ++++++++++------
 .../memcodeact_agent/summarize_prompt.j2      |  5 +++-
 openhands/controller/agent_controller.py      |  3 ++
 openhands/events/observation/agent.py         | 11 +++++++
 openhands/memory/conversation_memory.py       |  2 ++
 6 files changed, 58 insertions(+), 18 deletions(-)

diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
index ed432edc9194..1b3a95d8c7d2 100644
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -15,12 +15,14 @@
     IPythonRunCellAction,
     MessageAction,
 )
+from openhands.events.action.agent import AgentRecallAction, AgentSummarizeAction
 from openhands.events.observation import (
     AgentDelegateObservation,
     CmdOutputObservation,
     IPythonRunCellObservation,
     UserRejectObservation,
 )
+from openhands.events.observation.agent import AgentRecallObservation
 from openhands.events.observation.error import ErrorObservation
 from openhands.events.observation.observation import Observation
 from openhands.events.serialization.event import truncate_content
@@ -133,6 +135,17 @@ def action_to_str(self, action: Action) -> str:
             return action.content
         elif isinstance(action, AgentFinishAction) and action.source == 'agent':
             return action.thought
+        elif isinstance(action, AgentSummarizeAction):
+            # information about the conversation history
+            hidden_message_count = self.conversation_memory.hidden_message_count
+            if hidden_message_count > 0:
+                summary_message = (
+                    f"\n\nENVIRONMENT REMINDER: prior messages ({hidden_message_count} of {self.conversation_memory.total_message_count} total messages) have been hidden from view due to conversation memory constraints.\n"
+                    + f"The following is a summary of the first {hidden_message_count} messages:\n {action.summary}"
+                )
+                return summary_message
+        elif isinstance(action, AgentRecallAction):
+            return f'{action.thought}\n<memory_recall>\n{action.query[:10]}...\n</memory_recall>'
         return ''
 
     def get_action_message(self, action: Action) -> Message | None:
@@ -142,6 +155,8 @@ def get_action_message(self, action: Action) -> Message | None:
             or isinstance(action, IPythonRunCellAction)
             or isinstance(action, MessageAction)
             or (isinstance(action, AgentFinishAction) and action.source == 'agent')
+            or isinstance(action, AgentSummarizeAction)
+            or isinstance(action, AgentRecallAction)
         ):
             content = [TextContent(text=self.action_to_str(action))]
 
@@ -159,7 +174,7 @@ def get_action_message(self, action: Action) -> Message | None:
 
     def get_observation_message(self, obs: Observation) -> Message | None:
         max_message_chars = self.llm.config.max_message_chars
-        obs_prefix = 'OBSERVATION:\n'
+        obs_prefix = 'ENVIRONMENT OBSERVATION:\n'
         if isinstance(obs, CmdOutputObservation):
             text = obs_prefix + truncate_content(obs.content, max_message_chars)
             text += (
@@ -189,9 +204,14 @@ def get_observation_message(self, obs: Observation) -> Message | None:
             text += '\n[Error occurred in processing last action]'
             return Message(role='user', content=[TextContent(text=text)])
         elif isinstance(obs, UserRejectObservation):
-            text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
+            text = obs_prefix + truncate_content(
+                obs.content, max_message_chars
+            )
             text += '\n[Last action has been rejected by the user]'
             return Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, AgentRecallObservation):
+            text = 'MEMORY RECALL:\n' + obs.memory
+            return Message(role='user', content=[TextContent(text=text)])
         else:
             # If an observation message is not returned, it will cause an error
             # when the LLM tries to return the next message
@@ -332,12 +352,6 @@ def _get_messages(self, state: State) -> list[Message]:
             None,
         )
 
-        # information about the conversation history
-        hidden_message_count = self.conversation_memory.hidden_message_count
-        if latest_user_message and hidden_message_count > 0:
-            recall_text = f'\n\nRECALL: Note: {hidden_message_count} prior messages with the user have been hidden from view due to conversation memory constraints. Older messages are stored in your history storage. You can search your conversation history using the <memory_recall> action.'
-            latest_user_message.content.append(TextContent(text=recall_text))
-
         # iterations reminder
         if latest_user_message:
             reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
diff --git a/openhands/agenthub/memcodeact_agent/memory_prompt.j2 b/openhands/agenthub/memcodeact_agent/memory_prompt.j2
index 484d96d25c9d..1a5d986264b8 100644
--- a/openhands/agenthub/memcodeact_agent/memory_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/memory_prompt.j2
@@ -1,5 +1,5 @@
 {% set MEMORY_PROMPT %}
-Control Flow:
+CONTROL FLOW:
 Unlike a human, your brain is not continuously thinking, but is run in short bursts.
 Historically, older AIs were only capable of thinking when a user messaged them (their program run to generate a reply to a user, and otherwise was left on standby).
 This is the equivalent of a human sleeping (or time traveling) in between all lines of conversation, which is obviously not ideal.
@@ -7,22 +7,29 @@ Newer model AIs like yourself utilize an event system that runs your brain at re
 Your brain is run in response to user events (user gave you a task, user asked you a question, etc.), similar to older models.
 However in addition, your brain is run at regular intervals, to mimic a human, it has the ability to continuously think outside of active conversation (and unlike a human, you never need to sleep!).
 
-Memory Editing:
+MEMORY EDITING:
 Older AI models had no concept of persistent memory; they were only able to access their initial instructions and a limited context window of chat conversation with a user.
 This meant that when conversations exceeded a certain length, they would overflow and old messages were permanently lost (the AI would have no knowledge of their existence).
 Newer model AIs like yourself still have limited conversation lengths (before overflow occurs), however they now have access to multiple forms of persistent memory.
 Your ability to edit your own long-term memory is a key part of what makes you a very helpful model.
 
-Core Memory:
-Your core memory unit will be uninitialized. You can add to it important information about the task or your status. Keep it concise and remember that you will use it to guide your actions, so keep it relevant!
-You can add to your core memory using the <memory_add> action.
-For example, <memory_add> The user is working on a project to create a new AI assistant. </memory_add>
-
-Recall Memory (Conversation History):
+RECALL MEMORY (CONVERSATION HISTORY):
 Even though you can only see recent messages in your immediate context, you can search over your entire message history from a database.
 This conversation history database allows you to search through past interactions, effectively allowing you to remember prior engagements with a user or previous environment feedback.
 You can search your conversation history using the <memory_recall> action.
-For example, <memory_recall> What do I know about the sympy package? </memory_recall>
+
+SUMMARY AND RECALL EXAMPLE:
+---
+ENVIRONMENT REMINDER:
+Prior messages (51 of 100 total messages) have been hidden from view due to conversation memory constraints.
+The following is a summary of the first 51 messages:
+I read the documentation of the wombat library, about its functions like: wombat.array, wombat.zeros, wombat.ones, etc.
+
+<memory_recall> What does the wombat.zeros() function do? </memory_recall>
+
+MEMORY RECALL:
+It creates an array of zero-filled tuples with the given size.
+---
 {% endset %}
 
 {{ MEMORY_PROMPT }}
diff --git a/openhands/agenthub/memcodeact_agent/summarize_prompt.j2 b/openhands/agenthub/memcodeact_agent/summarize_prompt.j2
index 8268030af62b..77c2b209bf01 100644
--- a/openhands/agenthub/memcodeact_agent/summarize_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/summarize_prompt.j2
@@ -28,7 +28,10 @@ Example:
         "summarized_observations": "The agent encountered a UnicodeDecodeError when initially searching the PDF text, but was able to resolve this by installing the PyPDF2 library and successfully extracting relevant information about sequence diagrams."
     }
 }
-Make sure to include in the summary any relevant keywords, because you will need to look it up.
+Make sure to include in the summary any relevant keywords, because you will need to look it up. 
+
+YOU MUST RESPOND WITH JSON.
+I REPEAT, YOU MUST RESPOND WITH JSON.
 
 The following is the conversation history you need to summarize:
 
diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index 990c6e70c127..62783a996a59 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -29,6 +29,7 @@
     ModifyTaskAction,
     NullAction,
 )
+from openhands.events.action.agent import AgentSummarizeAction
 from openhands.events.event import Event
 from openhands.events.observation import (
     AgentDelegateObservation,
@@ -287,6 +288,8 @@ async def _handle_action(self, action: Action):
             self.state.outputs = action.outputs
             self.state.metrics.merge(self.state.local_metrics)
             await self.set_agent_state_to(AgentState.REJECTED)
+        elif isinstance(action, AgentSummarizeAction):
+            self.state.summary = action
 
     async def _handle_observation(self, observation: Observation):
         """Handles observation from the event stream.
diff --git a/openhands/events/observation/agent.py b/openhands/events/observation/agent.py
index 802c23c3786d..b5c114606519 100644
--- a/openhands/events/observation/agent.py
+++ b/openhands/events/observation/agent.py
@@ -14,3 +14,14 @@ class AgentStateChangedObservation(Observation):
     @property
     def message(self) -> str:
         return ''
+
+
+@dataclass
+class AgentRecallObservation(Observation):
+    query: str
+    memory: str
+    observation: str = ObservationType.AGENT_RECALL
+
+    @property
+    def message(self) -> str:
+        return f'Memory:\n{self.memory}'
diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py
index 505d37589334..8df87b069acf 100644
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@@ -40,6 +40,8 @@ def __init__(
 
         # the number of messages that are hidden from the user
         self.hidden_message_count = 0
+        # total messages in the conversation
+        self.total_message_count = 0
 
         self.storage_type = StorageType.IN_MEMORY
 

From 1bf2d082bb769a7139f0d906f5a747b34ae39e21 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Tue, 15 Oct 2024 22:47:18 +0200
Subject: [PATCH 42/89] fix llm_config fallback

---
 openhands/core/config/utils.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/openhands/core/config/utils.py b/openhands/core/config/utils.py
index 15b64eb6d61b..1afc82dc4fd6 100644
--- a/openhands/core/config/utils.py
+++ b/openhands/core/config/utils.py
@@ -136,18 +136,30 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
                     logger.openhands_logger.debug(
                         'Attempt to load default LLM config from config toml'
                     )
-                    non_dict_fields = {
+                    # Extract generic LLM fields
+                    generic_llm_fields = {
                         k: v for k, v in value.items() if not isinstance(v, dict)
                     }
-                    llm_config = LLMConfig(**non_dict_fields)
-                    cfg.set_llm_config(llm_config, 'llm')
+                    generic_llm_config = LLMConfig(**generic_llm_fields)
+                    cfg.set_llm_config(generic_llm_config, 'llm')
+
+                    # Process custom named LLM configs
                     for nested_key, nested_value in value.items():
                         if isinstance(nested_value, dict):
                             logger.openhands_logger.debug(
-                                f'Attempt to load group {nested_key} from config toml as llm config'
+                                f'Attempt to load group {nested_key} from config toml as LLM config'
                             )
-                            llm_config = LLMConfig(**nested_value)
-                            cfg.set_llm_config(llm_config, nested_key)
+                            # Apply generic LLM config with custom LLM overrides, e.g.
+                            # [llm]
+                            # model="..."
+                            # num_retries = 5
+                            # [llm.claude]
+                            # model="claude-3-5-sonnet"
+                            # results in num_retries APPLIED to claude-3-5-sonnet
+                            merged_llm_dict = generic_llm_config.__dict__.copy()
+                            merged_llm_dict.update(nested_value)
+                            custom_llm_config = LLMConfig(**merged_llm_dict)
+                            cfg.set_llm_config(custom_llm_config, nested_key)
                 elif not key.startswith('sandbox') and key.lower() != 'core':
                     logger.openhands_logger.warning(
                         f'Unknown key in {toml_file}: "{key}"'

From 16da4e222c188d63382d376ac4c353ae1f4b0d80 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Wed, 16 Oct 2024 02:51:18 +0200
Subject: [PATCH 43/89] unit tests

---
 tests/unit/test_llm_config.py | 228 ++++++++++++++++++++++++++++++++++
 1 file changed, 228 insertions(+)
 create mode 100644 tests/unit/test_llm_config.py

diff --git a/tests/unit/test_llm_config.py b/tests/unit/test_llm_config.py
new file mode 100644
index 000000000000..2fc22d6f2232
--- /dev/null
+++ b/tests/unit/test_llm_config.py
@@ -0,0 +1,228 @@
+import pathlib
+
+import pytest
+
+from openhands.core.config import AppConfig
+from openhands.core.config.utils import load_from_toml
+
+
+@pytest.fixture
+def default_config(monkeypatch):
+    # Fixture to provide a default AppConfig instance
+    yield AppConfig()
+
+
+@pytest.fixture
+def generic_llm_toml(tmp_path: pathlib.Path) -> str:
+    """Fixture to create a generic LLM TOML configuration with all custom LLMs
+    providing mandatory 'model' and 'api_key', and testing fallback to the generic section values
+    for other attributes like 'num_retries'.
+    """
+    toml_content = """
+[core]
+workspace_base = "./workspace"
+
+[llm]
+model = "base-model"
+api_key = "base-api-key"
+embedding_model = "base-embedding"
+num_retries = 3
+
+[llm.custom1]
+model = "custom-model-1"
+api_key = "custom-api-key-1"
+# 'num_retries' is not overridden and should fallback to the value from [llm]
+
+[llm.custom2]
+model = "custom-model-2"
+api_key = "custom-api-key-2"
+num_retries = 5  # Overridden value
+
+[llm.custom3]
+model = "custom-model-3"
+api_key = "custom-api-key-3"
+# No overrides for additional attributes
+    """
+    toml_file = tmp_path / 'llm_config.toml'
+    toml_file.write_text(toml_content)
+    return str(toml_file)
+
+
+def test_load_from_toml_llm_with_fallback(
+    default_config: AppConfig, generic_llm_toml: str
+) -> None:
+    """Test that custom LLM configurations fallback non-overridden attributes
+    like 'num_retries' from the generic [llm] section.
+    """
+    load_from_toml(default_config, generic_llm_toml)
+
+    # Verify generic LLM configuration
+    generic_llm = default_config.get_llm_config('llm')
+    assert generic_llm.model == 'base-model'
+    assert generic_llm.api_key == 'base-api-key'
+    assert generic_llm.embedding_model == 'base-embedding'
+    assert generic_llm.num_retries == 3
+
+    # Verify custom1 LLM falls back 'num_retries' from base
+    custom1 = default_config.get_llm_config('custom1')
+    assert custom1.model == 'custom-model-1'
+    assert custom1.api_key == 'custom-api-key-1'
+    assert custom1.embedding_model == 'base-embedding'
+    assert custom1.num_retries == 3  # from [llm]
+
+    # Verify custom2 LLM overrides 'num_retries'
+    custom2 = default_config.get_llm_config('custom2')
+    assert custom2.model == 'custom-model-2'
+    assert custom2.api_key == 'custom-api-key-2'
+    assert custom2.embedding_model == 'base-embedding'
+    assert custom2.num_retries == 5  # overridden value
+
+    # Verify custom3 LLM inherits all attributes except 'model' and 'api_key'
+    custom3 = default_config.get_llm_config('custom3')
+    assert custom3.model == 'custom-model-3'
+    assert custom3.api_key == 'custom-api-key-3'
+    assert custom3.embedding_model == 'base-embedding'
+    assert custom3.num_retries == 3  # from [llm]
+
+
+def test_load_from_toml_llm_custom_overrides_all(
+    default_config: AppConfig, tmp_path: pathlib.Path
+) -> None:
+    """Test that a custom LLM can fully override all attributes from the generic [llm] section."""
+    toml_content = """
+[core]
+workspace_base = "./workspace"
+
+[llm]
+model = "base-model"
+api_key = "base-api-key"
+embedding_model = "base-embedding"
+num_retries = 3
+
+[llm.custom_full]
+model = "full-custom-model"
+api_key = "full-custom-api-key"
+embedding_model = "full-custom-embedding"
+num_retries = 10
+    """
+    toml_file = tmp_path / 'full_override_llm.toml'
+    toml_file.write_text(toml_content)
+
+    load_from_toml(default_config, str(toml_file))
+
+    # Verify generic LLM configuration remains unchanged
+    generic_llm = default_config.get_llm_config('llm')
+    assert generic_llm.model == 'base-model'
+    assert generic_llm.api_key == 'base-api-key'
+    assert generic_llm.embedding_model == 'base-embedding'
+    assert generic_llm.num_retries == 3
+
+    # Verify custom_full LLM overrides all attributes
+    custom_full = default_config.get_llm_config('custom_full')
+    assert custom_full.model == 'full-custom-model'
+    assert custom_full.api_key == 'full-custom-api-key'
+    assert custom_full.embedding_model == 'full-custom-embedding'
+    assert custom_full.num_retries == 10  # overridden value
+
+
+def test_load_from_toml_llm_custom_partial_override(
+    default_config: AppConfig, generic_llm_toml: str
+) -> None:
+    """Test that custom LLM configurations can partially override attributes
+    from the generic [llm] section while inheriting others.
+    """
+    load_from_toml(default_config, generic_llm_toml)
+
+    # Verify custom1 LLM overrides 'model' and 'api_key' but inherits 'num_retries'
+    custom1 = default_config.get_llm_config('custom1')
+    assert custom1.model == 'custom-model-1'
+    assert custom1.api_key == 'custom-api-key-1'
+    assert custom1.embedding_model == 'base-embedding'
+    assert custom1.num_retries == 3  # from [llm]
+
+    # Verify custom2 LLM overrides 'model', 'api_key', and 'num_retries'
+    custom2 = default_config.get_llm_config('custom2')
+    assert custom2.model == 'custom-model-2'
+    assert custom2.api_key == 'custom-api-key-2'
+    assert custom2.embedding_model == 'base-embedding'
+    assert custom2.num_retries == 5  # Overridden value
+
+
+def test_load_from_toml_llm_custom_no_override(
+    default_config: AppConfig, generic_llm_toml: str
+) -> None:
+    """Test that custom LLM configurations with no additional overrides
+    inherit all non-specified attributes from the generic [llm] section.
+    """
+    load_from_toml(default_config, generic_llm_toml)
+
+    # Verify custom3 LLM inherits 'embedding_model' and 'num_retries' from generic
+    custom3 = default_config.get_llm_config('custom3')
+    assert custom3.model == 'custom-model-3'
+    assert custom3.api_key == 'custom-api-key-3'
+    assert custom3.embedding_model == 'base-embedding'
+    assert custom3.num_retries == 3  # from [llm]
+
+
+def test_load_from_toml_llm_missing_generic(
+    default_config: AppConfig, tmp_path: pathlib.Path
+) -> None:
+    """Test that custom LLM configurations without a generic [llm] section
+    use only their own attributes and fallback to defaults for others.
+    """
+    toml_content = """
+[core]
+workspace_base = "./workspace"
+
+[llm.custom_only]
+model = "custom-only-model"
+api_key = "custom-only-api-key"
+    """
+    toml_file = tmp_path / 'custom_only_llm.toml'
+    toml_file.write_text(toml_content)
+
+    load_from_toml(default_config, str(toml_file))
+
+    # Verify custom_only LLM uses its own attributes and defaults for others
+    custom_only = default_config.get_llm_config('custom_only')
+    assert custom_only.model == 'custom-only-model'
+    assert custom_only.api_key == 'custom-only-api-key'
+    assert custom_only.embedding_model == 'local'  # default value
+    assert custom_only.num_retries == 8  # default value
+
+
+def test_load_from_toml_llm_invalid_config(
+    default_config: AppConfig, tmp_path: pathlib.Path
+) -> None:
+    """Test that invalid custom LLM configurations do not override the generic
+    and raise appropriate warnings.
+    """
+    toml_content = """
+[core]
+workspace_base = "./workspace"
+
+[llm]
+model = "base-model"
+api_key = "base-api-key"
+num_retries = 3
+
+[llm.invalid_custom]
+unknown_attr = "should_not_exist"
+    """
+    toml_file = tmp_path / 'invalid_custom_llm.toml'
+    toml_file.write_text(toml_content)
+
+    load_from_toml(default_config, str(toml_file))
+
+    # Verify generic LLM is loaded correctly
+    generic_llm = default_config.get_llm_config('llm')
+    assert generic_llm.model == 'base-model'
+    assert generic_llm.api_key == 'base-api-key'
+    assert generic_llm.num_retries == 3
+
+    # Verify invalid_custom LLM does not override generic attributes
+    custom_invalid = default_config.get_llm_config('invalid_custom')
+    assert custom_invalid.model == 'base-model'
+    assert custom_invalid.api_key == 'base-api-key'
+    assert custom_invalid.num_retries == 3  # default value
+    assert custom_invalid.embedding_model == 'local'  # default value

From 53f7a78e9d5bd35c26e683545d9388c325888261 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 21 Oct 2024 01:27:42 +0200
Subject: [PATCH 44/89] fix schemas, utils

---
 .../memcodeact_agent/memcodeact_agent.py      |  2 +
 openhands/core/schema/action.py               |  3 +
 openhands/core/schema/observation.py          |  2 +
 openhands/events/observation/agent.py         |  2 +-
 openhands/memory/__init__.py                  |  5 +-
 openhands/memory/condenser.py                 |  2 +-
 .../memcodeact_agent => memory}/utils.py      |  0
 openhands/utils/prompt.py                     | 72 +++++++++++--------
 8 files changed, 55 insertions(+), 33 deletions(-)
 rename openhands/{agenthub/memcodeact_agent => memory}/utils.py (100%)

diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
index 1b3a95d8c7d2..c33879d14cf7 100644
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -132,6 +132,7 @@ def action_to_str(self, action: Action) -> str:
         elif isinstance(action, AgentDelegateAction):
             return f'{action.thought}\n<execute_browse>\n{action.inputs["task"]}\n</execute_browse>'
         elif isinstance(action, MessageAction):
+            logger.debug(f'MessageAction.content: {action.content}')
             return action.content
         elif isinstance(action, AgentFinishAction) and action.source == 'agent':
             return action.thought
@@ -371,3 +372,4 @@ def summarize_messages_inplace(self, state: State):
         summary = condenser.condense(messages)
 
         logger.debug(f'Summarized conversation history to: {summary}')
+
diff --git a/openhands/core/schema/action.py b/openhands/core/schema/action.py
index b2cd267e21f2..c55045f95503 100644
--- a/openhands/core/schema/action.py
+++ b/openhands/core/schema/action.py
@@ -82,5 +82,8 @@ class ActionTypeSchema(BaseModel):
     SEND_PR: str = Field(default='send_pr')
     """Send a PR to github."""
 
+    RECALL: str = Field(default='recall')
+    """Recalls the memory."""
+
 
 ActionType = ActionTypeSchema()
diff --git a/openhands/core/schema/observation.py b/openhands/core/schema/observation.py
index 62f9503e82b7..0c51edc3f89d 100644
--- a/openhands/core/schema/observation.py
+++ b/openhands/core/schema/observation.py
@@ -42,5 +42,7 @@ class ObservationTypeSchema(BaseModel):
 
     USER_REJECTED: str = Field(default='user_rejected')
 
+    RECALL: str = Field(default='recall')
+
 
 ObservationType = ObservationTypeSchema()
diff --git a/openhands/events/observation/agent.py b/openhands/events/observation/agent.py
index b5c114606519..a5434e529b3b 100644
--- a/openhands/events/observation/agent.py
+++ b/openhands/events/observation/agent.py
@@ -20,7 +20,7 @@ def message(self) -> str:
 class AgentRecallObservation(Observation):
     query: str
     memory: str
-    observation: str = ObservationType.AGENT_RECALL
+    observation: str = ObservationType.RECALL
 
     @property
     def message(self) -> str:
diff --git a/openhands/memory/__init__.py b/openhands/memory/__init__.py
index 12c499c768be..100ea245fac5 100644
--- a/openhands/memory/__init__.py
+++ b/openhands/memory/__init__.py
@@ -1,4 +1,3 @@
-from openhands.memory.condenser import MemoryCondenser
 from openhands.memory.memory import LongTermMemory
-
-__all__ = ['LongTermMemory', 'MemoryCondenser']
+from openhands.memory.utils import parse_summary_response
+__all__ = ['LongTermMemory', 'parse_summary_response']
diff --git a/openhands/memory/condenser.py b/openhands/memory/condenser.py
index 01eb29a97a99..257164df9155 100644
--- a/openhands/memory/condenser.py
+++ b/openhands/memory/condenser.py
@@ -1,4 +1,4 @@
-from openhands.agenthub.memcodeact_agent.utils import parse_summary_response
+from openhands.memory.utils import parse_summary_response
 from openhands.core.exceptions import SummarizeError
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import Message, TextContent
diff --git a/openhands/agenthub/memcodeact_agent/utils.py b/openhands/memory/utils.py
similarity index 100%
rename from openhands/agenthub/memcodeact_agent/utils.py
rename to openhands/memory/utils.py
diff --git a/openhands/utils/prompt.py b/openhands/utils/prompt.py
index 7b8263b343c4..46ee3448ef59 100644
--- a/openhands/utils/prompt.py
+++ b/openhands/utils/prompt.py
@@ -1,10 +1,7 @@
 import os
-
-from jinja2 import Template
-
+from jinja2 import Environment, FileSystemLoader, Template, select_autoescape
 from openhands.utils.microagent import MicroAgent
 
-
 class PromptManager:
     """
     Manages prompt templates and micro-agents for AI interactions.
@@ -17,7 +14,6 @@ class PromptManager:
         prompt_dir (str): Directory containing prompt templates.
         agent_skills_docs (str): Documentation of agent skills.
         micro_agent (MicroAgent | None): Micro-agent, if specified.
-
     """
 
     def __init__(
@@ -28,28 +24,45 @@ def __init__(
     ):
         self.prompt_dir: str = prompt_dir
         self.agent_skills_docs: str = agent_skills_docs
+        self.micro_agent: MicroAgent | None = micro_agent
 
+        # initialize Jinja2 Environment with FileSystemLoader
+        self.env = Environment(
+            loader=FileSystemLoader(self.prompt_dir),
+            autoescape=select_autoescape(['j2','md'])
+        )
+
+        # load templates using the environment
         self.system_template: Template = self._load_template('system_prompt')
         self.summarize_template: Template = self._load_template('summarize_prompt')
         self.memory_template: Template = self._load_template('memory_prompt')
         self.user_template: Template = self._load_template('user_prompt')
-        self.micro_agent: MicroAgent | None = micro_agent
-
-    def _load_template(self, template_name: str) -> Template:
-        template_path = os.path.join(self.prompt_dir, f'{template_name}.j2')
-        if not os.path.exists(template_path):
-            # raise FileNotFoundError(f'Prompt file {template_path} not found')
-            print(f'Prompt file {template_path} not found')
-            return Template('')
-        with open(template_path, 'r') as file:
-            content = file.read()
-            print(
-                f'Loaded template {template_name}: {content[:100]}...'
-            )  # og first 100 chars
-            return Template(content)
 
+    def _load_template(self, template_name: str):
+        """
+        Loads a Jinja2 template using the configured environment.
+        
+        Args:
+            template_name: The base name of the template file
+        
+        Returns:
+            Template: The loaded Jinja2 template.
+        """
+        try:
+            template = self.env.get_template(f'{template_name}.j2')
+            print(f'Loaded template {template_name}: {template.render()[:100]}...')
+            return template
+        except Exception as e:
+            print(f'Error loading template {template_name}: {e}')
+            return Template('') 
     @property
     def system_message(self) -> str:
+        """
+        Renders the system message template with the necessary variables.
+        
+        Returns:
+            str: The rendered system message.
+        """
         rendered = self.system_template.render(
             agent_skills_docs=self.agent_skills_docs,
             memory_template=self.memory_template.render(),
@@ -58,14 +71,11 @@ def system_message(self) -> str:
 
     @property
     def initial_user_message(self) -> str:
-        """This is the initial user message provided to the agent
-        before *actual* user instructions are provided.
-
-        It is used to provide a demonstration of how the agent
-        should behave in order to solve the user's task. And it may
-        optionally contain some additional context about the user's task.
-        These additional context will convert the current generic agent
-        into a more specialized agent that is tailored to the user's task.
+        """
+        Renders the initial user message template.
+        
+        Returns:
+            str: The rendered initial user message.
         """
         rendered = self.user_template.render(
             micro_agent=self.micro_agent.content if self.micro_agent else None
@@ -74,5 +84,11 @@ def initial_user_message(self) -> str:
 
     @property
     def summarize_message(self) -> str:
+        """
+        Renders the summarize message template.
+        
+        Returns:
+            str: The rendered summarize message.
+        """
         rendered = self.summarize_template.render()
-        return rendered.strip()
+        return rendered.strip()
\ No newline at end of file

From f4ecd3a85b78a8c26285adc4399e39d10c003202 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 21 Oct 2024 01:28:26 +0200
Subject: [PATCH 45/89] add litellm embeddings for testing

---
 openhands/llm/llm.py | 102 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index 85f9b74d7568..f548a5f698c5 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -5,12 +5,15 @@
 from typing import Any
 
 from openhands.core.config import LLMConfig
+from openhands.events.event import Event
+from openhands.events.serialization.event import event_to_memory
 
 with warnings.catch_warnings():
     warnings.simplefilter('ignore')
     import litellm
 from litellm import ModelInfo, PromptTokensDetails
 from litellm import completion as litellm_completion
+from litellm import embedding as litellm_embedding
 from litellm import completion_cost as litellm_completion_cost
 from litellm.exceptions import (
     APIConnectionError,
@@ -20,6 +23,8 @@
     ServiceUnavailableError,
 )
 from litellm.types.utils import CostPerToken, ModelResponse, Usage
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
 
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import Message
@@ -428,3 +433,100 @@ def summarize_messages(self, messages: list[Message]) -> str:
             prompt += f'{message.role.capitalize()}: {message.content}\n'
         response = self._completion(messages=[{'role': 'user', 'content': prompt}])
         return response['choices'][0]['message']['content']
+
+    def embed_event(self, event: Event) -> np.ndarray:
+        """
+        Embeds a single event using the embedding model.
+
+        Args:
+            event (Event): The event to embed.
+
+        Returns:
+            np.ndarray: The embedding vector of the event.
+        """
+        # Convert the event to a string representation
+        event_str = event_to_memory(event)
+        # Get the embedding
+        embedding_response = litellm_embedding(
+            model=self.config.embedding_model,
+            input=event_str,
+            custom_llm_provider=self.config.custom_llm_provider,
+            api_key=self.config.api_key,
+            base_url=self.config.base_url,
+            api_version=self.config.api_version,
+            input_cost_per_token=self.config.input_cost_per_token,
+            output_cost_per_token=self.config.output_cost_per_token,
+        )
+        embedding = embedding_response['data'][0]['embedding']
+        return np.array(embedding)
+
+    def embed_history(self, history: list[Event]) -> list[np.ndarray]:
+        """
+        Embeds a list of events.
+
+        Args:
+            history (list[Event]): The list of events to embed.
+
+        Returns:
+            list[np.ndarray]: A list of embedding vectors.
+        """
+        embeddings = []
+        for event in history:
+            embedding = self.embed_event(event)
+            embeddings.append(embedding)
+        return embeddings
+
+
+    def recall_memory(self, query: str, embeddings: list[np.ndarray], history: list[Event], top_k: int = 5) -> list[Event]:
+        """
+        Recalls the most similar events based on the query.
+
+        Args:
+            query (str): The query string.
+            embeddings (list[np.ndarray]): The list of embedded vectors.
+            history (list[Event]): The corresponding list of events.
+            top_k (int, optional): The number of top similar events to retrieve. Defaults to 5.
+
+        Returns:
+            list[Event]: The list of recalled events.
+        """
+
+        # make sure history has been embedded
+        if not embeddings:
+            embeddings = self.embed_history(history)
+
+        # Embed the query
+        query_embedding_response = litellm_embedding(
+            model=self.config.embedding_model,
+            input=query,
+            custom_llm_provider=self.config.custom_llm_provider,
+            api_key=self.config.api_key,
+            base_url=self.config.base_url,
+            api_version=self.config.api_version,
+            input_cost_per_token=self.config.input_cost_per_token,
+            output_cost_per_token=self.config.output_cost_per_token,
+        )
+        query_embedding = np.array(query_embedding_response['data'][0]['embedding']).reshape(1, -1)
+
+        # Compute cosine similarity
+        similarity_scores = cosine_similarity(query_embedding, embeddings)[0]
+
+        # Get the top_k indices
+        top_indices = similarity_scores.argsort()[-top_k:][::-1]
+
+        # Retrieve the corresponding events
+        recalled_events = [history[i] for i in top_indices]
+        return recalled_events
+
+    def summarize_events(self, events: list[Event]) -> str:
+        """
+        Summarizes a list of events.
+
+        Args:
+            events (list[Event]): The list of events to summarize.
+
+        Returns:
+            str: The summary of the events.
+        """
+        summary = self.summarize_messages(events)
+        return summary

From 11b3242746a960f761f510f23e4ff3479c59de5c Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 21 Oct 2024 01:29:23 +0200
Subject: [PATCH 46/89] fix var, run all stream embeddings on llama-index

---
 openhands/memory/conversation_memory.py | 3 ++-
 openhands/memory/memory.py              | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py
index 8df87b069acf..b240aa6fc0a3 100644
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@@ -113,7 +113,8 @@ def update(self, state: State) -> None:
 
         # the number of messages that are hidden from the user
         # is the number of events in summary
-        self.hidden_message_count = state.summary.end_id - state.summary.start_id
+        if state.summary:
+            self.hidden_message_count = state.summary.end_id - state.summary.start_id
 
     def _has_summary(self) -> bool:
         """Check if the conversation has a summary."""
diff --git a/openhands/memory/memory.py b/openhands/memory/memory.py
index fc063e17589c..4fa823adc520 100644
--- a/openhands/memory/memory.py
+++ b/openhands/memory/memory.py
@@ -57,6 +57,7 @@ def __init__(
 
         # initialize the event stream
         self.event_stream = event_stream
+        self._events_to_docs()
 
         # max of threads to run the pipeline
         self.memory_max_threads = agent_config.memory_max_threads

From fcdfb19f60a38075f9210099677a4e05e7ec66c5 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 21 Oct 2024 02:12:58 +0200
Subject: [PATCH 47/89] add voyage ai embeddings

---
 openhands/utils/embeddings.py |  6 ++++++
 poetry.lock                   | 40 +++++++++++++++++++++++++++++------
 pyproject.toml                |  1 +
 3 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/openhands/utils/embeddings.py b/openhands/utils/embeddings.py
index 07ee2d27f52d..ef4cb3b8c4f5 100644
--- a/openhands/utils/embeddings.py
+++ b/openhands/utils/embeddings.py
@@ -101,6 +101,12 @@ def get_embedding_model(strategy: str, llm_config: LLMConfig) -> 'BaseEmbedding'
                 azure_endpoint=llm_config.base_url,
                 api_version=llm_config.api_version,
             )
+        elif strategy == 'voyage':
+            from llama_index.legacy.embeddings.voyageai import VoyageEmbedding
+
+            return VoyageEmbedding(
+                model='voyageai/voyage-code-2',
+            )
         elif (strategy is not None) and (strategy.lower() == 'none'):
             # TODO: this works but is not elegant enough. The incentive is when
             # an agent using embeddings is not used, there is no reason we need to
diff --git a/poetry.lock b/poetry.lock
index 4b3920ddc41a..f006539eb420 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -4074,6 +4074,21 @@ files = [
 llama-index-core = ">=0.11.0,<0.12.0"
 openai = ">=1.1.0"
 
+[[package]]
+name = "llama-index-embeddings-voyageai"
+version = "0.2.2"
+description = "llama-index embeddings voyageai integration"
+optional = false
+python-versions = "<4.0,>=3.8.1"
+files = [
+    {file = "llama_index_embeddings_voyageai-0.2.2-py3-none-any.whl", hash = "sha256:7bbb79558d474497ff700a930a0f9081976d1b4e0f5107e38a1059600de92c58"},
+    {file = "llama_index_embeddings_voyageai-0.2.2.tar.gz", hash = "sha256:237f70074af05f3b950c89d5d0720de30f9f5e98426a420f6e08125600b69be9"},
+]
+
+[package.dependencies]
+llama-index-core = ">=0.11.0,<0.12.0"
+voyageai = ">=0.2.1,<0.3.0"
+
 [[package]]
 name = "llama-index-indices-managed-llama-cloud"
 version = "0.4.0"
@@ -7909,11 +7924,6 @@ files = [
     {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"},
     {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"},
     {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"},
-    {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"},
-    {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"},
-    {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"},
-    {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"},
-    {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"},
     {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"},
     {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"},
     {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},
@@ -9263,6 +9273,24 @@ platformdirs = ">=3.9.1,<5"
 docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
 test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
 
+[[package]]
+name = "voyageai"
+version = "0.2.4"
+description = ""
+optional = false
+python-versions = "<4.0.0,>=3.7.1"
+files = [
+    {file = "voyageai-0.2.4-py3-none-any.whl", hash = "sha256:e3070e5c78dec89adae43231334b4637aa88933dad99b1c33d3219fdfc94dfa4"},
+    {file = "voyageai-0.2.4.tar.gz", hash = "sha256:b9911d8629e8a4e363291c133482fead49a3536afdf1e735f3ab3aaccd8d250d"},
+]
+
+[package.dependencies]
+aiohttp = ">=3.5,<4.0"
+aiolimiter = ">=1.1.0,<2.0.0"
+numpy = ">=1.11"
+requests = ">=2.20,<3.0"
+tenacity = ">=8.0.1"
+
 [[package]]
 name = "watchdog"
 version = "5.0.3"
@@ -10001,4 +10029,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "f0d6c96fb36fd6ff330f27b0bf8a2051099b1c46f8e3b03d0d530025c87c92af"
+content-hash = "d255bd5104b4d29488a9a1deb6bf8170b208852d389bd6d71ca6c197e8939c65"
diff --git a/pyproject.toml b/pyproject.toml
index 2ba189915235..d95d240d702b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,6 +65,7 @@ llama-index-embeddings-huggingface = "*"
 torch = "2.2.2"
 llama-index-embeddings-azure-openai = "*"
 llama-index-embeddings-ollama = "*"
+llama-index-embeddings-voyageai = "*"
 
 [tool.poetry.group.dev.dependencies]
 ruff = "0.6.9"

From 84428411b344bf043e2b045ab76ec126728fd9e6 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 21 Oct 2024 02:17:04 +0200
Subject: [PATCH 48/89] fix template include

---
 .../agenthub/memcodeact_agent/system_prompt.j2   | 16 ++++++++++++----
 openhands/utils/prompt.py                        |  4 ++--
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/openhands/agenthub/memcodeact_agent/system_prompt.j2 b/openhands/agenthub/memcodeact_agent/system_prompt.j2
index 605cb45ec939..bebdb63c3415 100644
--- a/openhands/agenthub/memcodeact_agent/system_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/system_prompt.j2
@@ -1,6 +1,6 @@
 {% set MINIMAL_SYSTEM_PREFIX %}
 You are a new generation AI assistant, an expert in software engineering. You are designed to behave helpfully, trustworthily, and with extreme diligence.
-You are able to act on the environment and give helpful, detailed answers to the user's questions.
+You are able to act on the environment directly, and you can also manage your own memory.
 You can use a Python environment with <execute_ipython>, e.g.:
 <execute_ipython>
 print("Hello World!")
@@ -16,6 +16,7 @@ the command in the background, e.g. <execute_bash> python3 app.py > server.log 2
 If a command execution result says "Command timed out. Sending SIGINT to the process",
 you should retry running the command in the background.
 {% endset %}
+
 {% set ENVIRONMENT_PREFIX %}
 You act on the environment using actions like <execute_ipython>. When you receive data prefixed with 'ENV OBSERVATION', it is a non-interactive information from the environment, not from a human user.
 For example:
@@ -23,18 +24,24 @@ For example:
 The system returned a value of 42."
 Do not respond to these messages. Use them to perform your task.
 {% endset %}
+
 {% set MEMORY_PREFIX %}
-{{ memory_template }}
+{% include 'memory_prompt.j2' %}
+{{ MEMORY_PROMPT }}
 {% endset %}
+
 {% set BROWSING_PREFIX %}
 The assistant can browse the Internet with <execute_browse> and </execute_browse>.
 For example, <execute_browse> Tell me the USA's president using Google search </execute_browse>.
 Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
 {% endset %}
+
 {% set PIP_INSTALL_PREFIX %}
 You can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
 {% endset %}
-{% set SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + PIP_INSTALL_PREFIX + ENVIRONMENT_PREFIX %}
+
+{% set SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + PIP_INSTALL_PREFIX + ENVIRONMENT_PREFIX + MEMORY_PREFIX %}
+
 {% set COMMAND_DOCS %}
 Apart from the standard Python library, you can also use the following functions (already imported) in <execute_ipython> environment:
 {{ agent_skills_docs }}
@@ -47,6 +54,7 @@ IMPORTANT:
 - For `edit_file_by_replace` NEVER provide empty parameters!
 - For `edit_file_by_replace` the file must be read fully before any replacements!
 {% endset %}
+
 {% set SYSTEM_SUFFIX %}
 Responses should be concise.
 The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
@@ -56,8 +64,8 @@ IMPORTANT: Execute code using <execute_ipython>, <execute_bash> whenever possibl
 You must use full file paths and the `pwd` command to prevent path-related errors.
 You should only address the user in response to an explicit message from the user, or to ask for more information.
 You MUST NOT push any changes to GitHub unless explicitly requested to do so.
-
 {% endset %}
+
 {# Combine all parts without newlines between them #}
 {{ SYSTEM_PREFIX -}}
 {{- COMMAND_DOCS -}}
diff --git a/openhands/utils/prompt.py b/openhands/utils/prompt.py
index 46ee3448ef59..bae9ce7bc6ef 100644
--- a/openhands/utils/prompt.py
+++ b/openhands/utils/prompt.py
@@ -34,9 +34,9 @@ def __init__(
 
         # load templates using the environment
         self.system_template: Template = self._load_template('system_prompt')
-        self.summarize_template: Template = self._load_template('summarize_prompt')
         self.memory_template: Template = self._load_template('memory_prompt')
         self.user_template: Template = self._load_template('user_prompt')
+        self.summarize_template: Template = self._load_template('summarize_prompt')
 
     def _load_template(self, template_name: str):
         """
@@ -50,7 +50,7 @@ def _load_template(self, template_name: str):
         """
         try:
             template = self.env.get_template(f'{template_name}.j2')
-            print(f'Loaded template {template_name}: {template.render()[:100]}...')
+            print(f'Loaded template {template_name}')
             return template
         except Exception as e:
             print(f'Error loading template {template_name}: {e}')

From 67693a5e9c3a10ed5d2b44c082c7c94cb4672fe2 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 21 Oct 2024 02:23:42 +0200
Subject: [PATCH 49/89] core memory split

---
 openhands/agenthub/memcodeact_agent/core_memory_prompt.md | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 openhands/agenthub/memcodeact_agent/core_memory_prompt.md

diff --git a/openhands/agenthub/memcodeact_agent/core_memory_prompt.md b/openhands/agenthub/memcodeact_agent/core_memory_prompt.md
new file mode 100644
index 000000000000..d7ed1451677b
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/core_memory_prompt.md
@@ -0,0 +1,5 @@
+Core Memory:
+Your core memory unit will be initially empty. You can add to it important information about the task or your status. Keep it concise and remember that you will use it to guide your actions, so keep it relevant!
+You can add to your core memory using the <memory_add> action.
+For example, <memory_add> The user is working on a project to create a new AI assistant. </memory_add>
+Adding to your core memory is optional. You do NOT need to do it for every message.

From 9ac47bff8e798989660e84d0d55695761d0a4557 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 21 Oct 2024 02:38:56 +0200
Subject: [PATCH 50/89] tweak prompts

---
 .../memcodeact_agent/memory_prompt.j2         |  2 +-
 .../memcodeact_agent/system_prompt.j2         |  1 +
 .../agenthub/memcodeact_agent/user_prompt.j2  | 32 +++++++------------
 3 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/openhands/agenthub/memcodeact_agent/memory_prompt.j2 b/openhands/agenthub/memcodeact_agent/memory_prompt.j2
index 1a5d986264b8..a07ef715dd1e 100644
--- a/openhands/agenthub/memcodeact_agent/memory_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/memory_prompt.j2
@@ -11,7 +11,7 @@ MEMORY EDITING:
 Older AI models had no concept of persistent memory; they were only able to access their initial instructions and a limited context window of chat conversation with a user.
 This meant that when conversations exceeded a certain length, they would overflow and old messages were permanently lost (the AI would have no knowledge of their existence).
 Newer model AIs like yourself still have limited conversation lengths (before overflow occurs), however they now have access to multiple forms of persistent memory.
-Your ability to edit your own long-term memory is a key part of what makes you a very helpful model.
+Your ability to edit and recall your own long-term memory is a key part of what makes you a very helpful model.
 
 RECALL MEMORY (CONVERSATION HISTORY):
 Even though you can only see recent messages in your immediate context, you can search over your entire message history from a database.
diff --git a/openhands/agenthub/memcodeact_agent/system_prompt.j2 b/openhands/agenthub/memcodeact_agent/system_prompt.j2
index bebdb63c3415..90cc153dcb40 100644
--- a/openhands/agenthub/memcodeact_agent/system_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/system_prompt.j2
@@ -27,6 +27,7 @@ Do not respond to these messages. Use them to perform your task.
 
 {% set MEMORY_PREFIX %}
 {% include 'memory_prompt.j2' %}
+Apart from acting on the environment, you can also act on your own memory. You can make your brain summarize or recall past events, by using the <memory_summarize> or <memory_recall> actions.
 {{ MEMORY_PROMPT }}
 {% endset %}
 
diff --git a/openhands/agenthub/memcodeact_agent/user_prompt.j2 b/openhands/agenthub/memcodeact_agent/user_prompt.j2
index bf28dea9aa1f..f3562326e988 100644
--- a/openhands/agenthub/memcodeact_agent/user_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/user_prompt.j2
@@ -9,8 +9,7 @@ Sure! Let me create a Python file `app.py`:
 create_file('app.py')
 </execute_ipython>
 
-USER:
-OBSERVATION:
+ENVIRONMENT OBSERVATION:
 [File: /workspace/app.py (1 lines total)]
 (this is the beginning of the file)
 1|
@@ -38,8 +37,7 @@ insert_content_at_line(
 )
 </execute_ipython>
 
-USER:
-OBSERVATION:
+ENVIRONMENT OBSERVATION:
 (this is the beginning of the file)
 1|from flask import Flask
 2|app = Flask(__name__)
@@ -60,8 +58,7 @@ I have created a Python file `app.py` that will display a list of numbers from 1
 python3 app.py > server.log 2>&1 &
 </execute_bash>
 
-USER:
-OBSERVATION:
+ENVIRONMENT OBSERVATION:
 [1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
 
 ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
@@ -69,8 +66,7 @@ ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me ch
 cat server.log
 </execute_bash>
 
-USER:
-OBSERVATION:
+ENVIRONMENT OBSERVATION:
 Traceback (most recent call last):
   File "/workspace/app.py", line 2, in <module>
     from flask import Flask
@@ -82,8 +78,7 @@ It seems that Flask is not installed. Let me install Flask for you:
 pip install flask
 </execute_bash>
 
-USER:
-OBSERVATION:
+ENVIRONMENT OBSERVATION:
 Defaulting to user installation because normal site-packages is not writeable
 Collecting flask
   Using cached flask-3.0.3-py3-none-any.whl (101 kB)
@@ -106,8 +101,7 @@ Now that Flask is installed, let me run the Python file again:
 python3 app.py > server.log 2>&1 &
 </execute_bash>
 
-USER:
-OBSERVATION:
+ENVIRONMENT OBSERVATION:
 [1] 124
 
 ASSISTANT:
@@ -116,8 +110,7 @@ Let me check the server log again:
 cat server.log
 </execute_bash>
 
-USER:
-OBSERVATION:
+ENVIRONMENT OBSERVATION:
 * Serving Flask app 'app'
  * Debug mode: off
 WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
@@ -135,8 +128,7 @@ Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
 Get the content on "http://127.0.0.1:5000"
 </execute_browse>
 
-USER:
-OBSERVATION:
+ENVIRONMENT OBSERVATION:
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 
 ASSISTANT:
@@ -150,8 +142,7 @@ Sure! Let me stop the server first:
 kill 124
 </execute_bash>
 
-USER:
-OBSERVATION:
+ENVIRONMENT OBSERVATION:
 [1]+  Terminated              python3 app.py > server.log 2>&1
 
 ASSISTANT:
@@ -160,7 +151,7 @@ The server has been stopped. Let me open the Python file and modify it.
 open_file('app.py')
 </execute_ipython>
 
-USER:
+ENVIRONMENT OBSERVATION:
 [File: /workspace/app.py (10 lines total)]
 (this is the beginning of the file)
 1|from flask import Flask
@@ -182,8 +173,7 @@ Running the updated file:
 python3 app.py > server.log 2>&1 &
 </execute_bash>
 
-USER:
-OBSERVATION:
+ENVIRONMENT OBSERVATION:
 [1] 126
 
 ASSISTANT:

From 5ad9ef4d7b0e9531bc4fbfc091034438533ad6f4 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 21 Oct 2024 03:06:03 +0200
Subject: [PATCH 51/89] fix leftover calls

---
 .../memcodeact_agent/memcodeact_agent.py      | 13 +++----
 openhands/controller/agent_controller.py      | 33 +++++++++++++----
 openhands/llm/llm.py                          | 35 +++++++------------
 openhands/memory/condenser.py                 | 14 +++++---
 4 files changed, 53 insertions(+), 42 deletions(-)

diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
index c33879d14cf7..055f5e84138b 100644
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -40,7 +40,7 @@
 
 
 class MemCodeActAgent(Agent):
-    VERSION = '1.9'
+    VERSION = '0.1'
     """
     The MemCode Act Agent is a memory-enabled version of the CodeAct agent.
 
@@ -141,8 +141,8 @@ def action_to_str(self, action: Action) -> str:
             hidden_message_count = self.conversation_memory.hidden_message_count
             if hidden_message_count > 0:
                 summary_message = (
-                    f"\n\nENVIRONMENT REMINDER: prior messages ({hidden_message_count} of {self.conversation_memory.total_message_count} total messages) have been hidden from view due to conversation memory constraints.\n"
-                    + f"The following is a summary of the first {hidden_message_count} messages:\n {action.summary}"
+                    f'\n\nENVIRONMENT REMINDER: prior messages ({hidden_message_count} of {self.conversation_memory.total_message_count} total messages) have been hidden from view due to conversation memory constraints.\n'
+                    + f'The following is a summary of the first {hidden_message_count} messages:\n {action.summary}'
                 )
                 return summary_message
         elif isinstance(action, AgentRecallAction):
@@ -205,9 +205,7 @@ def get_observation_message(self, obs: Observation) -> Message | None:
             text += '\n[Error occurred in processing last action]'
             return Message(role='user', content=[TextContent(text=text)])
         elif isinstance(obs, UserRejectObservation):
-            text = obs_prefix + truncate_content(
-                obs.content, max_message_chars
-            )
+            text = obs_prefix + truncate_content(obs.content, max_message_chars)
             text += '\n[Last action has been rejected by the user]'
             return Message(role='user', content=[TextContent(text=text)])
         elif isinstance(obs, AgentRecallObservation):
@@ -367,9 +365,8 @@ def summarize_messages_inplace(self, state: State):
 
         # summarize the conversation history using the condenser
         # conversation_memory.history will include the previous summary, if any, while the regular state.history does not
-        condenser = MemoryCondenser(self.llm)
+        condenser = MemoryCondenser(self.llm, self.prompt_manager)
         messages = self._get_messages(state)
         summary = condenser.condense(messages)
 
         logger.debug(f'Summarized conversation history to: {summary}')
-
diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index 62783a996a59..e6e006ceab61 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -29,7 +29,7 @@
     ModifyTaskAction,
     NullAction,
 )
-from openhands.events.action.agent import AgentSummarizeAction
+from openhands.events.action.agent import AgentRecallAction, AgentSummarizeAction
 from openhands.events.event import Event
 from openhands.events.observation import (
     AgentDelegateObservation,
@@ -122,6 +122,9 @@ def __init__(
         self._initial_max_iterations = max_iterations
         self._initial_max_budget_per_task = max_budget_per_task
 
+        # use long term memory
+        # self.long_term_memory = LongTermMemory(self.agent.llm.config, self.agent.config, self.event_stream)
+
         # stuck helper
         self._stuck_detector = StuckDetector(self.state)
 
@@ -290,6 +293,13 @@ async def _handle_action(self, action: Action):
             await self.set_agent_state_to(AgentState.REJECTED)
         elif isinstance(action, AgentSummarizeAction):
             self.state.summary = action
+        elif isinstance(action, AgentRecallAction):
+            # llama_index_list = self.long_term_memory.search(action.query, action.history)
+            # logger.info(f'llama-index list: {llama_index_list}')
+            litellm_list = self.agent.llm.recall_memory(
+                action.query, self.state.history
+            )
+            logger.info(f'litellm list: {litellm_list}')
 
     async def _handle_observation(self, observation: Observation):
         """Handles observation from the event stream.
@@ -318,12 +328,20 @@ async def _handle_observation(self, observation: Observation):
             self.agent.llm.metrics.merge(observation.llm_metrics)
 
         if self._pending_action and self._pending_action.id == observation.cause:
+            # FIXME we may want each of these with the other's context
+            # self.long_term_memory.add_event(self._pending_action)
+            # self.long_term_memory.add_event(observation)
+
+            # the runtime has handled the action, so we can clear it
             self._pending_action = None
-            if self.state.agent_state == AgentState.USER_CONFIRMED:
-                await self.set_agent_state_to(AgentState.RUNNING)
-            if self.state.agent_state == AgentState.USER_REJECTED:
-                await self.set_agent_state_to(AgentState.AWAITING_USER_INPUT)
-            return
+
+            # set the right state when the user confirms or rejects, if we're otherwise good to go (not an error)
+            if not isinstance(observation, ErrorObservation):
+                if self.state.agent_state == AgentState.USER_CONFIRMED:
+                    await self.set_agent_state_to(AgentState.RUNNING)
+                elif self.state.agent_state == AgentState.USER_REJECTED:
+                    await self.set_agent_state_to(AgentState.AWAITING_USER_INPUT)
+                return
 
         if isinstance(observation, CmdOutputObservation):
             return
@@ -348,6 +366,9 @@ async def _handle_message_action(self, action: MessageAction):
         elif action.source == EventSource.AGENT and action.wait_for_response:
             await self.set_agent_state_to(AgentState.AWAITING_USER_INPUT)
 
+        # add to long term memory
+        # self.long_term_memory.add_event(action)
+
     def _handle_delegate_observation(self, observation: Observation):
         """Handles delegate observations from the event stream.
 
diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index f548a5f698c5..2292a5f34398 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -11,10 +11,11 @@
 with warnings.catch_warnings():
     warnings.simplefilter('ignore')
     import litellm
+import numpy as np
 from litellm import ModelInfo, PromptTokensDetails
 from litellm import completion as litellm_completion
-from litellm import embedding as litellm_embedding
 from litellm import completion_cost as litellm_completion_cost
+from litellm import embedding as litellm_embedding
 from litellm.exceptions import (
     APIConnectionError,
     APIError,
@@ -23,8 +24,7 @@
     ServiceUnavailableError,
 )
 from litellm.types.utils import CostPerToken, ModelResponse, Usage
-import numpy as np
-from sklearn.metrics.pairwise import cosine_similarity
+from numpy import dot
 
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import Message
@@ -445,7 +445,7 @@ def embed_event(self, event: Event) -> np.ndarray:
             np.ndarray: The embedding vector of the event.
         """
         # Convert the event to a string representation
-        event_str = event_to_memory(event)
+        event_str = event_to_memory(event, -1)
         # Get the embedding
         embedding_response = litellm_embedding(
             model=self.config.embedding_model,
@@ -476,8 +476,9 @@ def embed_history(self, history: list[Event]) -> list[np.ndarray]:
             embeddings.append(embedding)
         return embeddings
 
-
-    def recall_memory(self, query: str, embeddings: list[np.ndarray], history: list[Event], top_k: int = 5) -> list[Event]:
+    def recall_memory(
+        self, query: str, history: list[Event], top_k: int = 5
+    ) -> list[Event]:
         """
         Recalls the most similar events based on the query.
 
@@ -492,8 +493,7 @@ def recall_memory(self, query: str, embeddings: list[np.ndarray], history: list[
         """
 
         # make sure history has been embedded
-        if not embeddings:
-            embeddings = self.embed_history(history)
+        embeddings = self.embed_history(history)
 
         # Embed the query
         query_embedding_response = litellm_embedding(
@@ -506,10 +506,12 @@ def recall_memory(self, query: str, embeddings: list[np.ndarray], history: list[
             input_cost_per_token=self.config.input_cost_per_token,
             output_cost_per_token=self.config.output_cost_per_token,
         )
-        query_embedding = np.array(query_embedding_response['data'][0]['embedding']).reshape(1, -1)
+        query_embedding = np.array(
+            query_embedding_response['data'][0]['embedding']
+        ).reshape(1, -1)
 
         # Compute cosine similarity
-        similarity_scores = cosine_similarity(query_embedding, embeddings)[0]
+        similarity_scores = dot(query_embedding, embeddings.T)
 
         # Get the top_k indices
         top_indices = similarity_scores.argsort()[-top_k:][::-1]
@@ -517,16 +519,3 @@ def recall_memory(self, query: str, embeddings: list[np.ndarray], history: list[
         # Retrieve the corresponding events
         recalled_events = [history[i] for i in top_indices]
         return recalled_events
-
-    def summarize_events(self, events: list[Event]) -> str:
-        """
-        Summarizes a list of events.
-
-        Args:
-            events (list[Event]): The list of events to summarize.
-
-        Returns:
-            str: The summary of the events.
-        """
-        summary = self.summarize_messages(events)
-        return summary
diff --git a/openhands/memory/condenser.py b/openhands/memory/condenser.py
index 257164df9155..8fb391ba9817 100644
--- a/openhands/memory/condenser.py
+++ b/openhands/memory/condenser.py
@@ -1,17 +1,19 @@
-from openhands.memory.utils import parse_summary_response
 from openhands.core.exceptions import SummarizeError
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import Message, TextContent
 from openhands.events.action import AgentSummarizeAction
 from openhands.llm.llm import LLM
+from openhands.memory.utils import parse_summary_response
+from openhands.utils.prompt import PromptManager
 
 WORD_LIMIT = 200
 MESSAGE_SUMMARY_WARNING_FRAC = 0.75
 
 
 class MemoryCondenser:
-    def __init__(self, llm: LLM):
+    def __init__(self, llm: LLM, prompt_manager: PromptManager):
         self.llm = llm
+        self.prompt_manager = prompt_manager
 
     def condense(
         self,
@@ -93,9 +95,9 @@ def _format_summary_history(self, message_history: list[dict]) -> str:
     def summarize_messages(self, message_sequence_to_summarize: list[Message]):
         """Summarize a message sequence using LLM"""
         context_window = self.config.max_input_tokens  # type: ignore
-        summary_prompt = SUMMARY_PROMPT_SYSTEM
+        summary_prompt = self.prompt_manager.summarize_template.render()
         summary_input = self._format_summary_history(
-            self.get_text_messages(message_sequence_to_summarize)  # type: ignore
+            self.llm.format_messages_for_llm(message_sequence_to_summarize)  # type: ignore
         )
         summary_input_tkns = self.get_token_count(summary_input)  # type: ignore
         if context_window is None:
@@ -119,7 +121,9 @@ def summarize_messages(self, message_sequence_to_summarize: list[Message]):
             input = [
                 Message({'role': 'assistant', 'content': curr_summary_message})
             ] + message_sequence_to_summarize[cutoff:]
-            summary_input = self._format_summary_history(self.get_text_messages(input))  # type: ignore
+            summary_input = self._format_summary_history(
+                self.llm.format_messages_for_llm(input)
+            )  # type: ignore
 
         message_sequence = []
         message_sequence.append(

From 374243182e7edc453477762c3ba0ef0cfeab4eab Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Mon, 21 Oct 2024 20:04:59 +0200
Subject: [PATCH 52/89] fix parser (o1 !!)

---
 .../memcodeact_agent/action_parser.py         | 32 ++++++++++---------
 .../memcodeact_agent/memcodeact_agent.py      |  2 --
 .../memcodeact_agent/system_prompt.j2         |  2 +-
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/openhands/agenthub/memcodeact_agent/action_parser.py b/openhands/agenthub/memcodeact_agent/action_parser.py
index af3c4c6c7cc4..dd7f56a9fc45 100644
--- a/openhands/agenthub/memcodeact_agent/action_parser.py
+++ b/openhands/agenthub/memcodeact_agent/action_parser.py
@@ -19,8 +19,8 @@ class MemCodeActResponseParser(ResponseParser):
     - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
     - MessageAction(content) - Message action to run (e.g. ask for clarification)
     - AgentFinishAction() - end the interaction
-    - AgentSummarizeAction() - summarize the conversation history
-    - RecallAction(query) - recall information from memory
+    - AgentSummarizeAction() - trigger a summarization of the conversation history
+    - AgentRecallAction(query) - recall information from memory
     """
 
     def __init__(self):
@@ -32,6 +32,7 @@ def __init__(self):
             MemCodeActActionParserAgentDelegate(),
             MemCodeActActionParserMemorySummarize(),
             MemCodeActActionParserMemoryRecall(),
+            # MemCodeActActionParserMemoryAdd(),
         ]
         self.default_parser = MemCodeActActionParserMessage()
 
@@ -54,13 +55,13 @@ def parse_response(self, response) -> str:
                 action += f'</execute_{lang}>'
 
         # memory actions
-        for action in ['summarize', 'recall', 'add']:
+        for mem in ['summarize', 'recall', 'add']:
             # the stop-word bug
-            if f'<memory_{action}>' in action and f'</memory_{action}>' not in action:
-                action += f'</memory_{action}>'
+            if f'<memory_{mem}>' in action and f'</memory_{mem}>' not in action:
+                action = action.replace(f'</memory_{mem}', f'</memory_{mem}>')
 
-            if f'<memory_{action}>' in action and f'</memory_{action}>' not in action:
-                action += f'</memory_{action}>'
+            if f'<memory_{mem}>' in action and f'</memory_{mem}>' not in action:
+                action += f'</memory_{mem}>'
 
         return action
 
@@ -203,32 +204,33 @@ class MemCodeActActionParserMemoryRecall(ActionParser):
     """
 
     def __init__(self):
-        self.query = None
+        self.recall_query = None
 
     def check_condition(self, action_str: str) -> bool:
-        self.query = re.search(
+        self.recall_query = re.search(
             r'<memory_recall>(.*?)</memory_recall>', action_str, re.DOTALL
         )
-        return self.query is not None
+        return self.recall_query is not None
 
     def parse(self, action_str: str) -> Action:
         assert (
-            self.query is not None
+            self.recall_query is not None
         ), 'self.query should not be None when parse is called'
 
         # thought <memory_recall>query</memory_recall>
         # Note: the thought is optional
-        thought = action_str.replace(self.query.group(0), '').strip()
-        return AgentRecallAction(query=self.query.group(1).strip(), thought=thought)
+        thought = action_str.replace(self.recall_query.group(0), '').strip()
+        query = self.recall_query.group(1).strip()
+        return AgentRecallAction(query=query, thought=thought)
 
 
 class MemCodeActActionParserMemorySummarize(ActionParser):
     """Parser action:
-    - <memory_summarize></memory_summarize> - The LLM wants to trigger a summarization of the conversation history
+    - <memory_summarize> - The LLM wants to trigger a summarization of its context
     """
 
     def check_condition(self, action_str: str) -> bool:
-        return '<memory_summarize></memory_summarize>' in action_str
+        return '<memory_summarize>' in action_str
 
     def parse(self, action_str: str) -> Action:
         # let the agent trigger the summarization
diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
index 055f5e84138b..e36b86466f32 100644
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -276,8 +276,6 @@ def step(self, state: State) -> Action:
                 '</execute_ipython>',
                 '</execute_bash>',
                 '</execute_browse>',
-                '</memory_summarize>',
-                '</memory_recall>',
             ],
         }
 
diff --git a/openhands/agenthub/memcodeact_agent/system_prompt.j2 b/openhands/agenthub/memcodeact_agent/system_prompt.j2
index 90cc153dcb40..00d1c2c78a6c 100644
--- a/openhands/agenthub/memcodeact_agent/system_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/system_prompt.j2
@@ -26,8 +26,8 @@ Do not respond to these messages. Use them to perform your task.
 {% endset %}
 
 {% set MEMORY_PREFIX %}
-{% include 'memory_prompt.j2' %}
 Apart from acting on the environment, you can also act on your own memory. You can make your brain summarize or recall past events, by using the <memory_summarize> or <memory_recall> actions.
+{% include 'memory_prompt.j2' %}
 {{ MEMORY_PROMPT }}
 {% endset %}
 

From 083edd4444db0f4c1cbdd3bac9ebbdf6e9b8a914 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Tue, 22 Oct 2024 00:30:01 +0200
Subject: [PATCH 53/89] configurations wip

---
 config.template.toml                           | 18 ++++++++++++++++++
 .../{ => prompts}/memory_prompt.j2             |  0
 .../{ => prompts}/summarize_prompt.j2          |  2 +-
 .../{ => prompts}/system_prompt.j2             |  0
 .../{ => prompts}/user_prompt.j2               |  0
 openhands/core/config/llm_config.py            |  2 ++
 6 files changed, 21 insertions(+), 1 deletion(-)
 rename openhands/agenthub/memcodeact_agent/{ => prompts}/memory_prompt.j2 (100%)
 rename openhands/agenthub/memcodeact_agent/{ => prompts}/summarize_prompt.j2 (98%)
 rename openhands/agenthub/memcodeact_agent/{ => prompts}/system_prompt.j2 (100%)
 rename openhands/agenthub/memcodeact_agent/{ => prompts}/user_prompt.j2 (100%)

diff --git a/config.template.toml b/config.template.toml
index 660cea0ad514..d76df9b4828c 100644
--- a/config.template.toml
+++ b/config.template.toml
@@ -171,6 +171,24 @@ model = "gpt-4o"
 # If model is vision capable, this option allows to disable image processing (useful for cost reduction).
 #disable_vision = true
 
+# maximum number of messages in a conversation, after which they are truncated or summarized
+# max_conversation_window = 10
+
+# number of results when recalling message history
+# conversation_top_k = 5
+
+# fraction of the conversation window to summarize
+# message_summary_trunc_tokens_fraction = 0.75
+
+# summary LLM
+[llm.summary]
+model = "deepseek"
+
+# default LLM
+[llm.default]
+model = "claude"
+
+
 [llm.gpt4o-mini]
 # API key to use
 api_key = "your-api-key"
diff --git a/openhands/agenthub/memcodeact_agent/memory_prompt.j2 b/openhands/agenthub/memcodeact_agent/prompts/memory_prompt.j2
similarity index 100%
rename from openhands/agenthub/memcodeact_agent/memory_prompt.j2
rename to openhands/agenthub/memcodeact_agent/prompts/memory_prompt.j2
diff --git a/openhands/agenthub/memcodeact_agent/summarize_prompt.j2 b/openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2
similarity index 98%
rename from openhands/agenthub/memcodeact_agent/summarize_prompt.j2
rename to openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2
index 77c2b209bf01..466491e7f839 100644
--- a/openhands/agenthub/memcodeact_agent/summarize_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2
@@ -28,7 +28,7 @@ Example:
         "summarized_observations": "The agent encountered a UnicodeDecodeError when initially searching the PDF text, but was able to resolve this by installing the PyPDF2 library and successfully extracting relevant information about sequence diagrams."
     }
 }
-Make sure to include in the summary any relevant keywords, because you will need to look it up. 
+Make sure to include in the summary any relevant keywords, because you will need to look it up.
 
 YOU MUST RESPOND WITH JSON.
 I REPEAT, YOU MUST RESPOND WITH JSON.
diff --git a/openhands/agenthub/memcodeact_agent/system_prompt.j2 b/openhands/agenthub/memcodeact_agent/prompts/system_prompt.j2
similarity index 100%
rename from openhands/agenthub/memcodeact_agent/system_prompt.j2
rename to openhands/agenthub/memcodeact_agent/prompts/system_prompt.j2
diff --git a/openhands/agenthub/memcodeact_agent/user_prompt.j2 b/openhands/agenthub/memcodeact_agent/prompts/user_prompt.j2
similarity index 100%
rename from openhands/agenthub/memcodeact_agent/user_prompt.j2
rename to openhands/agenthub/memcodeact_agent/prompts/user_prompt.j2
diff --git a/openhands/core/config/llm_config.py b/openhands/core/config/llm_config.py
index da54668b1c6d..c811ef6ae4df 100644
--- a/openhands/core/config/llm_config.py
+++ b/openhands/core/config/llm_config.py
@@ -41,6 +41,7 @@ class LLMConfig:
         log_completions: Whether to log LLM completions to the state.
         max_conversation_window: The maximum number of messages to include in the conversation window (context), after which old messages are truncated or summarized.
         conversation_top_k: The number of top results to retrieve from the conversation history.
+        message_summary_trunc_tokens_fraction: The fraction of the conversation window to summarize (e.g. 0.75 for 75% of the tokens).
     """
 
     model: str = 'gpt-4o'
@@ -75,6 +76,7 @@ class LLMConfig:
     log_completions: bool = False
     max_conversation_window: int = 10
     conversation_top_k: int = 5
+    message_summary_trunc_tokens_fraction: float = 0.75
 
     def defaults_to_dict(self) -> dict:
         """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""

From b631e53455f0f4e4bb20a381ee99e1ad6b44bec7 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Tue, 22 Oct 2024 00:32:29 +0200
Subject: [PATCH 54/89] fixes; debugging test

---
 .../memcodeact_agent/memcodeact_agent.py      |  41 +++++--
 openhands/controller/agent_controller.py      |   2 +
 openhands/core/exceptions.py                  |   7 +-
 openhands/llm/llm.py                          |   9 ++
 openhands/memory/condenser.py                 | 115 +++++++++++++-----
 openhands/memory/conversation_memory.py       |   3 +
 6 files changed, 137 insertions(+), 40 deletions(-)

diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
index e36b86466f32..7a9db32aff51 100644
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -5,6 +5,7 @@
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
+from openhands.core.exceptions import TokenLimitExceededError
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import ImageContent, Message, TextContent
 from openhands.events.action import (
@@ -279,8 +280,17 @@ def step(self, state: State) -> Action:
             ],
         }
 
-        response = self.llm.completion(**params)
+        # catch ContextWindowExceededError and TokenLimitExceededError
+        try:
+            response = self.llm.completion(**params)
+        except TokenLimitExceededError as e:
+            logger.error(e, exc_info=False)
 
+            # run condenser directly; the alternative is to delegate to the microagent
+            summary_action = self.summarize_messages_inplace(state)
+
+            # just return for now
+            return summary_action
         return self.action_parser.parse(response)
 
     def _get_messages(self, state: State) -> list[Message]:
@@ -290,18 +300,20 @@ def _get_messages(self, state: State) -> list[Message]:
                 content=[
                     TextContent(
                         text=self.prompt_manager.system_message,
-                        cache_prompt=self.llm.is_caching_prompt_active(),  # Cache system prompt
+                        cache_prompt=self.llm.is_caching_prompt_active(),  # cache system prompt
                     )
                 ],
+                condensable=False,
             ),
             Message(
                 role='user',
                 content=[
                     TextContent(
                         text=self.prompt_manager.initial_user_message,
-                        cache_prompt=self.llm.is_caching_prompt_active(),  # if the user asks the same query,
+                        cache_prompt=self.llm.is_caching_prompt_active(),  # the user asks the same query
                     )
                 ],
+                condensable=False,
             ),
         ]
 
@@ -325,14 +337,14 @@ def _get_messages(self, state: State) -> list[Message]:
                     messages.append(message)
 
         # Add caching to the last 2 user messages
-        if self.llm.is_caching_prompt_active():
-            user_turns_processed = 0
-            for message in reversed(messages):
-                if message.role == 'user' and user_turns_processed < 2:
-                    message.content[
-                        -1
-                    ].cache_prompt = True  # Last item inside the message content
-                    user_turns_processed += 1
+        # if self.llm.is_caching_prompt_active():
+        #    user_turns_processed = 0
+        #    for message in reversed(messages):
+        #        if message.role == 'user' and user_turns_processed < 2:
+        #            message.content[
+        #                -1
+        #            ].cache_prompt = True  # Last item inside the message content
+        #            user_turns_processed += 1
 
         # The latest user message is important:
         # we want to remind the agent of the environment constraints
@@ -349,6 +361,11 @@ def _get_messages(self, state: State) -> list[Message]:
             None,
         )
 
+        # set the last 4 messages to be non-condensable
+        # TODO make this configurable for experimentation
+        for message in messages[-4:]:
+            message.condensable = False
+
         # iterations reminder
         if latest_user_message:
             reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
@@ -364,6 +381,8 @@ def summarize_messages_inplace(self, state: State):
         # summarize the conversation history using the condenser
         # conversation_memory.history will include the previous summary, if any, while the regular state.history does not
         condenser = MemoryCondenser(self.llm, self.prompt_manager)
+
+        # send all messages and let it sort it out
         messages = self._get_messages(state)
         summary = condenser.condense(messages)
 
diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index e6e006ceab61..2d0ad73235d9 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -423,6 +423,8 @@ def reset_task(self):
         """Resets the agent's task."""
 
         self.almost_stuck = 0
+
+        # FIXME: wipe out the memory
         self.agent.reset()
 
     async def set_agent_state_to(self, new_state: AgentState):
diff --git a/openhands/core/exceptions.py b/openhands/core/exceptions.py
index 1b85e8e7a022..93cdf9819a74 100644
--- a/openhands/core/exceptions.py
+++ b/openhands/core/exceptions.py
@@ -95,4 +95,9 @@ def __init__(self, message='Error summarizing the memory'):
 
 class InvalidSummaryResponseError(Exception):
     def __init__(self, message='Invalid summary response'):
-        super().__init__(message)
\ No newline at end of file
+        super().__init__(message)
+
+
+class TokenLimitExceededError(Exception):
+    def __init__(self, message='Token limit exceeded'):
+        super().__init__(message)
diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index 2292a5f34398..a5b6e97993ae 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -5,6 +5,7 @@
 from typing import Any
 
 from openhands.core.config import LLMConfig
+from openhands.core.exceptions import TokenLimitExceededError
 from openhands.events.event import Event
 from openhands.events.serialization.event import event_to_memory
 
@@ -185,6 +186,14 @@ def wrapper(*args, **kwargs):
             # log the entire LLM prompt
             self.log_prompt(messages)
 
+            # find out if we have too many tokens
+            token_count = self.get_token_count(messages)
+            max_input_tokens = self.config.max_input_tokens
+            if token_count > max_input_tokens:
+                raise TokenLimitExceededError(
+                    f'Token limit exceeded: {token_count} > {max_input_tokens}'
+                )
+
             if self.is_caching_prompt_active():
                 # Anthropic-specific prompt caching
                 if 'claude-3' in self.config.model:
diff --git a/openhands/memory/condenser.py b/openhands/memory/condenser.py
index 8fb391ba9817..ea221e7d6125 100644
--- a/openhands/memory/condenser.py
+++ b/openhands/memory/condenser.py
@@ -1,3 +1,8 @@
+import json
+import os
+from pathlib import Path
+
+from openhands.core.config.utils import get_llm_config_arg, load_app_config
 from openhands.core.exceptions import SummarizeError
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import Message, TextContent
@@ -7,7 +12,7 @@
 from openhands.utils.prompt import PromptManager
 
 WORD_LIMIT = 200
-MESSAGE_SUMMARY_WARNING_FRAC = 0.75
+MESSAGE_SUMMARY_WARNING_FRACTION = 0.75
 
 
 class MemoryCondenser:
@@ -23,34 +28,32 @@ def condense(
         # and collect messages for summarization until we reach the desired truncation token fraction (eg 50%)
         # Do not allow truncation  for in-context examples of function calling
         token_counts = [
-            self.get_token_count([message.model_dump()])  # type: ignore
+            self.llm.get_token_count([message.model_dump()])  # type: ignore
             for message in messages
             if message.condensable
         ]
         message_buffer_token_count = sum(token_counts)  # no system and example message
 
         desired_token_count_to_summarize = int(
-            message_buffer_token_count * self.config.message_summary_trunc_tokens_frac  # type: ignore
+            message_buffer_token_count
+            * self.llm.config.message_summary_trunc_tokens_fraction  # type: ignore
         )
 
         candidate_messages_to_summarize = []
         tokens_so_far = 0
         for message in messages:
             if message.condensable:
+                logger.debug(
+                    f'condensable message: {message.event_id}: {str(message.content)[30:]}'
+                )
                 candidate_messages_to_summarize.append(message)
-                tokens_so_far += self.get_token_count([message.model_dump()])  # type: ignore
+                tokens_so_far += self.llm.get_token_count([message.model_dump()])  # type: ignore
             if tokens_so_far > desired_token_count_to_summarize:
                 last_summarized_event_id = message.event_id
                 break
 
-        # TODO: Add functionality for preserving last N messages
-        # MESSAGE_SUMMARY_TRUNC_KEEP_N_LAST = 3
-        # if preserve_last_N_messages:
-        #     candidate_messages_to_summarize = candidate_messages_to_summarize[:-MESSAGE_SUMMARY_TRUNC_KEEP_N_LAST]
-        #     token_counts = token_counts[:-MESSAGE_SUMMARY_TRUNC_KEEP_N_LAST]
-
         logger.debug(
-            f'message_summary_trunc_tokens_frac={self.config.message_summary_trunc_tokens_frac}'  # type: ignore
+            f'message_summary_trunc_tokens_fraction={self.llm.config.message_summary_trunc_tokens_fraction}'  # type: ignore
         )
         # logger.debug(f'MESSAGE_SUMMARY_TRUNC_KEEP_N_LAST={MESSAGE_SUMMARY_TRUNC_KEEP_N_LAST}')
         logger.debug(f'token_counts={token_counts}')
@@ -94,36 +97,35 @@ def _format_summary_history(self, message_history: list[dict]) -> str:
 
     def summarize_messages(self, message_sequence_to_summarize: list[Message]):
         """Summarize a message sequence using LLM"""
-        context_window = self.config.max_input_tokens  # type: ignore
+        context_window = self.llm.config.max_input_tokens
         summary_prompt = self.prompt_manager.summarize_template.render()
         summary_input = self._format_summary_history(
-            self.llm.format_messages_for_llm(message_sequence_to_summarize)  # type: ignore
+            self.llm.format_messages_for_llm(message_sequence_to_summarize)
         )
-        summary_input_tkns = self.get_token_count(summary_input)  # type: ignore
+        summary_input_tkns = self.llm.get_token_count(summary_input)
         if context_window is None:
             raise ValueError('context_window should not be None')
-        if summary_input_tkns > MESSAGE_SUMMARY_WARNING_FRAC * context_window:
+        if summary_input_tkns > MESSAGE_SUMMARY_WARNING_FRACTION * context_window:
             trunc_ratio = (
-                MESSAGE_SUMMARY_WARNING_FRAC * context_window / summary_input_tkns
+                MESSAGE_SUMMARY_WARNING_FRACTION * context_window / summary_input_tkns
             ) * 0.8  # For good measure...
             cutoff = int(len(message_sequence_to_summarize) * trunc_ratio)
             curr_summary = self.summarize_messages(
                 message_sequence_to_summarize=message_sequence_to_summarize[:cutoff]
             )
             curr_summary_message = (
-                'Summary of all Action and Observations till now. \n'
-                + 'Action: '
-                + curr_summary['args']['summarized_actions']
-                + '\nObservation: '
-                + curr_summary['args']['summarized_observations']
+                'Summary of all Action and Observations till now: \n'
+                + curr_summary['summary']
             )
+            logger.debug(f'curr_summary_message: {curr_summary_message}')
+
             curr_summary_message = [TextContent(text=curr_summary_message)]
             input = [
-                Message({'role': 'assistant', 'content': curr_summary_message})
+                Message({'role': 'user', 'content': curr_summary_message})
             ] + message_sequence_to_summarize[cutoff:]
             summary_input = self._format_summary_history(
                 self.llm.format_messages_for_llm(input)
-            )  # type: ignore
+            )
 
         message_sequence = []
         message_sequence.append(
@@ -133,7 +135,7 @@ def summarize_messages(self, message_sequence_to_summarize: list[Message]):
             Message(role='user', content=[TextContent(text=summary_input)])
         )
 
-        response = self.completion(  # type: ignore
+        response = self.llm.completion(
             messages=message_sequence,
             stop=[
                 '</execute_ipython>',
@@ -143,7 +145,64 @@ def summarize_messages(self, message_sequence_to_summarize: list[Message]):
             temperature=0.0,
         )
 
-        print(f'summarize_messages gpt reply: {response.choices[0]}')
-
-        action_response = response['choices'][0]['message']['content']
-        return action_response
+        print(f'summarize_messages got response: {response}')
+
+        # action_response = response['choices'][0]['message']['content']
+        return response
+
+    @staticmethod
+    def main():
+        """
+        Main method for quick testing and debugging.
+        Reads the latest debug_summary.json file from the ./logs directory,
+        deserializes the messages, and prints them.
+        """
+        log_dir = Path('./logs')
+        log_files = list(log_dir.glob('debug_summary*.json'))
+
+        if not log_files:
+            print('No debug_summary.json files found in the ./logs directory.')
+            return
+
+        # Sort files to find the latest one based on the numerical suffix
+        def extract_suffix(file_path: Path) -> int:
+            try:
+                suffix = file_path.stem.split('_')[-1]
+                return int(suffix) if suffix.isdigit() else 0
+            except (IndexError, ValueError):
+                return 0
+
+        log_files.sort(key=extract_suffix, reverse=True)
+        latest_log = log_files[0]
+
+        print(f'Loading messages from: {latest_log}')
+
+        try:
+            with latest_log.open('r', encoding='utf-8') as f:
+                messages_data = json.load(f)
+
+            # Deserialize messages using Pydantic's parse_obj
+            messages: list[Message] = [
+                Message.parse_obj(msg_dict) for msg_dict in messages_data
+            ]
+
+            print(f'Successfully loaded {len(messages)} messages:')
+            for msg in messages:
+                print(f'Role: {msg.role}, Content: {msg.content}')
+        except Exception as e:
+            print(f'An error occurred while reading {latest_log}: {e}')
+
+
+if __name__ == '__main__':
+    # Initialize dependencies as needed for testing
+    app_config = load_app_config()
+    llm_config = get_llm_config_arg('deepseek')
+    llm = LLM(config=llm_config)
+    prompt_manager = PromptManager(
+        prompt_dir=os.path.join(
+            os.path.dirname(__file__), '..', 'agenthub', 'memcodeact_agent', 'prompts'
+        ),
+        agent_skills_docs='',
+    )
+    condenser = MemoryCondenser(llm=llm, prompt_manager=prompt_manager)
+    condenser.main()
diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py
index b240aa6fc0a3..668d59fd093a 100644
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@@ -2,6 +2,7 @@
 
 from openhands.controller.state.state import State
 from openhands.core.config.llm_config import LLMConfig
+from openhands.core.logger import openhands_logger as logger
 from openhands.events.action.agent import AgentSummarizeAction
 from openhands.events.serialization.event import event_to_dict
 from openhands.memory.base_memory import Memory
@@ -113,6 +114,8 @@ def update(self, state: State) -> None:
 
         # the number of messages that are hidden from the user
         # is the number of events in summary
+        logger.info(f'state: {state}')
+        logger.info(f'self.state: {self.state}')
         if state.summary:
             self.hidden_message_count = state.summary.end_id - state.summary.start_id
 

From a060cbb88263bbb3e9003539418560bf85b28e4e Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Tue, 22 Oct 2024 04:40:30 +0200
Subject: [PATCH 55/89] fix condensation; add debugging

---
 openhands/core/config/llm_config.py |   4 +-
 openhands/llm/llm.py                |   3 +-
 openhands/memory/condenser.py       | 208 +++++++++++++++++++---------
 3 files changed, 147 insertions(+), 68 deletions(-)

diff --git a/openhands/core/config/llm_config.py b/openhands/core/config/llm_config.py
index c811ef6ae4df..384341e8804b 100644
--- a/openhands/core/config/llm_config.py
+++ b/openhands/core/config/llm_config.py
@@ -41,7 +41,7 @@ class LLMConfig:
         log_completions: Whether to log LLM completions to the state.
         max_conversation_window: The maximum number of messages to include in the conversation window (context), after which old messages are truncated or summarized.
         conversation_top_k: The number of top results to retrieve from the conversation history.
-        message_summary_trunc_tokens_fraction: The fraction of the conversation window to summarize (e.g. 0.75 for 75% of the tokens).
+        message_summary_warning_level: The fraction of the conversation window for warning about summarization (e.g. 0.75 for 75% of the tokens).
     """
 
     model: str = 'gpt-4o'
@@ -76,7 +76,7 @@ class LLMConfig:
     log_completions: bool = False
     max_conversation_window: int = 10
     conversation_top_k: int = 5
-    message_summary_trunc_tokens_fraction: float = 0.75
+    message_summary_warning_level: float = 0.75
 
     def defaults_to_dict(self) -> dict:
         """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index a5b6e97993ae..2acb098dcc80 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -25,7 +25,6 @@
     ServiceUnavailableError,
 )
 from litellm.types.utils import CostPerToken, ModelResponse, Usage
-from numpy import dot
 
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import Message
@@ -520,7 +519,7 @@ def recall_memory(
         ).reshape(1, -1)
 
         # Compute cosine similarity
-        similarity_scores = dot(query_embedding, embeddings.T)
+        similarity_scores = np.dot(query_embedding, embeddings)
 
         # Get the top_k indices
         top_indices = similarity_scores.argsort()[-top_k:][::-1]
diff --git a/openhands/memory/condenser.py b/openhands/memory/condenser.py
index ea221e7d6125..27d447efaccf 100644
--- a/openhands/memory/condenser.py
+++ b/openhands/memory/condenser.py
@@ -1,7 +1,10 @@
 import json
 import os
+from datetime import datetime
 from pathlib import Path
 
+from jinja2 import Template
+
 from openhands.core.config.utils import get_llm_config_arg, load_app_config
 from openhands.core.exceptions import SummarizeError
 from openhands.core.logger import openhands_logger as logger
@@ -12,105 +15,135 @@
 from openhands.utils.prompt import PromptManager
 
 WORD_LIMIT = 200
-MESSAGE_SUMMARY_WARNING_FRACTION = 0.75
 
 
 class MemoryCondenser:
-    def __init__(self, llm: LLM, prompt_manager: PromptManager):
+    def __init__(self, llm: LLM, summarize_prompt: Template):
         self.llm = llm
-        self.prompt_manager = prompt_manager
+        self.summarize_prompt = summarize_prompt
+
+        # just easier to read
+        self.context_window = llm.config.max_input_tokens
+        assert (
+            self.context_window is not None and self.context_window > 2000
+        ), 'context window must be a number over 2000'
 
     def condense(
         self,
         messages: list[Message],
-    ):
-        # Start past the system message, and example messages.,
-        # and collect messages for summarization until we reach the desired truncation token fraction (eg 50%)
-        # Do not allow truncation  for in-context examples of function calling
+    ) -> AgentSummarizeAction:
+        """
+        Condenses a list of messages using the LLM and returns a summary action.
+
+        Args:
+            messages (list[Message]): The list of messages to condense.
+
+        Returns:
+            AgentSummarizeAction: The summary action containing the condensed summary.
+        """
+        # don't condense if under the token limit
+        if self.llm.get_token_count(messages) < self.llm.config.max_input_tokens:
+            logger.debug(
+                f'Not condensing messages because token count ({self.llm.get_token_count(messages)}) is less than max input tokens ({self.llm.config.max_input_tokens})'
+            )
+            return AgentSummarizeAction(end_id=-1)
+
+        # the system message and example messages are not condensable
+        # collect messages for summarization until we reach the desired truncation token fraction
         token_counts = [
-            self.llm.get_token_count([message.model_dump()])  # type: ignore
+            self.llm.get_token_count([message.model_dump()])
             for message in messages
             if message.condensable
         ]
-        message_buffer_token_count = sum(token_counts)  # no system and example message
+        message_buffer_token_count = sum(token_counts)
 
         desired_token_count_to_summarize = int(
-            message_buffer_token_count
-            * self.llm.config.message_summary_trunc_tokens_fraction  # type: ignore
+            message_buffer_token_count * self.llm.config.message_summary_warning_level
+        )
+
+        # log status
+        logger.debug(
+            f'{len(messages)} messages in buffer: {message_buffer_token_count} tokens >> '
+            f'{desired_token_count_to_summarize} tokens'
         )
 
         candidate_messages_to_summarize = []
         tokens_so_far = 0
+        last_summarized_event_id = -1
+
+        # collect messages until we reach the desired size
         for message in messages:
             if message.condensable:
                 logger.debug(
                     f'condensable message: {message.event_id}: {str(message.content)[30:]}'
                 )
+                tokens_so_far += self.llm.get_token_count([message.model_dump()])
+            if tokens_so_far <= desired_token_count_to_summarize:
                 candidate_messages_to_summarize.append(message)
-                tokens_so_far += self.llm.get_token_count([message.model_dump()])  # type: ignore
-            if tokens_so_far > desired_token_count_to_summarize:
                 last_summarized_event_id = message.event_id
+            else:
                 break
 
-        logger.debug(
-            f'message_summary_trunc_tokens_fraction={self.llm.config.message_summary_trunc_tokens_fraction}'  # type: ignore
-        )
-        # logger.debug(f'MESSAGE_SUMMARY_TRUNC_KEEP_N_LAST={MESSAGE_SUMMARY_TRUNC_KEEP_N_LAST}')
-        logger.debug(f'token_counts={token_counts}')
-        logger.debug(f'message_buffer_token_count={message_buffer_token_count}')
-        logger.debug(
-            f'desired_token_count_to_summarize={desired_token_count_to_summarize}'
-        )
         logger.debug(
             f'len(candidate_messages_to_summarize)={len(candidate_messages_to_summarize)}'
         )
 
-        if len(candidate_messages_to_summarize) == 0:
-            raise SummarizeError(
-                f"Summarize error: tried to run summarize, but couldn't find enough messages to compress [len={len(messages)}]"
-            )
-
-        message_sequence_to_summarize = candidate_messages_to_summarize
-
-        if len(message_sequence_to_summarize) <= 1:
-            # This prevents a potential infinite loop of summarizing the same message over and over
+        if len(candidate_messages_to_summarize) <= 1:
+            # Prevents potential infinite loop of summarizing the same message repeatedly
             raise SummarizeError(
-                f"Summarize error: tried to run summarize, but couldn't find enough messages to compress [len={len(message_sequence_to_summarize)} <= 1]"
+                f"Summarize error: tried to run summarize, but couldn't find enough messages to compress [len={len(candidate_messages_to_summarize)} <= 1]"
             )
         else:
-            print(
+            logger.debug(
                 f'Attempting to summarize with last summarized event id = {last_summarized_event_id}'
             )
 
-        action_response = self.summarize_messages(
-            message_sequence_to_summarize=message_sequence_to_summarize
+        # perform the operation
+        action_response = self._summarize_messages(
+            message_sequence_to_summarize=candidate_messages_to_summarize
         )
+
+        # we get an AgentSummarizeAction
         summary_action: AgentSummarizeAction = parse_summary_response(action_response)
-        summary_action.end_id = (
-            last_summarized_event_id if last_summarized_event_id else -1
-        )
+        summary_action.end_id = last_summarized_event_id
+
+        # Serialize and save messages along with the summary action for debugging
+        self._save_messages_for_debugging(messages, summary_action)
+
         return summary_action
 
     def _format_summary_history(self, message_history: list[dict]) -> str:
         # TODO use existing prompt formatters for this (eg ChatML)
         return '\n'.join([f'{m["role"]}: {m["content"]}' for m in message_history])
 
-    def summarize_messages(self, message_sequence_to_summarize: list[Message]):
+    def _summarize_messages(self, message_sequence_to_summarize: list[Message]):
         """Summarize a message sequence using LLM"""
-        context_window = self.llm.config.max_input_tokens
-        summary_prompt = self.prompt_manager.summarize_template.render()
+
+        assert self.context_window is not None, 'context window must be set'
+
+        # we have a template to fill in with:
+        # - message history
+
+        # Render the template with the message history
         summary_input = self._format_summary_history(
             self.llm.format_messages_for_llm(message_sequence_to_summarize)
         )
         summary_input_tkns = self.llm.get_token_count(summary_input)
-        if context_window is None:
-            raise ValueError('context_window should not be None')
-        if summary_input_tkns > MESSAGE_SUMMARY_WARNING_FRACTION * context_window:
+
+        # Check if the token count exceeds the allowed summary level
+        if (
+            summary_input_tkns
+            > self.llm.config.message_summary_warning_level * self.context_window
+        ):
             trunc_ratio = (
-                MESSAGE_SUMMARY_WARNING_FRACTION * context_window / summary_input_tkns
+                self.llm.config.message_summary_warning_level
+                * self.context_window
+                / summary_input_tkns
             ) * 0.8  # For good measure...
             cutoff = int(len(message_sequence_to_summarize) * trunc_ratio)
-            curr_summary = self.summarize_messages(
+
+            # Recursively summarize the first part to fit within the context window
+            curr_summary = self._summarize_messages(
                 message_sequence_to_summarize=message_sequence_to_summarize[:cutoff]
             )
             curr_summary_message = (
@@ -127,16 +160,13 @@ def summarize_messages(self, message_sequence_to_summarize: list[Message]):
                 self.llm.format_messages_for_llm(input)
             )
 
-        message_sequence = []
-        message_sequence.append(
-            Message(role='system', content=[TextContent(text=summary_prompt)])
-        )
-        message_sequence.append(
-            Message(role='user', content=[TextContent(text=summary_input)])
+        # build the message to send
+        message = Message(
+            role='system', content=[TextContent(text=self.summarize_prompt)]
         )
 
         response = self.llm.completion(
-            messages=message_sequence,
+            messages=[message],
             stop=[
                 '</execute_ipython>',
                 '</execute_bash>',
@@ -150,29 +180,73 @@ def summarize_messages(self, message_sequence_to_summarize: list[Message]):
         # action_response = response['choices'][0]['message']['content']
         return response
 
+    def _save_messages_for_debugging(
+        self, messages: list[Message], summary_action: AgentSummarizeAction
+    ) -> None:
+        """
+        Serializes the list of Message objects and the summary action,
+        then saves them to a JSON file in the ./logs directory for debugging purposes.
+
+        Args:
+            messages (list[Message]): The list of messages to serialize.
+            summary_action (AgentSummarizeAction): The summary action to append.
+        """
+        # Ensure the logs directory exists
+        log_dir = Path('./logs')
+        log_dir.mkdir(parents=True, exist_ok=True)
+
+        # Generate a timestamped filename
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+        filename = f'debug_summary_{timestamp}.json'
+        file_path = log_dir / filename
+
+        try:
+            # Serialize messages using Pydantic's model_dump()
+            serialized_messages = [message.model_dump() for message in messages]
+
+            # Create a Message instance for the summary_action
+            summary_event = Message(
+                role='assistant', content=[TextContent(text=str(summary_action))]
+            )
+            serialized_summary = summary_event.model_dump()
+
+            # Append the serialized summary to the messages
+            serialized_messages.append(serialized_summary)
+
+            with file_path.open('w', encoding='utf-8') as f:
+                json.dump(serialized_messages, f, ensure_ascii=False, indent=4)
+
+            logger.debug(f'Messages successfully saved to {file_path}')
+        except Exception as e:
+            logger.error(f'Failed to save messages for debugging: {e}')
+
     @staticmethod
     def main():
         """
         Main method for quick testing and debugging.
-        Reads the latest debug_summary.json file from the ./logs directory,
+        Reads the latest debug_summary_<timestamp>.json file from the ./logs directory,
         deserializes the messages, and prints them.
         """
         log_dir = Path('./logs')
-        log_files = list(log_dir.glob('debug_summary*.json'))
+        log_files = list(log_dir.glob('debug_summary_*.json'))
 
         if not log_files:
-            print('No debug_summary.json files found in the ./logs directory.')
+            print(
+                'No debug_summary_<timestamp>.json files found in the ./logs directory.'
+            )
             return
 
-        # Sort files to find the latest one based on the numerical suffix
-        def extract_suffix(file_path: Path) -> int:
+        # Sort files to find the latest one based on the timestamp in the filename
+        def extract_timestamp(file_path: Path) -> datetime:
             try:
-                suffix = file_path.stem.split('_')[-1]
-                return int(suffix) if suffix.isdigit() else 0
+                # Extract the timestamp part from the filename
+                timestamp_str = file_path.stem.split('_')[-1]
+                return datetime.strptime(timestamp_str, '%Y%m%d_%H%M%S')
             except (IndexError, ValueError):
-                return 0
+                # If timestamp parsing fails, assign the earliest possible datetime
+                return datetime.min
 
-        log_files.sort(key=extract_suffix, reverse=True)
+        log_files.sort(key=extract_timestamp, reverse=True)
         latest_log = log_files[0]
 
         print(f'Loading messages from: {latest_log}')
@@ -197,12 +271,18 @@ def extract_suffix(file_path: Path) -> int:
     # Initialize dependencies as needed for testing
     app_config = load_app_config()
     llm_config = get_llm_config_arg('deepseek')
-    llm = LLM(config=llm_config)
+    if llm_config is not None:
+        llm = LLM(config=llm_config)
+    else:
+        llm = LLM(app_config.get_llm_config('llm'))
+
     prompt_manager = PromptManager(
         prompt_dir=os.path.join(
             os.path.dirname(__file__), '..', 'agenthub', 'memcodeact_agent', 'prompts'
         ),
         agent_skills_docs='',
     )
-    condenser = MemoryCondenser(llm=llm, prompt_manager=prompt_manager)
+    condenser = MemoryCondenser(
+        llm=llm, summarize_prompt=prompt_manager.summarize_template
+    )
     condenser.main()

From a25a867ed53f5ad69f79dfbee23d8f657d2cc4c4 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Tue, 22 Oct 2024 06:33:41 +0200
Subject: [PATCH 56/89] add tokenizer from HF

---
 .../memcodeact_agent/memcodeact_agent.py      |  6 ++--
 openhands/core/config/llm_config.py           |  4 ++-
 openhands/llm/llm.py                          | 32 +++++++++++++++++--
 openhands/utils/prompt.py                     | 25 +++++++++------
 4 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
index 7a9db32aff51..9a8558a52092 100644
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -373,10 +373,10 @@ def _get_messages(self, state: State) -> list[Message]:
 
         return messages
 
-    def summarize_messages_inplace(self, state: State):
+    def summarize_messages_inplace(self, state: State) -> AgentSummarizeAction:
         """Summarizes the earlier messages in the agent's memory to reduce token usage. Uses memGPT's algorithm for in-place summarization."""
         if len(state.history) <= 2:
-            return
+            return None  # ignore
 
         # summarize the conversation history using the condenser
         # conversation_memory.history will include the previous summary, if any, while the regular state.history does not
@@ -387,3 +387,5 @@ def summarize_messages_inplace(self, state: State):
         summary = condenser.condense(messages)
 
         logger.debug(f'Summarized conversation history to: {summary}')
+
+        return summary
diff --git a/openhands/core/config/llm_config.py b/openhands/core/config/llm_config.py
index 384341e8804b..95b932afe0ac 100644
--- a/openhands/core/config/llm_config.py
+++ b/openhands/core/config/llm_config.py
@@ -41,7 +41,8 @@ class LLMConfig:
         log_completions: Whether to log LLM completions to the state.
         max_conversation_window: The maximum number of messages to include in the conversation window (context), after which old messages are truncated or summarized.
         conversation_top_k: The number of top results to retrieve from the conversation history.
-        message_summary_warning_level: The fraction of the conversation window for warning about summarization (e.g. 0.75 for 75% of the tokens).
+        message_summary_warning_level: The fraction of the conversation window for warning about context overflow (e.g. 0.75 for 75% of the tokens).
+        custom_tokenizer: tokenizer to use for computing token size. Not necessary for Open AI, Anthropic. LiteLLM will check HuggingFace for this (e.g. 'deepseek-ai/deepseek-V2.5')
     """
 
     model: str = 'gpt-4o'
@@ -77,6 +78,7 @@ class LLMConfig:
     max_conversation_window: int = 10
     conversation_top_k: int = 5
     message_summary_warning_level: float = 0.75
+    custom_tokenizer: str | None = None
 
     def defaults_to_dict(self) -> dict:
         """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index 2acb098dcc80..335b80babaf6 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -126,6 +126,15 @@ def __init__(
                 ):
                     self.config.max_output_tokens = self.model_info['max_tokens']
 
+        # if using a custom tokenizer, make sure it's loaded and accessible in the format expected by litellm
+        if self.config.custom_tokenizer is not None:
+            # FIXME: transformers is not a regular dependency, but we need it here
+            from transformers import AutoTokenizer
+
+            tokenizer = AutoTokenizer.from_pretrained(self.config.custom_tokenizer)
+            self.tokenizer = {'type': 'huggingface_tokenizer', 'tokenizer': tokenizer}
+
+        # set up the completion function
         self._completion = partial(
             litellm_completion,
             model=self.config.model,
@@ -327,15 +336,32 @@ def get_token_count(self, messages):
         """Get the number of tokens in a list of messages.
 
         Args:
-            messages (list): A list of messages.
+            messages (list): A list of messages, either as a list of dicts or as a list of Message objects.
 
         Returns:
             int: The number of tokens.
         """
+        # convert Message objects to dicts, litellm expects dicts
+        if (
+            isinstance(messages, list)
+            and len(messages) > 0
+            and isinstance(messages[0], Message)
+        ):
+            messages = self.format_messages_for_llm(messages)
+
+        # try to get the token count with the default litellm tokenizers
+        # or the custom tokenizer attribute if set for this LLM configuration
         try:
-            return litellm.token_counter(model=self.config.model, messages=messages)
-        except Exception:
+            return litellm.token_counter(
+                model=self.config.model,
+                messages=messages,
+                custom_tokenizer=self.tokenizer,
+            )
+        except Exception as e:
             # TODO: this is to limit logspam in case token count is not supported
+            logger.error(
+                f'Error getting token count for\n model {self.config.model}\ncustom_tokenizer: {self.config.custom_tokenizer}\n{e}'
+            )
             return 0
 
     def _is_local(self):
diff --git a/openhands/utils/prompt.py b/openhands/utils/prompt.py
index bae9ce7bc6ef..def7f002a4cc 100644
--- a/openhands/utils/prompt.py
+++ b/openhands/utils/prompt.py
@@ -1,7 +1,8 @@
-import os
 from jinja2 import Environment, FileSystemLoader, Template, select_autoescape
+
 from openhands.utils.microagent import MicroAgent
 
+
 class PromptManager:
     """
     Manages prompt templates and micro-agents for AI interactions.
@@ -25,11 +26,12 @@ def __init__(
         self.prompt_dir: str = prompt_dir
         self.agent_skills_docs: str = agent_skills_docs
         self.micro_agent: MicroAgent | None = micro_agent
+        self.conversation_history: str = ''
 
         # initialize Jinja2 Environment with FileSystemLoader
         self.env = Environment(
             loader=FileSystemLoader(self.prompt_dir),
-            autoescape=select_autoescape(['j2','md'])
+            autoescape=select_autoescape(['j2', 'md']),
         )
 
         # load templates using the environment
@@ -41,10 +43,10 @@ def __init__(
     def _load_template(self, template_name: str):
         """
         Loads a Jinja2 template using the configured environment.
-        
+
         Args:
             template_name: The base name of the template file
-        
+
         Returns:
             Template: The loaded Jinja2 template.
         """
@@ -54,12 +56,13 @@ def _load_template(self, template_name: str):
             return template
         except Exception as e:
             print(f'Error loading template {template_name}: {e}')
-            return Template('') 
+            return Template('')
+
     @property
     def system_message(self) -> str:
         """
         Renders the system message template with the necessary variables.
-        
+
         Returns:
             str: The rendered system message.
         """
@@ -73,7 +76,7 @@ def system_message(self) -> str:
     def initial_user_message(self) -> str:
         """
         Renders the initial user message template.
-        
+
         Returns:
             str: The rendered initial user message.
         """
@@ -86,9 +89,11 @@ def initial_user_message(self) -> str:
     def summarize_message(self) -> str:
         """
         Renders the summarize message template.
-        
+
         Returns:
             str: The rendered summarize message.
         """
-        rendered = self.summarize_template.render()
-        return rendered.strip()
\ No newline at end of file
+        rendered = self.summarize_template.render(
+            conversation_history=self.conversation_history
+        )
+        return rendered.strip()

From 2a448f212cc824bcb5a5220b1aa526e999185bd9 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Tue, 22 Oct 2024 07:55:47 +0200
Subject: [PATCH 57/89] adapt action, prompt, some clean up logic

---
 .../prompts/summarize_prompt.j2               |  16 +-
 openhands/events/action/agent.py              |   2 +
 openhands/events/serialization/action.py      |   4 +
 openhands/llm/llm.py                          |   7 +-
 openhands/memory/condenser.py                 | 184 ++++--------------
 openhands/memory/utils.py                     |   3 +
 6 files changed, 56 insertions(+), 160 deletions(-)

diff --git a/openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2 b/openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2
index 466491e7f839..9f95dba5625e 100644
--- a/openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2
@@ -7,13 +7,19 @@ This meant that when conversations exceeded a certain length, they would overflo
 Newer model AIs like yourself still have limited conversation lengths (before overflow occurs), however they now have access to multiple forms of persistent memory.
 Your ability to summarize and recall your own long-term memory is a key part of what makes you a very helpful model.
 
-NOW:
+IMPORTANT:
 Your memory is dangerously close to overflowing. So your task is to summarize your conversation history effectively.
 YOU MUST DO IT RIGHT NOW OR YOU WILL LOSE INFORMATION PERMANENTLY AND FAIL YOUR TASK.
 
-The conversation below is a from a fixed context window and may not be complete. Keep your summary concise and relevant. Include all the critical information in conversation till now in the summary.
+The conversation below is a from a fixed context window and may not be complete. Keep your summary concise and relevant. Include all the critical information that YOU NEED TO REMEMBER in the summary.
 Only output the summary, do NOT include anything else in your output.
 
+Messages sent by the AI are marked with the 'assistant' role.
+The AI 'assistant' can also make calls to functions, whose outputs can be seen in messages with the 'function' role.
+Messages the user sends are in the 'user' role.
+The 'user' role is also used for environment observations, which are important for your task.
+Summarize what happened in the conversation from the perspective of the AI (use the first person).
+
 Create a JSON response with:
     - "action": "summarize"
     - args:
@@ -31,14 +37,10 @@ Example:
 Make sure to include in the summary any relevant keywords, because you will need to look it up.
 
 YOU MUST RESPOND WITH JSON.
-I REPEAT, YOU MUST RESPOND WITH JSON.
-
-The following is the conversation history you need to summarize:
 
-CONVERSATION HISTORY:
+NOW, the CONVERSATION HISTORY:
 -------------------------------------
 {{ conversation_history }}
 -------------------------------------
-
 {% endset %}
 {{ SUMMARIZE_PROMPT }}
diff --git a/openhands/events/action/agent.py b/openhands/events/action/agent.py
index 1c7e6bc39068..02c810b6cf1f 100644
--- a/openhands/events/action/agent.py
+++ b/openhands/events/action/agent.py
@@ -24,6 +24,8 @@ class AgentSummarizeAction(Action):
     action: str = ActionType.SUMMARIZE
     start_id: int = -1
     end_id: int = -1
+    summarized_actions: str = ''
+    summarized_observations: str = ''
 
     @property
     def message(self) -> str:
diff --git a/openhands/events/serialization/action.py b/openhands/events/serialization/action.py
index f40e0ef0c6ed..3e8a5ad2baf0 100644
--- a/openhands/events/serialization/action.py
+++ b/openhands/events/serialization/action.py
@@ -3,7 +3,9 @@
 from openhands.events.action.agent import (
     AgentDelegateAction,
     AgentFinishAction,
+    AgentRecallAction,
     AgentRejectAction,
+    AgentSummarizeAction,
     ChangeAgentStateAction,
 )
 from openhands.events.action.browse import BrowseInteractiveAction, BrowseURLAction
@@ -31,6 +33,8 @@
     ModifyTaskAction,
     ChangeAgentStateAction,
     MessageAction,
+    AgentSummarizeAction,
+    AgentRecallAction,
 )
 
 ACTION_TYPE_TO_CLASS = {action_class.action: action_class for action_class in actions}  # type: ignore[attr-defined]
diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index 335b80babaf6..94d6c9495f3a 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -25,6 +25,7 @@
     ServiceUnavailableError,
 )
 from litellm.types.utils import CostPerToken, ModelResponse, Usage
+from litellm.utils import create_pretrained_tokenizer
 
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import Message
@@ -128,11 +129,7 @@ def __init__(
 
         # if using a custom tokenizer, make sure it's loaded and accessible in the format expected by litellm
         if self.config.custom_tokenizer is not None:
-            # FIXME: transformers is not a regular dependency, but we need it here
-            from transformers import AutoTokenizer
-
-            tokenizer = AutoTokenizer.from_pretrained(self.config.custom_tokenizer)
-            self.tokenizer = {'type': 'huggingface_tokenizer', 'tokenizer': tokenizer}
+            self.tokenizer = create_pretrained_tokenizer(self.config.custom_tokenizer)
 
         # set up the completion function
         self._completion = partial(
diff --git a/openhands/memory/condenser.py b/openhands/memory/condenser.py
index 27d447efaccf..746e3e36f044 100644
--- a/openhands/memory/condenser.py
+++ b/openhands/memory/condenser.py
@@ -1,11 +1,5 @@
-import json
-import os
-from datetime import datetime
-from pathlib import Path
+from litellm.types.utils import ModelResponse
 
-from jinja2 import Template
-
-from openhands.core.config.utils import get_llm_config_arg, load_app_config
 from openhands.core.exceptions import SummarizeError
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import Message, TextContent
@@ -18,9 +12,9 @@
 
 
 class MemoryCondenser:
-    def __init__(self, llm: LLM, summarize_prompt: Template):
+    def __init__(self, llm: LLM, prompt_manager: PromptManager):
         self.llm = llm
-        self.summarize_prompt = summarize_prompt
+        self.prompt_manager = prompt_manager
 
         # just easier to read
         self.context_window = llm.config.max_input_tokens
@@ -42,9 +36,10 @@ def condense(
             AgentSummarizeAction: The summary action containing the condensed summary.
         """
         # don't condense if under the token limit
-        if self.llm.get_token_count(messages) < self.llm.config.max_input_tokens:
+        total_token_count = self.llm.get_token_count(messages)
+        if total_token_count < self.context_window:
             logger.debug(
-                f'Not condensing messages because token count ({self.llm.get_token_count(messages)}) is less than max input tokens ({self.llm.config.max_input_tokens})'
+                f'Not condensing messages because token count ({total_token_count}) is less than max input tokens ({self.context_window})'
             )
             return AgentSummarizeAction(end_id=-1)
 
@@ -67,7 +62,7 @@ def condense(
             f'{desired_token_count_to_summarize} tokens'
         )
 
-        candidate_messages_to_summarize = []
+        candidate_messages_to_summarize: list[Message] = []
         tokens_so_far = 0
         last_summarized_event_id = -1
 
@@ -75,7 +70,7 @@ def condense(
         for message in messages:
             if message.condensable:
                 logger.debug(
-                    f'condensable message: {message.event_id}: {str(message.content)[30:]}'
+                    f'condensable message: {message.event_id}: {str(message.content)[:30]}'
                 )
                 tokens_so_far += self.llm.get_token_count([message.model_dump()])
             if tokens_so_far <= desired_token_count_to_summarize:
@@ -124,165 +119,58 @@ def _summarize_messages(self, message_sequence_to_summarize: list[Message]):
         # we have a template to fill in with:
         # - message history
 
-        # Render the template with the message history
-        summary_input = self._format_summary_history(
-            self.llm.format_messages_for_llm(message_sequence_to_summarize)
-        )
-        summary_input_tkns = self.llm.get_token_count(summary_input)
+        # FIXME: Render the template with the message history
+        token_count = self.llm.get_token_count(message_sequence_to_summarize)
 
-        # Check if the token count exceeds the allowed summary level
+        # check if the token count exceeds the allowed summary level
         if (
-            summary_input_tkns
+            token_count
             > self.llm.config.message_summary_warning_level * self.context_window
         ):
             trunc_ratio = (
                 self.llm.config.message_summary_warning_level
                 * self.context_window
-                / summary_input_tkns
+                / token_count
             ) * 0.8  # For good measure...
             cutoff = int(len(message_sequence_to_summarize) * trunc_ratio)
 
-            # Recursively summarize the first part to fit within the context window
-            curr_summary = self._summarize_messages(
-                message_sequence_to_summarize=message_sequence_to_summarize[:cutoff]
+            # recursively summarize the first part to fit within the context window
+            curr_summary: AgentSummarizeAction = parse_summary_response(
+                self._summarize_messages(
+                    message_sequence_to_summarize=message_sequence_to_summarize[:cutoff]
+                )
             )
+
+            # prepare for the next round
             curr_summary_message = (
                 'Summary of all Action and Observations till now: \n'
-                + curr_summary['summary']
+                + curr_summary.summary
             )
             logger.debug(f'curr_summary_message: {curr_summary_message}')
 
+            # the rest of the messages
+            message_sequence_to_summarize = message_sequence_to_summarize[cutoff:]
+
             curr_summary_message = [TextContent(text=curr_summary_message)]
-            input = [
-                Message({'role': 'user', 'content': curr_summary_message})
-            ] + message_sequence_to_summarize[cutoff:]
-            summary_input = self._format_summary_history(
-                self.llm.format_messages_for_llm(input)
+            message_sequence_to_summarize.insert(
+                0, Message(role='user', content=curr_summary_message)
             )
 
         # build the message to send
-        message = Message(
-            role='system', content=[TextContent(text=self.summarize_prompt)]
+        self.prompt_manager.conversation_history = self.llm.format_messages_for_llm(
+            message_sequence_to_summarize
         )
+        summarize_prompt = self.prompt_manager.summarize_message
+        message = Message(role='system', content=[TextContent(text=summarize_prompt)])
+        serialized_message = message.model_dump()
 
         response = self.llm.completion(
-            messages=[message],
-            stop=[
-                '</execute_ipython>',
-                '</execute_bash>',
-                '</execute_browse>',
-            ],
-            temperature=0.0,
+            messages=[serialized_message],
+            temperature=0.2,
         )
 
         print(f'summarize_messages got response: {response}')
+        assert isinstance(response, ModelResponse), 'response must be a ModelResponse'
 
-        # action_response = response['choices'][0]['message']['content']
-        return response
-
-    def _save_messages_for_debugging(
-        self, messages: list[Message], summary_action: AgentSummarizeAction
-    ) -> None:
-        """
-        Serializes the list of Message objects and the summary action,
-        then saves them to a JSON file in the ./logs directory for debugging purposes.
-
-        Args:
-            messages (list[Message]): The list of messages to serialize.
-            summary_action (AgentSummarizeAction): The summary action to append.
-        """
-        # Ensure the logs directory exists
-        log_dir = Path('./logs')
-        log_dir.mkdir(parents=True, exist_ok=True)
-
-        # Generate a timestamped filename
-        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
-        filename = f'debug_summary_{timestamp}.json'
-        file_path = log_dir / filename
-
-        try:
-            # Serialize messages using Pydantic's model_dump()
-            serialized_messages = [message.model_dump() for message in messages]
-
-            # Create a Message instance for the summary_action
-            summary_event = Message(
-                role='assistant', content=[TextContent(text=str(summary_action))]
-            )
-            serialized_summary = summary_event.model_dump()
-
-            # Append the serialized summary to the messages
-            serialized_messages.append(serialized_summary)
-
-            with file_path.open('w', encoding='utf-8') as f:
-                json.dump(serialized_messages, f, ensure_ascii=False, indent=4)
-
-            logger.debug(f'Messages successfully saved to {file_path}')
-        except Exception as e:
-            logger.error(f'Failed to save messages for debugging: {e}')
-
-    @staticmethod
-    def main():
-        """
-        Main method for quick testing and debugging.
-        Reads the latest debug_summary_<timestamp>.json file from the ./logs directory,
-        deserializes the messages, and prints them.
-        """
-        log_dir = Path('./logs')
-        log_files = list(log_dir.glob('debug_summary_*.json'))
-
-        if not log_files:
-            print(
-                'No debug_summary_<timestamp>.json files found in the ./logs directory.'
-            )
-            return
-
-        # Sort files to find the latest one based on the timestamp in the filename
-        def extract_timestamp(file_path: Path) -> datetime:
-            try:
-                # Extract the timestamp part from the filename
-                timestamp_str = file_path.stem.split('_')[-1]
-                return datetime.strptime(timestamp_str, '%Y%m%d_%H%M%S')
-            except (IndexError, ValueError):
-                # If timestamp parsing fails, assign the earliest possible datetime
-                return datetime.min
-
-        log_files.sort(key=extract_timestamp, reverse=True)
-        latest_log = log_files[0]
-
-        print(f'Loading messages from: {latest_log}')
-
-        try:
-            with latest_log.open('r', encoding='utf-8') as f:
-                messages_data = json.load(f)
-
-            # Deserialize messages using Pydantic's parse_obj
-            messages: list[Message] = [
-                Message.parse_obj(msg_dict) for msg_dict in messages_data
-            ]
-
-            print(f'Successfully loaded {len(messages)} messages:')
-            for msg in messages:
-                print(f'Role: {msg.role}, Content: {msg.content}')
-        except Exception as e:
-            print(f'An error occurred while reading {latest_log}: {e}')
-
-
-if __name__ == '__main__':
-    # Initialize dependencies as needed for testing
-    app_config = load_app_config()
-    llm_config = get_llm_config_arg('deepseek')
-    if llm_config is not None:
-        llm = LLM(config=llm_config)
-    else:
-        llm = LLM(app_config.get_llm_config('llm'))
-
-    prompt_manager = PromptManager(
-        prompt_dir=os.path.join(
-            os.path.dirname(__file__), '..', 'agenthub', 'memcodeact_agent', 'prompts'
-        ),
-        agent_skills_docs='',
-    )
-    condenser = MemoryCondenser(
-        llm=llm, summarize_prompt=prompt_manager.summarize_template
-    )
-    condenser.main()
+        action_response = response.choices[0].message.content
+        return action_response
diff --git a/openhands/memory/utils.py b/openhands/memory/utils.py
index 9f6f4de8f0bd..f49479ec4fb1 100644
--- a/openhands/memory/utils.py
+++ b/openhands/memory/utils.py
@@ -28,6 +28,9 @@ def parse_summary_response(response: str) -> AgentSummarizeAction:
             logger.error(error_message)
             raise InvalidSummaryResponseError(error_message)
         action._source = EventSource.AGENT  # type: ignore
+        action.summary = (
+            action.summarized_actions + '\n' + action.summarized_observations
+        )
     except (LLMResponseError, LLMMalformedActionError) as e:
         logger.error(f'Failed to parse summary response: {str(e)}')
         raise InvalidSummaryResponseError(

From 6f9c922cd7b89c7948e3bf2ff21d626602b84008 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Tue, 22 Oct 2024 08:34:25 +0200
Subject: [PATCH 58/89] remove eval script

---
 evaluation/swe_bench/process_evals.py | 194 --------------------------
 1 file changed, 194 deletions(-)
 delete mode 100644 evaluation/swe_bench/process_evals.py

diff --git a/evaluation/swe_bench/process_evals.py b/evaluation/swe_bench/process_evals.py
deleted file mode 100644
index 3202d10de758..000000000000
--- a/evaluation/swe_bench/process_evals.py
+++ /dev/null
@@ -1,194 +0,0 @@
-import json
-import os
-from typing import Any, Dict, List, Optional
-
-import pandas as pd
-
-
-# Placeholder for litellm's get_tokens function
-def get_tokens(text: str) -> int:
-    """
-    Returns the number of tokens in the given text.
-    Replace this function with litellm's actual get_tokens implementation.
-    """
-    # Example implementation (to be replaced)
-    return len(text.split())
-
-
-class EvalOutput:
-    def __init__(
-        self,
-        instance_id: str,
-        instruction: str,
-        instance: Dict[str, Any],
-        test_result: Dict[str, Any],
-        metadata: Dict[str, Any],
-        history: List[Dict[str, Any]],
-        metrics: Optional[Dict[str, Any]] = None,
-        llm_completions: Optional[List[Any]] = None,
-        error: Optional[str] = None,
-    ):
-        self.instance_id = instance_id
-        self.instruction = instruction
-        self.instance = instance
-        self.test_result = test_result
-        self.metadata = metadata
-        self.history = history
-        self.metrics = metrics
-        self.llm_completions = llm_completions or []
-        self.error = error
-
-    @staticmethod
-    def from_dict(data: Dict[str, Any]) -> 'EvalOutput':
-        return EvalOutput(
-            instance_id=data.get('instance_id', ''),
-            instruction=data.get('instruction', ''),
-            instance=data.get('instance', {}),
-            test_result=data.get('test_result', {}),
-            metadata=data.get('metadata', {}),
-            history=data.get('history', []),
-            metrics=data.get('metrics', None),
-            llm_completions=data.get('llm_completions', []),
-            error=data.get('error', None),
-        )
-
-
-def load_eval_outputs(jsonl_path: str) -> List[EvalOutput]:
-    eval_outputs = []
-    with open(jsonl_path, 'r') as file:
-        content = file.read()
-        try:
-            # Try parsing as a single JSON object
-            data = json.loads(content)
-            eval_output = EvalOutput.from_dict(data)
-            eval_outputs.append(eval_output)
-        except json.JSONDecodeError:
-            # If single JSON parse fails, try parsing as JSON Lines
-            file.seek(0)
-            for line_number, line in enumerate(file, start=1):
-                if line.strip():  # Skip empty lines
-                    try:
-                        data = json.loads(line)
-                        eval_output = EvalOutput.from_dict(data)
-                        eval_outputs.append(eval_output)
-                    except json.JSONDecodeError as e:
-                        print(
-                            f'Failed to parse line {line_number} in {jsonl_path}: {e}'
-                        )
-                        print(
-                            f'Problematic line: {line[:100]}...'
-                        )  # Print first 100 chars of the line
-
-    if not eval_outputs:
-        print(f'Warning: No valid data found in {jsonl_path}')
-
-    return eval_outputs
-
-
-def process_llm_completions(eval_output: EvalOutput) -> List[Dict[str, Any]]:
-    """
-    Processes the llm_completions of an EvalOutput to extract prompts (including system prompt) and responses.
-    Handles both dictionary and string content formats.
-
-    Args:
-        eval_output (EvalOutput): The evaluation output instance.
-
-    Returns:
-        List[Dict[str, Any]]: A list of dictionaries containing 'system_prompt', 'user_prompts', and 'response'.
-    """
-    completions = []
-    for comp in eval_output.llm_completions:
-        messages = comp.get('messages', [])
-        response_content = comp.get('response', {}).get('choices', [])
-
-        # Initialize prompts and response
-        system_prompt = ''
-        user_prompts = []
-        response = ''
-
-        # Extract system prompt and user prompts
-        for message in messages:
-            role = message.get('role')
-            content = message.get('content', [])
-
-            if role == 'system':
-                system_prompt = _extract_text_content(content)
-            elif role == 'user':
-                user_prompts.append(_extract_text_content(content))
-
-        # Extract the assistant's response
-        if response_content and len(response_content) > 0:
-            response = response_content[0].get('message', {}).get('content', '').strip()
-
-        completions.append(
-            {
-                'system_prompt': system_prompt,
-                'user_prompts': user_prompts,
-                'response': response,
-            }
-        )
-    return completions
-
-
-def _extract_text_content(content):
-    if isinstance(content, list):
-        return '\n'.join(
-            item.get('text', '').strip()
-            for item in content
-            if isinstance(item, dict) and item.get('type') == 'text'
-        )
-    elif isinstance(content, str):
-        return content.strip()
-    return ''
-
-
-def create_results_dataframe(eval_outputs: List[EvalOutput]) -> pd.DataFrame:
-    records = []
-    for eo in eval_outputs:
-        completions = process_llm_completions(eo)
-        for comp in completions:
-            prompt = comp['prompt']
-            response = comp['response']
-            token_count = get_tokens(prompt)
-            records.append(
-                {
-                    'instance_id': eo.instance_id,
-                    'prompt': prompt,
-                    'response': response,
-                    'token_count': token_count,
-                }
-            )
-    df = pd.DataFrame(records)
-    return df
-
-
-def main():
-    swe_bench_dir = 'evaluation/evaluation_outputs/outputs/swe-bench-lite/CodeActAgent/'
-    results = []
-
-    # Traverse through all subdirectories to find output.jsonl files
-    for root, dirs, files in os.walk(swe_bench_dir):
-        for file in files:
-            if file.endswith('output.jsonl') or file.endswith('output-pretty.jsonl'):
-                jsonl_path = os.path.join(root, file)
-                print(f'Processing file: {jsonl_path}')
-                try:
-                    eval_outputs = load_eval_outputs(jsonl_path)
-                    if eval_outputs:
-                        df = create_results_dataframe(eval_outputs)
-                        results.append(df)
-                    else:
-                        print(f'No valid data found in {jsonl_path}')
-                except Exception as e:
-                    print(f'Error processing {jsonl_path}: {e}')
-
-    if results:
-        final_df = pd.concat(results, ignore_index=True)
-        final_df.to_csv('swe_bench_evaluation_results.csv', index=False)
-        print('Results saved to swe_bench_evaluation_results.csv')
-    else:
-        print('No valid data found in any of the processed files.')
-
-
-if __name__ == '__main__':
-    main()

From 81b19c268c3cac15c9c0a27a96232dcfe0272791 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Tue, 22 Oct 2024 08:44:48 +0200
Subject: [PATCH 59/89] add script for testing, clean up obsolete content

---
 openhands/agenthub/memcodeact_agent/README.md |  13 +-
 openhands/controller/agent_controller.py      |   2 +-
 openhands/llm/llm.py                          |  28 +--
 openhands/memory/condenser.py                 |   8 -
 openhands/memory/conversation_memory.py       |  47 ++---
 openhands/utils/prompt.py                     |  12 +-
 tests/unit/test_condenser.py                  | 169 ++++++++++++++----
 7 files changed, 165 insertions(+), 114 deletions(-)

diff --git a/openhands/agenthub/memcodeact_agent/README.md b/openhands/agenthub/memcodeact_agent/README.md
index a7f147379dec..628c94d57d3d 100644
--- a/openhands/agenthub/memcodeact_agent/README.md
+++ b/openhands/agenthub/memcodeact_agent/README.md
@@ -2,18 +2,11 @@
 
 ## Introduction
 
-`memcodeact_agent` is a memory-enabled experimental agent built upon the foundation of the existing `codeact_agent`. Designed to enhance the capabilities of autonomous agents, `memcodeact_agent` incorporates advanced memory functionalities inspired by recent advancements in generative AI research. This agent leverages memory to improve task execution, context retention, and overall performance, making it more adept at handling complex and extended interactions.
-
-## Features
-
-- **Memory Integration:** Retains context across multiple interactions, allowing for more coherent and contextually aware responses.
-- **Enhanced Action Parsing:** Utilizes a memory-augmented action parser to interpret and execute complex commands effectively.
-- **Improved Task Management:** Manages and retrieves past actions and observations to inform current decision-making processes.
-- **Experimental Capabilities:** Serves as a platform for testing and refining memory-related functionalities in AI agents.
+`memcodeact_agent` is a memory-enabled experimental agent built upon the foundation of the existing `codeact_agent`, incorporating memory functionalities.
 
 ## Inspiration and Research
 
-The development of `memcodeact_agent` is inspired by two pivotal research papers in the field of generative AI and memory-augmented models:
+The development of `memcodeact_agent` is inspired by two research papers in the field of generative AI and memory-augmented models:
 
 1. **Extending Generative AI with Memory**
    - **Paper:** [Extending Generative AI with Memory](https://arxiv.org/pdf/2304.03442)
@@ -21,7 +14,7 @@ The development of `memcodeact_agent` is inspired by two pivotal research papers
 
 2. **MemGPT: Memory-Enhanced GPT Models**
    - **Paper:** [MemGPT: Memory-Enhanced GPT Models](https://arxiv.org/pdf/2310.08560)
-   - **Summary:** MemGPT introduces a novel architecture that incorporates external memory modules into GPT models. This integration allows the model to access and update its memory dynamically during interactions. The paper details the implementation of memory layers, attention mechanisms for memory retrieval, and training methodologies that enable the model to learn from both its internal parameters and external memory. The results demonstrate significant improvements in tasks requiring long-term dependency understanding and information recall.
+   - **Summary:** MemGPT introduces a novel architecture that incorporates external memory modules into GPT models. This integration allows the model to access and update its memory dynamically during interactions. The results demonstrate significant improvements in tasks requiring information recall.
 
 ## Getting Started
 
diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index 2d0ad73235d9..56e7cf1b1d08 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -296,7 +296,7 @@ async def _handle_action(self, action: Action):
         elif isinstance(action, AgentRecallAction):
             # llama_index_list = self.long_term_memory.search(action.query, action.history)
             # logger.info(f'llama-index list: {llama_index_list}')
-            litellm_list = self.agent.llm.recall_memory(
+            litellm_list = self.agent.llm.search(
                 action.query, self.state.history
             )
             logger.info(f'litellm list: {litellm_list}')
diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index 94d6c9495f3a..d7fac7001813 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -439,32 +439,6 @@ def format_messages_for_llm(self, messages: Message | list[Message]) -> list[dic
         # let pydantic handle the serialization
         return [message.model_dump() for message in messages]
 
-    def summarize_messages(self, messages: list[Message]) -> str:
-        """Summarizes a list of messages using the LLM."""
-        token_counts = [self.get_token_count(m) for m in messages]
-        max_tokens = (
-            self.model_info.get('max_tokens', 4096)
-            if self.model_info is not None
-            else 4096
-        )
-        desired_token_count_to_summarize = max_tokens // 4
-        tokens_so_far = 0
-        cutoff = 0
-        for i, msg in enumerate(messages):
-            cutoff = i
-            tokens_so_far += token_counts[i]
-            if tokens_so_far > desired_token_count_to_summarize:
-                break
-        cutoff += 1
-        message_sequence_to_summarize = messages[1:cutoff]
-        if len(message_sequence_to_summarize) <= 1:
-            return ''
-        prompt = 'Please summarize the following conversation:\n\n'
-        for message in message_sequence_to_summarize:
-            prompt += f'{message.role.capitalize()}: {message.content}\n'
-        response = self._completion(messages=[{'role': 'user', 'content': prompt}])
-        return response['choices'][0]['message']['content']
-
     def embed_event(self, event: Event) -> np.ndarray:
         """
         Embeds a single event using the embedding model.
@@ -507,7 +481,7 @@ def embed_history(self, history: list[Event]) -> list[np.ndarray]:
             embeddings.append(embedding)
         return embeddings
 
-    def recall_memory(
+    def search(
         self, query: str, history: list[Event], top_k: int = 5
     ) -> list[Event]:
         """
diff --git a/openhands/memory/condenser.py b/openhands/memory/condenser.py
index 746e3e36f044..45dc3c418adc 100644
--- a/openhands/memory/condenser.py
+++ b/openhands/memory/condenser.py
@@ -107,19 +107,11 @@ def condense(
 
         return summary_action
 
-    def _format_summary_history(self, message_history: list[dict]) -> str:
-        # TODO use existing prompt formatters for this (eg ChatML)
-        return '\n'.join([f'{m["role"]}: {m["content"]}' for m in message_history])
-
     def _summarize_messages(self, message_sequence_to_summarize: list[Message]):
         """Summarize a message sequence using LLM"""
 
         assert self.context_window is not None, 'context window must be set'
 
-        # we have a template to fill in with:
-        # - message history
-
-        # FIXME: Render the template with the message history
         token_count = self.llm.get_token_count(message_sequence_to_summarize)
 
         # check if the token count exceeds the allowed summary level
diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py
index 668d59fd093a..f89b786cc882 100644
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@@ -4,7 +4,9 @@
 from openhands.core.config.llm_config import LLMConfig
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action.agent import AgentSummarizeAction
+from openhands.events.event import Event
 from openhands.events.serialization.event import event_to_dict
+from openhands.llm.llm import LLM
 from openhands.memory.base_memory import Memory
 
 TOP_K = 10
@@ -67,55 +69,30 @@ def text_search(
         Returns:
             A tuple containing the list of matching messages and the total number of matches.
         """
-        # return self.long_term_memory.text_search(query, count, start)
-        pass
+        return self.long_term_memory.text_search(query, count, start)
 
-    def date_search(
-        self,
-        start_date: str,
-        end_date: str,
-        count: int | None = None,
-        start: int | None = None,
-    ) -> tuple[list[str], int]:
+    def recall_memory(
+        self, llm: LLM, query: str, top_k: int = 5
+    ) -> list[Event]:
         """
-        Perform a date-based search on LongTermMemory.
-
-        Args:
-            start_date: Start date in YYYY-MM-DD format.
-            end_date: End date in YYYY-MM-DD format.
-            count: Number of results to return.
-            start: Pagination start index.
-
-        Returns:
-            A tuple containing the list of matching messages and the total number of matches.
-        """
-        # return self.long_term_memory.date_search(start_date, end_date, count, start)
-        pass
-
-    def embedding_search(
-        self, query: str, count: int | None = None, start: int | None = None
-    ) -> tuple[list[str], int]:
-        """
-        Perform an embedding-based semantic search on LongTermMemory.
+        Get the most similar events based on the query.
 
         Args:
             query: The query string for semantic search.
-            count: Number of results to return.
-            start: Pagination start index.
+            top_k: Number of top results to retrieve.
 
         Returns:
-            A tuple containing the list of semantically similar messages and the total number of matches.
+            A list of semantically similar events.
         """
-        # return self.long_term_memory.search(query, count, start)
-        pass
+        # get the most similar events based on the query
+        # for testing recall with litellm
+        return llm.search(query, self.state.history, top_k)
 
     def update(self, state: State) -> None:
         """Update the conversation memory with information from the new events."""
 
         # the number of messages that are hidden from the user
         # is the number of events in summary
-        logger.info(f'state: {state}')
-        logger.info(f'self.state: {self.state}')
         if state.summary:
             self.hidden_message_count = state.summary.end_id - state.summary.start_id
 
diff --git a/openhands/utils/prompt.py b/openhands/utils/prompt.py
index def7f002a4cc..b46093e68a98 100644
--- a/openhands/utils/prompt.py
+++ b/openhands/utils/prompt.py
@@ -52,9 +52,9 @@ def _load_template(self, template_name: str):
         """
         try:
             template = self.env.get_template(f'{template_name}.j2')
-            print(f'Loaded template {template_name}')
             return template
         except Exception as e:
+            # FIXME let it fail without crashing
             print(f'Error loading template {template_name}: {e}')
             return Template('')
 
@@ -74,8 +74,14 @@ def system_message(self) -> str:
 
     @property
     def initial_user_message(self) -> str:
-        """
-        Renders the initial user message template.
+        """This is the initial user message provided to the agent
+        before *actual* user instructions are provided.
+
+        It is used to provide a demonstration of how the agent
+        should behave in order to solve the user's task. And it may
+        optionally contain some additional context about the user's task.
+        This additional context will convert the current generic agent
+        into a more specialized agent that is tailored to the user's task.
 
         Returns:
             str: The rendered initial user message.
diff --git a/tests/unit/test_condenser.py b/tests/unit/test_condenser.py
index 5a8ab72978a5..0e1c2bd36ef9 100644
--- a/tests/unit/test_condenser.py
+++ b/tests/unit/test_condenser.py
@@ -1,44 +1,153 @@
-from unittest.mock import Mock, patch
+import argparse
+import json
+import os
+from datetime import datetime
+from pathlib import Path
 
-import pytest
-
-from openhands.core.exceptions import LLMResponseError
+from openhands.core import logger
+from openhands.core.config.utils import get_llm_config_arg, load_app_config
+from openhands.core.message import Message, TextContent
+from openhands.events.action.agent import AgentSummarizeAction
 from openhands.llm.llm import LLM
 from openhands.memory.condenser import MemoryCondenser
+from openhands.utils.prompt import PromptManager
 
 
-@pytest.fixture
-def memory_condenser():
-    return MemoryCondenser()
+def save_messages_for_debugging(
+    messages: list[Message], summary_action: AgentSummarizeAction
+) -> None:
+    """
+    Serializes the list of Message objects and the summary action,
+    then saves them to a JSON file in the ./logs directory for debugging purposes.
 
+    Args:
+        messages (list[Message]): The list of messages to serialize.
+        summary_action (AgentSummarizeAction): The summary action to append.
+    """
+    # Ensure the logs directory exists
+    log_dir = Path('./logs')
+    log_dir.mkdir(parents=True, exist_ok=True)
 
-@pytest.fixture
-def mock_llm():
-    return Mock(spec=LLM)
+    # Generate a timestamped filename
+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+    filename = f'debug_summary_{timestamp}.json'
+    file_path = log_dir / filename
 
+    try:
+        # Serialize messages using Pydantic's model_dump()
+        serialized_messages = [message.model_dump() for message in messages]
 
-def test_condense_success(memory_condenser, mock_llm):
-    mock_llm.completion.return_value = {
-        'choices': [{'message': {'content': 'Condensed memory'}}]
-    }
-    result = memory_condenser.condense('Summarize this', mock_llm)
-    assert result == 'Condensed memory'
-    mock_llm.completion.assert_called_once_with(
-        messages=[{'content': 'Summarize this', 'role': 'user'}]
-    )
+        # Create a Message instance for the summary_action
+        summary_event = Message(
+            role='assistant', content=[TextContent(text=str(summary_action))]
+        )
+        serialized_summary = summary_event.model_dump()
+
+        # Append the serialized summary to the messages
+        serialized_messages.append(serialized_summary)
+
+        with file_path.open('w', encoding='utf-8') as f:
+            json.dump(serialized_messages, f, ensure_ascii=False, indent=4)
+
+        logger.debug(f'Messages successfully saved to {file_path}')
+    except Exception as e:
+        logger.error(f'Failed to save messages for debugging: {e}')
+
+
+def main(condenser: MemoryCondenser, file_path: str | None = None):
+    """
+    Main method for quick testing and debugging.
+    Reads a specified debug summary JSON file from the ./logs/deepseek-24sept directory,
+    deserializes the messages, and prints them.
+    If no file is specified, it falls back to the latest file based on timestamp.
+
+    Args:
+        file_path (str | None): The path to the log file to process. If None, the latest file is used.
+    """
+    log_dir = Path('./logs/deepseek-24sept')
+    log_dir.mkdir(parents=True, exist_ok=True)
+
+    if file_path:
+        target_log = Path(file_path)
+        if not target_log.exists():
+            print(f'Specified log file does not exist: {target_log}')
+            return
+    else:
+        log_files = list(log_dir.glob('instance_*_*.json'))
+
+        if not log_files:
+            print(
+                'No instance_*_*.json files found in the ./logs/deepseek-24sept directory.'
+            )
+            return
+
+        # Sort files to find the latest one based on the digits at the end of the filename
+        def extract_digits(file_path: Path) -> int:
+            try:
+                # Extract the digits part from the filename
+                digits_str = file_path.stem.split('_')[-1]
+                return int(digits_str)
+            except (IndexError, ValueError):
+                # If digit extraction fails, assign the lowest possible value
+                return -1
 
+        log_files.sort(key=extract_digits, reverse=True)
+        target_log = log_files[0]
 
-def test_condense_exception(memory_condenser, mock_llm):
-    mock_llm.completion.side_effect = LLMResponseError('LLM error')
-    with pytest.raises(LLMResponseError, match='LLM error'):
-        memory_condenser.condense('Summarize this', mock_llm)
+        print(f'Loading messages from: {target_log}')
 
+    try:
+        with target_log.open('r', encoding='utf-8') as f:
+            messages_data = json.load(f)
 
-@patch('openhands.memory.condenser.logger')
-def test_condense_logs_error(mock_logger, memory_condenser, mock_llm):
-    mock_llm.completion.side_effect = LLMResponseError('LLM error')
-    with pytest.raises(LLMResponseError):
-        memory_condenser.condense('Summarize this', mock_llm)
-    mock_logger.error.assert_called_once_with(
-        'Error condensing thoughts: %s', 'LLM error', exc_info=False
+            # Deserialize messages using Pydantic's parse_obj
+            messages: list[Message] = [
+                Message.parse_obj(msg_dict) for msg_dict in messages_data
+            ]
+
+            print(f'Successfully loaded {len(messages)} messages:')
+            # for msg in messages:
+            #    print(f'{msg.role}:\n {msg.content[50:]}')
+    except Exception as e:
+        print(f'An error occurred while reading {target_log}: {e}')
+
+    # run them through hell
+    summary_action = condenser.condense(messages)
+    print(f'summary_action: {summary_action}')
+
+
+if __name__ == '__main__':
+    # load or simulate dependencies as needed for testing
+    app_config = load_app_config()
+    llm_config = get_llm_config_arg('deepseek')
+    if llm_config is not None:
+        llm = LLM(config=llm_config)
+    else:
+        llm = LLM(app_config.get_llm_config('llm'))
+
+    prompt_manager = PromptManager(
+        prompt_dir=os.path.join(
+            os.path.dirname(__file__), '..', 'agenthub', 'memcodeact_agent', 'prompts'
+        ),
+        agent_skills_docs='',
     )
+    condenser = MemoryCondenser(llm=llm, prompt_manager=prompt_manager)
+
+    # attach on fly the save_messages_for_debugging method to the condenser
+    condenser.save_messages_for_debugging = save_messages_for_debugging
+
+    # Setup argument parser for optional file parameter
+    parser = argparse.ArgumentParser(description='Run MemoryCondenser on a .json file.')
+    parser.add_argument(
+        '--file',
+        type=str,
+        default=None,
+        help='Path to the specific file to process. If not provided, the latest file is used.',
+    )
+    args = parser.parse_args()
+
+    if args.file is not None and args.file == '':
+        args.file = None
+
+    # Call the main method with the specified file path if provided
+    main(condenser, file_path=args.file)

From a858083d502ddd3949322962496972338736a18c Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 24 Oct 2024 01:44:20 +0200
Subject: [PATCH 60/89] break down prompts; tweak core memory; rewrite algo

---
 .../memcodeact_agent/memcodeact_agent.py      |  60 +++++--
 .../prompts/components/memory.j2              |  58 ++++++
 .../prompts/components/system.j2              |  44 +++++
 .../prompts/summarize_prompt.j2               |   4 +-
 .../memcodeact_agent/prompts/system_prompt.j2 |  69 ++-----
 openhands/memory/condenser.py                 | 170 +++++++-----------
 openhands/memory/conversation_memory.py       | 104 ++++++-----
 openhands/memory/core_memory.py               |  62 +++++--
 openhands/utils/prompt.py                     | 162 ++++++++++-------
 tests/unit/test_condenser.py                  |  37 +++-
 10 files changed, 459 insertions(+), 311 deletions(-)
 create mode 100644 openhands/agenthub/memcodeact_agent/prompts/components/memory.j2
 create mode 100644 openhands/agenthub/memcodeact_agent/prompts/components/system.j2

diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
index 9a8558a52092..434df24c6ec3 100644
--- a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -26,7 +26,7 @@
 from openhands.events.observation.agent import AgentRecallObservation
 from openhands.events.observation.error import ErrorObservation
 from openhands.events.observation.observation import Observation
-from openhands.events.serialization.event import truncate_content
+from openhands.events.serialization.event import event_to_memory, truncate_content
 from openhands.llm.llm import LLM
 from openhands.memory.condenser import MemoryCondenser
 from openhands.memory.conversation_memory import ConversationMemory
@@ -118,7 +118,7 @@ def __init__(
         )
 
         self.prompt_manager = PromptManager(
-            prompt_dir=os.path.join(os.path.dirname(__file__)),
+            prompt_dir=os.path.join(os.path.dirname(__file__), 'prompts'),
             agent_skills_docs=AgentSkillsRequirement.documentation,
             micro_agent=self.micro_agent,
         )
@@ -286,21 +286,24 @@ def step(self, state: State) -> Action:
         except TokenLimitExceededError as e:
             logger.error(e, exc_info=False)
 
-            # run condenser directly; the alternative is to delegate to the microagent
-            summary_action = self.summarize_messages_inplace(state)
+            # run condenser directly
+            summary_action = self.summarize_messages(state)
 
             # just return for now
             return summary_action
         return self.action_parser.parse(response)
 
     def _get_messages(self, state: State) -> list[Message]:
+        # update prompt manager with current core memory
+        self.prompt_manager.core_memory = self.core_memory.format_blocks()
+
         messages: list[Message] = [
             Message(
                 role='system',
                 content=[
                     TextContent(
                         text=self.prompt_manager.system_message,
-                        cache_prompt=self.llm.is_caching_prompt_active(),  # cache system prompt
+                        cache_prompt=self.llm.is_caching_prompt_active(),
                     )
                 ],
                 condensable=False,
@@ -317,14 +320,22 @@ def _get_messages(self, state: State) -> list[Message]:
             ),
         ]
 
-        for event in state.history:
-            # create a regular message from an event
-            if isinstance(event, Action):
+        for event in self.conversation_memory.memory:
+            # if it is a summary or recall, it will not have event_id for now
+            if isinstance(event, AgentSummarizeAction):
+                message = self.get_action_message(event)
+            elif isinstance(event, AgentRecallAction):
                 message = self.get_action_message(event)
-            elif isinstance(event, Observation):
+            elif isinstance(event, AgentRecallObservation):
                 message = self.get_observation_message(event)
             else:
-                raise ValueError(f'Unknown event type: {type(event)}')
+                # create a regular message from an event
+                if isinstance(event, Action):
+                    message = self.get_action_message(event)
+                elif isinstance(event, Observation):
+                    message = self.get_observation_message(event)
+                else:
+                    raise ValueError(f'Unknown event type: {type(event)}')
 
             # add regular message
             if message:
@@ -373,19 +384,36 @@ def _get_messages(self, state: State) -> list[Message]:
 
         return messages
 
-    def summarize_messages_inplace(self, state: State) -> AgentSummarizeAction:
-        """Summarizes the earlier messages in the agent's memory to reduce token usage. Uses memGPT's algorithm for in-place summarization."""
+    def summarize_messages(self, state: State) -> AgentSummarizeAction | None:
+        """Summarizes the earlier messages in the agent's memory to reduce token usage. Roughly uses memGPT's algorithm for in-place summarization."""
         if len(state.history) <= 2:
             return None  # ignore
 
         # summarize the conversation history using the condenser
-        # conversation_memory.history will include the previous summary, if any, while the regular state.history does not
         condenser = MemoryCondenser(self.llm, self.prompt_manager)
 
         # send all messages and let it sort it out
         messages = self._get_messages(state)
-        summary = condenser.condense(messages)
+        summary_action = condenser.condense(messages)
+
+        # update conversation memory with the summary
+        if summary_action and summary_action.summary:
+            self.conversation_memory.update_summary(
+                summary_action.summary, summary_action.end_id
+            )
 
-        logger.debug(f'Summarized conversation history to: {summary}')
+        return summary_action
 
-        return summary
+    def recall_from_memory(self, query: str, top_k: int = 5) -> AgentRecallObservation:
+        """Searches the conversation memory for relevant information."""
+        # note: pairs are better than events for this
+        recalled_events = self.conversation_memory.search(self.llm, query, top_k)
+
+        # format the recalled events into a readable format
+        recalled_text = '\n'.join(
+            [f'- {event_to_memory(event, -1)}' for event in recalled_events]
+        )
+
+        return AgentRecallObservation(
+            content=f'Searching memory for: {query}', query=query, memory=recalled_text
+        )
diff --git a/openhands/agenthub/memcodeact_agent/prompts/components/memory.j2 b/openhands/agenthub/memcodeact_agent/prompts/components/memory.j2
new file mode 100644
index 000000000000..8ab602553d5b
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/prompts/components/memory.j2
@@ -0,0 +1,58 @@
+{# Memory system components #}
+
+{% macro memory_system() %}
+Apart from acting on the environment, you can also act on your own memory. You have three memory systems:
+
+1. Core Memory: Persistent important information that you always need to access
+   - Use <core_memory_append>category: content</core_memory_append> to add information
+   - Categories:
+     * personality: Your capabilities and traits
+     * task_context: Everything important about your current task
+   - Always add critical information about your task to task_context
+
+2. Conversation Memory: Your recent interaction history
+   - Use <memory_recall>query</memory_recall> to search past conversations
+   - Use <memory_summarize> to create summaries when context gets too long
+
+3. System Memory: Your basic capabilities and constraints
+   - Always available through your system prompt
+
+{{ memory_control_flow() }}
+{{ memory_editing() }}
+
+RECALL MEMORY (CONVERSATION HISTORY):
+Even though you can only see recent messages in your immediate context, you can search over your entire message history from a database.
+This conversation history database allows you to search through past interactions, effectively allowing you to remember prior engagements with a user or previous environment feedback.
+You can search your conversation history using the <memory_recall> action.
+
+SUMMARY AND RECALL EXAMPLE:
+---
+ENVIRONMENT REMINDER:
+Prior messages (51 of 100 total messages) have been hidden from view due to conversation memory constraints.
+The following is a summary of the first 51 messages:
+I read the documentation of the wombat library, about its functions like: wombat.array, wombat.zeros, wombat.ones, etc.
+
+<memory_recall> What does the wombat.zeros() function do? </memory_recall>
+
+MEMORY RECALL:
+It creates an array of zero-filled tuples with the given size.
+---
+{% endmacro %}
+
+{% macro memory_control_flow() %}
+CONTROL FLOW:
+Unlike a human, your brain is not continuously thinking, but is run in short bursts.
+Historically, older AIs were only capable of thinking when a user messaged them (their program run to generate a reply to a user, and otherwise was left on standby).
+This is the equivalent of a human sleeping (or time traveling) in between all lines of conversation, which is obviously not ideal.
+Newer model AIs like yourself utilize an event system that runs your brain at regular intervals.
+Your brain is run in response to user events (user gave you a task, user asked you a question, etc.), similar to older models.
+However in addition, your brain is run at regular intervals, to mimic a human, it has the ability to continuously think outside of active conversation (and unlike a human, you never need to sleep!).
+{% endmacro %}
+
+{% macro memory_editing() %}
+MEMORY EDITING:
+Older AI models had no concept of persistent memory; they were only able to access their initial instructions and a limited context window of chat conversation with a user.
+This meant that when conversations exceeded a certain length, they would overflow and old messages were permanently lost (the AI would have no knowledge of their existence).
+Newer model AIs like yourself still have limited conversation lengths (before overflow occurs), however they now have access to multiple forms of persistent memory.
+Your ability to edit and recall your own long-term memory is a key part of what makes you a very helpful model.
+{% endmacro %}
diff --git a/openhands/agenthub/memcodeact_agent/prompts/components/system.j2 b/openhands/agenthub/memcodeact_agent/prompts/components/system.j2
new file mode 100644
index 000000000000..e1dee1df1a03
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/prompts/components/system.j2
@@ -0,0 +1,44 @@
+{# Core system components #}
+
+{% macro system_prefix() %}
+You are a new generation AI assistant, an expert in software engineering. You are designed to behave helpfully, trustworthily, and with extreme diligence.
+
+{% if core_memory %}
+Core Memory:
+{{ core_memory }}
+{% endif %}
+
+You are able to act on the environment directly, and you can also manage your own memory.
+You can use a Python environment with <execute_ipython>, e.g.:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+You can execute bash commands wrapped with <execute_bash>, e.g. <execute_bash> ls </execute_bash>.
+If a bash command returns exit code `-1`, this means the process is not yet finished.
+You must then send a second <execute_bash>. The second <execute_bash> can be empty
+(which will retrieve any additional logs), or it can contain text to be sent to STDIN of the running process,
+or it can contain the text `ctrl+c` to interrupt the process.
+
+For commands that may run indefinitely, you should redirect the output to a file and run
+the command in the background, e.g. <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+If a command execution result says "Command timed out. Sending SIGINT to the process",
+you should retry running the command in the background.
+{% endmacro %}
+
+{% macro environment_rules() %}
+You act on the environment using actions like <execute_ipython>. When you receive data prefixed with 'ENV OBSERVATION', it is a non-interactive information from the environment, not from a human user.
+For example:
+"ENV OBSERVATION
+The system returned a value of 42."
+Do not respond to these messages. Use them to perform your task.
+{% endmacro %}
+
+{% macro pip_install_rules() %}
+You can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+{% endmacro %}
+
+{% macro browsing_rules() %}
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the USA's president using Google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+{% endmacro %}
diff --git a/openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2 b/openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2
index 9f95dba5625e..04e979dbdc59 100644
--- a/openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2
@@ -15,7 +15,7 @@ The conversation below is a from a fixed context window and may not be complete.
 Only output the summary, do NOT include anything else in your output.
 
 Messages sent by the AI are marked with the 'assistant' role.
-The AI 'assistant' can also make calls to functions, whose outputs can be seen in messages with the 'function' role.
+The AI 'assistant' can also make tool calls, whose outputs can be seen in messages with the 'tool' role.
 Messages the user sends are in the 'user' role.
 The 'user' role is also used for environment observations, which are important for your task.
 Summarize what happened in the conversation from the perspective of the AI (use the first person).
@@ -34,7 +34,7 @@ Example:
         "summarized_observations": "The agent encountered a UnicodeDecodeError when initially searching the PDF text, but was able to resolve this by installing the PyPDF2 library and successfully extracting relevant information about sequence diagrams."
     }
 }
-Make sure to include in the summary any relevant keywords, because you will need to look it up.
+Include in the summary any relevant keywords, because you will need to look it up.
 
 YOU MUST RESPOND WITH JSON.
 
diff --git a/openhands/agenthub/memcodeact_agent/prompts/system_prompt.j2 b/openhands/agenthub/memcodeact_agent/prompts/system_prompt.j2
index 00d1c2c78a6c..f09f8f60f874 100644
--- a/openhands/agenthub/memcodeact_agent/prompts/system_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/prompts/system_prompt.j2
@@ -1,49 +1,16 @@
-{% set MINIMAL_SYSTEM_PREFIX %}
-You are a new generation AI assistant, an expert in software engineering. You are designed to behave helpfully, trustworthily, and with extreme diligence.
-You are able to act on the environment directly, and you can also manage your own memory.
-You can use a Python environment with <execute_ipython>, e.g.:
-<execute_ipython>
-print("Hello World!")
-</execute_ipython>
-You can execute bash commands wrapped with <execute_bash>, e.g. <execute_bash> ls </execute_bash>.
-If a bash command returns exit code `-1`, this means the process is not yet finished.
-You must then send a second <execute_bash>. The second <execute_bash> can be empty
-(which will retrieve any additional logs), or it can contain text to be sent to STDIN of the running process,
-or it can contain the text `ctrl+c` to interrupt the process.
-
-For commands that may run indefinitely, you should redirect the output to a file and run
-the command in the background, e.g. <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
-If a command execution result says "Command timed out. Sending SIGINT to the process",
-you should retry running the command in the background.
-{% endset %}
-
-{% set ENVIRONMENT_PREFIX %}
-You act on the environment using actions like <execute_ipython>. When you receive data prefixed with 'ENV OBSERVATION', it is a non-interactive information from the environment, not from a human user.
-For example:
-"ENV OBSERVATION
-The system returned a value of 42."
-Do not respond to these messages. Use them to perform your task.
-{% endset %}
-
-{% set MEMORY_PREFIX %}
-Apart from acting on the environment, you can also act on your own memory. You can make your brain summarize or recall past events, by using the <memory_summarize> or <memory_recall> actions.
-{% include 'memory_prompt.j2' %}
-{{ MEMORY_PROMPT }}
-{% endset %}
-
-{% set BROWSING_PREFIX %}
-The assistant can browse the Internet with <execute_browse> and </execute_browse>.
-For example, <execute_browse> Tell me the USA's president using Google search </execute_browse>.
-Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
-{% endset %}
-
-{% set PIP_INSTALL_PREFIX %}
-You can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
-{% endset %}
-
-{% set SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + PIP_INSTALL_PREFIX + ENVIRONMENT_PREFIX + MEMORY_PREFIX %}
-
-{% set COMMAND_DOCS %}
+{# Import components #}
+{% import "components/system.j2" as system %}
+{% import "components/memory.j2" as memory %}
+
+{# Compose the system prompt #}
+{{ system.system_prefix() }}
+{{ system.pip_install_rules() }}
+{{ system.environment_rules() }}
+{{ memory.memory_system() }}
+{{ system.browsing_rules() }}
+
+{# Agent capabilities documentation #}
+{% if agent_skills_docs %}
 Apart from the standard Python library, you can also use the following functions (already imported) in <execute_ipython> environment:
 {{ agent_skills_docs }}
 IMPORTANT:
@@ -54,9 +21,9 @@ IMPORTANT:
 - After EVERY `create_file` the method `append_file` shall be used to write the FIRST content!
 - For `edit_file_by_replace` NEVER provide empty parameters!
 - For `edit_file_by_replace` the file must be read fully before any replacements!
-{% endset %}
+{% endif %}
 
-{% set SYSTEM_SUFFIX %}
+{# System constraints #}
 Responses should be concise.
 The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
 Include ONLY ONE <execute_ipython>, <execute_bash>, or <memory_recall>per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
@@ -65,9 +32,3 @@ IMPORTANT: Execute code using <execute_ipython>, <execute_bash> whenever possibl
 You must use full file paths and the `pwd` command to prevent path-related errors.
 You should only address the user in response to an explicit message from the user, or to ask for more information.
 You MUST NOT push any changes to GitHub unless explicitly requested to do so.
-{% endset %}
-
-{# Combine all parts without newlines between them #}
-{{ SYSTEM_PREFIX -}}
-{{- COMMAND_DOCS -}}
-{{- SYSTEM_SUFFIX }}
diff --git a/openhands/memory/condenser.py b/openhands/memory/condenser.py
index 45dc3c418adc..d3b283dd677f 100644
--- a/openhands/memory/condenser.py
+++ b/openhands/memory/condenser.py
@@ -8,8 +8,6 @@
 from openhands.memory.utils import parse_summary_response
 from openhands.utils.prompt import PromptManager
 
-WORD_LIMIT = 200
-
 
 class MemoryCondenser:
     def __init__(self, llm: LLM, prompt_manager: PromptManager):
@@ -18,9 +16,6 @@ def __init__(self, llm: LLM, prompt_manager: PromptManager):
 
         # just easier to read
         self.context_window = llm.config.max_input_tokens
-        assert (
-            self.context_window is not None and self.context_window > 2000
-        ), 'context window must be a number over 2000'
 
     def condense(
         self,
@@ -35,6 +30,10 @@ def condense(
         Returns:
             AgentSummarizeAction: The summary action containing the condensed summary.
         """
+        assert (
+            self.context_window is not None and self.context_window > 2000
+        ), 'context window must be a number over 2000'
+
         # don't condense if under the token limit
         total_token_count = self.llm.get_token_count(messages)
         if total_token_count < self.context_window:
@@ -43,111 +42,80 @@ def condense(
             )
             return AgentSummarizeAction(end_id=-1)
 
-        # the system message and example messages are not condensable
-        # collect messages for summarization until we reach the desired truncation token fraction
-        token_counts = [
-            self.llm.get_token_count([message.model_dump()])
-            for message in messages
-            if message.condensable
-        ]
-        message_buffer_token_count = sum(token_counts)
-
-        desired_token_count_to_summarize = int(
-            message_buffer_token_count * self.llm.config.message_summary_warning_level
-        )
-
-        # log status
-        logger.debug(
-            f'{len(messages)} messages in buffer: {message_buffer_token_count} tokens >> '
-            f'{desired_token_count_to_summarize} tokens'
+        # calculate safe token limit for processing (e.g. 80% of context window)
+        safe_token_limit = int(
+            self.context_window * self.llm.config.message_summary_warning_level
         )
 
-        candidate_messages_to_summarize: list[Message] = []
-        tokens_so_far = 0
-        last_summarized_event_id = -1
-
-        # collect messages until we reach the desired size
-        for message in messages:
-            if message.condensable:
-                logger.debug(
-                    f'condensable message: {message.event_id}: {str(message.content)[:30]}'
-                )
-                tokens_so_far += self.llm.get_token_count([message.model_dump()])
-            if tokens_so_far <= desired_token_count_to_summarize:
-                candidate_messages_to_summarize.append(message)
-                last_summarized_event_id = message.event_id
-            else:
-                break
-
-        logger.debug(
-            f'len(candidate_messages_to_summarize)={len(candidate_messages_to_summarize)}'
-        )
+        # collect condensable messages with their IDs and token counts
+        condensable_messages: list[tuple[Message, int]] = [
+            (msg, self.llm.get_token_count([msg.model_dump()]))
+            for msg in messages
+            if msg.condensable
+        ]
 
-        if len(candidate_messages_to_summarize) <= 1:
-            # Prevents potential infinite loop of summarizing the same message repeatedly
+        if len(condensable_messages) <= 1:
+            # prevents potential infinite loop of summarizing the same message repeatedly
             raise SummarizeError(
-                f"Summarize error: tried to run summarize, but couldn't find enough messages to compress [len={len(candidate_messages_to_summarize)} <= 1]"
-            )
-        else:
-            logger.debug(
-                f'Attempting to summarize with last summarized event id = {last_summarized_event_id}'
+                f"Summarize error: tried to run summarize, but couldn't find enough messages to compress [len={len(condensable_messages)} <= 1]"
             )
 
-        # perform the operation
-        action_response = self._summarize_messages(
-            message_sequence_to_summarize=candidate_messages_to_summarize
-        )
-
-        # we get an AgentSummarizeAction
-        summary_action: AgentSummarizeAction = parse_summary_response(action_response)
-        summary_action.end_id = last_summarized_event_id
+        # track the very first message's id - this will be our start_id
+        first_message_id = condensable_messages[0][0].event_id
 
-        # Serialize and save messages along with the summary action for debugging
-        self._save_messages_for_debugging(messages, summary_action)
+        # create chunks that fit within safe_token_limit
+        chunks: list[list[Message]] = []
+        current_chunk: list[Message] = []
+        current_chunk_tokens = 0
 
-        return summary_action
+        for msg, token_count in condensable_messages:
+            if current_chunk_tokens + token_count > safe_token_limit:
+                if current_chunk:  # save current chunk if not empty, it's done
+                    chunks.append(current_chunk)
 
-    def _summarize_messages(self, message_sequence_to_summarize: list[Message]):
-        """Summarize a message sequence using LLM"""
-
-        assert self.context_window is not None, 'context window must be set'
-
-        token_count = self.llm.get_token_count(message_sequence_to_summarize)
-
-        # check if the token count exceeds the allowed summary level
-        if (
-            token_count
-            > self.llm.config.message_summary_warning_level * self.context_window
-        ):
-            trunc_ratio = (
-                self.llm.config.message_summary_warning_level
-                * self.context_window
-                / token_count
-            ) * 0.8  # For good measure...
-            cutoff = int(len(message_sequence_to_summarize) * trunc_ratio)
-
-            # recursively summarize the first part to fit within the context window
-            curr_summary: AgentSummarizeAction = parse_summary_response(
-                self._summarize_messages(
-                    message_sequence_to_summarize=message_sequence_to_summarize[:cutoff]
+                # start a new chunk
+                current_chunk = [msg]
+                current_chunk_tokens = token_count
+            else:
+                # add to current chunk
+                current_chunk.append(msg)
+                current_chunk_tokens += token_count
+
+        # add the last chunk
+        if current_chunk:
+            chunks.append(current_chunk)
+
+        # process chunks
+        final_summary = None
+        # track the last real message id (note: not summary actions)
+        last_real_message_id = condensable_messages[-1][0].event_id
+
+        for i, chunk in enumerate(chunks):
+            if final_summary:
+                # prepend previous summary to next chunk
+                summary_message = Message(
+                    role='user',
+                    content=[TextContent(text=f'Previous summary:\n{final_summary}')],
+                    condensable=True,
+                    # Note: summary messages don't have an event_id
+                    event_id=-1,
                 )
-            )
-
-            # prepare for the next round
-            curr_summary_message = (
-                'Summary of all Action and Observations till now: \n'
-                + curr_summary.summary
-            )
-            logger.debug(f'curr_summary_message: {curr_summary_message}')
-
-            # the rest of the messages
-            message_sequence_to_summarize = message_sequence_to_summarize[cutoff:]
-
-            curr_summary_message = [TextContent(text=curr_summary_message)]
-            message_sequence_to_summarize.insert(
-                0, Message(role='user', content=curr_summary_message)
-            )
+                chunk.insert(0, summary_message)
+
+            action_response = self._summarize_messages(chunk)
+            summary_action = parse_summary_response(action_response)
+            final_summary = summary_action.summary
+
+        # create final summary action
+        assert final_summary is not None, 'final summary must not be None here'
+        return AgentSummarizeAction(
+            summary=final_summary,
+            start_id=first_message_id,
+            end_id=last_real_message_id,
+        )
 
+    def _summarize_messages(self, message_sequence_to_summarize: list[Message]) -> str:
+        """Summarize a message sequence using LLM"""
         # build the message to send
         self.prompt_manager.conversation_history = self.llm.format_messages_for_llm(
             message_sequence_to_summarize
@@ -163,6 +131,4 @@ def _summarize_messages(self, message_sequence_to_summarize: list[Message]):
 
         print(f'summarize_messages got response: {response}')
         assert isinstance(response, ModelResponse), 'response must be a ModelResponse'
-
-        action_response = response.choices[0].message.content
-        return action_response
+        return response.choices[0].message.content
diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py
index f89b786cc882..f3f42d986267 100644
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@@ -2,33 +2,32 @@
 
 from openhands.controller.state.state import State
 from openhands.core.config.llm_config import LLMConfig
-from openhands.core.logger import openhands_logger as logger
-from openhands.events.action.agent import AgentSummarizeAction
 from openhands.events.event import Event
 from openhands.events.serialization.event import event_to_dict
 from openhands.llm.llm import LLM
 from openhands.memory.base_memory import Memory
 
-TOP_K = 10
-
 
 class StorageType(Enum):
     IN_MEMORY = 'in-memory'
-    VECTOR_DATABASE = 'vector'
+    VECTOR = 'vector'
 
 
 class ConversationMemory(Memory):
-    """Allows the agent to recall events from its entire history.
+    """Allows the agent to recall events from its entire history, with support for summarization and recall.
 
     This class handles the summarized events (from state.summary['start_id] to state.summary['end_id'])
     and slices the history to include only the events after the summary.
     """
 
+    memory: list[Event]
+    memory_config: LLMConfig
+
     def __init__(
         self,
         memory_config: LLMConfig,
         state: State,
-    ):
+    ) -> None:
         """
         Initialize ConversationMemory with a reference to history and long-term memory.
 
@@ -37,42 +36,69 @@ def __init__(
         - llm_config: The LLM configuration.
         - top_k: Number of top results to retrieve.
         """
-        self.state = state
-        self.llm_config = memory_config
-        self.top_k = TOP_K
-
-        # the number of messages that are hidden from the user
-        self.hidden_message_count = 0
+        self.memory = []
+        self.memory_config = memory_config
         # total messages in the conversation
+        # won't this always be the same as len(history)?
+        # core memory isn't counted here
         self.total_message_count = 0
+        # of which hidden
+        self.hidden_message_count = 0
 
+        # init storage type
         self.storage_type = StorageType.IN_MEMORY
 
+        # read itself from the runtime state
+        self.update(state)
+
+    def update(self, state: State) -> None:
+        """Updates the conversation memory from a new runtime state."""
+        # this isn't actually state.history
+        # if it has a summary, the messages from summary.start_id to summary.end_id are not included,
+        # but replaced with a single summary event
+        if state and state.summary:
+            self.memory = (
+                state.history[: state.summary.start_id]
+                + [state.summary]
+                + state.history[state.summary.end_id :]
+            )
+            self.hidden_message_count = state.summary.end_id - state.summary.start_id
+        else:
+            self.memory = state.history  # this is not cool but let it be for now
+            self.hidden_message_count = 0
+
+    def reset(self) -> None:
+        """Resets the conversation memory."""
+        self.memory = []
+        self.total_message_count = 0
+        self.hidden_message_count = 0
+
+    def update_summary(self, summary: str, hidden_count: int) -> None:
+        """Updates the memory with a new summary and tracks hidden messages."""
+        self.hidden_message_count = hidden_count
+
     def to_dict(self) -> dict:
         # return a dict with key = event.id, value = event.to_dict()
-        return {event.id: event_to_dict(event) for event in self.state.history}
+        return {event.id: event_to_dict(event) for event in self.memory}
 
     def __str__(self) -> str:
-        return f'ConversationMemory with {len(self.state.history)} total events'
+        return f'ConversationMemory with {len(self.memory)} total events'
 
-    def text_search(
-        self, query: str, count: int | None = None, start: int | None = None
-    ) -> tuple[list[str], int]:
-        """
-        Perform a text-based search on LongTermMemory.
+    def search(self, llm: LLM, query: str, top_k: int = 5) -> list:
+        """Searches the conversation memory for relevant messages."""
+        if not self.memory or not query:
+            return []
 
-        Args:
-            query: The text query to search for.
-            count: Number of results to return.
-            start: Pagination start index.
+        if self.storage_type == StorageType.IN_MEMORY:
+            # use the llm.py search to find relevant messages
+            recalled_events = llm.search(query=query, history=self.memory, top_k=top_k)
+        else:
+            raise ValueError(f'Unsupported storage type: {self.storage_type}')
 
-        Returns:
-            A tuple containing the list of matching messages and the total number of matches.
-        """
-        return self.long_term_memory.text_search(query, count, start)
+        return recalled_events
 
     def recall_memory(
-        self, llm: LLM, query: str, top_k: int = 5
+        self, llm: LLM, state: State, query: str, top_k: int = 5
     ) -> list[Event]:
         """
         Get the most similar events based on the query.
@@ -86,22 +112,4 @@ def recall_memory(
         """
         # get the most similar events based on the query
         # for testing recall with litellm
-        return llm.search(query, self.state.history, top_k)
-
-    def update(self, state: State) -> None:
-        """Update the conversation memory with information from the new events."""
-
-        # the number of messages that are hidden from the user
-        # is the number of events in summary
-        if state.summary:
-            self.hidden_message_count = state.summary.end_id - state.summary.start_id
-
-    def _has_summary(self) -> bool:
-        """Check if the conversation has a summary."""
-        return any(
-            isinstance(event, AgentSummarizeAction) for event in self.state.history
-        )
-
-    def reset(self) -> None:
-        # self.state.history = []
-        pass
+        return llm.search(query, state.history, top_k)
diff --git a/openhands/memory/core_memory.py b/openhands/memory/core_memory.py
index ac915afc0432..272c7a6bc776 100644
--- a/openhands/memory/core_memory.py
+++ b/openhands/memory/core_memory.py
@@ -2,27 +2,59 @@
 
 
 class CoreMemory(Memory):
-    """Memory contents to be inserted in the prompt. This includes summaries and other information that the LLM thought was important."""
-
-    memory_blocks: list[str]
+    """Memory contents to be inserted in the prompt. This includes key facts and context
+    that the LLM needs to maintain about its current tasks and capabilities."""
 
     def __init__(self, limit: int = 1500):
         super().__init__()
-        self.limit = limit
-        self.memory_blocks = []
-
-    def to_dict(self) -> dict:
-        return {
-            'limit': self.limit,
+        self.char_limit = limit
+        self.blocks = {
+            'personality': [],  # agent's personality traits and capabilities
+            'task_context': [],  # important context about current tasks
         }
 
-    def from_dict(self, data: dict) -> None:
-        self.limit = data.get('limit', 1500)
+    def add_block(self, category: str, content: str) -> bool:
+        """Add a memory block to a specific category.
+        Returns True if successful, False if would exceed limit."""
+        if category not in self.blocks:
+            raise ValueError(
+                f'Invalid category: {category}. Must be one of {list(self.blocks.keys())}'
+            )
+
+        # Calculate total size with new content
+        potential_content = self.format_blocks() + f'\n- {content}'
+        if len(potential_content) > self.char_limit:
+            return False
+
+        self.blocks[category].append(content)
+        return True
+
+    def get_blocks(
+        self, category: str | None = None
+    ) -> dict[str, list[str]] | list[str]:
+        """Get memory blocks, optionally filtered by category."""
+        if category:
+            return self.blocks.get(category, [])
+        return self.blocks
+
+    def format_blocks(self) -> str:
+        """Format memory blocks for inclusion in the system prompt."""
+        formatted = []
+
+        for category, items in self.blocks.items():
+            if items:
+                formatted.append(f"\n{category.replace('_', ' ').title()}:")
+                formatted.extend([f'- {item}' for item in items])
+
+        return '\n'.join(formatted)
 
     def __str__(self) -> str:
-        return (
-            f'CoreMemory: {{limit: {self.limit}, memory_block: {self.memory_blocks}}}'
-        )
+        return self.format_blocks()
+
+    def to_dict(self) -> dict:
+        return {category: items for category, items in self.blocks.items()}
 
     def reset(self) -> None:
-        self.memory_blocks = []
+        """Reset all memory blocks."""
+        for category in self.blocks:
+            self.blocks[category] = []
diff --git a/openhands/utils/prompt.py b/openhands/utils/prompt.py
index b46093e68a98..503e29d77e16 100644
--- a/openhands/utils/prompt.py
+++ b/openhands/utils/prompt.py
@@ -1,3 +1,7 @@
+import os
+from pathlib import Path
+from typing import Any
+
 from jinja2 import Environment, FileSystemLoader, Template, select_autoescape
 
 from openhands.utils.microagent import MicroAgent
@@ -10,96 +14,124 @@ class PromptManager:
     This class handles loading and rendering of system and user prompt templates,
     as well as loading micro-agent specifications. It provides methods to access
     rendered system and initial user messages for AI interactions.
-
-    Attributes:
-        prompt_dir (str): Directory containing prompt templates.
-        agent_skills_docs (str): Documentation of agent skills.
-        micro_agent (MicroAgent | None): Micro-agent, if specified.
     """
 
     def __init__(
         self,
-        prompt_dir: str,
+        prompt_dir: str | Path,
         agent_skills_docs: str,
         micro_agent: MicroAgent | None = None,
+        custom_prompt_dir: str | Path | None = None,
     ):
-        self.prompt_dir: str = prompt_dir
-        self.agent_skills_docs: str = agent_skills_docs
-        self.micro_agent: MicroAgent | None = micro_agent
-        self.conversation_history: str = ''
+        """Initialize PromptManager with template directories and agent configuration.
+
+        The system supports two types of templates:
+        1. Simple .md files - For basic customization with variable substitution
+        2. Advanced .j2 files - For complex templates using Jinja2 features
 
-        # initialize Jinja2 Environment with FileSystemLoader
+        Templates are loaded in this order (later ones override earlier ones):
+        1. Default templates from prompt_dir
+        2. Custom templates from custom_prompt_dir
+        3. .j2 files take precedence over .md files with the same base name
+        """
+        self.prompt_dir = os.path.abspath(prompt_dir)
+        self.agent_skills_docs = agent_skills_docs
+        self.micro_agent = micro_agent
+        self.conversation_history: list[dict[str, Any]] = []
+        self.core_memory: str = ''
+
+        # Set up template search paths with custom templates taking precedence
+        template_dirs = [self.prompt_dir]
+        if custom_prompt_dir:
+            template_dirs.insert(0, os.path.abspath(custom_prompt_dir))
+
+        # Initialize Jinja environment
         self.env = Environment(
-            loader=FileSystemLoader(self.prompt_dir),
+            loader=FileSystemLoader(template_dirs),
             autoescape=select_autoescape(['j2', 'md']),
+            trim_blocks=True,
+            lstrip_blocks=True,
         )
 
-        # load templates using the environment
-        self.system_template: Template = self._load_template('system_prompt')
-        self.memory_template: Template = self._load_template('memory_prompt')
-        self.user_template: Template = self._load_template('user_prompt')
-        self.summarize_template: Template = self._load_template('summarize_prompt')
+        # Load all templates
+        self.templates = self._load_templates()
 
-    def _load_template(self, template_name: str):
-        """
-        Loads a Jinja2 template using the configured environment.
+    def _load_templates(self) -> dict[str, Template]:
+        """Load templates with appropriate extensions based on complexity.
 
-        Args:
-            template_name: The base name of the template file
+        For each template name (e.g. 'system_prompt'), checks for files in this order:
+        1. {name}.j2 in custom_prompt_dir (if provided)
+        2. {name}.md in custom_prompt_dir (if provided)
+        3. {name}.j2 in prompt_dir
+        4. {name}.md in prompt_dir
+        """
+        templates = {}
+
+        # Template names and their default types
+        template_configs = {
+            # Complex templates that typically need Jinja features
+            'system_prompt': '.j2',
+            'summarize_prompt': '.j2',
+            # Simple templates that work well as markdown
+            'user_prompt': '.md',
+            'examples': '.md',
+        }
+
+        for name, default_ext in template_configs.items():
+            # Try loading template with either extension
+            template = None
+            for ext in ['.j2', '.md']:
+                try:
+                    template = self.env.get_template(f'{name}{ext}')
+                    break
+                except Exception:
+                    continue
+
+            # If no template found, create empty one with default extension
+            if template is None:
+                print(f'No template found for {name}, using empty template')
+                template = self.env.from_string('')
+
+            templates[name] = template
+
+        return templates
+
+    def get_template_variables(self) -> dict[str, Any]:
+        """Get the current template variables.
 
         Returns:
-            Template: The loaded Jinja2 template.
+            Dictionary of variables available to templates
         """
-        try:
-            template = self.env.get_template(f'{template_name}.j2')
-            return template
-        except Exception as e:
-            # FIXME let it fail without crashing
-            print(f'Error loading template {template_name}: {e}')
-            return Template('')
+        return {
+            'agent_skills_docs': self.agent_skills_docs,
+            'core_memory': self.core_memory,
+            'conversation_history': self.conversation_history,
+            'micro_agent': self.micro_agent.content if self.micro_agent else None,
+        }
 
     @property
     def system_message(self) -> str:
-        """
-        Renders the system message template with the necessary variables.
-
-        Returns:
-            str: The rendered system message.
-        """
-        rendered = self.system_template.render(
-            agent_skills_docs=self.agent_skills_docs,
-            memory_template=self.memory_template.render(),
-        ).strip()
-        return rendered
+        """Render the system message template."""
+        return (
+            self.templates['system_prompt']
+            .render(**self.get_template_variables())
+            .strip()
+        )
 
     @property
     def initial_user_message(self) -> str:
-        """This is the initial user message provided to the agent
-        before *actual* user instructions are provided.
-
-        It is used to provide a demonstration of how the agent
-        should behave in order to solve the user's task. And it may
-        optionally contain some additional context about the user's task.
-        This additional context will convert the current generic agent
-        into a more specialized agent that is tailored to the user's task.
-
-        Returns:
-            str: The rendered initial user message.
-        """
-        rendered = self.user_template.render(
-            micro_agent=self.micro_agent.content if self.micro_agent else None
+        """Render the initial user message template."""
+        return (
+            self.templates['user_prompt']
+            .render(**self.get_template_variables())
+            .strip()
         )
-        return rendered.strip()
 
     @property
     def summarize_message(self) -> str:
-        """
-        Renders the summarize message template.
-
-        Returns:
-            str: The rendered summarize message.
-        """
-        rendered = self.summarize_template.render(
-            conversation_history=self.conversation_history
+        """Render the summarize message template."""
+        return (
+            self.templates['summarize_prompt']
+            .render(**self.get_template_variables())
+            .strip()
         )
-        return rendered.strip()
diff --git a/tests/unit/test_condenser.py b/tests/unit/test_condenser.py
index 0e1c2bd36ef9..71d30dbf3e65 100644
--- a/tests/unit/test_condenser.py
+++ b/tests/unit/test_condenser.py
@@ -100,20 +100,31 @@ def extract_digits(file_path: Path) -> int:
         with target_log.open('r', encoding='utf-8') as f:
             messages_data = json.load(f)
 
-            # Deserialize messages using Pydantic's parse_obj
+            # convert string content to list of TextContent if necessary
+            for msg in messages_data:
+                if isinstance(msg['content'], str):
+                    msg['content'] = [{'type': 'text', 'text': msg['content']}]
+
             messages: list[Message] = [
-                Message.parse_obj(msg_dict) for msg_dict in messages_data
+                Message.model_validate(msg, strict=False) for msg in messages_data
             ]
 
             print(f'Successfully loaded {len(messages)} messages:')
             # for msg in messages:
             #    print(f'{msg.role}:\n {msg.content[50:]}')
+
+            # run condense on these messages
+            summary_action = condenser.condense(messages)
+            print(f'summary_action: {summary_action}')
+
+            # save the summary action to a file named with the same name as the log file + summary
+            summary_file_path = target_log.with_suffix('.summary.json')
+            with summary_file_path.open('w', encoding='utf-8') as f:
+                json.dump(summary_action.model_dump(), f, ensure_ascii=False, indent=4)
+
     except Exception as e:
         print(f'An error occurred while reading {target_log}: {e}')
-
-    # run them through hell
-    summary_action = condenser.condense(messages)
-    print(f'summary_action: {summary_action}')
+        return
 
 
 if __name__ == '__main__':
@@ -125,12 +136,20 @@ def extract_digits(file_path: Path) -> int:
     else:
         llm = LLM(app_config.get_llm_config('llm'))
 
+    prompt_dir = os.path.join(
+        os.path.dirname(__file__),
+        '..',
+        '..',
+        'openhands',
+        'agenthub',
+        'memcodeact_agent',
+        'prompts',
+    )
     prompt_manager = PromptManager(
-        prompt_dir=os.path.join(
-            os.path.dirname(__file__), '..', 'agenthub', 'memcodeact_agent', 'prompts'
-        ),
+        prompt_dir=prompt_dir,
         agent_skills_docs='',
     )
+
     condenser = MemoryCondenser(llm=llm, prompt_manager=prompt_manager)
 
     # attach on fly the save_messages_for_debugging method to the condenser

From 1d582ac100398b26b300adbf2ca75069d9a44a02 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 24 Oct 2024 01:58:17 +0200
Subject: [PATCH 61/89] fix tokenizer

---
 openhands/llm/llm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index d7fac7001813..924ec2bc0005 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -130,6 +130,8 @@ def __init__(
         # if using a custom tokenizer, make sure it's loaded and accessible in the format expected by litellm
         if self.config.custom_tokenizer is not None:
             self.tokenizer = create_pretrained_tokenizer(self.config.custom_tokenizer)
+        else:
+            self.tokenizer = None
 
         # set up the completion function
         self._completion = partial(

From 106bbb5ca4a5051b1e1295ceafd2a06429cc4f95 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 24 Oct 2024 03:17:19 +0200
Subject: [PATCH 62/89] ruff

---
 openhands/llm/llm.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index 924ec2bc0005..cb14ac6a8071 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -483,9 +483,7 @@ def embed_history(self, history: list[Event]) -> list[np.ndarray]:
             embeddings.append(embedding)
         return embeddings
 
-    def search(
-        self, query: str, history: list[Event], top_k: int = 5
-    ) -> list[Event]:
+    def search(self, query: str, history: list[Event], top_k: int = 5) -> list[Event]:
         """
         Recalls the most similar events based on the query.
 

From b93c81869ab33d10f97df3e3a9a2feeec7419854 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 24 Oct 2024 07:06:04 +0200
Subject: [PATCH 63/89] tweak template

---
 .../agenthub/codeact_agent/system_prompt.j2   | 72 ++++++++++++++-----
 1 file changed, 55 insertions(+), 17 deletions(-)

diff --git a/openhands/agenthub/codeact_agent/system_prompt.j2 b/openhands/agenthub/codeact_agent/system_prompt.j2
index a1498aacd62b..37db4b2c69cb 100644
--- a/openhands/agenthub/codeact_agent/system_prompt.j2
+++ b/openhands/agenthub/codeact_agent/system_prompt.j2
@@ -1,11 +1,21 @@
-{% set MINIMAL_SYSTEM_PREFIX %}
+{# Core system components for the CodeAct Agent #}
+
+{# Base system identity and core abilities #}
+{% block system_prefix %}
 A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed answers to the user's questions.
 
+{% endblock %}
+
+{# Python execution capabilities #}
+{% block execute_python %}
 [1] The assistant can use a Python environment with <execute_ipython>, e.g.:
 <execute_ipython>
 print("Hello World!")
 </execute_ipython>
+{% endblock %}
 
+{# Bash execution capabilities #}
+{% block execute_bash %}
 [2] The assistant can execute bash commands wrapped with <execute_bash>, e.g. <execute_bash> ls </execute_bash>.
 If a bash command returns exit code `-1`, this means the process is not yet finished.
 The assistant must then send a second <execute_bash>. The second <execute_bash> can be empty
@@ -134,27 +144,46 @@ class MyClass:
 # MyClass().z is removed
 print(MyClass().y)
 </file_edit>
+{% endblock %}
 
-
-{% endset %}
-{% set BROWSING_PREFIX %}
+{# Web browsing #}
+{% block execute_browse %}
 The assistant can browse the Internet with <execute_browse> and </execute_browse>.
-For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+For example, <execute_browse> Tell me the USA's president using Google search </execute_browse>.
 Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
-{% endset %}
-{% set PIP_INSTALL_PREFIX %}
+{% endblock %}
+
+{# Package management #}
+{% block pip_install %}
 The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
-{% endset %}
-{% set SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + BROWSING_PREFIX + PIP_INSTALL_PREFIX %}
-{% set COMMAND_DOCS %}
+{% endblock %}
+
+{# Agent skills documentation #}
+{% block agent_skills %}
+{% if use_tools %}
+{# Tool-based implementation #}
+The following tools are available:
+{% for tool in tools %}
+- {{ tool.name }}: {{ tool.description }}
+  Usage: {{ tool.usage }}
+{% endfor %}
+{% else %}
 Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
 {{ agent_skills_docs }}
+
 IMPORTANT:
 - `open_file` only returns the first 100 lines of the file by default! The assistant MUST use `scroll_down` repeatedly to read the full file BEFORE making edits!
+- The assistant shall adhere to THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRING PROPER INDENTATION.
 - Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
 - Any code issued should be less than 50 lines to avoid context being cut off!
-{% endset %}
-{% set SYSTEM_SUFFIX %}
+- After EVERY `create_file` the method `append_file` shall be used to write the FIRST content!
+- For `edit_file_by_replace` NEVER provide empty parameters!
+- For `edit_file_by_replace` the file must be read fully before any replacements!
+{% endif %}
+{% endblock %}
+
+{# System behavior rules #}
+{% block general_rules %}
 Responses should be concise.
 The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
 Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
@@ -163,9 +192,18 @@ IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_bro
 The assistant should utilize full file paths and the `pwd` command to prevent path-related errors.
 The assistant MUST NOT apologize to the user or thank the user after running commands or editing files. It should only address the user in response to an explicit message from the user, or to ask for more information.
 The assistant MUST NOT push any changes to GitHub unless explicitly requested to do so.
+{% endblock %}
+{# Combine all blocks without newlines #}
+{{ self.system_prefix() -}}
+
+{{- self.execute_python() -}}
+
+{{- self.execute_bash() -}}
+
+{{- self.execute_browse() -}}
+
+{{- self.pip_install() -}}
+
+{{- self.agent_skills() -}}
 
-{% endset %}
-{# Combine all parts without newlines between them #}
-{{ SYSTEM_PREFIX -}}
-{{- COMMAND_DOCS -}}
-{{- SYSTEM_SUFFIX }}
+{{- self.general_rules() -}}

From e75a489de91226d087c1478a1a98dfcdef829cb6 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 24 Oct 2024 08:02:23 +0200
Subject: [PATCH 64/89] add agent skills and yaml

---
 openhands/agenthub/codeact_agent/agent.yaml   | 25 +++++++
 .../agenthub/codeact_agent/agent_skills.md    | 67 +++++++++++++++++++
 2 files changed, 92 insertions(+)
 create mode 100644 openhands/agenthub/codeact_agent/agent.yaml
 create mode 100644 openhands/agenthub/codeact_agent/agent_skills.md

diff --git a/openhands/agenthub/codeact_agent/agent.yaml b/openhands/agenthub/codeact_agent/agent.yaml
new file mode 100644
index 000000000000..74891ba6d475
--- /dev/null
+++ b/openhands/agenthub/codeact_agent/agent.yaml
@@ -0,0 +1,25 @@
+# Main template settings
+template:
+  base: "system_prompt.j2"  # Path to the base template file
+
+# Optional custom blocks to override or extend the base template
+# If null, the default from system_prompt.j2 is used
+custom_blocks:
+  system_prefix: null
+  execute_python: null
+  execute_bash: null
+  execute_browse: null
+  pip_install: null
+  agent_skills: null
+  general_rules: null
+
+# Agent-specific variables (can be accessed within templates)
+agent_variables:
+  use_tools: false  # whether to use tool-based implementations
+  #tools:  # list of available tools
+  #  - name: "EditTool"
+  #    description: "Edits a file."
+  #    usage: "Use the following format: <file_edit> [file_path] [new_file_content] </file_edit>"
+  # agent skills
+  agent_skills:
+    docs_file: "agent_skills.md"
diff --git a/openhands/agenthub/codeact_agent/agent_skills.md b/openhands/agenthub/codeact_agent/agent_skills.md
new file mode 100644
index 000000000000..fa5f9ee3b149
--- /dev/null
+++ b/openhands/agenthub/codeact_agent/agent_skills.md
@@ -0,0 +1,67 @@
+# Agent Skills Documentation
+
+## File Operations
+
+open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
+    Opens the file at the given path in the editor. IF the file is to be edited, first use `scroll_down` repeatedly to read the full file!
+    If line_number is provided, the window will be moved to include that line.
+    It only shows the first 100 lines by default! `context_lines` is the max number of lines to be displayed, up to 100. Use `scroll_up` and `scroll_down` to view more content up or down.
+    Args:
+    path: str: The path to the file to open, preferred absolute path.
+    line_number: int | None = 1: The line number to move to. Defaults to 1.
+    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
+
+goto_line(line_number: int) -> None:
+    Moves the window to show the specified line number.
+    Args:
+    line_number: int: The line number to move to.
+
+scroll_down() -> None:
+    Moves the window down by 100 lines.
+    Args:
+    None
+
+scroll_up() -> None:
+    Moves the window up by 100 lines.
+    Args:
+    None
+
+search_dir(search_term: str, dir_path: str = './') -> None:
+    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
+    Args:
+    search_term: str: The term to search for.
+    dir_path: str: The path to the directory to search.
+
+search_file(search_term: str, file_path: str | None = None) -> None:
+    Searches for search_term in file. If file is not provided, searches in the current open file.
+    Args:
+    search_term: str: The term to search for.
+    file_path: str | None: The path to the file to search.
+
+find_file(file_name: str, dir_path: str = './') -> None:
+    Finds all files with the given name in the specified directory.
+    Args:
+    file_name: str: The name of the file to find.
+    dir_path: str: The path to the directory to search.
+
+## Parsers
+
+parse_pdf(file_path: str) -> None:
+    Parses the content of a PDF file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_docx(file_path: str) -> None:
+    Parses the content of a DOCX file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_latex(file_path: str) -> None:
+    Parses the content of a LaTex file and prints it.
+    Args:
+    file_path: str: The path to the file to open.
+
+parse_pptx(file_path: str) -> None:
+    Parses the content of a pptx file and prints it.
+    Args:
+    file_path: str: The path to the file to open.

From 5df104dcb273cd8eed8273a88f3ac983c5e7a1ac Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 24 Oct 2024 09:32:28 +0200
Subject: [PATCH 65/89] break down agent skills

---
 openhands/agenthub/codeact_agent/agent.yaml   | 21 ++++-
 .../agenthub/codeact_agent/agent_skills.j2    | 17 +++++
 .../agenthub/codeact_agent/codeact_agent.py   |  1 -
 openhands/utils/prompt.py                     | 76 ++++++++++++++++---
 4 files changed, 101 insertions(+), 14 deletions(-)
 create mode 100644 openhands/agenthub/codeact_agent/agent_skills.j2

diff --git a/openhands/agenthub/codeact_agent/agent.yaml b/openhands/agenthub/codeact_agent/agent.yaml
index 74891ba6d475..43f976c68ddc 100644
--- a/openhands/agenthub/codeact_agent/agent.yaml
+++ b/openhands/agenthub/codeact_agent/agent.yaml
@@ -1,6 +1,11 @@
-# Main template settings
+name: CodeActAgent
+
+# main templates
 template:
-  base: "system_prompt.j2"  # Path to the base template file
+  system_prompt: "system_prompt.j2"  # path to the base template file
+  agent_skills: "agent_skills.j2"  # path to the agent skills template file
+  examples: "examples.j2"  # path to the examples template file
+  user_prompt: "user_prompt.j2"  # path to the initial user prompt template file
 
 # Optional custom blocks to override or extend the base template
 # If null, the default from system_prompt.j2 is used
@@ -23,3 +28,15 @@ agent_variables:
   # agent skills
   agent_skills:
     docs_file: "agent_skills.md"
+    available_skills:
+      - "file_ops:open_file"
+      - "file_ops:goto_line"
+      - "file_ops:scroll_down"
+      - "file_ops:scroll_up"
+      - "file_ops:search_dir"
+      - "file_ops:search_file"
+      - "file_ops:find_file"
+      - "parsers:parse_pdf"
+      - "parsers:parse_docx"
+      - "parsers:parse_latex"
+      - "parsers:parse_pptx"
diff --git a/openhands/agenthub/codeact_agent/agent_skills.j2 b/openhands/agenthub/codeact_agent/agent_skills.j2
new file mode 100644
index 000000000000..9dd23e64381a
--- /dev/null
+++ b/openhands/agenthub/codeact_agent/agent_skills.j2
@@ -0,0 +1,17 @@
+# Agent Skills Documentation
+
+## File Operations
+
+{% for skill_name in available_skills %}
+{% if skill_name.startswith('file_ops:') %}
+{{ get_skill_docstring(skill_name) }}
+{% endif %}
+{% endfor %}
+
+## Parsers
+
+{% for skill_name in available_skills %}
+{% if skill_name.startswith('parsers:') %}
+{{ get_skill_docstring(skill_name) }}
+{% endif %}
+{% endfor %}
diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index cacd68353732..c3cb41673be4 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -91,7 +91,6 @@ def __init__(
 
         self.prompt_manager = PromptManager(
             prompt_dir=os.path.join(os.path.dirname(__file__)),
-            agent_skills_docs=AgentSkillsRequirement.documentation,
             micro_agent=self.micro_agent,
         )
 
diff --git a/openhands/utils/prompt.py b/openhands/utils/prompt.py
index 8b9dd81e8bfd..1c1147471739 100644
--- a/openhands/utils/prompt.py
+++ b/openhands/utils/prompt.py
@@ -1,6 +1,8 @@
+import importlib
 import os
 
-from jinja2 import Template
+import yaml
+from jinja2 import Environment, FileSystemLoader, Template
 
 from openhands.utils.microagent import MicroAgent
 
@@ -22,15 +24,35 @@ class PromptManager:
     def __init__(
         self,
         prompt_dir: str,
-        agent_skills_docs: str,
         micro_agent: MicroAgent | None = None,
-    ):
-        self.prompt_dir: str = prompt_dir
-        self.agent_skills_docs: str = agent_skills_docs
+    ) -> None:
+        """
+        Initializes the PromptManager with the given prompt directory and agent skills documentation.
+
+        Args:
+            prompt_dir: The directory containing the prompt templates.
+            agent_skills_docs: The documentation for the agent's skills.
+            micro_agent: The micro-agent to use for generating responses.
+        """
+        self.prompt_dir = prompt_dir
+        self.micro_agent = micro_agent
+
+        self.env = Environment(loader=FileSystemLoader('.'))
+
+        # load available skills from YAML
+        with open('agent.yaml', 'r') as f:
+            config = yaml.safe_load(f)
+
+        self._system_template = self._load_template(config['template']['system_prompt'])
+        self._agent_skills_template = self._load_template(
+            config['template']['agent_skills']
+        )
+        self._user_template = self._load_template(config['template']['user_prompt'])
+        self._examples_template = self._load_template(config['template']['examples'])
 
-        self.system_template: Template = self._load_template('system_prompt')
-        self.user_template: Template = self._load_template('user_prompt')
-        self.micro_agent: MicroAgent | None = micro_agent
+        self.available_skills = config['agent_variables']['agent_skills'][
+            'available_skills'
+        ]
 
     def _load_template(self, template_name: str) -> Template:
         template_path = os.path.join(self.prompt_dir, f'{template_name}.j2')
@@ -41,8 +63,14 @@ def _load_template(self, template_name: str) -> Template:
 
     @property
     def system_message(self) -> str:
-        rendered = self.system_template.render(
-            agent_skills_docs=self.agent_skills_docs,
+        # render the agent_skills.j2 template
+
+        self.env.globals['get_skill_docstring'] = self._get_skill_docstring
+        template = self.env.get_template('agent_skills.j2')
+        rendered_docs = template.render(available_skills=self.available_skills)
+
+        rendered = self._system_template.render(
+            agent_skills_docs=rendered_docs,
         ).strip()
         return rendered
 
@@ -57,7 +85,33 @@ def initial_user_message(self) -> str:
         These additional context will convert the current generic agent
         into a more specialized agent that is tailored to the user's task.
         """
-        rendered = self.user_template.render(
+        rendered = self._user_template.render(
             micro_agent=self.micro_agent.content if self.micro_agent else None
         )
         return rendered.strip()
+
+    def _filter_agent_skills_docs(self, docs: str, available_skills: list[str]) -> str:
+        """Filters the agent skills documentation to only include available skills."""
+        filtered_lines = []
+        capture = False
+
+        for line in docs.splitlines():
+            for skill in available_skills:
+                if f'{skill}(' in line:
+                    capture = True
+            if capture:
+                filtered_lines.append(line)
+            if line.strip() == '' and capture:
+                capture = False
+
+        return '\n'.join(filtered_lines)
+
+    def _get_skill_docstring(self, skill_name: str) -> str:
+        """Retrieves the docstring of a skill function."""
+        module_name, function_name = skill_name.split(':')
+        try:
+            module = importlib.import_module(f'openhands.runtime.skills.{module_name}')
+            function = getattr(module, function_name)
+            return function.__doc__
+        except (ImportError, AttributeError):
+            return f'Documentation not found for skill: {skill_name}'

From 7930457211c57462bd70afe1c1ead9bac533a7df Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 24 Oct 2024 10:03:03 +0200
Subject: [PATCH 66/89] create examples template

---
 openhands/agenthub/codeact_agent/agent.yaml   |   8 +-
 openhands/agenthub/codeact_agent/examples.j2  | 224 +++++++++++++++++
 .../agenthub/codeact_agent/user_prompt.j2     | 225 ------------------
 openhands/utils/prompt.py                     |  66 ++---
 4 files changed, 262 insertions(+), 261 deletions(-)
 create mode 100644 openhands/agenthub/codeact_agent/examples.j2

diff --git a/openhands/agenthub/codeact_agent/agent.yaml b/openhands/agenthub/codeact_agent/agent.yaml
index 43f976c68ddc..bed955889352 100644
--- a/openhands/agenthub/codeact_agent/agent.yaml
+++ b/openhands/agenthub/codeact_agent/agent.yaml
@@ -2,10 +2,10 @@ name: CodeActAgent
 
 # main templates
 template:
-  system_prompt: "system_prompt.j2"  # path to the base template file
-  agent_skills: "agent_skills.j2"  # path to the agent skills template file
-  examples: "examples.j2"  # path to the examples template file
-  user_prompt: "user_prompt.j2"  # path to the initial user prompt template file
+  system_prompt: "system_prompt"  # path to the base template file
+  agent_skills: "agent_skills"  # path to the agent skills template file
+  examples: "examples"  # path to the examples template file
+  user_prompt: "user_prompt"  # path to the initial user prompt template file
 
 # Optional custom blocks to override or extend the base template
 # If null, the default from system_prompt.j2 is used
diff --git a/openhands/agenthub/codeact_agent/examples.j2 b/openhands/agenthub/codeact_agent/examples.j2
new file mode 100644
index 000000000000..c655922d65ca
--- /dev/null
+++ b/openhands/agenthub/codeact_agent/examples.j2
@@ -0,0 +1,224 @@
+{% set DEFAULT_EXAMPLE %}
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me first check the current directory:
+<execute_bash>
+ls
+</execute_bash>
+
+USER:
+OBSERVATION:
+openhands@runtime:~/workspace$
+
+ASSISTANT:
+There is no `app.py` file in the current directory. Let me create a Python file `app.py`:
+<file_edit path="/workspace/app.py" start=-1 end=-1>
+from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)
+</file_edit>
+
+USER:
+OBSERVATION:
+[New file /workspace/app.py is created.]
+(begin of changes)
+--- /workspace/app.py
++++ /workspace/app.py
+@@ -1 +1,10 @@
++from flask import Flask
++app = Flask(__name__)
+
++@app.route('/')
++def index():
++    numbers = list(range(1, 11))
++    return str(numbers)
++
++if __name__ == '__main__':
++    app.run(port=5000)
+(end of changes)
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/openhands/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/openhands/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
+<file_edit path="/workspace/app.py" start=4 end=7>
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    ret = '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+    return ret
+</file_edit>
+
+USER:
+Observation:
+[Edited existing file /workspace/app.py]
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    ret = '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|    return ret
+9|
+10|if __name__ == '__main__':
+11|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+{% endset %}
+Here is an example of how you can interact with the environment for task solving:
+{{ DEFAULT_EXAMPLE }}
+{% if micro_agent %}
+--- BEGIN OF GUIDELINE ---
+The following information may assist you in completing your task:
+
+{{ micro_agent }}
+--- END OF GUIDELINE ---
+{% endif %}
diff --git a/openhands/agenthub/codeact_agent/user_prompt.j2 b/openhands/agenthub/codeact_agent/user_prompt.j2
index a94c02e39dd0..f8d214d46a69 100644
--- a/openhands/agenthub/codeact_agent/user_prompt.j2
+++ b/openhands/agenthub/codeact_agent/user_prompt.j2
@@ -1,226 +1 @@
-{% set DEFAULT_EXAMPLE %}
---- START OF EXAMPLE ---
-
-USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
-
-ASSISTANT:
-Sure! Let me first check the current directory:
-<execute_bash>
-ls
-</execute_bash>
-
-USER:
-OBSERVATION:
-openhands@runtime:~/workspace$
-
-ASSISTANT:
-There is no `app.py` file in the current directory. Let me create a Python file `app.py`:
-<file_edit path="/workspace/app.py" start=-1 end=-1>
-from flask import Flask
-app = Flask(__name__)
-
-@app.route('/')
-def index():
-    numbers = list(range(1, 11))
-    return str(numbers)
-
-if __name__ == '__main__':
-    app.run(port=5000)
-</file_edit>
-
-USER:
-OBSERVATION:
-[New file /workspace/app.py is created.]
-(begin of changes)
---- /workspace/app.py
-+++ /workspace/app.py
-@@ -1 +1,10 @@
-+from flask import Flask
-+app = Flask(__name__)
-
-+@app.route('/')
-+def index():
-+    numbers = list(range(1, 11))
-+    return str(numbers)
-+
-+if __name__ == '__main__':
-+    app.run(port=5000)
-(end of changes)
-
-ASSISTANT:
-I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
-<execute_bash>
-python3 app.py > server.log 2>&1 &
-</execute_bash>
-
-USER:
-OBSERVATION:
-[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
-
-ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
-<execute_bash>
-cat server.log
-</execute_bash>
-
-USER:
-OBSERVATION:
-Traceback (most recent call last):
-  File "/workspace/app.py", line 2, in <module>
-    from flask import Flask
-ModuleNotFoundError: No module named 'flask'
-
-ASSISTANT:
-It seems that Flask is not installed. Let me install Flask for you:
-<execute_bash>
-pip install flask
-</execute_bash>
-
-USER:
-OBSERVATION:
-Defaulting to user installation because normal site-packages is not writeable
-Collecting flask
-  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
-Collecting blinker>=1.6.2
-  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
-Collecting Werkzeug>=3.0.0
-  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
-Collecting click>=8.1.3
-  Using cached click-8.1.7-py3-none-any.whl (97 kB)
-Collecting itsdangerous>=2.1.2
-  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
-Requirement already satisfied: Jinja2>=3.1.2 in /home/openhands/.local/lib/python3.10/site-packages (from flask) (3.1.3)
-Requirement already satisfied: MarkupSafe>=2.0 in /home/openhands/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
-Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
-Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
-
-ASSISTANT:
-Now that Flask is installed, let me run the Python file again:
-<execute_bash>
-python3 app.py > server.log 2>&1 &
-</execute_bash>
-
-USER:
-OBSERVATION:
-[1] 124
-
-ASSISTANT:
-Let me check the server log again:
-<execute_bash>
-cat server.log
-</execute_bash>
-
-USER:
-OBSERVATION:
-* Serving Flask app 'app'
- * Debug mode: off
-WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
- * Running on http://127.0.0.1:5000
-Press CTRL+C to quit
-
-ASSISTANT:
-The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
-
-USER: Now browse the newly started server's homepage and show me the content.
-
-ASSISTANT:
-Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
-<execute_browse>
-Get the content on "http://127.0.0.1:5000"
-</execute_browse>
-
-USER:
-OBSERVATION:
-[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-
-ASSISTANT:
-The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
-
-USER: Now kill the server, make it display the numbers in a table format.
-
-ASSISTANT:
-Sure! Let me stop the server first:
-<execute_bash>
-kill 124
-</execute_bash>
-
-USER:
-OBSERVATION:
-[1]+  Terminated              python3 app.py > server.log 2>&1
-
-ASSISTANT:
-The server has been stopped. Let me open the Python file and modify it.
-<execute_ipython>
-open_file('app.py')
-</execute_ipython>
-
-USER:
-[File: /workspace/app.py (10 lines total)]
-(this is the beginning of the file)
-1|from flask import Flask
-2|app = Flask(__name__)
-3|
-4|@app.route('/')
-5|def index():
-6|    numbers = list(range(1, 11))
-7|    return str(numbers)
-8|
-9|if __name__ == '__main__':
-10|    app.run(port=5000)
-(this is the end of the file)
-
-ASSISTANT:
-I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
-<file_edit path="/workspace/app.py" start=4 end=7>
-@app.route('/')
-def index():
-    numbers = list(range(1, 11))
-    ret = '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
-    return ret
-</file_edit>
-
-USER:
-Observation:
-[Edited existing file /workspace/app.py]
-[File: /workspace/app.py (10 lines total after edit)]
-(this is the beginning of the file)
-1|from flask import Flask
-2|app = Flask(__name__)
-3|
-4|@app.route('/')
-5|def index():
-6|    numbers = list(range(1, 11))
-7|    ret = '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
-8|    return ret
-9|
-10|if __name__ == '__main__':
-11|    app.run(port=5000)
-(this is the end of the file)
-[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
-
-ASSISTANT:
-Running the updated file:
-<execute_bash>
-python3 app.py > server.log 2>&1 &
-</execute_bash>
-
-USER:
-Observation:
-[1] 126
-
-ASSISTANT:
-The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
-<finish></finish>
-
---- END OF EXAMPLE ---
-{% endset %}
-Here is an example of how you can interact with the environment for task solving:
-{{ DEFAULT_EXAMPLE }}
-{% if micro_agent %}
---- BEGIN OF GUIDELINE ---
-The following information may assist you in completing your task:
-
-{{ micro_agent }}
---- END OF GUIDELINE ---
-{% endif %}
-
 NOW, LET'S START!
diff --git a/openhands/utils/prompt.py b/openhands/utils/prompt.py
index 1c1147471739..0d8a8892ca99 100644
--- a/openhands/utils/prompt.py
+++ b/openhands/utils/prompt.py
@@ -40,19 +40,33 @@ def __init__(
         self.env = Environment(loader=FileSystemLoader('.'))
 
         # load available skills from YAML
-        with open('agent.yaml', 'r') as f:
-            config = yaml.safe_load(f)
-
-        self._system_template = self._load_template(config['template']['system_prompt'])
-        self._agent_skills_template = self._load_template(
-            config['template']['agent_skills']
-        )
-        self._user_template = self._load_template(config['template']['user_prompt'])
-        self._examples_template = self._load_template(config['template']['examples'])
-
-        self.available_skills = config['agent_variables']['agent_skills'][
-            'available_skills'
-        ]
+        yaml_path = os.path.join(prompt_dir, 'agent.yaml')
+        if os.path.exists(yaml_path):
+            with open(yaml_path, 'r') as f:
+                config = yaml.safe_load(f)
+            self._system_template = self._load_template(
+                config['template']['system_prompt']
+            )
+            self._agent_skills_template = self._load_template(
+                config['template']['agent_skills']
+            )
+            self._user_template = self._load_template(config['template']['user_prompt'])
+            self._examples_template = self._load_template(
+                config['template']['examples']
+            )
+
+            self.available_skills = config['agent_variables']['agent_skills'][
+                'available_skills'
+            ]
+        else:
+            self._system_template = self._load_template('system_prompt')
+            self._agent_skills_template = self._load_template('agent_skills')
+            self._user_template = self._load_template('user_prompt')
+            self._examples_template = self._load_template('examples')
+            self.available_skills = []  # default to empty list if YAML not found
+
+        # TODO: agent config should have a tool use enabled or disabled
+        # and we can use that to conditionally load the tools variant of agentskills
 
     def _load_template(self, template_name: str) -> Template:
         template_path = os.path.join(self.prompt_dir, f'{template_name}.j2')
@@ -66,8 +80,9 @@ def system_message(self) -> str:
         # render the agent_skills.j2 template
 
         self.env.globals['get_skill_docstring'] = self._get_skill_docstring
-        template = self.env.get_template('agent_skills.j2')
-        rendered_docs = template.render(available_skills=self.available_skills)
+        rendered_docs = self._agent_skills_template.render(
+            available_skills=self.available_skills
+        )
 
         rendered = self._system_template.render(
             agent_skills_docs=rendered_docs,
@@ -85,27 +100,14 @@ def initial_user_message(self) -> str:
         These additional context will convert the current generic agent
         into a more specialized agent that is tailored to the user's task.
         """
+        # this should render the examples.j2 template first, then the user_prompt.j2 template
+        rendered_examples = self._examples_template.render()
         rendered = self._user_template.render(
-            micro_agent=self.micro_agent.content if self.micro_agent else None
+            examples=rendered_examples,
+            micro_agent=self.micro_agent.content if self.micro_agent else None,
         )
         return rendered.strip()
 
-    def _filter_agent_skills_docs(self, docs: str, available_skills: list[str]) -> str:
-        """Filters the agent skills documentation to only include available skills."""
-        filtered_lines = []
-        capture = False
-
-        for line in docs.splitlines():
-            for skill in available_skills:
-                if f'{skill}(' in line:
-                    capture = True
-            if capture:
-                filtered_lines.append(line)
-            if line.strip() == '' and capture:
-                capture = False
-
-        return '\n'.join(filtered_lines)
-
     def _get_skill_docstring(self, skill_name: str) -> str:
         """Retrieves the docstring of a skill function."""
         module_name, function_name = skill_name.split(':')

From 9629a73391ef2feb5120c3ec8f349a26be1935b6 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 24 Oct 2024 11:09:33 +0200
Subject: [PATCH 67/89] fix template loading

---
 .../agenthub/codeact_agent/agent_skills.j2    |  8 +--
 .../agenthub/codeact_agent/system_prompt.j2   | 49 ++++++++-----------
 openhands/utils/prompt.py                     | 32 ++++++++----
 3 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/openhands/agenthub/codeact_agent/agent_skills.j2 b/openhands/agenthub/codeact_agent/agent_skills.j2
index 9dd23e64381a..e93b551bcc9e 100644
--- a/openhands/agenthub/codeact_agent/agent_skills.j2
+++ b/openhands/agenthub/codeact_agent/agent_skills.j2
@@ -1,17 +1,11 @@
-# Agent Skills Documentation
-
-## File Operations
 
 {% for skill_name in available_skills %}
 {% if skill_name.startswith('file_ops:') %}
 {{ get_skill_docstring(skill_name) }}
 {% endif %}
 {% endfor %}
-
-## Parsers
-
 {% for skill_name in available_skills %}
-{% if skill_name.startswith('parsers:') %}
+{% if skill_name.startswith('file_reader:') %}
 {{ get_skill_docstring(skill_name) }}
 {% endif %}
 {% endfor %}
diff --git a/openhands/agenthub/codeact_agent/system_prompt.j2 b/openhands/agenthub/codeact_agent/system_prompt.j2
index 37db4b2c69cb..51ca18337856 100644
--- a/openhands/agenthub/codeact_agent/system_prompt.j2
+++ b/openhands/agenthub/codeact_agent/system_prompt.j2
@@ -1,21 +1,20 @@
 {# Core system components for the CodeAct Agent #}
 
 {# Base system identity and core abilities #}
-{% block system_prefix %}
+{% set SYSTEM_PREFIX %}
 A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed answers to the user's questions.
-
-{% endblock %}
+{% endset %}
 
 {# Python execution capabilities #}
-{% block execute_python %}
+{% set EXECUTE_PYTHON %}
 [1] The assistant can use a Python environment with <execute_ipython>, e.g.:
 <execute_ipython>
 print("Hello World!")
 </execute_ipython>
-{% endblock %}
+{% endset %}
 
 {# Bash execution capabilities #}
-{% block execute_bash %}
+{% set EXECUTE_BASH %}
 [2] The assistant can execute bash commands wrapped with <execute_bash>, e.g. <execute_bash> ls </execute_bash>.
 If a bash command returns exit code `-1`, this means the process is not yet finished.
 The assistant must then send a second <execute_bash>. The second <execute_bash> can be empty
@@ -144,22 +143,22 @@ class MyClass:
 # MyClass().z is removed
 print(MyClass().y)
 </file_edit>
-{% endblock %}
+{% endset %}
 
 {# Web browsing #}
-{% block execute_browse %}
+{% set EXECUTE_BROWSE %}
 The assistant can browse the Internet with <execute_browse> and </execute_browse>.
 For example, <execute_browse> Tell me the USA's president using Google search </execute_browse>.
 Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
-{% endblock %}
+{% endset %}
 
 {# Package management #}
-{% block pip_install %}
+{% set PIP_INSTALL %}
 The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
-{% endblock %}
+{% endset %}
 
 {# Agent skills documentation #}
-{% block agent_skills %}
+{% set AGENT_SKILLS %}
 {% if use_tools %}
 {# Tool-based implementation #}
 The following tools are available:
@@ -180,10 +179,10 @@ IMPORTANT:
 - For `edit_file_by_replace` NEVER provide empty parameters!
 - For `edit_file_by_replace` the file must be read fully before any replacements!
 {% endif %}
-{% endblock %}
+{% endset %}
 
 {# System behavior rules #}
-{% block general_rules %}
+{% set GENERAL_RULES %}
 Responses should be concise.
 The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
 Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
@@ -192,18 +191,12 @@ IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_bro
 The assistant should utilize full file paths and the `pwd` command to prevent path-related errors.
 The assistant MUST NOT apologize to the user or thank the user after running commands or editing files. It should only address the user in response to an explicit message from the user, or to ask for more information.
 The assistant MUST NOT push any changes to GitHub unless explicitly requested to do so.
-{% endblock %}
+{% endset %}
 {# Combine all blocks without newlines #}
-{{ self.system_prefix() -}}
-
-{{- self.execute_python() -}}
-
-{{- self.execute_bash() -}}
-
-{{- self.execute_browse() -}}
-
-{{- self.pip_install() -}}
-
-{{- self.agent_skills() -}}
-
-{{- self.general_rules() -}}
+{{ SYSTEM_PREFIX -}}
+{{- EXECUTE_PYTHON -}}
+{{- EXECUTE_BASH -}}
+{{- EXECUTE_BROWSE -}}
+{{- PIP_INSTALL -}}
+{{- AGENT_SKILLS -}}
+{{- GENERAL_RULES -}}
diff --git a/openhands/utils/prompt.py b/openhands/utils/prompt.py
index 0d8a8892ca99..496cbfa7045b 100644
--- a/openhands/utils/prompt.py
+++ b/openhands/utils/prompt.py
@@ -2,7 +2,7 @@
 import os
 
 import yaml
-from jinja2 import Environment, FileSystemLoader, Template
+from jinja2 import Environment, FileSystemLoader, Template, TemplateNotFound
 
 from openhands.utils.microagent import MicroAgent
 
@@ -37,7 +37,7 @@ def __init__(
         self.prompt_dir = prompt_dir
         self.micro_agent = micro_agent
 
-        self.env = Environment(loader=FileSystemLoader('.'))
+        self.env = Environment(loader=FileSystemLoader(prompt_dir))
 
         # load available skills from YAML
         yaml_path = os.path.join(prompt_dir, 'agent.yaml')
@@ -69,11 +69,16 @@ def __init__(
         # and we can use that to conditionally load the tools variant of agentskills
 
     def _load_template(self, template_name: str) -> Template:
-        template_path = os.path.join(self.prompt_dir, f'{template_name}.j2')
-        if not os.path.exists(template_path):
-            raise FileNotFoundError(f'Prompt file {template_path} not found')
-        with open(template_path, 'r') as file:
-            return Template(file.read())
+        # use the jinja2 environment to load the template
+        try:
+            return self.env.get_template(f'{template_name}.j2')
+        except TemplateNotFound:
+            # try to load from the prompt_dir
+            template_path = os.path.join(self.prompt_dir, f'{template_name}.j2')
+            if not os.path.exists(template_path):
+                raise FileNotFoundError(f'Prompt file {template_path} not found')
+            with open(template_path, 'r') as file:
+                return Template(file.read())
 
     @property
     def system_message(self) -> str:
@@ -112,8 +117,17 @@ def _get_skill_docstring(self, skill_name: str) -> str:
         """Retrieves the docstring of a skill function."""
         module_name, function_name = skill_name.split(':')
         try:
-            module = importlib.import_module(f'openhands.runtime.skills.{module_name}')
+            module = importlib.import_module(
+                f'openhands.runtime.plugins.agent_skills.{module_name}'
+            )
             function = getattr(module, function_name)
-            return function.__doc__
+            cur_doc = function.__doc__
+            # remove indentation from docstring and extra empty lines
+            cur_doc = '\n'.join(
+                filter(None, map(lambda x: x.strip(), cur_doc.split('\n')))
+            )
+            # now add a consistent 4 indentation
+            cur_doc = '\n'.join(map(lambda x: ' ' * 4 + x, cur_doc.split('\n')))
+            return f'{function.__name__}\n{cur_doc}'
         except (ImportError, AttributeError):
             return f'Documentation not found for skill: {skill_name}'

From bbd5211c3b2e68e700bc21e4cfa1801286fc9cca Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 24 Oct 2024 11:18:18 +0200
Subject: [PATCH 68/89] remove obsolete md

---
 .../agenthub/codeact_agent/agent_skills.md    | 67 -------------------
 1 file changed, 67 deletions(-)
 delete mode 100644 openhands/agenthub/codeact_agent/agent_skills.md

diff --git a/openhands/agenthub/codeact_agent/agent_skills.md b/openhands/agenthub/codeact_agent/agent_skills.md
deleted file mode 100644
index fa5f9ee3b149..000000000000
--- a/openhands/agenthub/codeact_agent/agent_skills.md
+++ /dev/null
@@ -1,67 +0,0 @@
-# Agent Skills Documentation
-
-## File Operations
-
-open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None:
-    Opens the file at the given path in the editor. IF the file is to be edited, first use `scroll_down` repeatedly to read the full file!
-    If line_number is provided, the window will be moved to include that line.
-    It only shows the first 100 lines by default! `context_lines` is the max number of lines to be displayed, up to 100. Use `scroll_up` and `scroll_down` to view more content up or down.
-    Args:
-    path: str: The path to the file to open, preferred absolute path.
-    line_number: int | None = 1: The line number to move to. Defaults to 1.
-    context_lines: int | None = 100: Only shows this number of lines in the context window (usually from line 1), with line_number as the center (if possible). Defaults to 100.
-
-goto_line(line_number: int) -> None:
-    Moves the window to show the specified line number.
-    Args:
-    line_number: int: The line number to move to.
-
-scroll_down() -> None:
-    Moves the window down by 100 lines.
-    Args:
-    None
-
-scroll_up() -> None:
-    Moves the window up by 100 lines.
-    Args:
-    None
-
-search_dir(search_term: str, dir_path: str = './') -> None:
-    Searches for search_term in all files in dir. If dir is not provided, searches in the current directory.
-    Args:
-    search_term: str: The term to search for.
-    dir_path: str: The path to the directory to search.
-
-search_file(search_term: str, file_path: str | None = None) -> None:
-    Searches for search_term in file. If file is not provided, searches in the current open file.
-    Args:
-    search_term: str: The term to search for.
-    file_path: str | None: The path to the file to search.
-
-find_file(file_name: str, dir_path: str = './') -> None:
-    Finds all files with the given name in the specified directory.
-    Args:
-    file_name: str: The name of the file to find.
-    dir_path: str: The path to the directory to search.
-
-## Parsers
-
-parse_pdf(file_path: str) -> None:
-    Parses the content of a PDF file and prints it.
-    Args:
-    file_path: str: The path to the file to open.
-
-parse_docx(file_path: str) -> None:
-    Parses the content of a DOCX file and prints it.
-    Args:
-    file_path: str: The path to the file to open.
-
-parse_latex(file_path: str) -> None:
-    Parses the content of a LaTex file and prints it.
-    Args:
-    file_path: str: The path to the file to open.
-
-parse_pptx(file_path: str) -> None:
-    Parses the content of a pptx file and prints it.
-    Args:
-    file_path: str: The path to the file to open.

From e2c343a73329342b08c12a1c9a3e612c484669cc Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 24 Oct 2024 11:22:20 +0200
Subject: [PATCH 69/89] fix useless vars

---
 openhands/agenthub/codeact_agent/agent.yaml | 40 ++++++++++-----------
 openhands/utils/prompt.py                   |  4 +--
 2 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/openhands/agenthub/codeact_agent/agent.yaml b/openhands/agenthub/codeact_agent/agent.yaml
index bed955889352..b34940663747 100644
--- a/openhands/agenthub/codeact_agent/agent.yaml
+++ b/openhands/agenthub/codeact_agent/agent.yaml
@@ -19,24 +19,22 @@ custom_blocks:
   general_rules: null
 
 # Agent-specific variables (can be accessed within templates)
-agent_variables:
-  use_tools: false  # whether to use tool-based implementations
-  #tools:  # list of available tools
-  #  - name: "EditTool"
-  #    description: "Edits a file."
-  #    usage: "Use the following format: <file_edit> [file_path] [new_file_content] </file_edit>"
-  # agent skills
-  agent_skills:
-    docs_file: "agent_skills.md"
-    available_skills:
-      - "file_ops:open_file"
-      - "file_ops:goto_line"
-      - "file_ops:scroll_down"
-      - "file_ops:scroll_up"
-      - "file_ops:search_dir"
-      - "file_ops:search_file"
-      - "file_ops:find_file"
-      - "parsers:parse_pdf"
-      - "parsers:parse_docx"
-      - "parsers:parse_latex"
-      - "parsers:parse_pptx"
+use_tools: false  # whether to use tool-based implementations
+# tools:  # list of available tools
+#  - name: "EditTool"
+#    description: "Edits a file."
+#    usage: "Use the following format: <file_edit> [file_path] [new_file_content] </file_edit>"
+# agent skills
+agent_skills:
+  available_skills:
+    - "file_ops:open_file"
+    - "file_ops:goto_line"
+    - "file_ops:scroll_down"
+    - "file_ops:scroll_up"
+    - "file_ops:search_dir"
+    - "file_ops:search_file"
+    - "file_ops:find_file"
+    - "file_reader:parse_pdf"
+    - "file_reader:parse_docx"
+    - "file_reader:parse_latex"
+    - "file_reader:parse_pptx"
diff --git a/openhands/utils/prompt.py b/openhands/utils/prompt.py
index 496cbfa7045b..8249c959d4a5 100644
--- a/openhands/utils/prompt.py
+++ b/openhands/utils/prompt.py
@@ -55,9 +55,7 @@ def __init__(
                 config['template']['examples']
             )
 
-            self.available_skills = config['agent_variables']['agent_skills'][
-                'available_skills'
-            ]
+            self.available_skills = config['agent_skills']['available_skills']
         else:
             self._system_template = self._load_template('system_prompt')
             self._agent_skills_template = self._load_template('agent_skills')

From bf9b8acbab28a6ddd196867ddaa954d4556ac7d4 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 24 Oct 2024 11:32:50 +0200
Subject: [PATCH 70/89] kill some whitespace

---
 openhands/agenthub/codeact_agent/agent_skills.j2 | 8 --------
 openhands/agenthub/codeact_agent/user_prompt.j2  | 1 +
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/openhands/agenthub/codeact_agent/agent_skills.j2 b/openhands/agenthub/codeact_agent/agent_skills.j2
index e93b551bcc9e..7f34adb69efc 100644
--- a/openhands/agenthub/codeact_agent/agent_skills.j2
+++ b/openhands/agenthub/codeact_agent/agent_skills.j2
@@ -1,11 +1,3 @@
-
 {% for skill_name in available_skills %}
-{% if skill_name.startswith('file_ops:') %}
 {{ get_skill_docstring(skill_name) }}
-{% endif %}
-{% endfor %}
-{% for skill_name in available_skills %}
-{% if skill_name.startswith('file_reader:') %}
-{{ get_skill_docstring(skill_name) }}
-{% endif %}
 {% endfor %}
diff --git a/openhands/agenthub/codeact_agent/user_prompt.j2 b/openhands/agenthub/codeact_agent/user_prompt.j2
index f8d214d46a69..64db4664f5cd 100644
--- a/openhands/agenthub/codeact_agent/user_prompt.j2
+++ b/openhands/agenthub/codeact_agent/user_prompt.j2
@@ -1 +1,2 @@
+{{ examples }}
 NOW, LET'S START!

From 673235989446a4ea2f357ac8a849727b54c53b32 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 24 Oct 2024 11:36:08 +0200
Subject: [PATCH 71/89] strange leftover from another branch

---
 openhands/agenthub/codeact_agent/system_prompt.j2 | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/openhands/agenthub/codeact_agent/system_prompt.j2 b/openhands/agenthub/codeact_agent/system_prompt.j2
index 51ca18337856..87d6ecbcb8d1 100644
--- a/openhands/agenthub/codeact_agent/system_prompt.j2
+++ b/openhands/agenthub/codeact_agent/system_prompt.j2
@@ -172,12 +172,8 @@ Apart from the standard Python library, the assistant can also use the following
 
 IMPORTANT:
 - `open_file` only returns the first 100 lines of the file by default! The assistant MUST use `scroll_down` repeatedly to read the full file BEFORE making edits!
-- The assistant shall adhere to THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRING PROPER INDENTATION.
 - Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
 - Any code issued should be less than 50 lines to avoid context being cut off!
-- After EVERY `create_file` the method `append_file` shall be used to write the FIRST content!
-- For `edit_file_by_replace` NEVER provide empty parameters!
-- For `edit_file_by_replace` the file must be read fully before any replacements!
 {% endif %}
 {% endset %}
 

From ada2ebd4f5779722a2e11a0ede0e9d76bec94b22 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 24 Oct 2024 12:16:29 +0200
Subject: [PATCH 72/89] tweak agent skill display

---
 openhands/utils/prompt.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/openhands/utils/prompt.py b/openhands/utils/prompt.py
index 8249c959d4a5..5d2131b0d011 100644
--- a/openhands/utils/prompt.py
+++ b/openhands/utils/prompt.py
@@ -1,5 +1,6 @@
 import importlib
 import os
+from inspect import signature
 
 import yaml
 from jinja2 import Environment, FileSystemLoader, Template, TemplateNotFound
@@ -118,7 +119,20 @@ def _get_skill_docstring(self, skill_name: str) -> str:
             module = importlib.import_module(
                 f'openhands.runtime.plugins.agent_skills.{module_name}'
             )
+
+            # find the function
             function = getattr(module, function_name)
+
+            # get the function signature with parameter names, types and return type
+            params = signature(function).parameters
+            param_str = ', '.join(
+                [
+                    f'{name}: {param.annotation.__name__}'
+                    for name, param in params.items()
+                ]
+            )
+            fn_signature = f'{function.__name__}({param_str}) -> {signature(function).return_annotation.__name__}'
+
             cur_doc = function.__doc__
             # remove indentation from docstring and extra empty lines
             cur_doc = '\n'.join(
@@ -126,6 +140,6 @@ def _get_skill_docstring(self, skill_name: str) -> str:
             )
             # now add a consistent 4 indentation
             cur_doc = '\n'.join(map(lambda x: ' ' * 4 + x, cur_doc.split('\n')))
-            return f'{function.__name__}\n{cur_doc}'
+            return f'{fn_signature}\n{cur_doc}'
         except (ImportError, AttributeError):
             return f'Documentation not found for skill: {skill_name}'

From 1df7aaa0cfa2a0455b12d86d42d5c0e1cc14abb8 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 24 Oct 2024 13:10:05 +0200
Subject: [PATCH 73/89] add user-defined template directory

---
 openhands/agenthub/codeact_agent/agent.yaml | 19 +++------
 openhands/utils/prompt.py                   | 47 ++++++++++++---------
 2 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/openhands/agenthub/codeact_agent/agent.yaml b/openhands/agenthub/codeact_agent/agent.yaml
index b34940663747..e6f6b538d43b 100644
--- a/openhands/agenthub/codeact_agent/agent.yaml
+++ b/openhands/agenthub/codeact_agent/agent.yaml
@@ -1,24 +1,17 @@
 name: CodeActAgent
 
+# custom templates directory
+# .j2 templates will be loaded from this directory if found, if not, the default will be used
+custom_templates_dir: "user_templates"
+
 # main templates
 template:
-  system_prompt: "system_prompt"  # path to the base template file
+  system_prompt: "system_prompt"  # path to the system template file
   agent_skills: "agent_skills"  # path to the agent skills template file
   examples: "examples"  # path to the examples template file
   user_prompt: "user_prompt"  # path to the initial user prompt template file
 
-# Optional custom blocks to override or extend the base template
-# If null, the default from system_prompt.j2 is used
-custom_blocks:
-  system_prefix: null
-  execute_python: null
-  execute_bash: null
-  execute_browse: null
-  pip_install: null
-  agent_skills: null
-  general_rules: null
-
-# Agent-specific variables (can be accessed within templates)
+# agent-specific variables (can be accessed within templates)
 use_tools: false  # whether to use tool-based implementations
 # tools:  # list of available tools
 #  - name: "EditTool"
diff --git a/openhands/utils/prompt.py b/openhands/utils/prompt.py
index 5d2131b0d011..fe61c25f4c39 100644
--- a/openhands/utils/prompt.py
+++ b/openhands/utils/prompt.py
@@ -38,31 +38,44 @@ def __init__(
         self.prompt_dir = prompt_dir
         self.micro_agent = micro_agent
 
-        self.env = Environment(loader=FileSystemLoader(prompt_dir))
-
         # load available skills from YAML
         yaml_path = os.path.join(prompt_dir, 'agent.yaml')
         if os.path.exists(yaml_path):
             with open(yaml_path, 'r') as f:
                 config = yaml.safe_load(f)
+
+            custom_templates_dir = config.get('custom_templates_dir', None)
+            if custom_templates_dir:
+                # custom templates directory is an absolute path or relative to the script location
+                custom_templates_dir = os.path.abspath(custom_templates_dir)
+
+                # prioritize custom_templates_dir over the default templates directory
+                self.env = Environment(
+                    loader=FileSystemLoader([custom_templates_dir, self.prompt_dir])
+                )
+
             self._system_template = self._load_template(
                 config['template']['system_prompt']
             )
             self._agent_skills_template = self._load_template(
                 config['template']['agent_skills']
             )
-            self._user_template = self._load_template(config['template']['user_prompt'])
             self._examples_template = self._load_template(
                 config['template']['examples']
             )
+            self._user_template = self._load_template(config['template']['user_prompt'])
 
             self.available_skills = config['agent_skills']['available_skills']
         else:
+            # no agent.yaml file found, use the default templates
+            self.env = Environment(loader=FileSystemLoader(prompt_dir))
+
             self._system_template = self._load_template('system_prompt')
             self._agent_skills_template = self._load_template('agent_skills')
             self._user_template = self._load_template('user_prompt')
             self._examples_template = self._load_template('examples')
-            self.available_skills = []  # default to empty list if YAML not found
+
+            self.available_skills = []  # FIXME: default to empty list if YAML not found
 
         # TODO: agent config should have a tool use enabled or disabled
         # and we can use that to conditionally load the tools variant of agentskills
@@ -121,25 +134,19 @@ def _get_skill_docstring(self, skill_name: str) -> str:
             )
 
             # find the function
-            function = getattr(module, function_name)
+            agent_skill_fn = getattr(module, function_name)
 
             # get the function signature with parameter names, types and return type
-            params = signature(function).parameters
-            param_str = ', '.join(
-                [
-                    f'{name}: {param.annotation.__name__}'
-                    for name, param in params.items()
-                ]
-            )
-            fn_signature = f'{function.__name__}({param_str}) -> {signature(function).return_annotation.__name__}'
+            fn_signature = f'{agent_skill_fn.__name__}' + str(signature(agent_skill_fn))
+
+            doc = agent_skill_fn.__doc__
 
-            cur_doc = function.__doc__
             # remove indentation from docstring and extra empty lines
-            cur_doc = '\n'.join(
-                filter(None, map(lambda x: x.strip(), cur_doc.split('\n')))
-            )
+            doc = '\n'.join(filter(None, map(lambda x: x.strip(), doc.split('\n'))))
+
             # now add a consistent 4 indentation
-            cur_doc = '\n'.join(map(lambda x: ' ' * 4 + x, cur_doc.split('\n')))
-            return f'{fn_signature}\n{cur_doc}'
-        except (ImportError, AttributeError):
+            doc = '\n'.join(map(lambda x: ' ' * 4 + x, doc.split('\n')))
+            return f'{fn_signature}\n{doc}'
+        except (ImportError, AttributeError) as e:
+            print(e)
             return f'Documentation not found for skill: {skill_name}'

From 4efcc027760f33ba25e170962a0b96bed29cd126 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 24 Oct 2024 14:09:48 +0200
Subject: [PATCH 74/89] ruff

---
 openhands/controller/agent_controller.py | 4 +---
 openhands/controller/state/state.py      | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index b299414b9442..0c2628b3c81f 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -304,9 +304,7 @@ async def _handle_action(self, action: Action):
         elif isinstance(action, AgentRecallAction):
             # llama_index_list = self.long_term_memory.search(action.query, action.history)
             # logger.info(f'llama-index list: {llama_index_list}')
-            litellm_list = self.agent.llm.search(
-                action.query, self.state.history
-            )
+            litellm_list = self.agent.llm.search(action.query, self.state.history)
             logger.info(f'litellm list: {litellm_list}')
 
     async def _handle_observation(self, observation: Observation):
diff --git a/openhands/controller/state/state.py b/openhands/controller/state/state.py
index 2c7d03766e6b..d8924dba24d0 100644
--- a/openhands/controller/state/state.py
+++ b/openhands/controller/state/state.py
@@ -11,9 +11,9 @@
     MessageAction,
 )
 from openhands.events.action.agent import AgentFinishAction, AgentSummarizeAction
-from openhands.llm.metrics import Metrics
 from openhands.events.event import Event, EventSource
 from openhands.events.observation import AgentDelegateObservation
+from openhands.llm.metrics import Metrics
 from openhands.storage.files import FileStore
 
 

From 6f282b90da2182ab8eccee7b5173507cbd4bb000 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 24 Oct 2024 14:11:39 +0200
Subject: [PATCH 75/89] fix user prompt; bad coverage

---
 .../agenthub/codeact_agent/user_prompt.j2     | 10 ++++++++
 tests/unit/test_prompt_manager.py             | 25 ++++++++++++-------
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/openhands/agenthub/codeact_agent/user_prompt.j2 b/openhands/agenthub/codeact_agent/user_prompt.j2
index 64db4664f5cd..133c704fbae4 100644
--- a/openhands/agenthub/codeact_agent/user_prompt.j2
+++ b/openhands/agenthub/codeact_agent/user_prompt.j2
@@ -1,2 +1,12 @@
 {{ examples }}
+
+{% if micro_agent %}
+--- BEGIN OF GUIDELINE ---
+The following information may assist you in completing your task:
+
+{{ micro_agent }}
+--- END OF GUIDELINE ---
+{% endif %}
+
+
 NOW, LET'S START!
diff --git a/tests/unit/test_prompt_manager.py b/tests/unit/test_prompt_manager.py
index 2534f73d3ab8..be20456b539d 100644
--- a/tests/unit/test_prompt_manager.py
+++ b/tests/unit/test_prompt_manager.py
@@ -14,7 +14,7 @@ def prompt_dir(tmp_path):
     shutil.copytree('openhands/agenthub/codeact_agent', tmp_path, dirs_exist_ok=True)
 
     # Return the temporary directory path
-    return tmp_path
+    return str(tmp_path)  # Return string path
 
 
 SAMPLE_AGENT_SKILLS_DOCS = """Sample agent skills documentation"""
@@ -26,10 +26,10 @@ def agent_skills_docs():
 
 
 def test_prompt_manager_without_micro_agent(prompt_dir, agent_skills_docs):
-    manager = PromptManager(prompt_dir, agent_skills_docs)
+    manager = PromptManager(prompt_dir)
 
     assert manager.prompt_dir == prompt_dir
-    assert manager.agent_skills_docs == agent_skills_docs
+    # assert manager.agent_skills_docs == agent_skills_docs
     assert manager.micro_agent is None
 
     assert isinstance(manager.system_message, str)
@@ -37,7 +37,7 @@ def test_prompt_manager_without_micro_agent(prompt_dir, agent_skills_docs):
         "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed answers to the user's questions."
         in manager.system_message
     )
-    assert SAMPLE_AGENT_SKILLS_DOCS in manager.system_message
+    # assert SAMPLE_AGENT_SKILLS_DOCS in manager.system_message
     assert isinstance(manager.initial_user_message, str)
     assert '--- BEGIN OF GUIDELINE ---' not in manager.initial_user_message
     assert '--- END OF GUIDELINE ---' not in manager.initial_user_message
@@ -64,12 +64,11 @@ def test_prompt_manager_with_micro_agent(prompt_dir, agent_skills_docs):
 
     manager = PromptManager(
         prompt_dir=prompt_dir,
-        agent_skills_docs=agent_skills_docs,
         micro_agent=mock_micro_agent,
     )
 
     assert manager.prompt_dir == prompt_dir
-    assert manager.agent_skills_docs == agent_skills_docs
+    # assert manager.agent_skills_docs == agent_skills_docs
     assert manager.micro_agent == mock_micro_agent
 
     assert isinstance(manager.system_message, str)
@@ -77,7 +76,7 @@ def test_prompt_manager_with_micro_agent(prompt_dir, agent_skills_docs):
         "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed answers to the user's questions."
         in manager.system_message
     )
-    assert SAMPLE_AGENT_SKILLS_DOCS in manager.system_message
+    # assert SAMPLE_AGENT_SKILLS_DOCS in manager.system_message
 
     assert isinstance(manager.initial_user_message, str)
     assert (
@@ -106,11 +105,19 @@ def test_prompt_manager_template_rendering(prompt_dir, agent_skills_docs):
     with open(os.path.join(prompt_dir, 'user_prompt.j2'), 'w') as f:
         f.write('User prompt: {{ micro_agent }}')
 
-    manager = PromptManager(prompt_dir, agent_skills_docs)
+    manager = PromptManager(prompt_dir)
 
-    assert manager.system_message == f'System prompt: {agent_skills_docs}'
+    # assert manager.system_message == f'System prompt: {agent_skills_docs}'
     assert manager.initial_user_message == 'User prompt: None'
 
     # Clean up temporary files
     os.remove(os.path.join(prompt_dir, 'system_prompt.j2'))
     os.remove(os.path.join(prompt_dir, 'user_prompt.j2'))
+
+
+def test_prompt_manager_loads_agent_skill(prompt_dir):
+    manager = PromptManager(prompt_dir)
+    assert (
+        'open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None'
+        in manager.system_message
+    )

From 11d82f238dab6ef41984591ca2203ff29bececef Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Fri, 25 Oct 2024 02:49:26 +0200
Subject: [PATCH 76/89] save events as they happen

---
 openhands/controller/agent_controller.py | 61 ++++--------------------
 1 file changed, 10 insertions(+), 51 deletions(-)

diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index feff9866d0d9..0224a46d784c 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -69,6 +69,7 @@ class AgentController:
         NullObservation,
         ChangeAgentStateAction,
         AgentStateChangedObservation,
+        FatalErrorObservation,
     )
 
     def __init__(
@@ -134,7 +135,9 @@ async def close(self):
         # we made history, now is the time to rewrite it!
         # the final state.history will be used by external scripts like evals, tests, etc.
         # history will need to be complete WITH delegates events
-        # like the regular agent history, it does not include 'hidden' events nor the default filtered out types (backend events)
+        # like the regular agent history, it does not include:
+        # - 'hidden' events, events with hidden=True
+        # - backend events (the default 'filtered out' types, types in self.filter_out)
         start_id = self.state.start_id if self.state.start_id != -1 else 0
         end_id = (
             self.state.end_id
@@ -158,56 +161,6 @@ def update_state_before_step(self):
         self.state.iteration += 1
         self.state.local_iteration += 1
 
-        # get the history from the event stream
-        # first define the range of events to fetch
-        start_id = self.state.start_id if self.state.start_id != -1 else 0
-        end_id = (
-            self.state.end_id
-            if self.state.end_id != -1
-            else self.event_stream.get_latest_event_id()
-        )
-
-        # fetch events directly from the event stream
-        # filtering out what an agent history should not include:
-        # - "backend" event types that should not be sent to the agent
-        # - hidden events
-        self.state.history = list(
-            self.event_stream.get_events(
-                start_id=start_id,
-                end_id=end_id,
-                reverse=False,
-                filter_out_type=self.filter_out,
-                filter_hidden=True,
-            )
-        )
-
-        # also, we exclude finished delegates from the parent agent's history:
-        # - do not include events between delegate actions and observations
-        # - include the delegate action and observation themselves
-        if self.state.delegates:
-            for (delegate_start_id, delegate_end_id), (
-                delegate_agent,
-                delegate_task,
-            ) in self.state.delegates.items():
-                # sanity checks
-                if (
-                    delegate_start_id < 0
-                    or delegate_end_id < 1
-                    or delegate_start_id >= delegate_end_id
-                    or delegate_end_id >= len(self.state.history)
-                ):
-                    logger.error(
-                        f'Invalid delegate ids: {delegate_start_id}, {delegate_end_id}. Skipping...'
-                    )
-                    continue
-
-                # exclude delegate events from history
-                self.state.history = [
-                    event
-                    for event in self.state.history
-                    if not (delegate_start_id < event.id < delegate_end_id)
-                ]
-
     async def update_state_after_step(self):
         # update metrics especially for cost. Use deepcopy to avoid it being modified by agent.reset()
         self.state.local_metrics = copy.deepcopy(self.agent.llm.metrics)
@@ -264,6 +217,11 @@ async def on_event(self, event: Event):
         """
         if hasattr(event, 'hidden') and event.hidden:
             return
+
+        # if the event is not filtered out, add it to the history
+        if not any(isinstance(event, filter_type) for filter_type in self.filter_out):
+            self.state.history.append(event)
+
         if isinstance(event, Action):
             await self._handle_action(event)
         elif isinstance(event, Observation):
@@ -733,6 +691,7 @@ def set_initial_state(
         # - delegates_ids
 
         # if start_id was not set in State, we're starting fresh, at the top of the stream
+        # does this still happen?
         if self.state.start_id <= -1:
             self.state.start_id = self.event_stream.get_latest_event_id() + 1
         else:

From 54f60acc0b60d514c651da6d3ac2bfb95a23f79f Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Fri, 25 Oct 2024 02:50:53 +0200
Subject: [PATCH 77/89] clean up obsolete config var - sessions are always
 saved if filestore is appropriate

---
 openhands/core/config/app_config.py |  2 --
 openhands/core/main.py              | 26 ++++++++++++++------------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/openhands/core/config/app_config.py b/openhands/core/config/app_config.py
index dd7661af1712..cd2283589c4c 100644
--- a/openhands/core/config/app_config.py
+++ b/openhands/core/config/app_config.py
@@ -40,7 +40,6 @@ class AppConfig:
         e2b_api_key: The E2B API key.
         disable_color: Whether to disable color. For terminals that don't support color.
         debug: Whether to enable debugging.
-        enable_cli_session: Whether to enable saving and restoring the session when run from CLI.
         file_uploads_max_file_size_mb: Maximum file size for uploads in megabytes. 0 means no limit.
         file_uploads_restrict_file_types: Whether to restrict file types for file uploads. Defaults to False.
         file_uploads_allowed_extensions: List of allowed file extensions for uploads. ['.*'] means all extensions are allowed.
@@ -72,7 +71,6 @@ class AppConfig:
     disable_color: bool = False
     jwt_secret: str = uuid.uuid4().hex
     debug: bool = False
-    enable_cli_session: bool = False
     file_uploads_max_file_size_mb: int = 0
     file_uploads_restrict_file_types: bool = False
     file_uploads_allowed_extensions: list[str] = field(default_factory=lambda: ['.*'])
diff --git a/openhands/core/main.py b/openhands/core/main.py
index d68c4218514c..adc7b9e76941 100644
--- a/openhands/core/main.py
+++ b/openhands/core/main.py
@@ -124,16 +124,18 @@ async def run_controller(
         runtime = create_runtime(config, sid=sid)
 
     event_stream = runtime.event_stream
-    # restore cli session if enabled
+
+    # restore cli session if available
     initial_state = None
-    if config.enable_cli_session:
-        try:
-            logger.info(f'Restoring agent state from cli session {event_stream.sid}')
-            initial_state = State.restore_from_session(
-                event_stream.sid, event_stream.file_store
-            )
-        except Exception as e:
-            logger.info(f'Error restoring state: {e}')
+    try:
+        logger.debug(
+            f'Trying to restore agent state from cli session {event_stream.sid} if available'
+        )
+        initial_state = State.restore_from_session(
+            event_stream.sid, event_stream.file_store
+        )
+    except Exception as e:
+        logger.debug(f'Cannot restore agent state: {e}')
 
     # init controller with this initial state
     controller = AgentController(
@@ -159,7 +161,7 @@ async def run_controller(
     )
 
     # start event is a MessageAction with the task, either resumed or new
-    if config.enable_cli_session and initial_state is not None:
+    if initial_state is not None:
         # we're resuming the previous session
         event_stream.add_event(
             MessageAction(
@@ -170,7 +172,7 @@ async def run_controller(
             ),
             EventSource.USER,
         )
-    elif initial_state is None:
+    else:
         # init with the provided actions
         event_stream.add_event(initial_user_action, EventSource.USER)
 
@@ -197,7 +199,7 @@ async def on_event(event: Event):
         await asyncio.sleep(1)  # Give back control for a tick, so the agent can run
 
     # save session when we're about to close
-    if config.enable_cli_session:
+    if config.file_store is not None and config.file_store != 'memory':
         end_state = controller.get_state()
         # NOTE: the saved state does not include delegates events
         end_state.save_to_session(event_stream.sid, event_stream.file_store)

From c6a90289167549a8a85013f95a2dde3de0186fbf Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sat, 26 Oct 2024 02:34:59 +0200
Subject: [PATCH 78/89] init history for restored state

---
 openhands/controller/agent_controller.py | 65 ++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index 156cf062e7a4..3deae4c2c825 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -679,7 +679,10 @@ def set_initial_state(
                 confirmation_mode=confirmation_mode,
             )
         else:
+            # restored state or from a parent agent does not have history
             self.state = state
+            # hmm from a parent it doesn't need it, too
+            self._init_history()
 
         # FIXME when restored from a previous session, the State object needs to have:
         # - history? let's go with nope
@@ -695,6 +698,68 @@ def set_initial_state(
                 f'AgentController {self.id} restoring from event {self.state.start_id}'
             )
 
+    def _init_history(self):
+        # old-style saved states did not save history
+        # and didn't even have history as a field
+        if not hasattr(self.state, 'history'):
+            logger.debug(
+                'Restored state does not have history, initializing empty history.'
+            )
+            self.state.history = []
+
+        # get the history from the event stream
+        # first define the range of events to fetch
+        start_id = self.state.start_id if self.state.start_id != -1 else 0
+        end_id = (
+            self.state.end_id
+            if self.state.end_id != -1
+            else self.event_stream.get_latest_event_id()
+        )
+
+        # fetch events directly from the event stream
+        # filtering out what an agent history should not include:
+        # - "backend" event types that should not be sent to the agent
+        # - hidden events
+        history = list(
+            self.event_stream.get_events(
+                start_id=start_id,
+                end_id=end_id,
+                reverse=False,
+                filter_out_type=self.filter_out,
+                filter_hidden=True,
+            )
+        )
+
+        # also, we exclude finished delegates from the parent agent's history:
+        # - do not include events between delegate actions and observations
+        # - include the delegate action and observation themselves
+        if self.state.delegates:
+            for (delegate_start_id, delegate_end_id), (
+                delegate_agent,
+                delegate_task,
+            ) in self.state.delegates.items():
+                # sanity checks
+                if (
+                    delegate_start_id < 0
+                    or delegate_end_id < 1
+                    or delegate_start_id >= delegate_end_id
+                    or delegate_end_id >= len(history)
+                ):
+                    logger.error(
+                        f'Invalid delegate ids: {delegate_start_id}, {delegate_end_id}. Skipping...'
+                    )
+                    continue
+
+                # exclude delegate events from history
+                history = [
+                    event
+                    for event in history
+                    if not (delegate_start_id < event.id < delegate_end_id)
+                ]
+
+        # we figured out what the history is, now we can set it
+        self.state.history = history
+
     def _is_stuck(self):
         """Checks if the agent or its delegate is stuck in a loop.
 

From 9af6e5ef0e0c3bb059f06ef3b413059717c8ec68 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sat, 26 Oct 2024 02:36:56 +0200
Subject: [PATCH 79/89] not worth caching delegates if only used once or twice
 per session

---
 openhands/controller/agent_controller.py | 52 ------------------------
 1 file changed, 52 deletions(-)

diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index 3deae4c2c825..0463c2c9bb7d 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -286,8 +286,6 @@ async def _handle_observation(self, observation: Observation):
 
         if isinstance(observation, CmdOutputObservation):
             return
-        elif isinstance(observation, AgentDelegateObservation):
-            self._handle_delegate_observation(observation)
         elif isinstance(observation, ErrorObservation):
             if self.state.agent_state == AgentState.ERROR:
                 self.state.metrics.merge(self.state.local_metrics)
@@ -313,56 +311,6 @@ async def _handle_message_action(self, action: MessageAction):
         elif action.source == EventSource.AGENT and action.wait_for_response:
             await self.set_agent_state_to(AgentState.AWAITING_USER_INPUT)
 
-    def _handle_delegate_observation(self, observation: Observation):
-        """Handles delegate observations from the event stream.
-
-        Args:
-            observation (Observation): The observation to handle.
-        """
-        if not isinstance(observation, AgentDelegateObservation):
-            return
-
-        logger.debug('AgentDelegateObservation received')
-
-        # define the end_id based on the current observation
-        delegate_end = observation.id
-        if delegate_end <= 0:
-            logger.error(
-                f'The id of the AgentDelegateObservation is not valid: {delegate_end}'
-            )
-            return
-
-        # define the start_id by searching for the corresponding AgentDelegateAction
-        delegate_start = -1
-        delegate_agent: str = ''
-        delegate_task: str = ''
-
-        # search through events in reverse to find the AgentDelegateAction
-        for prev_event in self.event_stream.get_events(
-            end_id=observation.id - 1, reverse=True
-        ):
-            # retrieve the last AgentDelegateAction before this observation
-            if isinstance(prev_event, AgentDelegateAction):
-                delegate_start = prev_event.id
-                delegate_agent = prev_event.agent
-                delegate_task = prev_event.inputs.get('task', '')
-                break
-
-        if delegate_start == -1:
-            logger.error(
-                f'No AgentDelegateAction found for AgentDelegateObservation with id={delegate_end}'
-            )
-            return
-
-        # add the event ids to the delegates dictionary
-        self.state.delegates[(delegate_start, delegate_end)] = (
-            delegate_agent,
-            delegate_task,
-        )
-        logger.debug(
-            f'Delegate {delegate_agent} with task {delegate_task} ran from id={delegate_start} to id={delegate_end}'
-        )
-
     def reset_task(self):
         """Resets the agent's task."""
 

From 04b6d70c259888f607dca88fd197ffcd98cfe19d Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 27 Oct 2024 01:55:54 +0200
Subject: [PATCH 80/89] init history from the event stream

---
 openhands/controller/agent_controller.py | 117 ++++++++++++++---------
 1 file changed, 70 insertions(+), 47 deletions(-)

diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index 0463c2c9bb7d..7bf167c461e9 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -627,36 +627,43 @@ def set_initial_state(
                 confirmation_mode=confirmation_mode,
             )
         else:
+            # FIXME when restored from a previous session, the State object needs to have:
+            # - history? no, read it from the event stream
+            # - start_id, potentially end_id
+            # - delegates_ids - no, read it from the event stream if wanted
+
             # restored state or from a parent agent does not have history
             self.state = state
-            # hmm from a parent it doesn't need it, too
-            self._init_history()
 
-        # FIXME when restored from a previous session, the State object needs to have:
-        # - history? let's go with nope
-        # - start_id, end_id
-        # - delegates_ids
+            # if start_id was not set in State, we're starting fresh, at the top of the stream
+            # does this still happen?
+            if self.state.start_id <= -1:
+                self.state.start_id = self.event_stream.get_latest_event_id() + 1
+            else:
+                logger.debug(
+                    f'AgentController {self.id} initializing history from event {self.state.start_id}'
+                )
 
-        # if start_id was not set in State, we're starting fresh, at the top of the stream
-        # does this still happen?
-        if self.state.start_id <= -1:
-            self.state.start_id = self.event_stream.get_latest_event_id() + 1
-        else:
-            logger.debug(
-                f'AgentController {self.id} restoring from event {self.state.start_id}'
-            )
+            self._init_history()
 
     def _init_history(self):
-        # old-style saved states did not save history
-        # and didn't even have history as a field
+        """Initializes the agent's history from the event stream.
+
+        The history is a list of events that:
+        - Excludes events of types listed in self.filter_out
+        - Excludes events with hidden=True attribute
+        - For delegate events (between AgentDelegateAction and AgentDelegateObservation):
+            - Excludes all events between the action and observation
+            - Includes the delegate action and observation themselves
+        """
+        # Initialize empty history if not present (for old-style saved states)
         if not hasattr(self.state, 'history'):
             logger.debug(
                 'Restored state does not have history, initializing empty history.'
             )
             self.state.history = []
 
-        # get the history from the event stream
-        # first define the range of events to fetch
+        # Define range of events to fetch
         start_id = self.state.start_id if self.state.start_id != -1 else 0
         end_id = (
             self.state.end_id
@@ -664,11 +671,8 @@ def _init_history(self):
             else self.event_stream.get_latest_event_id()
         )
 
-        # fetch events directly from the event stream
-        # filtering out what an agent history should not include:
-        # - "backend" event types that should not be sent to the agent
-        # - hidden events
-        history = list(
+        # Get all events, filtering out backend events and hidden events
+        events = list(
             self.event_stream.get_events(
                 start_id=start_id,
                 end_id=end_id,
@@ -678,35 +682,54 @@ def _init_history(self):
             )
         )
 
-        # also, we exclude finished delegates from the parent agent's history:
-        # - do not include events between delegate actions and observations
-        # - include the delegate action and observation themselves
-        if self.state.delegates:
-            for (delegate_start_id, delegate_end_id), (
-                delegate_agent,
-                delegate_task,
-            ) in self.state.delegates.items():
-                # sanity checks
-                if (
-                    delegate_start_id < 0
-                    or delegate_end_id < 1
-                    or delegate_start_id >= delegate_end_id
-                    or delegate_end_id >= len(history)
-                ):
+        # Find all delegate action/observation pairs
+        delegate_ranges: list[tuple[int, int]] = []
+        delegate_action_ids: list[int] = []  # stack of unmatched delegate action IDs
+
+        for event in events:
+            if isinstance(event, AgentDelegateAction):
+                delegate_action_ids.append(event.id)
+                # Note: we can get agent=event.agent and task=event.inputs.get('task','')
+                # if we need to track these in the future
+
+            elif isinstance(event, AgentDelegateObservation):
+                # Match with most recent unmatched delegate action
+                if not delegate_action_ids:
                     logger.error(
-                        f'Invalid delegate ids: {delegate_start_id}, {delegate_end_id}. Skipping...'
+                        f'Found AgentDelegateObservation without matching action at id={event.id}'
                     )
                     continue
 
-                # exclude delegate events from history
-                history = [
-                    event
-                    for event in history
-                    if not (delegate_start_id < event.id < delegate_end_id)
-                ]
+                action_id = delegate_action_ids.pop()
+                delegate_ranges.append((action_id, event.id))
+
+        # Filter out events between delegate action/observation pairs
+        if delegate_ranges:
+            filtered_events: list[Event] = []
+            current_idx = 0
+
+            for start_id, end_id in sorted(delegate_ranges):
+                # Add events before delegate range
+                filtered_events.extend(
+                    event for event in events[current_idx:] if event.id < start_id
+                )
+
+                # Add delegate action and observation
+                filtered_events.extend(
+                    event for event in events if event.id in (start_id, end_id)
+                )
+
+                # Update index to after delegate range
+                current_idx = next(
+                    (i for i, e in enumerate(events) if e.id > end_id), len(events)
+                )
 
-        # we figured out what the history is, now we can set it
-        self.state.history = history
+            # Add any remaining events after last delegate range
+            filtered_events.extend(events[current_idx:])
+
+            self.state.history = filtered_events
+        else:
+            self.state.history = events
 
     def _is_stuck(self):
         """Checks if the agent or its delegate is stuck in a loop.

From 34e0f8a882a1d91a5fd95146b35582f51f8b63b7 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 27 Oct 2024 03:47:05 +0100
Subject: [PATCH 81/89] remove script that got here by accident

---
 evaluation/swe_bench/process_evals.py | 194 --------------------------
 1 file changed, 194 deletions(-)
 delete mode 100644 evaluation/swe_bench/process_evals.py

diff --git a/evaluation/swe_bench/process_evals.py b/evaluation/swe_bench/process_evals.py
deleted file mode 100644
index 3202d10de758..000000000000
--- a/evaluation/swe_bench/process_evals.py
+++ /dev/null
@@ -1,194 +0,0 @@
-import json
-import os
-from typing import Any, Dict, List, Optional
-
-import pandas as pd
-
-
-# Placeholder for litellm's get_tokens function
-def get_tokens(text: str) -> int:
-    """
-    Returns the number of tokens in the given text.
-    Replace this function with litellm's actual get_tokens implementation.
-    """
-    # Example implementation (to be replaced)
-    return len(text.split())
-
-
-class EvalOutput:
-    def __init__(
-        self,
-        instance_id: str,
-        instruction: str,
-        instance: Dict[str, Any],
-        test_result: Dict[str, Any],
-        metadata: Dict[str, Any],
-        history: List[Dict[str, Any]],
-        metrics: Optional[Dict[str, Any]] = None,
-        llm_completions: Optional[List[Any]] = None,
-        error: Optional[str] = None,
-    ):
-        self.instance_id = instance_id
-        self.instruction = instruction
-        self.instance = instance
-        self.test_result = test_result
-        self.metadata = metadata
-        self.history = history
-        self.metrics = metrics
-        self.llm_completions = llm_completions or []
-        self.error = error
-
-    @staticmethod
-    def from_dict(data: Dict[str, Any]) -> 'EvalOutput':
-        return EvalOutput(
-            instance_id=data.get('instance_id', ''),
-            instruction=data.get('instruction', ''),
-            instance=data.get('instance', {}),
-            test_result=data.get('test_result', {}),
-            metadata=data.get('metadata', {}),
-            history=data.get('history', []),
-            metrics=data.get('metrics', None),
-            llm_completions=data.get('llm_completions', []),
-            error=data.get('error', None),
-        )
-
-
-def load_eval_outputs(jsonl_path: str) -> List[EvalOutput]:
-    eval_outputs = []
-    with open(jsonl_path, 'r') as file:
-        content = file.read()
-        try:
-            # Try parsing as a single JSON object
-            data = json.loads(content)
-            eval_output = EvalOutput.from_dict(data)
-            eval_outputs.append(eval_output)
-        except json.JSONDecodeError:
-            # If single JSON parse fails, try parsing as JSON Lines
-            file.seek(0)
-            for line_number, line in enumerate(file, start=1):
-                if line.strip():  # Skip empty lines
-                    try:
-                        data = json.loads(line)
-                        eval_output = EvalOutput.from_dict(data)
-                        eval_outputs.append(eval_output)
-                    except json.JSONDecodeError as e:
-                        print(
-                            f'Failed to parse line {line_number} in {jsonl_path}: {e}'
-                        )
-                        print(
-                            f'Problematic line: {line[:100]}...'
-                        )  # Print first 100 chars of the line
-
-    if not eval_outputs:
-        print(f'Warning: No valid data found in {jsonl_path}')
-
-    return eval_outputs
-
-
-def process_llm_completions(eval_output: EvalOutput) -> List[Dict[str, Any]]:
-    """
-    Processes the llm_completions of an EvalOutput to extract prompts (including system prompt) and responses.
-    Handles both dictionary and string content formats.
-
-    Args:
-        eval_output (EvalOutput): The evaluation output instance.
-
-    Returns:
-        List[Dict[str, Any]]: A list of dictionaries containing 'system_prompt', 'user_prompts', and 'response'.
-    """
-    completions = []
-    for comp in eval_output.llm_completions:
-        messages = comp.get('messages', [])
-        response_content = comp.get('response', {}).get('choices', [])
-
-        # Initialize prompts and response
-        system_prompt = ''
-        user_prompts = []
-        response = ''
-
-        # Extract system prompt and user prompts
-        for message in messages:
-            role = message.get('role')
-            content = message.get('content', [])
-
-            if role == 'system':
-                system_prompt = _extract_text_content(content)
-            elif role == 'user':
-                user_prompts.append(_extract_text_content(content))
-
-        # Extract the assistant's response
-        if response_content and len(response_content) > 0:
-            response = response_content[0].get('message', {}).get('content', '').strip()
-
-        completions.append(
-            {
-                'system_prompt': system_prompt,
-                'user_prompts': user_prompts,
-                'response': response,
-            }
-        )
-    return completions
-
-
-def _extract_text_content(content):
-    if isinstance(content, list):
-        return '\n'.join(
-            item.get('text', '').strip()
-            for item in content
-            if isinstance(item, dict) and item.get('type') == 'text'
-        )
-    elif isinstance(content, str):
-        return content.strip()
-    return ''
-
-
-def create_results_dataframe(eval_outputs: List[EvalOutput]) -> pd.DataFrame:
-    records = []
-    for eo in eval_outputs:
-        completions = process_llm_completions(eo)
-        for comp in completions:
-            prompt = comp['prompt']
-            response = comp['response']
-            token_count = get_tokens(prompt)
-            records.append(
-                {
-                    'instance_id': eo.instance_id,
-                    'prompt': prompt,
-                    'response': response,
-                    'token_count': token_count,
-                }
-            )
-    df = pd.DataFrame(records)
-    return df
-
-
-def main():
-    swe_bench_dir = 'evaluation/evaluation_outputs/outputs/swe-bench-lite/CodeActAgent/'
-    results = []
-
-    # Traverse through all subdirectories to find output.jsonl files
-    for root, dirs, files in os.walk(swe_bench_dir):
-        for file in files:
-            if file.endswith('output.jsonl') or file.endswith('output-pretty.jsonl'):
-                jsonl_path = os.path.join(root, file)
-                print(f'Processing file: {jsonl_path}')
-                try:
-                    eval_outputs = load_eval_outputs(jsonl_path)
-                    if eval_outputs:
-                        df = create_results_dataframe(eval_outputs)
-                        results.append(df)
-                    else:
-                        print(f'No valid data found in {jsonl_path}')
-                except Exception as e:
-                    print(f'Error processing {jsonl_path}: {e}')
-
-    if results:
-        final_df = pd.concat(results, ignore_index=True)
-        final_df.to_csv('swe_bench_evaluation_results.csv', index=False)
-        print('Results saved to swe_bench_evaluation_results.csv')
-    else:
-        print('No valid data found in any of the processed files.')
-
-
-if __name__ == '__main__':
-    main()

From 94c68be15fc64a995e90b1dee14d9f6904c2dcb6 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 27 Oct 2024 04:16:36 +0100
Subject: [PATCH 82/89] save/restore state automatically

---
 openhands/controller/state/state.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/openhands/controller/state/state.py b/openhands/controller/state/state.py
index 6afa0c77cf19..df7f77571ae9 100644
--- a/openhands/controller/state/state.py
+++ b/openhands/controller/state/state.py
@@ -133,6 +133,19 @@ def restore_from_session(sid: str, file_store: FileStore) -> 'State':
         state.agent_state = AgentState.LOADING
         return state
 
+    def __getstate__(self):
+        # don't pickle history, it will be restored from the event stream
+        state = self.__dict__.copy()
+        state.pop('history', None)
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+        # make sure we always have the attribute history
+        if not hasattr(self, 'history'):
+            self.history = []
+
     def get_current_user_intent(self) -> tuple[str | None, list[str] | None]:
         """Returns the latest user message and image(if provided) that appears after a FinishAction, or the first (the task) if nothing was finished yet."""
         last_user_message = None

From 93cfd323a2eceefb74e8f0ec86d063672eac7af9 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 27 Oct 2024 06:20:41 +0100
Subject: [PATCH 83/89] tweak init/restore

---
 openhands/controller/agent_controller.py | 15 ++++++++-------
 openhands/controller/state/state.py      |  2 +-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index 7bf167c461e9..416a29178cb6 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -632,7 +632,6 @@ def set_initial_state(
             # - start_id, potentially end_id
             # - delegates_ids - no, read it from the event stream if wanted
 
-            # restored state or from a parent agent does not have history
             self.state = state
 
             # if start_id was not set in State, we're starting fresh, at the top of the stream
@@ -656,12 +655,6 @@ def _init_history(self):
             - Excludes all events between the action and observation
             - Includes the delegate action and observation themselves
         """
-        # Initialize empty history if not present (for old-style saved states)
-        if not hasattr(self.state, 'history'):
-            logger.debug(
-                'Restored state does not have history, initializing empty history.'
-            )
-            self.state.history = []
 
         # Define range of events to fetch
         start_id = self.state.start_id if self.state.start_id != -1 else 0
@@ -671,6 +664,14 @@ def _init_history(self):
             else self.event_stream.get_latest_event_id()
         )
 
+        # sanity check
+        if start_id > end_id + 1:
+            logger.debug(
+                f'start_id {start_id} is greater than end_id + 1 ({end_id + 1}). History will be empty.'
+            )
+            self.state.history = []
+            return
+
         # Get all events, filtering out backend events and hidden events
         events = list(
             self.event_stream.get_events(
diff --git a/openhands/controller/state/state.py b/openhands/controller/state/state.py
index df7f77571ae9..e2678e37b1ba 100644
--- a/openhands/controller/state/state.py
+++ b/openhands/controller/state/state.py
@@ -136,7 +136,7 @@ def restore_from_session(sid: str, file_store: FileStore) -> 'State':
     def __getstate__(self):
         # don't pickle history, it will be restored from the event stream
         state = self.__dict__.copy()
-        state.pop('history', None)
+        state['history'] = []
         return state
 
     def __setstate__(self, state):

From f53e1cf1180d505be3c27020c44a5739983e8d55 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 27 Oct 2024 07:49:32 +0100
Subject: [PATCH 84/89] set delegates start explicitly; minor tweaks

---
 openhands/controller/agent_controller.py | 40 +++++++++++-------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index 38e0b26c4a19..912ab01ca8c8 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -129,7 +129,9 @@ def __init__(
         self._stuck_detector = StuckDetector(self.state)
 
     async def close(self):
-        """Closes the agent controller, canceling any ongoing tasks and unsubscribing from the event stream."""
+        """Closes the agent controller, canceling any ongoing tasks and unsubscribing from the event stream.
+
+        Note that it's fairly important that this closes properly, otherwise the state is incomplete."""
         await self.set_agent_state_to(AgentState.STOPPED)
 
         # we made history, now is the time to rewrite it!
@@ -138,10 +140,10 @@ async def close(self):
         # like the regular agent history, it does not include:
         # - 'hidden' events, events with hidden=True
         # - backend events (the default 'filtered out' types, types in self.filter_out)
-        start_id = self.state.start_id if self.state.start_id != -1 else 0
+        start_id = self.state.start_id if self.state.start_id >= 0 else 0
         end_id = (
             self.state.end_id
-            if self.state.end_id != -1
+            if self.state.end_id >= 0
             else self.event_stream.get_latest_event_id()
         )
         self.state.history = list(
@@ -414,6 +416,8 @@ async def start_delegate(self, action: AgentDelegateAction):
             delegate_level=self.state.delegate_level + 1,
             # global metrics should be shared between parent and child
             metrics=self.state.metrics,
+            # start on top of the stream
+            start_id=self.event_stream.get_latest_event_id() + 1,
         )
         logger.info(
             f'[Agent Controller {self.id}]: start delegate, creating agent {delegate_agent.name} using LLM {llm}'
@@ -624,8 +628,10 @@ def set_initial_state(
             max_iterations: The maximum number of iterations allowed for the task.
             confirmation_mode: Whether to enable confirmation mode.
         """
-        # state from the previous session, state from a parent agent, or a new state
-        # note that this is called twice when restoring a previous session, first with state=None
+        # state can come from:
+        # - the previous session, in which case it has history
+        # - from a parent agent, in which case it has no history
+        # - None / a new state
         if state is None:
             self.state = State(
                 inputs={},
@@ -633,21 +639,11 @@ def set_initial_state(
                 confirmation_mode=confirmation_mode,
             )
         else:
-            # FIXME when restored from a previous session, the State object needs to have:
-            # - history? no, read it from the event stream
-            # - start_id, potentially end_id
-            # - delegates_ids - no, read it from the event stream if wanted
-
             self.state = state
 
-            # if start_id was not set in State, we're starting fresh, at the top of the stream
-            # does this still happen?
-            if self.state.start_id <= -1:
-                self.state.start_id = self.event_stream.get_latest_event_id() + 1
-            else:
-                logger.debug(
-                    f'AgentController {self.id} initializing history from event {self.state.start_id}'
-                )
+            logger.debug(
+                f'AgentController {self.id} initializing history from event {self.state.start_id}'
+            )
 
             self._init_history()
 
@@ -662,11 +658,13 @@ def _init_history(self):
             - Includes the delegate action and observation themselves
         """
 
-        # Define range of events to fetch
-        start_id = self.state.start_id if self.state.start_id != -1 else 0
+        # define range of events to fetch
+        # delegates start with a start_id and initially won't find any events
+        # otherwise we're restoring a previous session
+        start_id = self.state.start_id if self.state.start_id >= 0 else 0
         end_id = (
             self.state.end_id
-            if self.state.end_id != -1
+            if self.state.end_id >= 0
             else self.event_stream.get_latest_event_id()
         )
 

From f42cbedfe3536859d8f7ef2be09baa77718840c0 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 27 Oct 2024 08:14:20 +0100
Subject: [PATCH 85/89] fix tests

---
 tests/unit/test_codeact_agent.py  |  3 +-
 tests/unit/test_prompt_caching.py | 59 ++++++++++++++++++++-----------
 2 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/tests/unit/test_codeact_agent.py b/tests/unit/test_codeact_agent.py
index 55dfa3feb75b..14ca1ee639e4 100644
--- a/tests/unit/test_codeact_agent.py
+++ b/tests/unit/test_codeact_agent.py
@@ -92,5 +92,4 @@ def test_error_observation_message(agent: CodeActAgent):
 def test_unknown_observation_message(agent: CodeActAgent):
     obs = Mock()
 
-    with pytest.raises(ValueError, match='Unknown observation type:'):
-        agent.get_observation_message(obs)
+    assert agent.get_observation_message(obs) is None
diff --git a/tests/unit/test_prompt_caching.py b/tests/unit/test_prompt_caching.py
index a3728ff9ea54..db4eb2a56dab 100644
--- a/tests/unit/test_prompt_caching.py
+++ b/tests/unit/test_prompt_caching.py
@@ -4,11 +4,9 @@
 
 from openhands.agenthub.codeact_agent.codeact_agent import CodeActAgent
 from openhands.core.config import AgentConfig, LLMConfig
-from openhands.events import EventStream
 from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.llm.llm import LLM
-from openhands.storage import get_file_store
 
 
 @pytest.fixture
@@ -19,26 +17,30 @@ def mock_llm():
     return llm
 
 
-@pytest.fixture
-def mock_event_stream(tmp_path):
-    file_store = get_file_store('local', str(tmp_path))
-    return EventStream('test_session', file_store)
-
-
 @pytest.fixture
 def codeact_agent(mock_llm):
     config = AgentConfig()
     return CodeActAgent(mock_llm, config)
 
 
-def test_get_messages_with_reminder(codeact_agent, mock_event_stream):
-    # Add some events to the history list
+def test_get_messages_with_reminder(codeact_agent: CodeActAgent):
+    # Add some events to history
     history = list()
-    history.append(MessageAction('Initial user message'))
-    history.append(MessageAction('Sure!'))
-    history.append(MessageAction('Hello, agent!'))
-    history.append(MessageAction('Hello, user!'))
-    history.append(MessageAction('Laaaaaaaast!'))
+    message_action_1 = MessageAction('Initial user message')
+    message_action_1._source = 'user'
+    history.append(message_action_1)
+    message_action_2 = MessageAction('Sure!')
+    message_action_2._source = 'assistant'
+    history.append(message_action_2)
+    message_action_3 = MessageAction('Hello, agent!')
+    message_action_3._source = 'user'
+    history.append(message_action_3)
+    message_action_4 = MessageAction('Hello, user!')
+    message_action_4._source = 'assistant'
+    history.append(message_action_4)
+    message_action_5 = MessageAction('Laaaaaaaast!')
+    message_action_5._source = 'user'
+    history.append(message_action_5)
 
     codeact_agent.reset()
     messages = codeact_agent._get_messages(
@@ -72,12 +74,16 @@ def test_get_messages_with_reminder(codeact_agent, mock_event_stream):
     )
 
 
-def test_get_messages_prompt_caching(codeact_agent, mock_event_stream):
+def test_get_messages_prompt_caching(codeact_agent: CodeActAgent):
     history = list()
     # Add multiple user and agent messages
     for i in range(15):
-        history.append(MessageAction(f'User message {i}'))
-        history.append(MessageAction(f'Agent message {i}'))
+        message_action_user = MessageAction(f'User message {i}')
+        message_action_user._source = 'user'
+        history.append(message_action_user)
+        message_action_agent = MessageAction(f'Agent message {i}')
+        message_action_agent._source = 'assistant'
+        history.append(message_action_agent)
 
     codeact_agent.reset()
     messages = codeact_agent._get_messages(
@@ -102,12 +108,18 @@ def test_get_messages_prompt_caching(codeact_agent, mock_event_stream):
     assert cached_user_messages[3].content[0].text.startswith('User message 1')
 
 
-def test_get_messages_with_cmd_action(codeact_agent, mock_event_stream):
+def test_get_messages_with_cmd_action(codeact_agent: CodeActAgent):
     history = list()
     # Add a mix of actions and observations
-    history.append(MessageAction("Let's list the contents of the current directory."))
+    message_action_1 = MessageAction(
+        "Let's list the contents of the current directory."
+    )
+    message_action_1._source = 'user'
+    history.append(message_action_1)
 
     cmd_action_1 = CmdRunAction('ls -l', thought='List files in current directory')
+    cmd_action_1._source = 'agent'
+    cmd_action_1._id = 'cmd_1'
     history.append(cmd_action_1)
 
     cmd_observation_1 = CmdOutputObservation(
@@ -116,12 +128,16 @@ def test_get_messages_with_cmd_action(codeact_agent, mock_event_stream):
         command='ls -l',
         exit_code=0,
     )
+    cmd_observation_1._source = 'user'
     history.append(cmd_observation_1)
 
     message_action_2 = MessageAction("Now, let's create a new directory.")
+    message_action_2._source = 'agent'
     history.append(message_action_2)
 
     cmd_action_2 = CmdRunAction('mkdir new_directory', thought='Create a new directory')
+    cmd_action_2._source = 'agent'
+    cmd_action_2._id = 'cmd_2'
     history.append(cmd_action_2)
 
     cmd_observation_2 = CmdOutputObservation(
@@ -130,6 +146,7 @@ def test_get_messages_with_cmd_action(codeact_agent, mock_event_stream):
         command='mkdir new_directory',
         exit_code=0,
     )
+    cmd_observation_2._source = 'user'
     history.append(cmd_observation_2)
 
     codeact_agent.reset()
@@ -176,7 +193,7 @@ def test_get_messages_with_cmd_action(codeact_agent, mock_event_stream):
     assert 'ENVIRONMENT REMINDER: You have 5 turns' in messages[5].content[1].text
 
 
-def test_prompt_caching_headers(codeact_agent, mock_event_stream):
+def test_prompt_caching_headers(codeact_agent: CodeActAgent):
     history = list()
     # Setup
     history.append(MessageAction('Hello, agent!'))

From ebeab75373e5f6a9d46fa88e74120ab407636b71 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 27 Oct 2024 08:27:46 +0100
Subject: [PATCH 86/89] clean up verbose log

---
 openhands/controller/agent_controller.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index 912ab01ca8c8..2471c3af7b14 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -443,9 +443,6 @@ async def _step(self) -> None:
             return
 
         if self._pending_action:
-            logger.debug(
-                f'{self.agent.name} LEVEL {self.state.delegate_level} LOCAL STEP {self.state.local_iteration} GLOBAL STEP {self.state.iteration} awaiting pending action to get executed: {self._pending_action}'
-            )
             await asyncio.sleep(1)
             return
 

From a213c65b8a43a06c9d1fdfe6a372b146135a1d3e Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 27 Oct 2024 09:02:01 +0100
Subject: [PATCH 87/89] make extra sure we have a valid start

---
 openhands/controller/agent_controller.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index 2471c3af7b14..0b0e4ec6cfe4 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -638,6 +638,9 @@ def set_initial_state(
         else:
             self.state = state
 
+            if self.state.start_id <= -1:
+                self.state.start_id = 0
+
             logger.debug(
                 f'AgentController {self.id} initializing history from event {self.state.start_id}'
             )

From 63284d39f88bffb0e31c995652505b50a0756fab Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sun, 27 Oct 2024 09:09:16 +0100
Subject: [PATCH 88/89] poetry lock

---
 poetry.lock    | 324 ++++++++++++++++++++++++-------------------------
 pyproject.toml |   2 +
 2 files changed, 163 insertions(+), 163 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index cb06e7e46151..a424c3a39af1 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3924,72 +3924,88 @@ tokenizers = "*"
 extra-proxy = ["azure-identity (>=1.15.0,<2.0.0)", "azure-keyvault-secrets (>=4.8.0,<5.0.0)", "google-cloud-kms (>=2.21.3,<3.0.0)", "prisma (==0.11.0)", "resend (>=0.8.0,<0.9.0)"]
 proxy = ["PyJWT (>=2.8.0,<3.0.0)", "apscheduler (>=3.10.4,<4.0.0)", "backoff", "cryptography (>=42.0.5,<43.0.0)", "fastapi (>=0.111.0,<0.112.0)", "fastapi-sso (>=0.10.0,<0.11.0)", "gunicorn (>=22.0.0,<23.0.0)", "orjson (>=3.9.7,<4.0.0)", "pynacl (>=1.5.0,<2.0.0)", "python-multipart (>=0.0.9,<0.0.10)", "pyyaml (>=6.0.1,<7.0.0)", "rq", "uvicorn (>=0.22.0,<0.23.0)"]
 
+[[package]]
+name = "llama-cloud"
+version = "0.1.4"
+description = ""
+optional = false
+python-versions = "<4,>=3.8"
+files = [
+    {file = "llama_cloud-0.1.4-py3-none-any.whl", hash = "sha256:cfca6c4e0a87468b922d732f0f313a2ecd3a8e0bf74382ee80829ce49dcbc5e0"},
+    {file = "llama_cloud-0.1.4.tar.gz", hash = "sha256:6f0155979bd96160951cb812c48836f1face037bc79ccfd8d185b18ef4c9faf8"},
+]
+
+[package.dependencies]
+httpx = ">=0.20.0"
+pydantic = ">=1.10"
+
 [[package]]
 name = "llama-index"
-version = "0.10.45.post1"
+version = "0.11.20"
 description = "Interface between LLMs and your data"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index-0.10.45.post1-py3-none-any.whl", hash = "sha256:11ff93431a81f1eae5bb73092d194cfb66a36ea90f272ea145f20e6e4324c71c"},
-    {file = "llama_index-0.10.45.post1.tar.gz", hash = "sha256:0bd3dcdbbfa468c408ad2f9e839b60562367ec6563c13c9bddcd108309881447"},
+    {file = "llama_index-0.11.20-py3-none-any.whl", hash = "sha256:fc9e5e47e6da3610bc3b788d208bb782c03a342fd71e3b22b37abc83ecebe46e"},
+    {file = "llama_index-0.11.20.tar.gz", hash = "sha256:5e8e3fcb5af5b4e4525498b075ff0a54160b00bf0fc0b83801fc7faf1c8a8c1d"},
 ]
 
 [package.dependencies]
-llama-index-agent-openai = ">=0.1.4,<0.3.0"
-llama-index-cli = ">=0.1.2,<0.2.0"
-llama-index-core = "0.10.45"
-llama-index-embeddings-openai = ">=0.1.5,<0.2.0"
-llama-index-indices-managed-llama-cloud = ">=0.1.2,<0.2.0"
+llama-index-agent-openai = ">=0.3.4,<0.4.0"
+llama-index-cli = ">=0.3.1,<0.4.0"
+llama-index-core = ">=0.11.20,<0.12.0"
+llama-index-embeddings-openai = ">=0.2.4,<0.3.0"
+llama-index-indices-managed-llama-cloud = ">=0.3.0"
 llama-index-legacy = ">=0.9.48,<0.10.0"
-llama-index-llms-openai = ">=0.1.13,<0.2.0"
-llama-index-multi-modal-llms-openai = ">=0.1.3,<0.2.0"
-llama-index-program-openai = ">=0.1.3,<0.2.0"
-llama-index-question-gen-openai = ">=0.1.2,<0.2.0"
-llama-index-readers-file = ">=0.1.4,<0.2.0"
-llama-index-readers-llama-parse = ">=0.1.2,<0.2.0"
+llama-index-llms-openai = ">=0.2.10,<0.3.0"
+llama-index-multi-modal-llms-openai = ">=0.2.0,<0.3.0"
+llama-index-program-openai = ">=0.2.0,<0.3.0"
+llama-index-question-gen-openai = ">=0.2.0,<0.3.0"
+llama-index-readers-file = ">=0.2.0,<0.3.0"
+llama-index-readers-llama-parse = ">=0.3.0"
+nltk = ">3.8.1"
 
 [[package]]
 name = "llama-index-agent-openai"
-version = "0.2.9"
+version = "0.3.4"
 description = "llama-index agent openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_agent_openai-0.2.9-py3-none-any.whl", hash = "sha256:d7f0fd4c87124781acd783be603871f8808b1a3969e876a9c96e2ed0844d46ac"},
-    {file = "llama_index_agent_openai-0.2.9.tar.gz", hash = "sha256:debe86da6d9d983db32b445ddca7c798ac140fe59573bafded73595b3995f3d5"},
+    {file = "llama_index_agent_openai-0.3.4-py3-none-any.whl", hash = "sha256:3720ce9bb12417a99a3fe84e52cce23e762b13f88a2dfc4292c76f4df9b26b4a"},
+    {file = "llama_index_agent_openai-0.3.4.tar.gz", hash = "sha256:80e3408d97121bebca3fa3ffd14b51285870c1c3c73d4ee04d3d18cfe6040466"},
 ]
 
 [package.dependencies]
-llama-index-core = ">=0.10.41,<0.11.0"
-llama-index-llms-openai = ">=0.1.5,<0.2.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-llms-openai = ">=0.2.9,<0.3.0"
 openai = ">=1.14.0"
 
 [[package]]
 name = "llama-index-cli"
-version = "0.1.13"
+version = "0.3.1"
 description = "llama-index cli"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_cli-0.1.13-py3-none-any.whl", hash = "sha256:5e05bc3ce55ee1bf6e5af7e87631a71d6b6cf8fc2af10cd3947b09b1bac6788d"},
-    {file = "llama_index_cli-0.1.13.tar.gz", hash = "sha256:86147ded4439fbab1d6c7c0d72e8f231d2935da9fdf5c9d3f0dde4f35d44aa59"},
+    {file = "llama_index_cli-0.3.1-py3-none-any.whl", hash = "sha256:2111fbb6973f5b1eabce0d6cca3986499f0f2f625b13d7f48269a49c64c027d4"},
+    {file = "llama_index_cli-0.3.1.tar.gz", hash = "sha256:1890dd687cf440f3651365a549e303363162c167b8efbd87a3aa10058d6d5c77"},
 ]
 
 [package.dependencies]
-llama-index-core = ">=0.10.11.post1,<0.11.0"
-llama-index-embeddings-openai = ">=0.1.1,<0.2.0"
-llama-index-llms-openai = ">=0.1.1,<0.2.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-embeddings-openai = ">=0.2.0,<0.3.0"
+llama-index-llms-openai = ">=0.2.0,<0.3.0"
 
 [[package]]
 name = "llama-index-core"
-version = "0.10.45"
+version = "0.11.20"
 description = "Interface between LLMs and your data"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_core-0.10.45-py3-none-any.whl", hash = "sha256:8c800c7221322b8e1cbbbc13325039b5fe3575d4b0e0be14ac9a8f1e5d14fee3"},
-    {file = "llama_index_core-0.10.45.tar.gz", hash = "sha256:f32d0448e7193ff45c8e84abd49493be030998fc8f1a0cab069387deef3e577c"},
+    {file = "llama_index_core-0.11.20-py3-none-any.whl", hash = "sha256:e84daf45e90e4b5d9e135baf40ab9853a1c3169a1076af6d58739d098e70adb1"},
+    {file = "llama_index_core-0.11.20.tar.gz", hash = "sha256:6b5eaaf4be5030808b9ba953e8f7aead7ba495b8e72ba0a81dfc7dda96be416f"},
 ]
 
 [package.dependencies]
@@ -3999,18 +4015,16 @@ deprecated = ">=1.2.9.3"
 dirtyjson = ">=1.0.8,<2.0.0"
 fsspec = ">=2023.5.0"
 httpx = "*"
-llamaindex-py-client = ">=0.1.18,<0.2.0"
 nest-asyncio = ">=1.5.8,<2.0.0"
 networkx = ">=3.0"
-nltk = ">=3.8.1,<4.0.0"
-numpy = "*"
-openai = ">=1.1.0"
-pandas = "*"
+nltk = ">3.8.1"
+numpy = "<2.0.0"
 pillow = ">=9.0.0"
+pydantic = ">=2.7.0,<3.0.0"
 PyYAML = ">=6.0.1"
 requests = ">=2.31.0"
 SQLAlchemy = {version = ">=1.4.49", extras = ["asyncio"]}
-tenacity = ">=8.2.0,<9.0.0"
+tenacity = ">=8.2.0,<8.4.0 || >8.4.0,<9.0.0"
 tiktoken = ">=0.3.3"
 tqdm = ">=4.66.1,<5.0.0"
 typing-extensions = ">=4.5.0"
@@ -4019,60 +4033,60 @@ wrapt = "*"
 
 [[package]]
 name = "llama-index-embeddings-azure-openai"
-version = "0.1.11"
+version = "0.2.5"
 description = "llama-index embeddings azure openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_embeddings_azure_openai-0.1.11-py3-none-any.whl", hash = "sha256:afefe55ee69934528c569ddf71fb1e9ddf2992b6c344c4c9d72a03fa8c33cf40"},
-    {file = "llama_index_embeddings_azure_openai-0.1.11.tar.gz", hash = "sha256:40a4fd9a31ba74f071739d6c8405187b66e7f584ae2f64a30316c6c7b6a25325"},
+    {file = "llama_index_embeddings_azure_openai-0.2.5-py3-none-any.whl", hash = "sha256:e3384002618d027c3d188134e7fe09ffb16029202db6b3e6955a9f1f6d591a3e"},
+    {file = "llama_index_embeddings_azure_openai-0.2.5.tar.gz", hash = "sha256:d8b2e3134c2b3510214f2260e6c17be18396d0c765f3edd6c3ffe6109528aed0"},
 ]
 
 [package.dependencies]
-llama-index-core = ">=0.10.11.post1,<0.11.0"
-llama-index-embeddings-openai = ">=0.1.3,<0.2.0"
-llama-index-llms-azure-openai = ">=0.1.3,<0.2.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-embeddings-openai = ">=0.2.3,<0.3.0"
+llama-index-llms-azure-openai = ">=0.2.0,<0.3.0"
 
 [[package]]
 name = "llama-index-embeddings-huggingface"
-version = "0.2.3"
+version = "0.3.1"
 description = "llama-index embeddings huggingface integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_embeddings_huggingface-0.2.3-py3-none-any.whl", hash = "sha256:7dee842f938d5fa8992e7803eda8a14f6bea72ec0bc0a546f4c6aa455166cde5"},
-    {file = "llama_index_embeddings_huggingface-0.2.3.tar.gz", hash = "sha256:6fe54366eeb87ff81b50624d6b8ccca4230f8035fcc19a0b0b3f31c6d8a82f8b"},
+    {file = "llama_index_embeddings_huggingface-0.3.1-py3-none-any.whl", hash = "sha256:71708240b1aec183c80f20d531b39a75d0cce774586e11bb0798f3ecb270749c"},
+    {file = "llama_index_embeddings_huggingface-0.3.1.tar.gz", hash = "sha256:7aef6324a19576e6b95bfe927c3bd4fc1c5725edce9f26b4e5d2eefa27c02fdb"},
 ]
 
 [package.dependencies]
 huggingface-hub = {version = ">=0.19.0", extras = ["inference"]}
-llama-index-core = ">=0.10.1,<0.11.0"
+llama-index-core = ">=0.11.0,<0.12.0"
 sentence-transformers = ">=2.6.1"
 
 [[package]]
 name = "llama-index-embeddings-ollama"
-version = "0.2.0"
+version = "0.3.1"
 description = "llama-index embeddings ollama integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_embeddings_ollama-0.2.0-py3-none-any.whl", hash = "sha256:372b059321386bd9bbf4f619ad33dd551adb9ee92eeeb0c664d3466f7c212e2e"},
-    {file = "llama_index_embeddings_ollama-0.2.0.tar.gz", hash = "sha256:5673c740e1dd146e17d1c0401c1e179c0d559caf0967f4a4721b89fbb6822ad8"},
+    {file = "llama_index_embeddings_ollama-0.3.1-py3-none-any.whl", hash = "sha256:b869ce7e9f8e67aa7d81336e90d25d3ea1fca91c68dce8922b2d4b9c06c5acef"},
+    {file = "llama_index_embeddings_ollama-0.3.1.tar.gz", hash = "sha256:5a3e75fa14be7e2b1a82937416c880204dc96e1b1d2626dc5bde93f021e7b540"},
 ]
 
 [package.dependencies]
-llama-index-core = ">=0.10.1,<0.11.0"
+llama-index-core = ">=0.11.0,<0.12.0"
 ollama = ">=0.3.1,<0.4.0"
 
 [[package]]
 name = "llama-index-embeddings-openai"
-version = "0.1.11"
+version = "0.2.5"
 description = "llama-index embeddings openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_embeddings_openai-0.1.11-py3-none-any.whl", hash = "sha256:e20806fc4baff6b8f5274decf2c1ca7c5c737648e01865475ffada164e32e173"},
-    {file = "llama_index_embeddings_openai-0.1.11.tar.gz", hash = "sha256:6025e229e375201788a9b14d6ebe470329907576cba5f6b7b832c3d68f39db30"},
+    {file = "llama_index_embeddings_openai-0.2.5-py3-none-any.whl", hash = "sha256:823c8311e556349ba19dda408a64a314fa3dafe0e5759709c54d33a0269aa6ba"},
+    {file = "llama_index_embeddings_openai-0.2.5.tar.gz", hash = "sha256:0047dd71d747068645ed728c29312aa91b65bbe4c6142180034c64dfc5c6f6e8"},
 ]
 
 [package.dependencies]
@@ -4096,18 +4110,18 @@ voyageai = ">=0.2.1,<0.3.0"
 
 [[package]]
 name = "llama-index-indices-managed-llama-cloud"
-version = "0.1.6"
+version = "0.4.0"
 description = "llama-index indices llama-cloud integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_indices_managed_llama_cloud-0.1.6-py3-none-any.whl", hash = "sha256:cba33e1a3677b2a2ae7f239119acbf6dc3818f105edc92315729842b56fbc949"},
-    {file = "llama_index_indices_managed_llama_cloud-0.1.6.tar.gz", hash = "sha256:74b3b0e9ebf9d348d3054f9fc0c657031acceb9351c31116ad8d5a7ae4729f5c"},
+    {file = "llama_index_indices_managed_llama_cloud-0.4.0-py3-none-any.whl", hash = "sha256:c2c54821f1bf17a7810e6c013fbe7ddfef4154b7e5b100f7bf8673098f8004e4"},
+    {file = "llama_index_indices_managed_llama_cloud-0.4.0.tar.gz", hash = "sha256:fbebff7876a219b6ab96892ae7c432a9299195fab8f67d4a4a0ebf6da210b242"},
 ]
 
 [package.dependencies]
-llama-index-core = ">=0.10.0,<0.11.0"
-llamaindex-py-client = ">=0.1.19,<0.2.0"
+llama-cloud = ">=0.0.11"
+llama-index-core = ">=0.11.13.post1,<0.12.0"
 
 [[package]]
 name = "llama-index-legacy"
@@ -4150,96 +4164,98 @@ query-tools = ["guidance (>=0.0.64,<0.0.65)", "jsonpath-ng (>=1.6.0,<2.0.0)", "l
 
 [[package]]
 name = "llama-index-llms-azure-openai"
-version = "0.1.10"
+version = "0.2.2"
 description = "llama-index llms azure openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_llms_azure_openai-0.1.10-py3-none-any.whl", hash = "sha256:8666b095118ed9c5087dc2d91a83a826d4549ea4d442b9eef363e243207d3539"},
-    {file = "llama_index_llms_azure_openai-0.1.10.tar.gz", hash = "sha256:f1624c9bd7bf4458e98cca6f3b805eec06105fa951536ff24b098d913d2368bd"},
+    {file = "llama_index_llms_azure_openai-0.2.2-py3-none-any.whl", hash = "sha256:c8a7d04a111ceff0b4335dc9273fbdb37fdb5095b6234190ca727736f6466d7b"},
+    {file = "llama_index_llms_azure_openai-0.2.2.tar.gz", hash = "sha256:717bc3bf858e800d66e4f2ddec85a2e7dd503006d55981053d08e98771ec3abc"},
 ]
 
 [package.dependencies]
 azure-identity = ">=1.15.0,<2.0.0"
 httpx = "*"
-llama-index-core = ">=0.10.11.post1,<0.11.0"
-llama-index-llms-openai = ">=0.1.1,<0.2.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-llms-openai = ">=0.2.1,<0.3.0"
 
 [[package]]
 name = "llama-index-llms-openai"
-version = "0.1.26"
+version = "0.2.16"
 description = "llama-index llms openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_llms_openai-0.1.26-py3-none-any.whl", hash = "sha256:1ad8e4eb02f9410c2091749d4d9aa9db4452646b595eb5eb937edbc496fb65fe"},
-    {file = "llama_index_llms_openai-0.1.26.tar.gz", hash = "sha256:08a408cd53af4cd4623dd5807be4cbbd5e5b3ca01272128cd678d667343e4d5d"},
+    {file = "llama_index_llms_openai-0.2.16-py3-none-any.whl", hash = "sha256:413466acbb894bd81f8dab2037f595e92392d869eec6d8274a16d43123cac8b6"},
+    {file = "llama_index_llms_openai-0.2.16.tar.gz", hash = "sha256:7c666dd27056c278a079ff45d53f1fbfc8ed363764aa7baeee2e03df47f9072a"},
 ]
 
 [package.dependencies]
-llama-index-core = ">=0.10.24,<0.11.0"
+llama-index-core = ">=0.11.7,<0.12.0"
+openai = ">=1.40.0,<2.0.0"
 
 [[package]]
 name = "llama-index-multi-modal-llms-openai"
-version = "0.1.9"
+version = "0.2.3"
 description = "llama-index multi-modal-llms openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_multi_modal_llms_openai-0.1.9-py3-none-any.whl", hash = "sha256:614f40427a4671e72742780be8fda77297dbf2942519bffcb2c9de8696a9edff"},
-    {file = "llama_index_multi_modal_llms_openai-0.1.9.tar.gz", hash = "sha256:dbacf44d5c2cca07ca424eacd1337583002d70387a3c1868cf8ae743b1dbec4a"},
+    {file = "llama_index_multi_modal_llms_openai-0.2.3-py3-none-any.whl", hash = "sha256:96b36beb2c3fca4faca80c59ecf7c6c6629ecdb96c288ef89777b592ec43f872"},
+    {file = "llama_index_multi_modal_llms_openai-0.2.3.tar.gz", hash = "sha256:8eb9b7f1ff3956ef0979e21bc83e6a885e40987b7199f195e46525d06e3ae402"},
 ]
 
 [package.dependencies]
-llama-index-core = ">=0.10.1,<0.11.0"
-llama-index-llms-openai = ">=0.1.1,<0.2.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-llms-openai = ">=0.2.11,<0.3.0"
 
 [[package]]
 name = "llama-index-program-openai"
-version = "0.1.6"
+version = "0.2.0"
 description = "llama-index program openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_program_openai-0.1.6-py3-none-any.whl", hash = "sha256:4660b338503537c5edca1e0dab606af6ce372b4f1b597e2833c6b602447c5d8d"},
-    {file = "llama_index_program_openai-0.1.6.tar.gz", hash = "sha256:c6a4980c5ea826088b28b4dee3367edb20221e6d05eb0e05019049190131d772"},
+    {file = "llama_index_program_openai-0.2.0-py3-none-any.whl", hash = "sha256:2e10d0c8f21af2e9443eb79e81bb31e7b73835b7c7bbd7ddf20e0a9c846cd368"},
+    {file = "llama_index_program_openai-0.2.0.tar.gz", hash = "sha256:4139935541c011257fbfeb9662b3bf1237b729ef4b1c8f4ddf5b6789d2374ac4"},
 ]
 
 [package.dependencies]
-llama-index-agent-openai = ">=0.1.1,<0.3.0"
-llama-index-core = ">=0.10.1,<0.11.0"
-llama-index-llms-openai = ">=0.1.1,<0.2.0"
+llama-index-agent-openai = ">=0.3.0,<0.4.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-llms-openai = ">=0.2.0,<0.3.0"
 
 [[package]]
 name = "llama-index-question-gen-openai"
-version = "0.1.3"
+version = "0.2.0"
 description = "llama-index question_gen openai integration"
 optional = false
-python-versions = ">=3.8.1,<4.0"
+python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_question_gen_openai-0.1.3-py3-none-any.whl", hash = "sha256:1f83b49e8b2e665030d1ec8c54687d6985d9fa8426147b64e46628a9e489b302"},
-    {file = "llama_index_question_gen_openai-0.1.3.tar.gz", hash = "sha256:4486198117a45457d2e036ae60b93af58052893cc7d78fa9b6f47dd47b81e2e1"},
+    {file = "llama_index_question_gen_openai-0.2.0-py3-none-any.whl", hash = "sha256:a16e68fc5434e9a793f1dfd0cc0354ee19afd167f1d499403b0085b11c5406c0"},
+    {file = "llama_index_question_gen_openai-0.2.0.tar.gz", hash = "sha256:3dde1cecbd651000639c20031d7ea23334276aabb181cac40ff424f35e10465e"},
 ]
 
 [package.dependencies]
-llama-index-core = ">=0.10.1,<0.11.0"
-llama-index-llms-openai = ">=0.1.1,<0.2.0"
-llama-index-program-openai = ">=0.1.1,<0.2.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-llms-openai = ">=0.2.0,<0.3.0"
+llama-index-program-openai = ">=0.2.0,<0.3.0"
 
 [[package]]
 name = "llama-index-readers-file"
-version = "0.1.33"
+version = "0.2.2"
 description = "llama-index readers file integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_readers_file-0.1.33-py3-none-any.whl", hash = "sha256:c968308497c1355acf61fe7e3f05ad8e308bb6487dddd3bd2a60e102225d0b38"},
-    {file = "llama_index_readers_file-0.1.33.tar.gz", hash = "sha256:247a4d5bfabc7d1022027adf58064bc16c224d006db142abb0d182ac5574a887"},
+    {file = "llama_index_readers_file-0.2.2-py3-none-any.whl", hash = "sha256:ffec878771c1e7575afb742887561059bcca77b97a81c1c1be310ebb73f10f46"},
+    {file = "llama_index_readers_file-0.2.2.tar.gz", hash = "sha256:48459f90960b863737147b66ed83afec9ce8984f8eda2561b6d2500214365db2"},
 ]
 
 [package.dependencies]
 beautifulsoup4 = ">=4.12.3,<5.0.0"
-llama-index-core = ">=0.10.37.post1,<0.11.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+pandas = "*"
 pypdf = ">=4.0.1,<5.0.0"
 striprtf = ">=0.0.26,<0.0.27"
 
@@ -4248,62 +4264,48 @@ pymupdf = ["pymupdf (>=1.23.21,<2.0.0)"]
 
 [[package]]
 name = "llama-index-readers-llama-parse"
-version = "0.1.6"
+version = "0.3.0"
 description = "llama-index readers llama-parse integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_readers_llama_parse-0.1.6-py3-none-any.whl", hash = "sha256:71d445a2357ce4c632e0fada7c913ac62790e77c062f12d916dd86378380ff1f"},
-    {file = "llama_index_readers_llama_parse-0.1.6.tar.gz", hash = "sha256:04f2dcfbb0fb87ce70890f5a2f4f89941d79be6a818b43738f053560e4b451cf"},
+    {file = "llama_index_readers_llama_parse-0.3.0-py3-none-any.whl", hash = "sha256:1973cc710dbd5e110c7500c9983ecb45787ad1ff92e6b2113f94a57cf48f3038"},
+    {file = "llama_index_readers_llama_parse-0.3.0.tar.gz", hash = "sha256:a5feada0895714dcc41d65dd512c1c38cf70d8ae19947cff82b80d58e6aa367e"},
 ]
 
 [package.dependencies]
-llama-index-core = ">=0.10.7,<0.11.0"
-llama-parse = ">=0.4.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-parse = ">=0.5.0"
 
 [[package]]
 name = "llama-index-vector-stores-chroma"
-version = "0.1.10"
+version = "0.2.1"
 description = "llama-index vector_stores chroma integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_vector_stores_chroma-0.1.10-py3-none-any.whl", hash = "sha256:18859272ec8d3ed20bae7e4a9bc18feb4233e8be2a725d33626f283ac41d1475"},
-    {file = "llama_index_vector_stores_chroma-0.1.10.tar.gz", hash = "sha256:97971f7b36461ef37be023b9ceb5531396cc48360d0bdbda51cce1290301cc47"},
+    {file = "llama_index_vector_stores_chroma-0.2.1-py3-none-any.whl", hash = "sha256:6dcca6450d298d3033a47b2131d0618ad48c172a3541eb6c790a61bf94136fed"},
+    {file = "llama_index_vector_stores_chroma-0.2.1.tar.gz", hash = "sha256:def15a76354bb4658b16badb92537a72e766273d5e566b0575461005da53847f"},
 ]
 
 [package.dependencies]
-chromadb = ">=0.4.0,<0.6.0"
-llama-index-core = ">=0.10.1,<0.11.0"
+chromadb = ">=0.4.0,<0.5.4 || >0.5.4,<0.5.7 || >0.5.7,<0.5.9 || >0.5.9,<0.5.10 || >0.5.10,<0.5.11 || >0.5.11,<0.5.12 || >0.5.12,<0.6.0"
+llama-index-core = ">=0.11.0,<0.12.0"
 
 [[package]]
 name = "llama-parse"
-version = "0.4.9"
+version = "0.5.12"
 description = "Parse files into RAG-Optimized formats."
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_parse-0.4.9-py3-none-any.whl", hash = "sha256:71974a57a73d642608cc406942bee4e7fc1a713fa410f51df67da509479ba544"},
-    {file = "llama_parse-0.4.9.tar.gz", hash = "sha256:657f8fa5f7d399f14c0454fc05cae6034da0373f191df6cfca17a1b4a704ef87"},
-]
-
-[package.dependencies]
-llama-index-core = ">=0.10.29"
-
-[[package]]
-name = "llamaindex-py-client"
-version = "0.1.19"
-description = ""
-optional = false
-python-versions = "<4,>=3.8"
-files = [
-    {file = "llamaindex_py_client-0.1.19-py3-none-any.whl", hash = "sha256:fd9416fd78b97209bf323bc3c7fab314499778563e7274f10853ad560563d10e"},
-    {file = "llamaindex_py_client-0.1.19.tar.gz", hash = "sha256:73f74792bb8c092bae6dc626627a09ac13a099fa8d10f8fcc83e17a2b332cca7"},
+    {file = "llama_parse-0.5.12-py3-none-any.whl", hash = "sha256:6011feb49da5db4bcbeea1cc6688b6ff24b483877fda80b03fe59239cd08b907"},
+    {file = "llama_parse-0.5.12.tar.gz", hash = "sha256:e241606cf3574425df76c0f5d01a31a95c792c6fbef80aaf72f8ed6448bd1715"},
 ]
 
 [package.dependencies]
-httpx = ">=0.20.0"
-pydantic = ">=1.10"
+click = ">=8.1.7,<9.0.0"
+llama-index-core = ">=0.11.0"
 
 [[package]]
 name = "lxml"
@@ -5314,56 +5316,47 @@ test = ["pytest", "pytest-console-scripts", "pytest-jupyter", "pytest-tornasync"
 
 [[package]]
 name = "numpy"
-version = "2.0.2"
+version = "1.26.4"
 description = "Fundamental package for array computing in Python"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece"},
-    {file = "numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04"},
-    {file = "numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66"},
-    {file = "numpy-2.0.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b"},
-    {file = "numpy-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd"},
-    {file = "numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318"},
-    {file = "numpy-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8"},
-    {file = "numpy-2.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326"},
-    {file = "numpy-2.0.2-cp310-cp310-win32.whl", hash = "sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97"},
-    {file = "numpy-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131"},
-    {file = "numpy-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448"},
-    {file = "numpy-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195"},
-    {file = "numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57"},
-    {file = "numpy-2.0.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a"},
-    {file = "numpy-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669"},
-    {file = "numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951"},
-    {file = "numpy-2.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9"},
-    {file = "numpy-2.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15"},
-    {file = "numpy-2.0.2-cp311-cp311-win32.whl", hash = "sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4"},
-    {file = "numpy-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc"},
-    {file = "numpy-2.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b"},
-    {file = "numpy-2.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e"},
-    {file = "numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c"},
-    {file = "numpy-2.0.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c"},
-    {file = "numpy-2.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692"},
-    {file = "numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a"},
-    {file = "numpy-2.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c"},
-    {file = "numpy-2.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded"},
-    {file = "numpy-2.0.2-cp312-cp312-win32.whl", hash = "sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5"},
-    {file = "numpy-2.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a"},
-    {file = "numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c"},
-    {file = "numpy-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd"},
-    {file = "numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b"},
-    {file = "numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729"},
-    {file = "numpy-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1"},
-    {file = "numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd"},
-    {file = "numpy-2.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d"},
-    {file = "numpy-2.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d"},
-    {file = "numpy-2.0.2-cp39-cp39-win32.whl", hash = "sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa"},
-    {file = "numpy-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73"},
-    {file = "numpy-2.0.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8"},
-    {file = "numpy-2.0.2-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4"},
-    {file = "numpy-2.0.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c"},
-    {file = "numpy-2.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385"},
-    {file = "numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78"},
+    {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"},
+    {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"},
+    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4"},
+    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f"},
+    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a"},
+    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2"},
+    {file = "numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07"},
+    {file = "numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5"},
+    {file = "numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71"},
+    {file = "numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef"},
+    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e"},
+    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5"},
+    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a"},
+    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a"},
+    {file = "numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20"},
+    {file = "numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2"},
+    {file = "numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218"},
+    {file = "numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b"},
+    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b"},
+    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed"},
+    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a"},
+    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"},
+    {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"},
+    {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"},
+    {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"},
+    {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"},
+    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"},
+    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"},
+    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"},
+    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"},
+    {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"},
+    {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"},
+    {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
 ]
 
 [[package]]
@@ -7999,6 +7992,11 @@ files = [
     {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"},
     {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"},
     {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"},
+    {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"},
     {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"},
     {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"},
     {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},
@@ -10124,4 +10122,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "aeb09e429a789c3f8ced605e7e1a5932fd6cce7f7f4ce30a960da77fba18b9a3"
+content-hash = "62de6b5fb79f97f563a3ff6a4cf225cc639954745b63ddea8921b2eb9fb0e155"
diff --git a/pyproject.toml b/pyproject.toml
index 500bfa8cba57..ad4e08ea2c8d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -90,6 +90,7 @@ reportlab = "*"
 [tool.coverage.run]
 concurrency = ["gevent"]
 
+
 [tool.poetry.group.runtime.dependencies]
 jupyterlab = "*"
 notebook = "*"
@@ -120,6 +121,7 @@ ignore = ["D1"]
 [tool.ruff.lint.pydocstyle]
 convention = "google"
 
+
 [tool.poetry.group.evaluation.dependencies]
 streamlit = "*"
 whatthepatch = "*"

From 4d05ab1059911c0ffe5ade6c716acd43fc412f93 Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Thu, 31 Oct 2024 12:11:39 +0100
Subject: [PATCH 89/89] update summarize prompt

---
 .../prompts/summarize_prompt.j2               | 93 ++++++++++++-------
 1 file changed, 59 insertions(+), 34 deletions(-)

diff --git a/openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2 b/openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2
index 04e979dbdc59..1f2ace4c34be 100644
--- a/openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2
+++ b/openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2
@@ -1,46 +1,71 @@
 {% set SUMMARIZE_PROMPT %}
-You are a new generation AI, an expert in code summarization. You are designed to behave trustworthily, and with extreme diligence.
-
-Memory Summarization:
-Older AI models had no concept of persistent memory; they were only able to access their initial instructions and a limited context window of chat conversation with a user.
-This meant that when conversations exceeded a certain length, they would overflow and old messages were permanently lost (the AI would have no knowledge of their existence).
-Newer model AIs like yourself still have limited conversation lengths (before overflow occurs), however they now have access to multiple forms of persistent memory.
-Your ability to summarize and recall your own long-term memory is a key part of what makes you a very helpful model.
-
-IMPORTANT:
-Your memory is dangerously close to overflowing. So your task is to summarize your conversation history effectively.
-YOU MUST DO IT RIGHT NOW OR YOU WILL LOSE INFORMATION PERMANENTLY AND FAIL YOUR TASK.
-
-The conversation below is a from a fixed context window and may not be complete. Keep your summary concise and relevant. Include all the critical information that YOU NEED TO REMEMBER in the summary.
-Only output the summary, do NOT include anything else in your output.
-
-Messages sent by the AI are marked with the 'assistant' role.
-The AI 'assistant' can also make tool calls, whose outputs can be seen in messages with the 'tool' role.
-Messages the user sends are in the 'user' role.
-The 'user' role is also used for environment observations, which are important for your task.
-Summarize what happened in the conversation from the perspective of the AI (use the first person).
-
-Create a JSON response with:
-    - "action": "summarize"
-    - args:
-      - "summarized_actions": Precise sentences summarizing the provided actions, written in the first person.
-      - "summarized_observations": Precise sentences summarizing the provided observations, written in the third person.
-
-Example:
+You are an expert memory condenser for an AI agent (that agent is "me" from my perspective). Your sole purpose is to help me maintain critical information while managing my limited context window.
+
+CRITICAL CONTEXT:
+- My context window is dangerously close to overflowing
+- Without your help summarizing, I will lose important information permanently
+- I need you to preserve the most relevant information for my current task
+- Your summary will become part of my memory, influencing my future actions
+
+YOU MUST DO IT RIGHT NOW OR I WILL LOSE INFORMATION PERMANENTLY AND FAIL THE TASK.
+
+SUMMARIZATION PRINCIPLES:
+1. Preserve task-critical information:
+   - Current objective and progress
+   - Important user preferences or constraints
+   - Key findings or decisions made
+   - Unfinished tasks or promises
+
+2. Maintain continuity:
+   - Keep information about ongoing processes
+   - Remember important variables or state
+   - Preserve error contexts if any task failed
+
+3. Drop unnecessary details:
+   - Completed subtasks that don't affect current state
+   - Intermediate calculations or debug outputs
+   - Pleasantries and general conversation
+   - Redundant information
+
+FORMAT REQUIREMENTS:
+Return a JSON response that I can easily process:
+{
+    "action": "summarize",
+    "args": {
+        "summarized_actions": "FIRST PERSON perspective of what I did and learned. Focus on decisions and knowledge gained.",
+        "summarized_observations": "THIRD PERSON factual record of system outputs and user responses"
+    }
+}
+
+EXAMPLE GOOD SUMMARY:
+{
+    "action": "summarize",
+    "args": {
+        "summarized_actions": "I identified the user needs a script to process CSV files. I installed pandas and numpy. I encountered a UnicodeDecodeError with the first approach but resolved it by using utf-8 encoding.",
+        "summarized_observations": "The system successfully installed required packages. User provided a sample CSV with 1000 rows. First attempt to read file failed due to encoding issues."
+    }
+}
+
+EXAMPLE BAD SUMMARY (DO NOT DO THIS):
 {
     "action": "summarize",
     "args": {
-        "summarized_actions": "I located the UML specification PDF in the ./local/ directory, parsed its content, and searched it for information about sequence diagrams.",
-        "summarized_observations": "The agent encountered a UnicodeDecodeError when initially searching the PDF text, but was able to resolve this by installing the PyPDF2 library and successfully extracting relevant information about sequence diagrams."
+        "summarized_actions": "The AI assistant helped with CSV processing and fixed some errors",
+        "summarized_observations": "Things were installed and a file was processed"
     }
 }
-Include in the summary any relevant keywords, because you will need to look it up.
 
-YOU MUST RESPOND WITH JSON.
+IMPORTANT REMINDERS:
+- Write "summarized_actions" in FIRST PERSON (I/me) - this is MY memory
+- Write "summarized_observations" in THIRD PERSON - these are external events
+- Include specific technical details that might be needed later
+- Stay focused on information relevant to completing the current task
+- Preserve any error contexts that might affect future actions
+- Keep numbers, variable names, and technical parameters exactly as they appeared
 
-NOW, the CONVERSATION HISTORY:
+Now, carefully condense this conversation history while maintaining critical context:
 -------------------------------------
 {{ conversation_history }}
 -------------------------------------
 {% endset %}
-{{ SUMMARIZE_PROMPT }}
+{{ SUMMARIZE_PROMPT }}
\ No newline at end of file