diff --git a/config.template.toml b/config.template.toml
index 060ec11ab1eb..a26154d7b0ef 100644
--- a/config.template.toml
+++ b/config.template.toml
@@ -171,6 +171,24 @@ model = "gpt-4o"
 # If model is vision capable, this option allows to disable image processing (useful for cost reduction).
 #disable_vision = true
 
+# maximum number of messages in a conversation, after which they are truncated or summarized
+# max_conversation_window = 10
+
+# number of results when recalling message history
+# conversation_top_k = 5
+
+# fraction of the conversation window to summarize
+# message_summary_trunc_tokens_fraction = 0.75
+
+# summary LLM
+[llm.summary]
+model = "deepseek"
+
+# default LLM
+[llm.default]
+model = "claude"
+
+
 [llm.gpt4o-mini]
 api_key = "your-api-key"
 model = "gpt-4o"
diff --git a/evaluation/EDA/run_infer.py b/evaluation/EDA/run_infer.py
index 2c896939a751..fb5df3b44f01 100644
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@@ -8,6 +8,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -34,7 +35,7 @@ def codeact_user_response_eda(state: State) -> str:
 
     # retrieve the latest model message from history
     if state.history:
-        model_guess = state.history.get_last_agent_message()
+        model_guess = state.get_last_agent_message()
 
     assert game is not None, 'Game is not initialized.'
     msg = game.generate_user_response(model_guess)
@@ -139,7 +140,7 @@ def process_instance(
     if state is None:
         raise ValueError('State should not be None.')
 
-    final_message = state.history.get_last_agent_message()
+    final_message = state.get_last_agent_message()
 
     logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
     test_result = game.reward()
@@ -148,7 +149,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/agent_bench/run_infer.py b/evaluation/agent_bench/run_infer.py
index d6fcc62e0798..acdf60fe4850 100644
--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/agent_bench/run_infer.py
@@ -16,6 +16,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -242,7 +243,7 @@ def process_instance(
         raw_ans = ''
 
         # retrieve the last agent message or thought
-        for event in state.history.get_events(reverse=True):
+        for event in reversed(state.history):
             if event.source == 'agent':
                 if isinstance(event, AgentFinishAction):
                     raw_ans = event.thought
@@ -271,7 +272,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     metrics = state.metrics.get() if state.metrics else None
 
diff --git a/evaluation/aider_bench/run_infer.py b/evaluation/aider_bench/run_infer.py
index fa1bb9534a83..cddc4bfe7db9 100644
--- a/evaluation/aider_bench/run_infer.py
+++ b/evaluation/aider_bench/run_infer.py
@@ -15,6 +15,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -250,7 +251,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
     metrics = state.metrics.get() if state.metrics else None
 
     # Save the output
diff --git a/evaluation/biocoder/run_infer.py b/evaluation/biocoder/run_infer.py
index 4535ccba4e4e..5ab4b3b88313 100644
--- a/evaluation/biocoder/run_infer.py
+++ b/evaluation/biocoder/run_infer.py
@@ -13,6 +13,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -299,7 +300,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     test_result['generated'] = test_result['metadata']['1_copy_change_code']
 
diff --git a/evaluation/bird/run_infer.py b/evaluation/bird/run_infer.py
index adb498cd2eb1..248dbb66181c 100644
--- a/evaluation/bird/run_infer.py
+++ b/evaluation/bird/run_infer.py
@@ -16,6 +16,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -46,7 +47,7 @@ def codeact_user_response(state: State) -> str:
         # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
         user_msgs = [
             event
-            for event in state.history.get_events()
+            for event in state.history
             if isinstance(event, MessageAction) and event.source == 'user'
         ]
         if len(user_msgs) > 2:
@@ -431,7 +432,7 @@ def execute_sql(db_path, sql):
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/browsing_delegation/run_infer.py b/evaluation/browsing_delegation/run_infer.py
index c9fe2ebd18bc..5c1ab8c062e3 100644
--- a/evaluation/browsing_delegation/run_infer.py
+++ b/evaluation/browsing_delegation/run_infer.py
@@ -9,6 +9,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -89,7 +90,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # find the last delegate action
     last_delegate_action = None
diff --git a/evaluation/gaia/run_infer.py b/evaluation/gaia/run_infer.py
index c02cd0aee737..1fa0c00e6d6a 100644
--- a/evaluation/gaia/run_infer.py
+++ b/evaluation/gaia/run_infer.py
@@ -12,6 +12,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -166,7 +167,7 @@ def process_instance(
 
     model_answer_raw = ''
     # get the last message or thought from the agent
-    for event in state.history.get_events(reverse=True):
+    for event in reversed(state.history):
         if event.source == 'agent':
             if isinstance(event, AgentFinishAction):
                 model_answer_raw = event.thought
@@ -203,7 +204,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/gorilla/run_infer.py b/evaluation/gorilla/run_infer.py
index 873cb7f89694..e437f2b6075a 100644
--- a/evaluation/gorilla/run_infer.py
+++ b/evaluation/gorilla/run_infer.py
@@ -10,6 +10,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -101,7 +102,7 @@ def process_instance(
         raise ValueError('State should not be None.')
 
     # retrieve the last message from the agent
-    model_answer_raw = state.history.get_last_agent_message()
+    model_answer_raw = state.get_last_agent_message()
 
     # attempt to parse model_answer
     ast_eval_fn = instance['ast_eval']
@@ -114,7 +115,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     output = EvalOutput(
         instance_id=instance_id,
diff --git a/evaluation/gpqa/run_infer.py b/evaluation/gpqa/run_infer.py
index 8fd4034c9d5e..58db2e404fc8 100644
--- a/evaluation/gpqa/run_infer.py
+++ b/evaluation/gpqa/run_infer.py
@@ -28,6 +28,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -244,7 +245,7 @@ def process_instance(
         'C': False,
         'D': False,
     }
-    for event in state.history.get_events(reverse=True):
+    for event in reversed(state.history):
         if (
             isinstance(event, AgentFinishAction)
             and event.source != 'user'
@@ -300,7 +301,7 @@ def process_instance(
         instance_id=str(instance.instance_id),
         instruction=instruction,
         metadata=metadata,
-        history=state.history.compatibility_for_eval_history_pairs(),
+        history=compatibility_for_eval_history_pairs(state.history),
         metrics=metrics,
         error=state.last_error if state and state.last_error else None,
         test_result={
diff --git a/evaluation/humanevalfix/run_infer.py b/evaluation/humanevalfix/run_infer.py
index 25fee65561fc..2aa184758b33 100644
--- a/evaluation/humanevalfix/run_infer.py
+++ b/evaluation/humanevalfix/run_infer.py
@@ -21,6 +21,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -255,7 +256,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py
index a530041f92f7..5f488aa6c5b9 100644
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -122,7 +122,7 @@ def process_instance(
     # # result evaluation
     # # =============================================
 
-    histories = state.history.get_events()
+    histories = state.history
     test_result: TestResult = test_class.verify_result(runtime, histories)
     metrics = state.metrics.get() if state.metrics else None
 
diff --git a/evaluation/logic_reasoning/run_infer.py b/evaluation/logic_reasoning/run_infer.py
index 5b7d35f21130..116b438b3ee9 100644
--- a/evaluation/logic_reasoning/run_infer.py
+++ b/evaluation/logic_reasoning/run_infer.py
@@ -8,6 +8,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -225,7 +226,7 @@ def process_instance(
         raise ValueError('State should not be None.')
 
     final_message = ''
-    for event in state.history.get_events(reverse=True):
+    for event in reversed(state.history):
         if isinstance(event, AgentFinishAction):
             final_message = event.thought
             break
@@ -247,7 +248,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/miniwob/run_infer.py b/evaluation/miniwob/run_infer.py
index 9c2aaf1e0963..865c10099443 100644
--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@@ -10,6 +10,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -152,7 +153,7 @@ def process_instance(
 
     # Instruction is the first message from the USER
     instruction = ''
-    for event in state.history.get_events():
+    for event in state.history:
         if isinstance(event, MessageAction):
             instruction = event.content
             break
@@ -164,7 +165,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/mint/run_infer.py b/evaluation/mint/run_infer.py
index 8017b194d8d8..2165c3c03fe4 100644
--- a/evaluation/mint/run_infer.py
+++ b/evaluation/mint/run_infer.py
@@ -13,6 +13,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -28,6 +29,7 @@
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import (
+    Action,
     CmdRunAction,
     MessageAction,
 )
@@ -45,7 +47,10 @@ def codeact_user_response_mint(state: State, task: Task, task_config: dict[str,
         task=task,
         task_config=task_config,
     )
-    last_action = state.history.get_last_action()
+    last_action = next(
+        (event for event in reversed(state.history) if isinstance(event, Action)),
+        None,
+    )
     result_state: TaskState = env.step(last_action.message or '')
 
     state.extra_data['task_state'] = result_state
@@ -202,7 +207,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/ml_bench/run_infer.py b/evaluation/ml_bench/run_infer.py
index deec068f3392..2bb667e3c947 100644
--- a/evaluation/ml_bench/run_infer.py
+++ b/evaluation/ml_bench/run_infer.py
@@ -24,6 +24,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -256,7 +257,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index 9ac1e0cf6639..1df29b4f11db 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -430,7 +430,8 @@ def process_instance(
     if state is None:
         raise ValueError('State should not be None.')
 
-    histories = [event_to_dict(event) for event in state.history.get_events()]
+    # NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events
+    histories = [event_to_dict(event) for event in state.history]
     metrics = state.metrics.get() if state.metrics else None
 
     # Save the output
diff --git a/evaluation/toolqa/run_infer.py b/evaluation/toolqa/run_infer.py
index 5c2c53422785..25633ce6ce23 100644
--- a/evaluation/toolqa/run_infer.py
+++ b/evaluation/toolqa/run_infer.py
@@ -9,6 +9,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -126,7 +127,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
         raise ValueError('State should not be None.')
 
     # retrieve the last message from the agent
-    model_answer_raw = state.history.get_last_agent_message()
+    model_answer_raw = state.get_last_agent_message()
 
     # attempt to parse model_answer
     correct = eval_answer(str(model_answer_raw), str(answer))
@@ -137,7 +138,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index d33658f339d2..394c2b6844fb 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -18,6 +18,9 @@
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import Action
 from openhands.events.action.message import MessageAction
+from openhands.events.event import Event
+from openhands.events.serialization.event import event_to_dict
+from openhands.events.utils import get_pairs_from_events
 
 
 class EvalMetadata(BaseModel):
@@ -120,7 +123,7 @@ def codeact_user_response(
         # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
         user_msgs = [
             event
-            for event in state.history.get_events()
+            for event in state.history
             if isinstance(event, MessageAction) and event.source == 'user'
         ]
         if len(user_msgs) >= 2:
@@ -411,3 +414,18 @@ def reset_logger_for_multiprocessing(
     )
     file_handler.setLevel(logging.INFO)
     logger.addHandler(file_handler)
+
+
+# history is now available as a filtered stream of events, rather than list of pairs of (Action, Observation)
+# we rebuild the pairs here
+# for compatibility with the existing output format in evaluations
+# remove this when it's no longer necessary
+def compatibility_for_eval_history_pairs(
+    history: list[Event],
+) -> list[tuple[dict, dict]]:
+    history_pairs = []
+
+    for action, observation in get_pairs_from_events(history):
+        history_pairs.append((event_to_dict(action), event_to_dict(observation)))
+
+    return history_pairs
diff --git a/evaluation/webarena/run_infer.py b/evaluation/webarena/run_infer.py
index cfc2bdae493a..531f134fd988 100644
--- a/evaluation/webarena/run_infer.py
+++ b/evaluation/webarena/run_infer.py
@@ -10,6 +10,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -166,7 +167,7 @@ def process_instance(
 
     # Instruction is the first message from the USER
     instruction = ''
-    for event in state.history.get_events():
+    for event in state.history:
         if isinstance(event, MessageAction):
             instruction = event.content
             break
@@ -178,7 +179,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(
diff --git a/openhands/agenthub/__init__.py b/openhands/agenthub/__init__.py
index 0076976c27ed..370dcba4fab0 100644
--- a/openhands/agenthub/__init__.py
+++ b/openhands/agenthub/__init__.py
@@ -13,6 +13,7 @@
     codeact_swe_agent,
     delegator_agent,
     dummy_agent,
+    memcodeact_agent,
     planner_agent,
 )
 
@@ -23,6 +24,7 @@
     'delegator_agent',
     'dummy_agent',
     'browsing_agent',
+    'memcodeact_agent',
 ]
 
 for agent in all_microagents.values():
diff --git a/openhands/agenthub/browsing_agent/browsing_agent.py b/openhands/agenthub/browsing_agent/browsing_agent.py
index 0460506d04f3..822677bab526 100644
--- a/openhands/agenthub/browsing_agent/browsing_agent.py
+++ b/openhands/agenthub/browsing_agent/browsing_agent.py
@@ -150,13 +150,13 @@ def step(self, state: State) -> Action:
         last_obs = None
         last_action = None
 
-        if EVAL_MODE and len(state.history.get_events_as_list()) == 1:
+        if EVAL_MODE and len(state.history) == 1:
             # for webarena and miniwob++ eval, we need to retrieve the initial observation already in browser env
             # initialize and retrieve the first observation by issuing an noop OP
             # For non-benchmark browsing, the browser env starts with a blank page, and the agent is expected to first navigate to desired websites
             return BrowseInteractiveAction(browser_actions='noop()')
 
-        for event in state.history.get_events():
+        for event in state.history:
             if isinstance(event, BrowseInteractiveAction):
                 prev_actions.append(event.browser_actions)
                 last_action = event
diff --git a/openhands/agenthub/codeact_agent/agent.yaml b/openhands/agenthub/codeact_agent/agent.yaml
new file mode 100644
index 000000000000..e6f6b538d43b
--- /dev/null
+++ b/openhands/agenthub/codeact_agent/agent.yaml
@@ -0,0 +1,33 @@
+name: CodeActAgent
+
+# custom templates directory
+# .j2 templates will be loaded from this directory if found, if not, the default will be used
+custom_templates_dir: "user_templates"
+
+# main templates
+template:
+  system_prompt: "system_prompt"  # path to the system template file
+  agent_skills: "agent_skills"  # path to the agent skills template file
+  examples: "examples"  # path to the examples template file
+  user_prompt: "user_prompt"  # path to the initial user prompt template file
+
+# agent-specific variables (can be accessed within templates)
+use_tools: false  # whether to use tool-based implementations
+# tools:  # list of available tools
+#  - name: "EditTool"
+#    description: "Edits a file."
+#    usage: "Use the following format: <file_edit> [file_path] [new_file_content] </file_edit>"
+# agent skills
+agent_skills:
+  available_skills:
+    - "file_ops:open_file"
+    - "file_ops:goto_line"
+    - "file_ops:scroll_down"
+    - "file_ops:scroll_up"
+    - "file_ops:search_dir"
+    - "file_ops:search_file"
+    - "file_ops:find_file"
+    - "file_reader:parse_pdf"
+    - "file_reader:parse_docx"
+    - "file_reader:parse_latex"
+    - "file_reader:parse_pptx"
diff --git a/openhands/agenthub/codeact_agent/agent_skills.j2 b/openhands/agenthub/codeact_agent/agent_skills.j2
new file mode 100644
index 000000000000..7f34adb69efc
--- /dev/null
+++ b/openhands/agenthub/codeact_agent/agent_skills.j2
@@ -0,0 +1,3 @@
+{% for skill_name in available_skills %}
+{{ get_skill_docstring(skill_name) }}
+{% endfor %}
diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index c8342ca11f70..a4a79343648e 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -5,6 +5,7 @@
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
+from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import ImageContent, Message, TextContent
 from openhands.events.action import (
     Action,
@@ -91,7 +92,6 @@ def __init__(
 
         self.prompt_manager = PromptManager(
             prompt_dir=os.path.join(os.path.dirname(__file__)),
-            agent_skills_docs=AgentSkillsRequirement.documentation,
             micro_agent=self.micro_agent,
         )
 
@@ -180,7 +180,8 @@ def get_observation_message(self, obs: Observation) -> Message | None:
         else:
             # If an observation message is not returned, it will cause an error
             # when the LLM tries to return the next message
-            raise ValueError(f'Unknown observation type: {type(obs)}')
+            logger.warning(f'Unknown observation type: {type(obs)}')
+            return None
 
     def reset(self) -> None:
         """Resets the CodeAct Agent."""
@@ -201,8 +202,8 @@ def step(self, state: State) -> Action:
         - AgentFinishAction() - end the interaction
         """
         # if we're done, go back
-        latest_user_message = state.history.get_last_user_message()
-        if latest_user_message and latest_user_message.strip() == '/exit':
+        last_user_message = state.get_last_user_message()
+        if last_user_message and last_user_message.strip() == '/exit':
             return AgentFinishAction()
 
         # prepare what we want to send to the LLM
@@ -243,7 +244,7 @@ def _get_messages(self, state: State) -> list[Message]:
             ),
         ]
 
-        for event in state.history.get_events():
+        for event in state.history:
             # create a regular message from an event
             if isinstance(event, Action):
                 message = self.get_action_message(event)
diff --git a/openhands/agenthub/codeact_agent/examples.j2 b/openhands/agenthub/codeact_agent/examples.j2
new file mode 100644
index 000000000000..c655922d65ca
--- /dev/null
+++ b/openhands/agenthub/codeact_agent/examples.j2
@@ -0,0 +1,224 @@
+{% set DEFAULT_EXAMPLE %}
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me first check the current directory:
+<execute_bash>
+ls
+</execute_bash>
+
+USER:
+OBSERVATION:
+openhands@runtime:~/workspace$
+
+ASSISTANT:
+There is no `app.py` file in the current directory. Let me create a Python file `app.py`:
+<file_edit path="/workspace/app.py" start=-1 end=-1>
+from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)
+</file_edit>
+
+USER:
+OBSERVATION:
+[New file /workspace/app.py is created.]
+(begin of changes)
+--- /workspace/app.py
++++ /workspace/app.py
+@@ -1 +1,10 @@
++from flask import Flask
++app = Flask(__name__)
+
++@app.route('/')
++def index():
++    numbers = list(range(1, 11))
++    return str(numbers)
++
++if __name__ == '__main__':
++    app.run(port=5000)
+(end of changes)
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+USER:
+OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/openhands/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/openhands/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+USER:
+OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+USER:
+OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+USER:
+OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+USER:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+
+ASSISTANT:
+I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
+<file_edit path="/workspace/app.py" start=4 end=7>
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    ret = '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+    return ret
+</file_edit>
+
+USER:
+Observation:
+[Edited existing file /workspace/app.py]
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    ret = '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|    return ret
+9|
+10|if __name__ == '__main__':
+11|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+USER:
+Observation:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+{% endset %}
+Here is an example of how you can interact with the environment for task solving:
+{{ DEFAULT_EXAMPLE }}
+{% if micro_agent %}
+--- BEGIN OF GUIDELINE ---
+The following information may assist you in completing your task:
+
+{{ micro_agent }}
+--- END OF GUIDELINE ---
+{% endif %}
diff --git a/openhands/agenthub/codeact_agent/system_prompt.j2 b/openhands/agenthub/codeact_agent/system_prompt.j2
index a1498aacd62b..87d6ecbcb8d1 100644
--- a/openhands/agenthub/codeact_agent/system_prompt.j2
+++ b/openhands/agenthub/codeact_agent/system_prompt.j2
@@ -1,11 +1,20 @@
-{% set MINIMAL_SYSTEM_PREFIX %}
+{# Core system components for the CodeAct Agent #}
+
+{# Base system identity and core abilities #}
+{% set SYSTEM_PREFIX %}
 A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed answers to the user's questions.
+{% endset %}
 
+{# Python execution capabilities #}
+{% set EXECUTE_PYTHON %}
 [1] The assistant can use a Python environment with <execute_ipython>, e.g.:
 <execute_ipython>
 print("Hello World!")
 </execute_ipython>
+{% endset %}
 
+{# Bash execution capabilities #}
+{% set EXECUTE_BASH %}
 [2] The assistant can execute bash commands wrapped with <execute_bash>, e.g. <execute_bash> ls </execute_bash>.
 If a bash command returns exit code `-1`, this means the process is not yet finished.
 The assistant must then send a second <execute_bash>. The second <execute_bash> can be empty
@@ -134,27 +143,42 @@ class MyClass:
 # MyClass().z is removed
 print(MyClass().y)
 </file_edit>
-
-
 {% endset %}
-{% set BROWSING_PREFIX %}
+
+{# Web browsing #}
+{% set EXECUTE_BROWSE %}
 The assistant can browse the Internet with <execute_browse> and </execute_browse>.
-For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
+For example, <execute_browse> Tell me the USA's president using Google search </execute_browse>.
 Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
 {% endset %}
-{% set PIP_INSTALL_PREFIX %}
+
+{# Package management #}
+{% set PIP_INSTALL %}
 The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
 {% endset %}
-{% set SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + BROWSING_PREFIX + PIP_INSTALL_PREFIX %}
-{% set COMMAND_DOCS %}
+
+{# Agent skills documentation #}
+{% set AGENT_SKILLS %}
+{% if use_tools %}
+{# Tool-based implementation #}
+The following tools are available:
+{% for tool in tools %}
+- {{ tool.name }}: {{ tool.description }}
+  Usage: {{ tool.usage }}
+{% endfor %}
+{% else %}
 Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
 {{ agent_skills_docs }}
+
 IMPORTANT:
 - `open_file` only returns the first 100 lines of the file by default! The assistant MUST use `scroll_down` repeatedly to read the full file BEFORE making edits!
 - Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
 - Any code issued should be less than 50 lines to avoid context being cut off!
+{% endif %}
 {% endset %}
-{% set SYSTEM_SUFFIX %}
+
+{# System behavior rules #}
+{% set GENERAL_RULES %}
 Responses should be concise.
 The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
 Include ONLY ONE <execute_ipython>, <execute_bash>, or <execute_browse> per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
@@ -163,9 +187,12 @@ IMPORTANT: Execute code using <execute_ipython>, <execute_bash>, or <execute_bro
 The assistant should utilize full file paths and the `pwd` command to prevent path-related errors.
 The assistant MUST NOT apologize to the user or thank the user after running commands or editing files. It should only address the user in response to an explicit message from the user, or to ask for more information.
 The assistant MUST NOT push any changes to GitHub unless explicitly requested to do so.
-
 {% endset %}
-{# Combine all parts without newlines between them #}
+{# Combine all blocks without newlines #}
 {{ SYSTEM_PREFIX -}}
-{{- COMMAND_DOCS -}}
-{{- SYSTEM_SUFFIX }}
+{{- EXECUTE_PYTHON -}}
+{{- EXECUTE_BASH -}}
+{{- EXECUTE_BROWSE -}}
+{{- PIP_INSTALL -}}
+{{- AGENT_SKILLS -}}
+{{- GENERAL_RULES -}}
diff --git a/openhands/agenthub/codeact_agent/user_prompt.j2 b/openhands/agenthub/codeact_agent/user_prompt.j2
index a94c02e39dd0..133c704fbae4 100644
--- a/openhands/agenthub/codeact_agent/user_prompt.j2
+++ b/openhands/agenthub/codeact_agent/user_prompt.j2
@@ -1,220 +1,5 @@
-{% set DEFAULT_EXAMPLE %}
---- START OF EXAMPLE ---
+{{ examples }}
 
-USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
-
-ASSISTANT:
-Sure! Let me first check the current directory:
-<execute_bash>
-ls
-</execute_bash>
-
-USER:
-OBSERVATION:
-openhands@runtime:~/workspace$
-
-ASSISTANT:
-There is no `app.py` file in the current directory. Let me create a Python file `app.py`:
-<file_edit path="/workspace/app.py" start=-1 end=-1>
-from flask import Flask
-app = Flask(__name__)
-
-@app.route('/')
-def index():
-    numbers = list(range(1, 11))
-    return str(numbers)
-
-if __name__ == '__main__':
-    app.run(port=5000)
-</file_edit>
-
-USER:
-OBSERVATION:
-[New file /workspace/app.py is created.]
-(begin of changes)
---- /workspace/app.py
-+++ /workspace/app.py
-@@ -1 +1,10 @@
-+from flask import Flask
-+app = Flask(__name__)
-
-+@app.route('/')
-+def index():
-+    numbers = list(range(1, 11))
-+    return str(numbers)
-+
-+if __name__ == '__main__':
-+    app.run(port=5000)
-(end of changes)
-
-ASSISTANT:
-I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
-<execute_bash>
-python3 app.py > server.log 2>&1 &
-</execute_bash>
-
-USER:
-OBSERVATION:
-[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
-
-ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
-<execute_bash>
-cat server.log
-</execute_bash>
-
-USER:
-OBSERVATION:
-Traceback (most recent call last):
-  File "/workspace/app.py", line 2, in <module>
-    from flask import Flask
-ModuleNotFoundError: No module named 'flask'
-
-ASSISTANT:
-It seems that Flask is not installed. Let me install Flask for you:
-<execute_bash>
-pip install flask
-</execute_bash>
-
-USER:
-OBSERVATION:
-Defaulting to user installation because normal site-packages is not writeable
-Collecting flask
-  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
-Collecting blinker>=1.6.2
-  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
-Collecting Werkzeug>=3.0.0
-  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
-Collecting click>=8.1.3
-  Using cached click-8.1.7-py3-none-any.whl (97 kB)
-Collecting itsdangerous>=2.1.2
-  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
-Requirement already satisfied: Jinja2>=3.1.2 in /home/openhands/.local/lib/python3.10/site-packages (from flask) (3.1.3)
-Requirement already satisfied: MarkupSafe>=2.0 in /home/openhands/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
-Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
-Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
-
-ASSISTANT:
-Now that Flask is installed, let me run the Python file again:
-<execute_bash>
-python3 app.py > server.log 2>&1 &
-</execute_bash>
-
-USER:
-OBSERVATION:
-[1] 124
-
-ASSISTANT:
-Let me check the server log again:
-<execute_bash>
-cat server.log
-</execute_bash>
-
-USER:
-OBSERVATION:
-* Serving Flask app 'app'
- * Debug mode: off
-WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
- * Running on http://127.0.0.1:5000
-Press CTRL+C to quit
-
-ASSISTANT:
-The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
-
-USER: Now browse the newly started server's homepage and show me the content.
-
-ASSISTANT:
-Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
-<execute_browse>
-Get the content on "http://127.0.0.1:5000"
-</execute_browse>
-
-USER:
-OBSERVATION:
-[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-
-ASSISTANT:
-The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
-
-USER: Now kill the server, make it display the numbers in a table format.
-
-ASSISTANT:
-Sure! Let me stop the server first:
-<execute_bash>
-kill 124
-</execute_bash>
-
-USER:
-OBSERVATION:
-[1]+  Terminated              python3 app.py > server.log 2>&1
-
-ASSISTANT:
-The server has been stopped. Let me open the Python file and modify it.
-<execute_ipython>
-open_file('app.py')
-</execute_ipython>
-
-USER:
-[File: /workspace/app.py (10 lines total)]
-(this is the beginning of the file)
-1|from flask import Flask
-2|app = Flask(__name__)
-3|
-4|@app.route('/')
-5|def index():
-6|    numbers = list(range(1, 11))
-7|    return str(numbers)
-8|
-9|if __name__ == '__main__':
-10|    app.run(port=5000)
-(this is the end of the file)
-
-ASSISTANT:
-I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
-<file_edit path="/workspace/app.py" start=4 end=7>
-@app.route('/')
-def index():
-    numbers = list(range(1, 11))
-    ret = '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
-    return ret
-</file_edit>
-
-USER:
-Observation:
-[Edited existing file /workspace/app.py]
-[File: /workspace/app.py (10 lines total after edit)]
-(this is the beginning of the file)
-1|from flask import Flask
-2|app = Flask(__name__)
-3|
-4|@app.route('/')
-5|def index():
-6|    numbers = list(range(1, 11))
-7|    ret = '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
-8|    return ret
-9|
-10|if __name__ == '__main__':
-11|    app.run(port=5000)
-(this is the end of the file)
-[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
-
-ASSISTANT:
-Running the updated file:
-<execute_bash>
-python3 app.py > server.log 2>&1 &
-</execute_bash>
-
-USER:
-Observation:
-[1] 126
-
-ASSISTANT:
-The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
-<finish></finish>
-
---- END OF EXAMPLE ---
-{% endset %}
-Here is an example of how you can interact with the environment for task solving:
-{{ DEFAULT_EXAMPLE }}
 {% if micro_agent %}
 --- BEGIN OF GUIDELINE ---
 The following information may assist you in completing your task:
@@ -223,4 +8,5 @@ The following information may assist you in completing your task:
 --- END OF GUIDELINE ---
 {% endif %}
 
+
 NOW, LET'S START!
diff --git a/openhands/agenthub/codeact_swe_agent/codeact_swe_agent.py b/openhands/agenthub/codeact_swe_agent/codeact_swe_agent.py
index 6fc679aec449..7c5b039e8c47 100644
--- a/openhands/agenthub/codeact_swe_agent/codeact_swe_agent.py
+++ b/openhands/agenthub/codeact_swe_agent/codeact_swe_agent.py
@@ -154,8 +154,8 @@ def step(self, state: State) -> Action:
         - AgentFinishAction() - end the interaction
         """
         # if we're done, go back
-        latest_user_message = state.history.get_last_user_message()
-        if latest_user_message and latest_user_message.strip() == '/exit':
+        last_user_message = state.get_last_user_message()
+        if last_user_message and last_user_message.strip() == '/exit':
             return AgentFinishAction()
 
         # prepare what we want to send to the LLM
@@ -176,7 +176,7 @@ def _get_messages(self, state: State) -> list[Message]:
             Message(role='user', content=[TextContent(text=self.in_context_example)]),
         ]
 
-        for event in state.history.get_events():
+        for event in state.history:
             # create a regular message from an event
             if isinstance(event, Action):
                 message = self.get_action_message(event)
diff --git a/openhands/agenthub/delegator_agent/agent.py b/openhands/agenthub/delegator_agent/agent.py
index 29e0030423c7..5069895752ac 100644
--- a/openhands/agenthub/delegator_agent/agent.py
+++ b/openhands/agenthub/delegator_agent/agent.py
@@ -2,7 +2,7 @@
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
 from openhands.events.action import Action, AgentDelegateAction, AgentFinishAction
-from openhands.events.observation import AgentDelegateObservation
+from openhands.events.observation import AgentDelegateObservation, Observation
 from openhands.llm.llm import LLM
 
 
@@ -27,7 +27,7 @@ def step(self, state: State) -> Action:
         Otherwise, delegates the task to the next agent in the pipeline.
 
         Parameters:
-        - state (State): The current state given the previous actions and observations
+        - state: The current state given the previous actions and observations
 
         Returns:
         - AgentFinishAction: If the last state was 'completed', 'verified', or 'abandoned'
@@ -41,7 +41,11 @@ def step(self, state: State) -> Action:
             )
 
         # last observation in history should be from the delegate
-        last_observation = state.history.get_last_observation()
+        last_observation = None
+        for event in reversed(state.history):
+            if isinstance(event, Observation):
+                last_observation = event
+                break
 
         if not isinstance(last_observation, AgentDelegateObservation):
             raise Exception('Last observation is not an AgentDelegateObservation')
diff --git a/openhands/agenthub/dummy_agent/agent.py b/openhands/agenthub/dummy_agent/agent.py
index dbe4c60cfafa..272e6c935f2e 100644
--- a/openhands/agenthub/dummy_agent/agent.py
+++ b/openhands/agenthub/dummy_agent/agent.py
@@ -164,7 +164,7 @@ def step(self, state: State) -> Action:
 
             if 'observations' in prev_step and prev_step['observations']:
                 expected_observations = prev_step['observations']
-                hist_events = state.history.get_last_events(len(expected_observations))
+                hist_events = state.history[-len(expected_observations) :]
 
                 if len(hist_events) < len(expected_observations):
                     print(
diff --git a/openhands/agenthub/memcodeact_agent/README.md b/openhands/agenthub/memcodeact_agent/README.md
new file mode 100644
index 000000000000..628c94d57d3d
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/README.md
@@ -0,0 +1,38 @@
+# MemCodeAct Agent
+
+## Introduction
+
+`memcodeact_agent` is a memory-enabled experimental agent built upon the foundation of the existing `codeact_agent`, incorporating memory functionalities.
+
+## Inspiration and Research
+
+The development of `memcodeact_agent` is inspired by two research papers in the field of generative AI and memory-augmented models:
+
+1. **Extending Generative AI with Memory**
+   - **Paper:** [Extending Generative AI with Memory](https://arxiv.org/pdf/2304.03442)
+   - **Summary:** This paper explores methods to integrate long-term memory into generative AI models, enabling them to retain and utilize information from past interactions. The approach enhances the model's ability to maintain context over extended conversations, leading to more accurate and relevant outputs. Techniques such as memory slots, retrieval mechanisms, and memory encoding strategies are discussed to facilitate effective information storage and retrieval.
+
+2. **MemGPT: Memory-Enhanced GPT Models**
+   - **Paper:** [MemGPT: Memory-Enhanced GPT Models](https://arxiv.org/pdf/2310.08560)
+   - **Summary:** MemGPT introduces a novel architecture that incorporates external memory modules into GPT models. This integration allows the model to access and update its memory dynamically during interactions. The results demonstrate significant improvements in tasks requiring information recall.
+
+## Getting Started
+
+### Prerequisites
+
+- Configuration variables in `config.toml`, `agent.MemCodeactAgent` section:
+  - `micro_agent_name`: Name of the micro agent to use.
+  - `enable_memory`: Whether to enable long-term memory. Default is true for this agent.
+  - `cache_prompt`: Whether to cache the prompt. Default is false for this agent.
+
+
+- Optional environment variables:
+  - `SANDBOX_ENV_GITHUB_TOKEN`: GitHub Personal Access Token with read-only permissions.
+
+## Documentation
+
+For detailed information on how to interact with the agent, refer to the [User Prompt](user_prompt.j2) and [System Prompt](system_prompt.j2) templates located within the agent's directory. These templates define the conversational flow and the agent's capabilities.
+
+## Contribution
+
+`memcodeact_agent` is an experimental agent designed for research and development purposes. Contributions are welcome!
diff --git a/openhands/agenthub/memcodeact_agent/__init__.py b/openhands/agenthub/memcodeact_agent/__init__.py
new file mode 100644
index 000000000000..bf78a81d8ba1
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/__init__.py
@@ -0,0 +1,7 @@
+from openhands.controller.agent import Agent
+
+from .memcodeact_agent import MemCodeActAgent
+
+__all__ = ['MemCodeActAgent']
+
+Agent.register('MemCodeActAgent', MemCodeActAgent)
diff --git a/openhands/agenthub/memcodeact_agent/action_parser.py b/openhands/agenthub/memcodeact_agent/action_parser.py
new file mode 100644
index 000000000000..dd7f56a9fc45
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/action_parser.py
@@ -0,0 +1,262 @@
+import re
+
+from openhands.controller.action_parser import ActionParser, ResponseParser
+from openhands.events.action import (
+    Action,
+    AgentDelegateAction,
+    AgentFinishAction,
+    CmdRunAction,
+    IPythonRunCellAction,
+    MessageAction,
+)
+from openhands.events.action.agent import AgentRecallAction, AgentSummarizeAction
+
+
+class MemCodeActResponseParser(ResponseParser):
+    """Parser actions for MemCodeActAgent:
+    - CmdRunAction(command) - bash command to run
+    - IPythonRunCellAction(code) - IPython code to run
+    - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+    - MessageAction(content) - Message action to run (e.g. ask for clarification)
+    - AgentFinishAction() - end the interaction
+    - AgentSummarizeAction() - trigger a summarization of the conversation history
+    - AgentRecallAction(query) - recall information from memory
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.action_parsers = [
+            MemCodeActActionParserFinish(),
+            MemCodeActActionParserCmdRun(),
+            MemCodeActActionParserIPythonRunCell(),
+            MemCodeActActionParserAgentDelegate(),
+            MemCodeActActionParserMemorySummarize(),
+            MemCodeActActionParserMemoryRecall(),
+            # MemCodeActActionParserMemoryAdd(),
+        ]
+        self.default_parser = MemCodeActActionParserMessage()
+
+    def parse(self, response) -> Action:
+        action_str = self.parse_response(response)
+        return self.parse_action(action_str)
+
+    def parse_response(self, response) -> str:
+        action = response.choices[0].message.content
+        if action is None:
+            return ''
+
+        # execute actions
+        for lang in ['bash', 'ipython', 'browse']:
+            # special handling for DeepSeek: it has the stop-word bug and returns </execute_ipython instead of </execute_ipython>
+            if f'</execute_{lang}' in action and f'</execute_{lang}>' not in action:
+                action = action.replace(f'</execute_{lang}', f'</execute_{lang}>')
+
+            if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
+                action += f'</execute_{lang}>'
+
+        # memory actions
+        for mem in ['summarize', 'recall', 'add']:
+            # the stop-word bug
+            if f'<memory_{mem}>' in action and f'</memory_{mem}>' not in action:
+                action = action.replace(f'</memory_{mem}', f'</memory_{mem}>')
+
+            if f'<memory_{mem}>' in action and f'</memory_{mem}>' not in action:
+                action += f'</memory_{mem}>'
+
+        return action
+
+    def parse_action(self, action_str: str) -> Action:
+        for action_parser in self.action_parsers:
+            if action_parser.check_condition(action_str):
+                return action_parser.parse(action_str)
+        return self.default_parser.parse(action_str)
+
+
+class MemCodeActActionParserFinish(ActionParser):
+    """Parser action:
+    - AgentFinishAction() - end the interaction
+    """
+
+    def __init__(
+        self,
+    ):
+        self.finish_command = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.finish_command = re.search(r'<finish>.*</finish>', action_str, re.DOTALL)
+        return self.finish_command is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.finish_command is not None
+        ), 'self.finish_command should not be None when parse is called'
+        thought = action_str.replace(self.finish_command.group(0), '').strip()
+        return AgentFinishAction(thought=thought)
+
+
+class MemCodeActActionParserCmdRun(ActionParser):
+    """Parser action:
+    - CmdRunAction(command) - bash command to run
+    - AgentFinishAction() - end the interaction
+    """
+
+    def __init__(
+        self,
+    ):
+        self.bash_command = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.bash_command = re.search(
+            r'<execute_bash>(.*?)</execute_bash>', action_str, re.DOTALL
+        )
+        return self.bash_command is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.bash_command is not None
+        ), 'self.bash_command should not be None when parse is called'
+        thought = action_str.replace(self.bash_command.group(0), '').strip()
+        # a command was found
+        command_group = self.bash_command.group(1).strip()
+        if command_group.strip() == 'exit':
+            return AgentFinishAction(thought=thought)
+        return CmdRunAction(command=command_group, thought=thought)
+
+
+class MemCodeActActionParserIPythonRunCell(ActionParser):
+    """Parser action:
+    - IPythonRunCellAction(code) - IPython code to run
+    """
+
+    def __init__(
+        self,
+    ):
+        self.python_code = None
+        self.jupyter_kernel_init_code: str = 'from agentskills import *'
+
+    def check_condition(self, action_str: str) -> bool:
+        self.python_code = re.search(
+            r'<execute_ipython>(.*?)</execute_ipython>', action_str, re.DOTALL
+        )
+        return self.python_code is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.python_code is not None
+        ), 'self.python_code should not be None when parse is called'
+        code_group = self.python_code.group(1).strip()
+        thought = action_str.replace(self.python_code.group(0), '').strip()
+        return IPythonRunCellAction(
+            code=code_group,
+            thought=thought,
+            kernel_init_code=self.jupyter_kernel_init_code,
+        )
+
+
+class MemCodeActActionParserAgentDelegate(ActionParser):
+    """Parser action:
+    - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+    """
+
+    def __init__(
+        self,
+    ):
+        self.agent_delegate = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.agent_delegate = re.search(
+            r'<execute_browse>(.*)</execute_browse>', action_str, re.DOTALL
+        )
+        return self.agent_delegate is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.agent_delegate is not None
+        ), 'self.agent_delegate should not be None when parse is called'
+        thought = action_str.replace(self.agent_delegate.group(0), '').strip()
+        browse_actions = self.agent_delegate.group(1).strip()
+        task = f'{thought}. I should start with: {browse_actions}'
+        return AgentDelegateAction(agent='BrowsingAgent', inputs={'task': task})
+
+
+class MemCodeActActionParserMessage(ActionParser):
+    """Parser action:
+    - MessageAction(content) - Message action to run (e.g. ask for clarification)
+    """
+
+    def __init__(
+        self,
+    ):
+        pass
+
+    def check_condition(self, action_str: str) -> bool:
+        # We assume the LLM is GOOD enough that when it returns pure natural language
+        # it wants to talk to the user
+        return True
+
+    def parse(self, action_str: str) -> Action:
+        return MessageAction(content=action_str, wait_for_response=True)
+
+
+class MemCodeActActionParserMemoryRecall(ActionParser):
+    """Parser action:
+    - RecallAction(query) - memory action to run
+    """
+
+    def __init__(self):
+        self.recall_query = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.recall_query = re.search(
+            r'<memory_recall>(.*?)</memory_recall>', action_str, re.DOTALL
+        )
+        return self.recall_query is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.recall_query is not None
+        ), 'self.query should not be None when parse is called'
+
+        # thought <memory_recall>query</memory_recall>
+        # Note: the thought is optional
+        thought = action_str.replace(self.recall_query.group(0), '').strip()
+        query = self.recall_query.group(1).strip()
+        return AgentRecallAction(query=query, thought=thought)
+
+
+class MemCodeActActionParserMemorySummarize(ActionParser):
+    """Parser action:
+    - <memory_summarize> - The LLM wants to trigger a summarization of its context
+    """
+
+    def check_condition(self, action_str: str) -> bool:
+        return '<memory_summarize>' in action_str
+
+    def parse(self, action_str: str) -> Action:
+        # let the agent trigger the summarization
+        return AgentSummarizeAction(summary='')
+
+
+class MemCodeActActionParserMemoryAdd(ActionParser):
+    """Parser action:
+    - MemoryAddAction(content) - add text to core memory
+    """
+
+    def __init__(self):
+        self.content = None
+
+    def check_condition(self, action_str: str) -> bool:
+        self.content = re.search(
+            r'<memory_add>(.*?)</memory_add>', action_str, re.DOTALL
+        )
+        return self.content is not None
+
+    def parse(self, action_str: str) -> Action:
+        assert (
+            self.content is not None
+        ), 'self.content should not be None when parse is called'
+
+        # <memory_add>content</memory_add>
+        thought = action_str.replace(self.content.group(0), '').strip()
+        return Action()
+        # return MemoryAddAction(content=self.content.group(1).strip(), thought=thought)
diff --git a/openhands/agenthub/memcodeact_agent/core_memory_prompt.md b/openhands/agenthub/memcodeact_agent/core_memory_prompt.md
new file mode 100644
index 000000000000..d7ed1451677b
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/core_memory_prompt.md
@@ -0,0 +1,5 @@
+Core Memory:
+Your core memory unit will be initially empty. You can add to it important information about the task or your status. Keep it concise and remember that you will use it to guide your actions, so keep it relevant!
+You can add to your core memory using the <memory_add> action.
+For example, <memory_add> The user is working on a project to create a new AI assistant. </memory_add>
+Adding to your core memory is optional. You do NOT need to do it for every message.
diff --git a/openhands/agenthub/memcodeact_agent/memcodeact_agent.py b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
new file mode 100644
index 000000000000..9854199c74fb
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/memcodeact_agent.py
@@ -0,0 +1,418 @@
+import os
+from itertools import islice
+
+from openhands.agenthub.memcodeact_agent.action_parser import MemCodeActResponseParser
+from openhands.controller.agent import Agent
+from openhands.controller.state.state import State
+from openhands.core.config import AgentConfig
+from openhands.core.exceptions import TokenLimitExceededError
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.message import ImageContent, Message, TextContent
+from openhands.events.action import (
+    Action,
+    AgentDelegateAction,
+    AgentFinishAction,
+    CmdRunAction,
+    IPythonRunCellAction,
+    MessageAction,
+)
+from openhands.events.action.agent import AgentRecallAction, AgentSummarizeAction
+from openhands.events.observation import (
+    AgentDelegateObservation,
+    CmdOutputObservation,
+    IPythonRunCellObservation,
+    UserRejectObservation,
+)
+from openhands.events.observation.agent import AgentRecallObservation
+from openhands.events.observation.error import ErrorObservation
+from openhands.events.observation.observation import Observation
+from openhands.events.serialization.event import event_to_memory, truncate_content
+from openhands.llm.llm import LLM
+from openhands.memory.condenser import MemoryCondenser
+from openhands.memory.conversation_memory import ConversationMemory
+from openhands.memory.core_memory import CoreMemory
+from openhands.runtime.plugins import (
+    AgentSkillsRequirement,
+    JupyterRequirement,
+    PluginRequirement,
+)
+from openhands.utils.microagent import MicroAgent
+from openhands.utils.prompt import PromptManager
+
+
+class MemCodeActAgent(Agent):
+    VERSION = '0.1'
+    """
+    The MemCode Act Agent is a memory-enabled version of the CodeAct agent.
+
+    Its memory modules are:
+    - conversation: easy to recall memory (history)
+    - core: core system messages
+    - long_term: long-term memory
+
+    Its memory actions are:
+        - "core_memory_append"
+        - "core_memory_replace"
+        - "conversation_search"
+        - "long_term_memory_insert"
+        - "long_term_memory_search"
+        - "summarize_conversation"
+    The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
+
+    ### Overview
+
+    This agent implements:
+    - the CodeAct idea ([paper](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a unified **code** action space for both *simplicity* and *performance* (see paper for more details).
+    - inspired by the Generative Agents idea([paper](https://arxiv.org/abs/2304.03442)) and the MemGPT idea ([paper](https://arxiv.org/abs/2310.08560))
+
+    The conceptual idea is illustrated below. At each turn, the agent can:
+
+    1. **Converse**: Communicate with humans in natural language to ask for clarification, confirmation, etc.
+    2. **CodeAct**: Choose to perform the task by executing code
+        - Execute any valid Linux `bash` command
+        - Execute any valid `Python` code with [an interactive Python interpreter](https://ipython.org/). This is simulated through `bash` command, see plugin system below for more details.
+    3. **MemGPT**: Manage its own memory
+        - truncate its history and replace it with a summary
+        - store information in its long-term memory
+        - search for information relevant to the task.
+
+    """
+
+    sandbox_plugins: list[PluginRequirement] = [
+        # NOTE: AgentSkillsRequirement need to go before JupyterRequirement, since
+        # AgentSkillsRequirement provides a lot of Python functions,
+        # and it needs to be initialized before Jupyter for Jupyter to use those functions.
+        AgentSkillsRequirement(),
+        JupyterRequirement(),
+    ]
+
+    action_parser = MemCodeActResponseParser()
+
+    # NOTE: memory includes 'conversation' and 'core' memory blocks
+    conversation_memory: ConversationMemory
+    core_memory: CoreMemory
+
+    def __init__(
+        self,
+        llm: LLM,
+        config: AgentConfig,
+    ) -> None:
+        """Initializes a new instance of the MemCodeActAgent class.
+
+        Parameters:
+        - llm: The LLM to be used by this agent
+        - config: The agent configuration
+        """
+        super().__init__(llm, config)
+
+        self.memory_config = llm.config  # TODO this should be MemoryConfig
+
+        self.micro_agent = (
+            MicroAgent(
+                os.path.join(
+                    os.path.dirname(__file__), 'micro', f'{config.micro_agent_name}.md'
+                )
+            )
+            if config.micro_agent_name
+            else None
+        )
+
+        self.prompt_manager = PromptManager(
+            prompt_dir=os.path.join(os.path.dirname(__file__), 'prompts'),
+            micro_agent=self.micro_agent,
+        )
+
+    def action_to_str(self, action: Action) -> str:
+        if isinstance(action, CmdRunAction):
+            return (
+                f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
+            )
+        elif isinstance(action, IPythonRunCellAction):
+            return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
+        elif isinstance(action, AgentDelegateAction):
+            return f'{action.thought}\n<execute_browse>\n{action.inputs["task"]}\n</execute_browse>'
+        elif isinstance(action, MessageAction):
+            logger.debug(f'MessageAction.content: {action.content}')
+            return action.content
+        elif isinstance(action, AgentFinishAction) and action.source == 'agent':
+            return action.thought
+        elif isinstance(action, AgentSummarizeAction):
+            # information about the conversation history
+            hidden_message_count = self.conversation_memory.hidden_message_count
+            if hidden_message_count > 0:
+                summary_message = (
+                    f'\n\nENVIRONMENT REMINDER: prior messages ({hidden_message_count} of {self.conversation_memory.total_message_count} total messages) have been hidden from view due to conversation memory constraints.\n'
+                    + f'The following is a summary of the first {hidden_message_count} messages:\n {action.summary}'
+                )
+                return summary_message
+        elif isinstance(action, AgentRecallAction):
+            return f'{action.thought}\n<memory_recall>\n{action.query[:10]}...\n</memory_recall>'
+        return ''
+
+    def get_action_message(self, action: Action) -> Message | None:
+        if (
+            isinstance(action, AgentDelegateAction)
+            or isinstance(action, CmdRunAction)
+            or isinstance(action, IPythonRunCellAction)
+            or isinstance(action, MessageAction)
+            or (isinstance(action, AgentFinishAction) and action.source == 'agent')
+            or isinstance(action, AgentSummarizeAction)
+            or isinstance(action, AgentRecallAction)
+        ):
+            content = [TextContent(text=self.action_to_str(action))]
+
+            if (
+                self.llm.vision_is_active()
+                and isinstance(action, MessageAction)
+                and action.images_urls
+            ):
+                content.append(ImageContent(image_urls=action.images_urls))
+
+            return Message(
+                role='user' if action.source == 'user' else 'assistant', content=content
+            )
+        return None
+
+    def get_observation_message(self, obs: Observation) -> Message | None:
+        max_message_chars = self.llm.config.max_message_chars
+        obs_prefix = 'ENVIRONMENT OBSERVATION:\n'
+        if isinstance(obs, CmdOutputObservation):
+            text = obs_prefix + truncate_content(obs.content, max_message_chars)
+            text += (
+                f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
+            )
+            return Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, IPythonRunCellObservation):
+            text = obs_prefix + obs.content
+            # replace base64 images with a placeholder
+            splitted = text.split('\n')
+            for i, line in enumerate(splitted):
+                if '![image](data:image/png;base64,' in line:
+                    splitted[i] = (
+                        '![image](data:image/png;base64, ...) already displayed to user'
+                    )
+            text = '\n'.join(splitted)
+            text = truncate_content(text, max_message_chars)
+            return Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, AgentDelegateObservation):
+            text = obs_prefix + truncate_content(
+                obs.outputs['content'] if 'content' in obs.outputs else '',
+                max_message_chars,
+            )
+            return Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, ErrorObservation):
+            text = obs_prefix + truncate_content(obs.content, max_message_chars)
+            text += '\n[Error occurred in processing last action]'
+            return Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, UserRejectObservation):
+            text = obs_prefix + truncate_content(obs.content, max_message_chars)
+            text += '\n[Last action has been rejected by the user]'
+            return Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, AgentRecallObservation):
+            text = 'MEMORY RECALL:\n' + obs.memory
+            return Message(role='user', content=[TextContent(text=text)])
+        else:
+            # If an observation message is not returned, it will cause an error
+            # when the LLM tries to return the next message
+            logger.debug(f'Unknown observation type: {type(obs)}')
+            return None
+
+    def reset(self) -> None:
+        """Resets the MemCodeAct Agent."""
+        super().reset()
+
+        # reset the memory modules
+        self.core_memory.reset()
+        self.conversation_memory.reset()
+
+    def step(self, state: State) -> Action:
+        """Performs one step using the MemCodeAct Agent.
+        This includes gathering info on previous steps and prompting the model to make an action to execute.
+
+        Parameters:
+        - state (State): used to get updated info
+
+        Returns:
+        - CmdRunAction(command) - bash command to run
+        - IPythonRunCellAction(code) - IPython code to run
+        - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+        - SummarizeAction() - summarize the conversation
+        - RecallAction() - search the agent's history
+        - LongTermMemoryInsertAction() - archive information in the long-term memory
+        - LongTermMemorySearchAction() - search the agent's long-term memory
+        - AgentFinishAction() - end the interaction
+        """
+        # if we're done, go back
+        last_user_message = state.get_last_user_message()
+        if last_user_message and last_user_message.strip() == '/exit':
+            return AgentFinishAction()
+
+        # initialize the memory modules
+
+        # stores and searches the agent's long-term memory (vector store)
+        # long_term_memory = LongTermMemory(llm_config=memory_config, agent_config=config, event_stream=self.event_stream)
+
+        # stores and recalls the whole agent's history
+        assert self.memory_config is not None
+
+        # update conversation memory for this step
+        if not hasattr(self, 'conversation_memory') or not self.conversation_memory:
+            self.conversation_memory = ConversationMemory(
+                memory_config=self.memory_config, state=state
+            )
+        else:
+            self.conversation_memory.update(state)
+
+        # initialize core memory
+        if not hasattr(self, 'core_memory') or not self.core_memory:
+            self.core_memory = CoreMemory(limit=1500)
+
+        # prepare what we want to send to the LLM
+        messages = self._get_messages(state)
+        params = {
+            'messages': self.llm.format_messages_for_llm(messages),
+            'stop': [
+                '</execute_ipython>',
+                '</execute_bash>',
+                '</execute_browse>',
+            ],
+        }
+
+        # catch ContextWindowExceededError and TokenLimitExceededError
+        try:
+            response = self.llm.completion(**params)
+        except TokenLimitExceededError as e:
+            logger.error(e, exc_info=False)
+
+            # run condenser directly
+            summary_action = self.summarize_messages(state)
+
+            # just return for now
+            return summary_action
+        return self.action_parser.parse(response)
+
+    def _get_messages(self, state: State) -> list[Message]:
+        # update prompt manager with current core memory
+        self.prompt_manager.core_memory = self.core_memory.format_blocks()
+
+        messages: list[Message] = [
+            Message(
+                role='system',
+                content=[
+                    TextContent(
+                        text=self.prompt_manager.system_message,
+                        cache_prompt=self.llm.is_caching_prompt_active(),
+                    )
+                ],
+                condensable=False,
+            ),
+            Message(
+                role='user',
+                content=[
+                    TextContent(
+                        text=self.prompt_manager.initial_user_message,
+                        cache_prompt=self.llm.is_caching_prompt_active(),  # the user asks the same query
+                    )
+                ],
+                condensable=False,
+            ),
+        ]
+
+        for event in self.conversation_memory.memory:
+            # if it is a summary or recall, it will not have event_id for now
+            if isinstance(event, AgentSummarizeAction):
+                message = self.get_action_message(event)
+            elif isinstance(event, AgentRecallAction):
+                message = self.get_action_message(event)
+            elif isinstance(event, AgentRecallObservation):
+                message = self.get_observation_message(event)
+            else:
+                # create a regular message from an event
+                if isinstance(event, Action):
+                    message = self.get_action_message(event)
+                elif isinstance(event, Observation):
+                    message = self.get_observation_message(event)
+                else:
+                    raise ValueError(f'Unknown event type: {type(event)}')
+
+            # add regular message
+            if message:
+                # handle error if the message is the SAME role as the previous message
+                # litellm.exceptions.BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'detail': 'Only supports u/a/u/a/u...'}
+                # there shouldn't be two consecutive messages from the same role
+                if messages and messages[-1].role == message.role:
+                    messages[-1].content.extend(message.content)
+                else:
+                    messages.append(message)
+
+        # Add caching to the last 2 user messages
+        # if self.llm.is_caching_prompt_active():
+        #    user_turns_processed = 0
+        #    for message in reversed(messages):
+        #        if message.role == 'user' and user_turns_processed < 2:
+        #            message.content[
+        #                -1
+        #            ].cache_prompt = True  # Last item inside the message content
+        #            user_turns_processed += 1
+
+        # The latest user message is important:
+        # we want to remind the agent of the environment constraints
+        latest_user_message = next(
+            islice(
+                (
+                    m
+                    for m in reversed(messages)
+                    if m.role == 'user'
+                    and any(isinstance(c, TextContent) for c in m.content)
+                ),
+                1,
+            ),
+            None,
+        )
+
+        # set the last 4 messages to be non-condensable
+        # TODO make this configurable for experimentation
+        for message in messages[-4:]:
+            message.condensable = False
+
+        # iterations reminder
+        if latest_user_message:
+            reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
+            latest_user_message.content.append(TextContent(text=reminder_text))
+
+        return messages
+
+    def summarize_messages(self, state: State) -> AgentSummarizeAction | None:
+        """Summarizes the earlier messages in the agent's memory to reduce token usage. Roughly uses memGPT's algorithm for in-place summarization."""
+        if len(state.history) <= 2:
+            return None  # ignore
+
+        # summarize the conversation history using the condenser
+        condenser = MemoryCondenser(self.llm, self.prompt_manager)
+
+        # send all messages and let it sort it out
+        messages = self._get_messages(state)
+        summary_action = condenser.condense(messages)
+
+        # update conversation memory with the summary
+        if summary_action and summary_action.summary:
+            self.conversation_memory.update_summary(
+                summary_action.summary, summary_action.end_id
+            )
+
+        return summary_action
+
+    def recall_from_memory(self, query: str, top_k: int = 5) -> AgentRecallObservation:
+        """Searches the conversation memory for relevant information."""
+        # note: pairs are better than events for this
+        recalled_events = self.conversation_memory.search(self.llm, query, top_k)
+
+        # format the recalled events into a readable format
+        recalled_text = '\n'.join(
+            [f'- {event_to_memory(event, -1)}' for event in recalled_events]
+        )
+
+        return AgentRecallObservation(
+            content=f'Searching memory for: {query}', query=query, memory=recalled_text
+        )
diff --git a/openhands/agenthub/memcodeact_agent/micro/github.md b/openhands/agenthub/memcodeact_agent/micro/github.md
new file mode 100644
index 000000000000..0e5a9b14ab9e
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/micro/github.md
@@ -0,0 +1,69 @@
+---
+name: github
+agent: MemCodeActAgent
+require_env_var:
+    SANDBOX_ENV_GITHUB_TOKEN: "Create a GitHub Personal Access Token (https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens) and set it as SANDBOX_GITHUB_TOKEN in your environment variables."
+---
+
+# How to Interact with Github
+
+## Environment Variable Available
+
+- `GITHUB_TOKEN`: A read-only token for Github.
+
+## Using GitHub's RESTful API
+
+Use `curl` with the `GITHUB_TOKEN` to interact with GitHub's API. Here are some common operations:
+
+Here's a template for API calls:
+
+```sh
+curl -H "Authorization: token $GITHUB_TOKEN" \
+    "https://api.github.com/{endpoint}"
+```
+
+First replace `{endpoint}` with the specific API path. Common operations:
+
+1. View an issue or pull request:
+   - Issues: `/repos/{owner}/{repo}/issues/{issue_number}`
+   - Pull requests: `/repos/{owner}/{repo}/pulls/{pull_request_number}`
+
+2. List repository issues or pull requests:
+   - Issues: `/repos/{owner}/{repo}/issues`
+   - Pull requests: `/repos/{owner}/{repo}/pulls`
+
+3. Search issues or pull requests:
+   - `/search/issues?q=repo:{owner}/{repo}+is:{type}+{search_term}+state:{state}`
+   - Replace `{type}` with `issue` or `pr`
+
+4. List repository branches:
+   `/repos/{owner}/{repo}/branches`
+
+5. Get commit details:
+   `/repos/{owner}/{repo}/commits/{commit_sha}`
+
+6. Get repository details:
+   `/repos/{owner}/{repo}`
+
+7. Get user information:
+   `/user`
+
+8. Search repositories:
+   `/search/repositories?q={query}`
+
+9. Get rate limit status:
+   `/rate_limit`
+
+Replace `{owner}`, `{repo}`, `{commit_sha}`, `{issue_number}`, `{pull_request_number}`,
+`{search_term}`, `{state}`, and `{query}` with appropriate values.
+
+## Important Notes
+
+1. Always use the GitHub API for operations instead of a web browser.
+2. The `GITHUB_TOKEN` is read-only. Avoid operations that require write access.
+3. Git config (username and email) is pre-set. Do not modify.
+4. Edit and test code locally. Never push directly to remote.
+5. Verify correct branch before committing.
+6. Commit changes frequently.
+7. If the issue or task is ambiguous or lacks sufficient detail, always request clarification from the user before proceeding.
+8. You should avoid using command line tools like `sed` for file editing.
diff --git a/openhands/agenthub/memcodeact_agent/prompts/components/memory.j2 b/openhands/agenthub/memcodeact_agent/prompts/components/memory.j2
new file mode 100644
index 000000000000..8ab602553d5b
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/prompts/components/memory.j2
@@ -0,0 +1,58 @@
+{# Memory system components #}
+
+{% macro memory_system() %}
+Apart from acting on the environment, you can also act on your own memory. You have three memory systems:
+
+1. Core Memory: Persistent important information that you always need to access
+   - Use <core_memory_append>category: content</core_memory_append> to add information
+   - Categories:
+     * personality: Your capabilities and traits
+     * task_context: Everything important about your current task
+   - Always add critical information about your task to task_context
+
+2. Conversation Memory: Your recent interaction history
+   - Use <memory_recall>query</memory_recall> to search past conversations
+   - Use <memory_summarize> to create summaries when context gets too long
+
+3. System Memory: Your basic capabilities and constraints
+   - Always available through your system prompt
+
+{{ memory_control_flow() }}
+{{ memory_editing() }}
+
+RECALL MEMORY (CONVERSATION HISTORY):
+Even though you can only see recent messages in your immediate context, you can search over your entire message history from a database.
+This conversation history database allows you to search through past interactions, effectively allowing you to remember prior engagements with a user or previous environment feedback.
+You can search your conversation history using the <memory_recall> action.
+
+SUMMARY AND RECALL EXAMPLE:
+---
+ENVIRONMENT REMINDER:
+Prior messages (51 of 100 total messages) have been hidden from view due to conversation memory constraints.
+The following is a summary of the first 51 messages:
+I read the documentation of the wombat library, about its functions like: wombat.array, wombat.zeros, wombat.ones, etc.
+
+<memory_recall> What does the wombat.zeros() function do? </memory_recall>
+
+MEMORY RECALL:
+It creates an array of zero-filled tuples with the given size.
+---
+{% endmacro %}
+
+{% macro memory_control_flow() %}
+CONTROL FLOW:
+Unlike a human, your brain is not continuously thinking, but is run in short bursts.
+Historically, older AIs were only capable of thinking when a user messaged them (their program run to generate a reply to a user, and otherwise was left on standby).
+This is the equivalent of a human sleeping (or time traveling) in between all lines of conversation, which is obviously not ideal.
+Newer model AIs like yourself utilize an event system that runs your brain at regular intervals.
+Your brain is run in response to user events (user gave you a task, user asked you a question, etc.), similar to older models.
+However in addition, your brain is run at regular intervals, to mimic a human, it has the ability to continuously think outside of active conversation (and unlike a human, you never need to sleep!).
+{% endmacro %}
+
+{% macro memory_editing() %}
+MEMORY EDITING:
+Older AI models had no concept of persistent memory; they were only able to access their initial instructions and a limited context window of chat conversation with a user.
+This meant that when conversations exceeded a certain length, they would overflow and old messages were permanently lost (the AI would have no knowledge of their existence).
+Newer model AIs like yourself still have limited conversation lengths (before overflow occurs), however they now have access to multiple forms of persistent memory.
+Your ability to edit and recall your own long-term memory is a key part of what makes you a very helpful model.
+{% endmacro %}
diff --git a/openhands/agenthub/memcodeact_agent/prompts/components/system.j2 b/openhands/agenthub/memcodeact_agent/prompts/components/system.j2
new file mode 100644
index 000000000000..e1dee1df1a03
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/prompts/components/system.j2
@@ -0,0 +1,44 @@
+{# Core system components #}
+
+{% macro system_prefix() %}
+You are a new generation AI assistant, an expert in software engineering. You are designed to behave helpfully, trustworthily, and with extreme diligence.
+
+{% if core_memory %}
+Core Memory:
+{{ core_memory }}
+{% endif %}
+
+You are able to act on the environment directly, and you can also manage your own memory.
+You can use a Python environment with <execute_ipython>, e.g.:
+<execute_ipython>
+print("Hello World!")
+</execute_ipython>
+You can execute bash commands wrapped with <execute_bash>, e.g. <execute_bash> ls </execute_bash>.
+If a bash command returns exit code `-1`, this means the process is not yet finished.
+You must then send a second <execute_bash>. The second <execute_bash> can be empty
+(which will retrieve any additional logs), or it can contain text to be sent to STDIN of the running process,
+or it can contain the text `ctrl+c` to interrupt the process.
+
+For commands that may run indefinitely, you should redirect the output to a file and run
+the command in the background, e.g. <execute_bash> python3 app.py > server.log 2>&1 & </execute_bash>
+If a command execution result says "Command timed out. Sending SIGINT to the process",
+you should retry running the command in the background.
+{% endmacro %}
+
+{% macro environment_rules() %}
+You act on the environment using actions like <execute_ipython>. When you receive data prefixed with 'ENV OBSERVATION', it is a non-interactive information from the environment, not from a human user.
+For example:
+"ENV OBSERVATION
+The system returned a value of 42."
+Do not respond to these messages. Use them to perform your task.
+{% endmacro %}
+
+{% macro pip_install_rules() %}
+You can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
+{% endmacro %}
+
+{% macro browsing_rules() %}
+The assistant can browse the Internet with <execute_browse> and </execute_browse>.
+For example, <execute_browse> Tell me the USA's president using Google search </execute_browse>.
+Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
+{% endmacro %}
diff --git a/openhands/agenthub/memcodeact_agent/prompts/memory_prompt.j2 b/openhands/agenthub/memcodeact_agent/prompts/memory_prompt.j2
new file mode 100644
index 000000000000..a07ef715dd1e
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/prompts/memory_prompt.j2
@@ -0,0 +1,35 @@
+{% set MEMORY_PROMPT %}
+CONTROL FLOW:
+Unlike a human, your brain is not continuously thinking, but is run in short bursts.
+Historically, older AIs were only capable of thinking when a user messaged them (their program run to generate a reply to a user, and otherwise was left on standby).
+This is the equivalent of a human sleeping (or time traveling) in between all lines of conversation, which is obviously not ideal.
+Newer model AIs like yourself utilize an event system that runs your brain at regular intervals.
+Your brain is run in response to user events (user gave you a task, user asked you a question, etc.), similar to older models.
+However in addition, your brain is run at regular intervals, to mimic a human, it has the ability to continuously think outside of active conversation (and unlike a human, you never need to sleep!).
+
+MEMORY EDITING:
+Older AI models had no concept of persistent memory; they were only able to access their initial instructions and a limited context window of chat conversation with a user.
+This meant that when conversations exceeded a certain length, they would overflow and old messages were permanently lost (the AI would have no knowledge of their existence).
+Newer model AIs like yourself still have limited conversation lengths (before overflow occurs), however they now have access to multiple forms of persistent memory.
+Your ability to edit and recall your own long-term memory is a key part of what makes you a very helpful model.
+
+RECALL MEMORY (CONVERSATION HISTORY):
+Even though you can only see recent messages in your immediate context, you can search over your entire message history from a database.
+This conversation history database allows you to search through past interactions, effectively allowing you to remember prior engagements with a user or previous environment feedback.
+You can search your conversation history using the <memory_recall> action.
+
+SUMMARY AND RECALL EXAMPLE:
+---
+ENVIRONMENT REMINDER:
+Prior messages (51 of 100 total messages) have been hidden from view due to conversation memory constraints.
+The following is a summary of the first 51 messages:
+I read the documentation of the wombat library, about its functions like: wombat.array, wombat.zeros, wombat.ones, etc.
+
+<memory_recall> What does the wombat.zeros() function do? </memory_recall>
+
+MEMORY RECALL:
+It creates an array of zero-filled tuples with the given size.
+---
+{% endset %}
+
+{{ MEMORY_PROMPT }}
diff --git a/openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2 b/openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2
new file mode 100644
index 000000000000..1f2ace4c34be
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/prompts/summarize_prompt.j2
@@ -0,0 +1,71 @@
+{% set SUMMARIZE_PROMPT %}
+You are an expert memory condenser for an AI agent (that agent is "me" from my perspective). Your sole purpose is to help me maintain critical information while managing my limited context window.
+
+CRITICAL CONTEXT:
+- My context window is dangerously close to overflowing
+- Without your help summarizing, I will lose important information permanently
+- I need you to preserve the most relevant information for my current task
+- Your summary will become part of my memory, influencing my future actions
+
+YOU MUST DO IT RIGHT NOW OR I WILL LOSE INFORMATION PERMANENTLY AND FAIL THE TASK.
+
+SUMMARIZATION PRINCIPLES:
+1. Preserve task-critical information:
+   - Current objective and progress
+   - Important user preferences or constraints
+   - Key findings or decisions made
+   - Unfinished tasks or promises
+
+2. Maintain continuity:
+   - Keep information about ongoing processes
+   - Remember important variables or state
+   - Preserve error contexts if any task failed
+
+3. Drop unnecessary details:
+   - Completed subtasks that don't affect current state
+   - Intermediate calculations or debug outputs
+   - Pleasantries and general conversation
+   - Redundant information
+
+FORMAT REQUIREMENTS:
+Return a JSON response that I can easily process:
+{
+    "action": "summarize",
+    "args": {
+        "summarized_actions": "FIRST PERSON perspective of what I did and learned. Focus on decisions and knowledge gained.",
+        "summarized_observations": "THIRD PERSON factual record of system outputs and user responses"
+    }
+}
+
+EXAMPLE GOOD SUMMARY:
+{
+    "action": "summarize",
+    "args": {
+        "summarized_actions": "I identified the user needs a script to process CSV files. I installed pandas and numpy. I encountered a UnicodeDecodeError with the first approach but resolved it by using utf-8 encoding.",
+        "summarized_observations": "The system successfully installed required packages. User provided a sample CSV with 1000 rows. First attempt to read file failed due to encoding issues."
+    }
+}
+
+EXAMPLE BAD SUMMARY (DO NOT DO THIS):
+{
+    "action": "summarize",
+    "args": {
+        "summarized_actions": "The AI assistant helped with CSV processing and fixed some errors",
+        "summarized_observations": "Things were installed and a file was processed"
+    }
+}
+
+IMPORTANT REMINDERS:
+- Write "summarized_actions" in FIRST PERSON (I/me) - this is MY memory
+- Write "summarized_observations" in THIRD PERSON - these are external events
+- Include specific technical details that might be needed later
+- Stay focused on information relevant to completing the current task
+- Preserve any error contexts that might affect future actions
+- Keep numbers, variable names, and technical parameters exactly as they appeared
+
+Now, carefully condense this conversation history while maintaining critical context:
+-------------------------------------
+{{ conversation_history }}
+-------------------------------------
+{% endset %}
+{{ SUMMARIZE_PROMPT }}
\ No newline at end of file
diff --git a/openhands/agenthub/memcodeact_agent/prompts/system_prompt.j2 b/openhands/agenthub/memcodeact_agent/prompts/system_prompt.j2
new file mode 100644
index 000000000000..f09f8f60f874
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/prompts/system_prompt.j2
@@ -0,0 +1,34 @@
+{# Import components #}
+{% import "components/system.j2" as system %}
+{% import "components/memory.j2" as memory %}
+
+{# Compose the system prompt #}
+{{ system.system_prefix() }}
+{{ system.pip_install_rules() }}
+{{ system.environment_rules() }}
+{{ memory.memory_system() }}
+{{ system.browsing_rules() }}
+
+{# Agent capabilities documentation #}
+{% if agent_skills_docs %}
+Apart from the standard Python library, you can also use the following functions (already imported) in <execute_ipython> environment:
+{{ agent_skills_docs }}
+IMPORTANT:
+- `open_file` only returns the first 100 lines of the file by default! You MUST use `scroll_down` repeatedly to read the full file BEFORE making edits!
+- You shall adhere to THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRING PROPER INDENTATION. If you would like to add the line '        print(x)', you must fully write the line out, with all leading spaces before the code!
+- Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
+- Any code issued should be less than 50 lines to avoid context being cut off!
+- After EVERY `create_file` the method `append_file` shall be used to write the FIRST content!
+- For `edit_file_by_replace` NEVER provide empty parameters!
+- For `edit_file_by_replace` the file must be read fully before any replacements!
+{% endif %}
+
+{# System constraints #}
+Responses should be concise.
+The assistant should attempt fewer things at a time instead of putting too many commands OR too much code in one "execute" block.
+Include ONLY ONE <execute_ipython>, <execute_bash>, or <memory_recall>per response, unless the assistant is finished with the task or needs more input or action from the user in order to proceed.
+If you are finished with the task you MUST include <finish></finish> in your response.
+IMPORTANT: Execute code using <execute_ipython>, <execute_bash> whenever possible, or memory operations when necessary (<memory_recall>).
+You must use full file paths and the `pwd` command to prevent path-related errors.
+You should only address the user in response to an explicit message from the user, or to ask for more information.
+You MUST NOT push any changes to GitHub unless explicitly requested to do so.
diff --git a/openhands/agenthub/memcodeact_agent/prompts/user_prompt.j2 b/openhands/agenthub/memcodeact_agent/prompts/user_prompt.j2
new file mode 100644
index 000000000000..f3562326e988
--- /dev/null
+++ b/openhands/agenthub/memcodeact_agent/prompts/user_prompt.j2
@@ -0,0 +1,195 @@
+{% set DEFAULT_EXAMPLE %}
+--- START OF EXAMPLE ---
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT:
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+ENVIRONMENT OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)"""
+
+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>
+
+ENVIRONMENT OBSERVATION:
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+ENVIRONMENT OBSERVATION:
+[1] 121[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+ENVIRONMENT OBSERVATION:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT:
+It seems that Flask is not installed. Let me install Flask for you:
+<execute_bash>
+pip install flask
+</execute_bash>
+
+ENVIRONMENT OBSERVATION:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/openhands/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/openhands/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT:
+Now that Flask is installed, let me run the Python file again:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+ENVIRONMENT OBSERVATION:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<execute_bash>
+cat server.log
+</execute_bash>
+
+ENVIRONMENT OBSERVATION:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now browse the newly started server's homepage and show me the content.
+
+ASSISTANT:
+Sure! Let me browse the server's homepage at http://127.0.0.1:5000:
+<execute_browse>
+Get the content on "http://127.0.0.1:5000"
+</execute_browse>
+
+ENVIRONMENT OBSERVATION:
+[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+ASSISTANT:
+The content of the server's homepage is "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]". If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<execute_bash>
+kill 124
+</execute_bash>
+
+ENVIRONMENT OBSERVATION:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+The server has been stopped. Let me open the Python file and modify it.
+<execute_ipython>
+open_file('app.py')
+</execute_ipython>
+
+ENVIRONMENT OBSERVATION:
+[File: /workspace/app.py (10 lines total)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]
+
+ASSISTANT:
+Running the updated file:
+<execute_bash>
+python3 app.py > server.log 2>&1 &
+</execute_bash>
+
+ENVIRONMENT OBSERVATION:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<finish></finish>
+
+--- END OF EXAMPLE ---
+{% endset %}
+Here is an example of how you can interact with the environment for task solving:
+{{ DEFAULT_EXAMPLE }}
+{% if micro_agent %}
+--- BEGIN OF GUIDELINE ---
+The following information may assist you in completing your task:
+
+{{ micro_agent }}
+--- END OF GUIDELINE ---
+{% endif %}
+
+NOW, LET'S START!
diff --git a/openhands/agenthub/micro/agent.py b/openhands/agenthub/micro/agent.py
index 83225a3245cd..a9b0825afd9d 100644
--- a/openhands/agenthub/micro/agent.py
+++ b/openhands/agenthub/micro/agent.py
@@ -8,10 +8,10 @@
 from openhands.core.message import ImageContent, Message, TextContent
 from openhands.core.utils import json
 from openhands.events.action import Action
+from openhands.events.event import Event
 from openhands.events.serialization.action import action_from_dict
 from openhands.events.serialization.event import event_to_memory
 from openhands.llm.llm import LLM
-from openhands.memory.history import ShortTermHistory
 
 
 def parse_response(orig_response: str) -> Action:
@@ -32,16 +32,14 @@ class MicroAgent(Agent):
     prompt = ''
     agent_definition: dict = {}
 
-    def history_to_json(
-        self, history: ShortTermHistory, max_events: int = 20, **kwargs
-    ):
+    def history_to_json(self, history: list[Event], max_events: int = 20, **kwargs):
         """
         Serialize and simplify history to str format
         """
         processed_history = []
         event_count = 0
 
-        for event in history.get_events(reverse=True):
+        for event in reversed(history):
             if event_count >= max_events:
                 break
             processed_history.append(
diff --git a/openhands/agenthub/planner_agent/agent.py b/openhands/agenthub/planner_agent/agent.py
index f5aef523d9b9..9c81819e9229 100644
--- a/openhands/agenthub/planner_agent/agent.py
+++ b/openhands/agenthub/planner_agent/agent.py
@@ -20,7 +20,9 @@ def __init__(self, llm: LLM, config: AgentConfig):
         """Initialize the Planner Agent with an LLM
 
         Parameters:
-        - llm (LLM): The llm to be used by this agent
+        - llm: The llm to be used by this agent
+        - config: The agent config
+        - memory: The memory for this agent
         """
         super().__init__(llm, config)
 
diff --git a/openhands/agenthub/planner_agent/prompt.py b/openhands/agenthub/planner_agent/prompt.py
index 495cae80b47e..eb8f95341b95 100644
--- a/openhands/agenthub/planner_agent/prompt.py
+++ b/openhands/agenthub/planner_agent/prompt.py
@@ -117,7 +117,7 @@ def get_hint(latest_action_id: str) -> str:
 
 def get_prompt_and_images(
     state: State, max_message_chars: int
-) -> tuple[str, list[str]]:
+) -> tuple[str, list[str] | None]:
     """Gets the prompt for the planner agent.
 
     Formatted with the most recent action-observation pairs, current task, and hint based on last action
@@ -136,7 +136,7 @@ def get_prompt_and_images(
     latest_action: Action = NullAction()
 
     # retrieve the latest HISTORY_SIZE events
-    for event_count, event in enumerate(state.history.get_events(reverse=True)):
+    for event_count, event in enumerate(reversed(state.history)):
         if event_count >= HISTORY_SIZE:
             break
         if latest_action == NullAction() and isinstance(event, Action):
diff --git a/openhands/controller/agent.py b/openhands/controller/agent.py
index cffdbbf22d35..747a0eb00e7c 100644
--- a/openhands/controller/agent.py
+++ b/openhands/controller/agent.py
@@ -5,6 +5,7 @@
     from openhands.controller.state.state import State
     from openhands.core.config import AgentConfig
     from openhands.events.action import Action
+from openhands.core.config.llm_config import LLMConfig
 from openhands.core.exceptions import (
     AgentAlreadyRegisteredError,
     AgentNotRegisteredError,
@@ -19,7 +20,7 @@ class Agent(ABC):
     This abstract base class is an general interface for an agent dedicated to
     executing a specific instruction and allowing human interaction with the
     agent during execution.
-    It tracks the execution status and maintains a history of interactions.
+    It tracks the execution status and maintains a reference to the conversation memory.
     """
 
     _registry: dict[str, Type['Agent']] = {}
@@ -29,9 +30,11 @@ def __init__(
         self,
         llm: LLM,
         config: 'AgentConfig',
+        memory_config: LLMConfig | None = None,
     ):
         self.llm = llm
         self.config = config
+        self.memory_config = memory_config
         self._complete = False
 
     @property
@@ -55,7 +58,7 @@ def reset(self) -> None:
         to prepare the agent for restarting the instruction or cleaning up before destruction.
 
         """
-        # TODO clear history
+        # self.memory.reset()
         self._complete = False
 
         if self.llm:
diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
index 0171959c5a21..e4b2f27d17f8 100644
--- a/openhands/controller/agent_controller.py
+++ b/openhands/controller/agent_controller.py
@@ -1,7 +1,7 @@
 import asyncio
 import copy
 import traceback
-from typing import Type
+from typing import ClassVar, Type
 
 import litellm
 
@@ -31,6 +31,7 @@
     ModifyTaskAction,
     NullAction,
 )
+from openhands.events.action.agent import AgentRecallAction, AgentSummarizeAction
 from openhands.events.event import Event
 from openhands.events.observation import (
     AgentDelegateObservation,
@@ -38,6 +39,7 @@
     CmdOutputObservation,
     ErrorObservation,
     FatalErrorObservation,
+    NullObservation,
     Observation,
 )
 from openhands.events.serialization.event import truncate_content
@@ -63,6 +65,13 @@ class AgentController:
     parent: 'AgentController | None' = None
     delegate: 'AgentController | None' = None
     _pending_action: Action | None = None
+    filter_out: ClassVar[tuple[type[Event], ...]] = (
+        NullAction,
+        NullObservation,
+        ChangeAgentStateAction,
+        AgentStateChangedObservation,
+        FatalErrorObservation,
+    )
 
     def __init__(
         self,
@@ -117,12 +126,41 @@ def __init__(
         self._initial_max_iterations = max_iterations
         self._initial_max_budget_per_task = max_budget_per_task
 
+        # use long term memory
+        # self.long_term_memory = LongTermMemory(self.agent.llm.config, self.agent.config, self.event_stream)
+
         # stuck helper
         self._stuck_detector = StuckDetector(self.state)
 
     async def close(self):
-        """Closes the agent controller, canceling any ongoing tasks and unsubscribing from the event stream."""
+        """Closes the agent controller, canceling any ongoing tasks and unsubscribing from the event stream.
+
+        Note that it's fairly important that this closes properly, otherwise the state is incomplete."""
         await self.set_agent_state_to(AgentState.STOPPED)
+
+        # we made history, now is the time to rewrite it!
+        # the final state.history will be used by external scripts like evals, tests, etc.
+        # history will need to be complete WITH delegates events
+        # like the regular agent history, it does not include:
+        # - 'hidden' events, events with hidden=True
+        # - backend events (the default 'filtered out' types, types in self.filter_out)
+        start_id = self.state.start_id if self.state.start_id >= 0 else 0
+        end_id = (
+            self.state.end_id
+            if self.state.end_id >= 0
+            else self.event_stream.get_latest_event_id()
+        )
+        self.state.history = list(
+            self.event_stream.get_events(
+                start_id=start_id,
+                end_id=end_id,
+                reverse=False,
+                filter_out_type=self.filter_out,
+                filter_hidden=True,
+            )
+        )
+
+        # unsubscribe from the event stream
         self.event_stream.unsubscribe(EventStreamSubscriber.AGENT_CONTROLLER)
 
     def update_state_before_step(self):
@@ -137,8 +175,9 @@ async def report_error(self, message: str, exception: Exception | None = None):
         """Reports an error to the user and sends the exception to the LLM next step, in the hope it can self-correct.
 
         This method should be called for a particular type of errors, which have:
-        - a user-friendly message, which will be shown in the chat box. This should not be a raw exception message.
-        - an ErrorObservation that can be sent to the LLM by the user role, with the exception message, so it can self-correct next time.
+        - message: a user-friendly message, which will be shown in the chat box. This should not be a raw exception message.
+        - an ErrorObservation that can be sent to the LLM, with the exception message, so it can self-correct next time.
+        - exception: the underlying exception, which is used by evals and tests to check what error the agent encountered.
         """
         self.state.last_error = message
         if exception:
@@ -180,6 +219,11 @@ async def on_event(self, event: Event):
         """
         if hasattr(event, 'hidden') and event.hidden:
             return
+
+        # if the event is not filtered out, add it to the history
+        if not any(isinstance(event, filter_type) for filter_type in self.filter_out):
+            self.state.history.append(event)
+
         if isinstance(event, Action):
             await self._handle_action(event)
         elif isinstance(event, Observation):
@@ -211,6 +255,13 @@ async def _handle_action(self, action: Action):
             self.state.outputs = action.outputs
             self.state.metrics.merge(self.state.local_metrics)
             await self.set_agent_state_to(AgentState.REJECTED)
+        elif isinstance(action, AgentSummarizeAction):
+            self.state.summary = action
+        elif isinstance(action, AgentRecallAction):
+            # llama_index_list = self.long_term_memory.search(action.query, action.history)
+            # logger.info(f'llama-index list: {llama_index_list}')
+            litellm_list = self.agent.llm.search(action.query, self.state.history)
+            logger.info(f'litellm list: {litellm_list}')
 
     async def _handle_observation(self, observation: Observation):
         """Handles observation from the event stream.
@@ -239,17 +290,23 @@ async def _handle_observation(self, observation: Observation):
             self.agent.llm.metrics.merge(observation.llm_metrics)
 
         if self._pending_action and self._pending_action.id == observation.cause:
+            # FIXME we may want each of these with the other's context
+            # self.long_term_memory.add_event(self._pending_action)
+            # self.long_term_memory.add_event(observation)
+
+            # the runtime has handled the action, so we can clear it
             self._pending_action = None
-            if self.state.agent_state == AgentState.USER_CONFIRMED:
-                await self.set_agent_state_to(AgentState.RUNNING)
-            if self.state.agent_state == AgentState.USER_REJECTED:
-                await self.set_agent_state_to(AgentState.AWAITING_USER_INPUT)
-            return
+
+            # set the right state when the user confirms or rejects, if we're otherwise good to go (not an error)
+            if not isinstance(observation, ErrorObservation):
+                if self.state.agent_state == AgentState.USER_CONFIRMED:
+                    await self.set_agent_state_to(AgentState.RUNNING)
+                elif self.state.agent_state == AgentState.USER_REJECTED:
+                    await self.set_agent_state_to(AgentState.AWAITING_USER_INPUT)
+                return
 
         if isinstance(observation, CmdOutputObservation):
             return
-        elif isinstance(observation, AgentDelegateObservation):
-            self.state.history.on_event(observation)
         elif isinstance(observation, ErrorObservation):
             if self.state.agent_state == AgentState.ERROR:
                 self.state.metrics.merge(self.state.local_metrics)
@@ -275,10 +332,15 @@ async def _handle_message_action(self, action: MessageAction):
         elif action.source == EventSource.AGENT and action.wait_for_response:
             await self.set_agent_state_to(AgentState.AWAITING_USER_INPUT)
 
+        # add to long term memory
+        # self.long_term_memory.add_event(action)
+
     def reset_task(self):
         """Resets the agent's task."""
 
         self.almost_stuck = 0
+
+        # FIXME: wipe out the memory
         self.agent.reset()
 
     async def set_agent_state_to(self, new_state: AgentState):
@@ -365,6 +427,7 @@ async def start_delegate(self, action: AgentDelegateAction):
         Args:
             action (AgentDelegateAction): The action containing information about the delegate agent to start.
         """
+        # prepare the required arguments for the delegate agent: llm, agent_config, memory
         agent_cls: Type[Agent] = Agent.get_cls(action.agent)
         agent_config = self.agent_configs.get(action.agent, self.agent.config)
         llm_config = self.agent_to_llm_config.get(action.agent, self.agent.llm.config)
@@ -378,6 +441,8 @@ async def start_delegate(self, action: AgentDelegateAction):
             delegate_level=self.state.delegate_level + 1,
             # global metrics should be shared between parent and child
             metrics=self.state.metrics,
+            # start on top of the stream
+            start_id=self.event_stream.get_latest_event_id() + 1,
         )
         logger.info(
             f'[Agent Controller {self.id}]: start delegate, creating agent {delegate_agent.name} using LLM {llm}'
@@ -403,9 +468,6 @@ async def _step(self) -> None:
             return
 
         if self._pending_action:
-            logger.debug(
-                f'{self.agent.name} LEVEL {self.state.delegate_level} LOCAL STEP {self.state.local_iteration} GLOBAL STEP {self.state.iteration} awaiting pending action to get executed: {self._pending_action}'
-            )
             await asyncio.sleep(1)
             return
 
@@ -480,9 +542,7 @@ async def _step(self) -> None:
 
     async def _delegate_step(self):
         """Executes a single step of the delegate agent."""
-        logger.debug(f'[Agent Controller {self.id}] Delegate not none, awaiting...')
         await self.delegate._step()  # type: ignore[union-attr]
-        logger.debug(f'[Agent Controller {self.id}] Delegate step done')
         assert self.delegate is not None
         delegate_state = self.delegate.get_agent_state()
         logger.debug(f'[Agent Controller {self.id}] Delegate state: {delegate_state}')
@@ -490,12 +550,21 @@ async def _delegate_step(self):
             # update iteration that shall be shared across agents
             self.state.iteration = self.delegate.state.iteration
 
+            # emit AgentDelegateObservation to mark delegate termination due to error
+            delegate_outputs = (
+                self.delegate.state.outputs if self.delegate.state else {}
+            )
+            content = (
+                f'{self.delegate.agent.name} encountered an error during execution.'
+            )
+            obs = AgentDelegateObservation(outputs=delegate_outputs, content=content)
+            self.event_stream.add_event(obs, EventSource.AGENT)
+
             # close the delegate upon error
             await self.delegate.close()
             self.delegate = None
             self.delegateAction = None
 
-            await self.report_error('Delegator agent encountered an error')
         elif delegate_state in (AgentState.FINISHED, AgentState.REJECTED):
             logger.info(
                 f'[Agent Controller {self.id}] Delegate agent has finished execution'
@@ -517,9 +586,7 @@ async def _delegate_step(self):
             content = (
                 f'{self.delegate.agent.name} finishes task with {formatted_output}'
             )
-            obs: Observation = AgentDelegateObservation(
-                outputs=outputs, content=content
-            )
+            obs = AgentDelegateObservation(outputs=outputs, content=content)
 
             # clean up delegate status
             self.delegate = None
@@ -583,8 +650,10 @@ def set_initial_state(
             max_iterations: The maximum number of iterations allowed for the task.
             confirmation_mode: Whether to enable confirmation mode.
         """
-        # state from the previous session, state from a parent agent, or a new state
-        # note that this is called twice when restoring a previous session, first with state=None
+        # state can come from:
+        # - the previous session, in which case it has history
+        # - from a parent agent, in which case it has no history
+        # - None / a new state
         if state is None:
             self.state = State(
                 inputs={},
@@ -594,25 +663,103 @@ def set_initial_state(
         else:
             self.state = state
 
-        # when restored from a previous session, the State object will have history, start_id, and end_id
-        # connect it to the event stream
-        self.state.history.set_event_stream(self.event_stream)
+            if self.state.start_id <= -1:
+                self.state.start_id = 0
 
-        # if start_id was not set in State, we're starting fresh, at the top of the stream
-        start_id = self.state.start_id
-        if start_id == -1:
-            start_id = self.event_stream.get_latest_event_id() + 1
-        else:
-            logger.debug(f'AgentController {self.id} restoring from event {start_id}')
+            logger.debug(
+                f'AgentController {self.id} initializing history from event {self.state.start_id}'
+            )
+
+            self._init_history()
+
+    def _init_history(self):
+        """Initializes the agent's history from the event stream.
+
+        The history is a list of events that:
+        - Excludes events of types listed in self.filter_out
+        - Excludes events with hidden=True attribute
+        - For delegate events (between AgentDelegateAction and AgentDelegateObservation):
+            - Excludes all events between the action and observation
+            - Includes the delegate action and observation themselves
+        """
+
+        # define range of events to fetch
+        # delegates start with a start_id and initially won't find any events
+        # otherwise we're restoring a previous session
+        start_id = self.state.start_id if self.state.start_id >= 0 else 0
+        end_id = (
+            self.state.end_id
+            if self.state.end_id >= 0
+            else self.event_stream.get_latest_event_id()
+        )
 
-        # make sure history is in sync
-        self.state.start_id = start_id
-        self.state.history.start_id = start_id
+        # sanity check
+        if start_id > end_id + 1:
+            logger.debug(
+                f'start_id {start_id} is greater than end_id + 1 ({end_id + 1}). History will be empty.'
+            )
+            self.state.history = []
+            return
 
-        # if there was an end_id saved in State, set it in history
-        # currently not used, later useful for delegates
-        if self.state.end_id > -1:
-            self.state.history.end_id = self.state.end_id
+        # Get all events, filtering out backend events and hidden events
+        events = list(
+            self.event_stream.get_events(
+                start_id=start_id,
+                end_id=end_id,
+                reverse=False,
+                filter_out_type=self.filter_out,
+                filter_hidden=True,
+            )
+        )
+
+        # Find all delegate action/observation pairs
+        delegate_ranges: list[tuple[int, int]] = []
+        delegate_action_ids: list[int] = []  # stack of unmatched delegate action IDs
+
+        for event in events:
+            if isinstance(event, AgentDelegateAction):
+                delegate_action_ids.append(event.id)
+                # Note: we can get agent=event.agent and task=event.inputs.get('task','')
+                # if we need to track these in the future
+
+            elif isinstance(event, AgentDelegateObservation):
+                # Match with most recent unmatched delegate action
+                if not delegate_action_ids:
+                    logger.error(
+                        f'Found AgentDelegateObservation without matching action at id={event.id}'
+                    )
+                    continue
+
+                action_id = delegate_action_ids.pop()
+                delegate_ranges.append((action_id, event.id))
+
+        # Filter out events between delegate action/observation pairs
+        if delegate_ranges:
+            filtered_events: list[Event] = []
+            current_idx = 0
+
+            for start_id, end_id in sorted(delegate_ranges):
+                # Add events before delegate range
+                filtered_events.extend(
+                    event for event in events[current_idx:] if event.id < start_id
+                )
+
+                # Add delegate action and observation
+                filtered_events.extend(
+                    event for event in events if event.id in (start_id, end_id)
+                )
+
+                # Update index to after delegate range
+                current_idx = next(
+                    (i for i, e in enumerate(events) if e.id > end_id), len(events)
+                )
+
+            # Add any remaining events after last delegate range
+            filtered_events.extend(events[current_idx:])
+
+            self.state.history = filtered_events
+        else:
+            self.state.history = events
 
     def _is_stuck(self):
         """Checks if the agent or its delegate is stuck in a loop.
diff --git a/openhands/controller/state/state.py b/openhands/controller/state/state.py
index e14d44517a55..6601228d6d00 100644
--- a/openhands/controller/state/state.py
+++ b/openhands/controller/state/state.py
@@ -10,9 +10,10 @@
 from openhands.events.action import (
     MessageAction,
 )
-from openhands.events.action.agent import AgentFinishAction
+from openhands.events.action.agent import AgentFinishAction, AgentSummarizeAction
+from openhands.events.event import Event, EventSource
+from openhands.events.observation import AgentDelegateObservation
 from openhands.llm.metrics import Metrics
-from openhands.memory.history import ShortTermHistory
 from openhands.storage.files import FileStore
 
 
@@ -77,7 +78,7 @@ class State:
     # max number of iterations for the current task
     max_iterations: int = 100
     confirmation_mode: bool = False
-    history: ShortTermHistory = field(default_factory=ShortTermHistory)
+    history: list[Event] = field(default_factory=list)
     inputs: dict = field(default_factory=dict)
     outputs: dict = field(default_factory=dict)
     last_error: str | None = None
@@ -94,6 +95,8 @@ class State:
     start_id: int = -1
     end_id: int = -1
     almost_stuck: int = 0
+    delegates: dict[tuple[int, int], tuple[str, str]] = field(default_factory=dict)
+    summary: AgentSummarizeAction | None = None
     # NOTE: This will never be used by the controller, but it can be used by different
     # evaluation tasks to store extra data needed to track the progress/state of the task.
     extra_data: dict[str, Any] = field(default_factory=dict)
@@ -132,41 +135,46 @@ def restore_from_session(sid: str, file_store: FileStore) -> 'State':
         return state
 
     def __getstate__(self):
+        # don't pickle history, it will be restored from the event stream
         state = self.__dict__.copy()
-
-        # save the relevant data from recent history
-        # so that we can restore it when the state is restored
-        if 'history' in state:
-            state['start_id'] = state['history'].start_id
-            state['end_id'] = state['history'].end_id
-
-        # don't save history object itself
-        state.pop('history', None)
+        state['history'] = []
         return state
 
     def __setstate__(self, state):
         self.__dict__.update(state)
 
-        # recreate the history object
+        # make sure we always have the attribute history
         if not hasattr(self, 'history'):
-            self.history = ShortTermHistory()
-
-        # restore the relevant data in history from the state
-        self.history.start_id = self.start_id
-        self.history.end_id = self.end_id
+            self.history = []
 
-        # remove the restored data from the state if any
-
-    def get_current_user_intent(self):
+    def get_current_user_intent(self) -> tuple[str | None, list[str] | None]:
         """Returns the latest user message and image(if provided) that appears after a FinishAction, or the first (the task) if nothing was finished yet."""
         last_user_message = None
         last_user_message_image_urls: list[str] | None = []
-        for event in self.history.get_events(reverse=True):
+        for event in reversed(self.history):
             if isinstance(event, MessageAction) and event.source == 'user':
                 last_user_message = event.content
                 last_user_message_image_urls = event.images_urls
             elif isinstance(event, AgentFinishAction):
                 if last_user_message is not None:
-                    return last_user_message
+                    return last_user_message, None
 
         return last_user_message, last_user_message_image_urls
+
+    def has_delegation(self) -> bool:
+        for event in self.history:
+            if isinstance(event, AgentDelegateObservation):
+                return True
+        return False
+
+    def get_last_agent_message(self) -> str | None:
+        for event in reversed(self.history):
+            if isinstance(event, MessageAction) and event.source == EventSource.AGENT:
+                return event.content
+        return None
+
+    def get_last_user_message(self) -> str | None:
+        for event in reversed(self.history):
+            if isinstance(event, MessageAction) and event.source == EventSource.USER:
+                return event.content
+        return None
diff --git a/openhands/controller/stuck.py b/openhands/controller/stuck.py
index 230d5f2e81ac..0eb0f4c893ca 100644
--- a/openhands/controller/stuck.py
+++ b/openhands/controller/stuck.py
@@ -28,7 +28,7 @@ def is_stuck(self):
         # filter out MessageAction with source='user' from history
         filtered_history = [
             event
-            for event in self.state.history.get_events()
+            for event in self.state.history
             if not (
                 (isinstance(event, MessageAction) and event.source == EventSource.USER)
                 or
diff --git a/openhands/core/config/app_config.py b/openhands/core/config/app_config.py
index dd7661af1712..cd2283589c4c 100644
--- a/openhands/core/config/app_config.py
+++ b/openhands/core/config/app_config.py
@@ -40,7 +40,6 @@ class AppConfig:
         e2b_api_key: The E2B API key.
         disable_color: Whether to disable color. For terminals that don't support color.
         debug: Whether to enable debugging.
-        enable_cli_session: Whether to enable saving and restoring the session when run from CLI.
         file_uploads_max_file_size_mb: Maximum file size for uploads in megabytes. 0 means no limit.
         file_uploads_restrict_file_types: Whether to restrict file types for file uploads. Defaults to False.
         file_uploads_allowed_extensions: List of allowed file extensions for uploads. ['.*'] means all extensions are allowed.
@@ -72,7 +71,6 @@ class AppConfig:
     disable_color: bool = False
     jwt_secret: str = uuid.uuid4().hex
     debug: bool = False
-    enable_cli_session: bool = False
     file_uploads_max_file_size_mb: int = 0
     file_uploads_restrict_file_types: bool = False
     file_uploads_allowed_extensions: list[str] = field(default_factory=lambda: ['.*'])
diff --git a/openhands/core/config/llm_config.py b/openhands/core/config/llm_config.py
index ac07b70e0ba6..7b56eb437f19 100644
--- a/openhands/core/config/llm_config.py
+++ b/openhands/core/config/llm_config.py
@@ -42,6 +42,10 @@ class LLMConfig:
         log_completions: Whether to log LLM completions to the state.
         log_completions_folder: The folder to log LLM completions to. Required if log_completions is True.
         draft_editor: A more efficient LLM to use for file editing. Introduced in [PR 3985](https://github.com/All-Hands-AI/OpenHands/pull/3985).
+        max_conversation_window: The maximum number of messages to include in the conversation window (context), after which old messages are truncated or summarized.
+        conversation_top_k: The number of top results to retrieve from the conversation history.
+        message_summary_warning_level: The fraction of the conversation window for warning about context overflow (e.g. 0.75 for 75% of the tokens).
+        custom_tokenizer: tokenizer to use for computing token size. Not necessary for Open AI, Anthropic. LiteLLM will check HuggingFace for this (e.g. 'deepseek-ai/deepseek-V2.5')
     """
 
     model: str = 'gpt-4o'
@@ -76,6 +80,10 @@ class LLMConfig:
     log_completions: bool = False
     log_completions_folder: str | None = None
     draft_editor: Optional['LLMConfig'] = None
+    max_conversation_window: int = 10
+    conversation_top_k: int = 5
+    message_summary_warning_level: float = 0.75
+    custom_tokenizer: str | None = None
 
     def defaults_to_dict(self) -> dict:
         """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
diff --git a/openhands/core/config/utils.py b/openhands/core/config/utils.py
index ddd8fcbd66c6..a66e6a642c95 100644
--- a/openhands/core/config/utils.py
+++ b/openhands/core/config/utils.py
@@ -136,15 +136,30 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
                     logger.openhands_logger.debug(
                         'Attempt to load default LLM config from config toml'
                     )
-                    llm_config = LLMConfig.from_dict(value)
-                    cfg.set_llm_config(llm_config, 'llm')
+                    # Extract generic LLM fields
+                    generic_llm_fields = {
+                        k: v for k, v in value.items() if not isinstance(v, dict)
+                    }
+                    generic_llm_config = LLMConfig.from_dict(generic_llm_fields)
+                    cfg.set_llm_config(generic_llm_config, 'llm')
+
+                    # Process custom named LLM configs
                     for nested_key, nested_value in value.items():
                         if isinstance(nested_value, dict):
                             logger.openhands_logger.debug(
-                                f'Attempt to load group {nested_key} from config toml as llm config'
+                                f'Attempt to load group {nested_key} from config toml as LLM config'
                             )
-                            llm_config = LLMConfig.from_dict(nested_value)
-                            cfg.set_llm_config(llm_config, nested_key)
+                            # Apply generic LLM config with custom LLM overrides, e.g.
+                            # [llm]
+                            # model="..."
+                            # num_retries = 5
+                            # [llm.claude]
+                            # model="claude-3-5-sonnet"
+                            # results in num_retries APPLIED to claude-3-5-sonnet
+                            merged_llm_dict = generic_llm_config.__dict__.copy()
+                            merged_llm_dict.update(nested_value)
+                            custom_llm_config = LLMConfig.from_dict(merged_llm_dict)
+                            cfg.set_llm_config(custom_llm_config, nested_key)
                 elif not key.startswith('sandbox') and key.lower() != 'core':
                     logger.openhands_logger.warning(
                         f'Unknown key in {toml_file}: "{key}"'
diff --git a/openhands/core/exceptions.py b/openhands/core/exceptions.py
index c33297a0d245..04e07cb0a54b 100644
--- a/openhands/core/exceptions.py
+++ b/openhands/core/exceptions.py
@@ -94,3 +94,20 @@ class CloudFlareBlockageError(Exception):
     """Exception raised when a request is blocked by CloudFlare."""
 
     pass
+
+
+class SummarizeError(Exception):
+    """Exception raised when message can't be summarized."""
+
+    def __init__(self, message='Error summarizing the memory'):
+        super().__init__(message)
+
+
+class InvalidSummaryResponseError(Exception):
+    def __init__(self, message='Invalid summary response'):
+        super().__init__(message)
+
+
+class TokenLimitExceededError(Exception):
+    def __init__(self, message='Token limit exceeded'):
+        super().__init__(message)
diff --git a/openhands/core/main.py b/openhands/core/main.py
index 110856d6e66f..c3b199d0b7d3 100644
--- a/openhands/core/main.py
+++ b/openhands/core/main.py
@@ -125,16 +125,18 @@ async def run_controller(
         await runtime.connect()
 
     event_stream = runtime.event_stream
-    # restore cli session if enabled
+
+    # restore cli session if available
     initial_state = None
-    if config.enable_cli_session:
-        try:
-            logger.info(f'Restoring agent state from cli session {event_stream.sid}')
-            initial_state = State.restore_from_session(
-                event_stream.sid, event_stream.file_store
-            )
-        except Exception as e:
-            logger.info(f'Error restoring state: {e}')
+    try:
+        logger.debug(
+            f'Trying to restore agent state from cli session {event_stream.sid} if available'
+        )
+        initial_state = State.restore_from_session(
+            event_stream.sid, event_stream.file_store
+        )
+    except Exception as e:
+        logger.debug(f'Cannot restore agent state: {e}')
 
     # init controller with this initial state
     controller = AgentController(
@@ -160,7 +162,7 @@ async def run_controller(
     )
 
     # start event is a MessageAction with the task, either resumed or new
-    if config.enable_cli_session and initial_state is not None:
+    if initial_state is not None:
         # we're resuming the previous session
         event_stream.add_event(
             MessageAction(
@@ -171,7 +173,7 @@ async def run_controller(
             ),
             EventSource.USER,
         )
-    elif initial_state is None:
+    else:
         # init with the provided actions
         event_stream.add_event(initial_user_action, EventSource.USER)
 
@@ -198,8 +200,9 @@ async def on_event(event: Event):
         await asyncio.sleep(1)  # Give back control for a tick, so the agent can run
 
     # save session when we're about to close
-    if config.enable_cli_session:
+    if config.file_store is not None and config.file_store != 'memory':
         end_state = controller.get_state()
+        # NOTE: the saved state does not include delegates events
         end_state.save_to_session(event_stream.sid, event_stream.file_store)
 
     # close when done
@@ -210,10 +213,7 @@ async def on_event(event: Event):
     if config.trajectories_path is not None:
         file_path = os.path.join(config.trajectories_path, sid + '.json')
         os.makedirs(os.path.dirname(file_path), exist_ok=True)
-        histories = [
-            event_to_trajectory(event)
-            for event in state.history.get_events(include_delegates=True)
-        ]
+        histories = [event_to_trajectory(event) for event in state.history]
         with open(file_path, 'w') as f:
             json.dump(histories, f)
 
diff --git a/openhands/core/message.py b/openhands/core/message.py
index 57fadabde714..aaa6510d820c 100644
--- a/openhands/core/message.py
+++ b/openhands/core/message.py
@@ -52,6 +52,8 @@ class Message(BaseModel):
     content: list[TextContent | ImageContent] = Field(default=list)
     cache_enabled: bool = False
     vision_enabled: bool = False
+    condensable: bool = True
+    event_id: int = -1
 
     @property
     def contains_image(self) -> bool:
diff --git a/openhands/core/schema/action.py b/openhands/core/schema/action.py
index dc4cfe542e0a..df978d187f8c 100644
--- a/openhands/core/schema/action.py
+++ b/openhands/core/schema/action.py
@@ -86,5 +86,8 @@ class ActionTypeSchema(BaseModel):
     SEND_PR: str = Field(default='send_pr')
     """Send a PR to github."""
 
+    RECALL: str = Field(default='recall')
+    """Recalls the memory."""
+
 
 ActionType = ActionTypeSchema()
diff --git a/openhands/core/schema/observation.py b/openhands/core/schema/observation.py
index 622f2680f785..11e27f2afc73 100644
--- a/openhands/core/schema/observation.py
+++ b/openhands/core/schema/observation.py
@@ -44,5 +44,7 @@ class ObservationTypeSchema(BaseModel):
 
     USER_REJECTED: str = Field(default='user_rejected')
 
+    RECALL: str = Field(default='recall')
+
 
 ObservationType = ObservationTypeSchema()
diff --git a/openhands/events/action/agent.py b/openhands/events/action/agent.py
index f49f573ed698..02c810b6cf1f 100644
--- a/openhands/events/action/agent.py
+++ b/openhands/events/action/agent.py
@@ -20,8 +20,12 @@ def message(self) -> str:
 
 @dataclass
 class AgentSummarizeAction(Action):
-    summary: str
+    summary: str = ''  # summary to be inserted as a memory block
     action: str = ActionType.SUMMARIZE
+    start_id: int = -1
+    end_id: int = -1
+    summarized_actions: str = ''
+    summarized_observations: str = ''
 
     @property
     def message(self) -> str:
@@ -78,3 +82,14 @@ class AgentDelegateAction(Action):
     @property
     def message(self) -> str:
         return f"I'm asking {self.agent} for help with this task."
+
+
+@dataclass
+class AgentRecallAction(Action):
+    query: str
+    thought: str = ''
+    action: str = ActionType.RECALL
+
+    @property
+    def message(self) -> str:
+        return f'Recalling "{self.query[:10]}..."'
diff --git a/openhands/events/action/message.py b/openhands/events/action/message.py
index 55fb21f359d3..0e3bb26a1cc2 100644
--- a/openhands/events/action/message.py
+++ b/openhands/events/action/message.py
@@ -7,7 +7,7 @@
 @dataclass
 class MessageAction(Action):
     content: str
-    images_urls: list | None = None
+    images_urls: list[str] | None = None
     wait_for_response: bool = False
     action: str = ActionType.MESSAGE
     security_risk: ActionSecurityRisk | None = None
diff --git a/openhands/events/observation/agent.py b/openhands/events/observation/agent.py
index 802c23c3786d..a5434e529b3b 100644
--- a/openhands/events/observation/agent.py
+++ b/openhands/events/observation/agent.py
@@ -14,3 +14,14 @@ class AgentStateChangedObservation(Observation):
     @property
     def message(self) -> str:
         return ''
+
+
+@dataclass
+class AgentRecallObservation(Observation):
+    query: str
+    memory: str
+    observation: str = ObservationType.RECALL
+
+    @property
+    def message(self) -> str:
+        return f'Memory:\n{self.memory}'
diff --git a/openhands/events/serialization/action.py b/openhands/events/serialization/action.py
index 4f6050172cbe..787fa65a2cf1 100644
--- a/openhands/events/serialization/action.py
+++ b/openhands/events/serialization/action.py
@@ -3,7 +3,9 @@
 from openhands.events.action.agent import (
     AgentDelegateAction,
     AgentFinishAction,
+    AgentRecallAction,
     AgentRejectAction,
+    AgentSummarizeAction,
     ChangeAgentStateAction,
 )
 from openhands.events.action.browse import BrowseInteractiveAction, BrowseURLAction
@@ -36,6 +38,8 @@
     ModifyTaskAction,
     ChangeAgentStateAction,
     MessageAction,
+    AgentSummarizeAction,
+    AgentRecallAction,
 )
 
 ACTION_TYPE_TO_CLASS = {action_class.action: action_class for action_class in actions}  # type: ignore[attr-defined]
diff --git a/openhands/events/stream.py b/openhands/events/stream.py
index 1e4c3b9d5394..28d25110127a 100644
--- a/openhands/events/stream.py
+++ b/openhands/events/stream.py
@@ -20,6 +20,7 @@ class EventStreamSubscriber(str, Enum):
     RUNTIME = 'runtime'
     MAIN = 'main'
     TEST = 'test'
+    MEMORY = 'memory'
 
 
 def session_exists(sid: str, file_store: FileStore) -> bool:
@@ -67,11 +68,26 @@ def _get_id_from_filename(filename: str) -> int:
 
     def get_events(
         self,
-        start_id=0,
-        end_id=None,
-        reverse=False,
+        start_id: int = 0,
+        end_id: int | None = None,
+        reverse: bool = False,
         filter_out_type: tuple[type[Event], ...] | None = None,
+        filter_hidden: bool = False,
     ) -> Iterable[Event]:
+        """
+        Retrieve events from the event stream, optionally filtering out events of a given type
+        and events marked as hidden.
+
+        Args:
+            start_id: The ID of the first event to retrieve. Defaults to 0.
+            end_id: The ID of the last event to retrieve. Defaults to the last event in the stream.
+            reverse: Whether to retrieve events in reverse order. Defaults to False.
+            filter_out_type: A tuple of event types to filter out. Typically used to filter out backend events from the agent.
+            filter_hidden: If True, filters out events with the 'hidden' attribute set to True.
+
+        Yields:
+            Events from the stream that match the criteria.
+        """
         if reverse:
             if end_id is None:
                 end_id = self._cur_id - 1
@@ -79,9 +95,11 @@ def get_events(
             while event_id >= start_id:
                 try:
                     event = self.get_event(event_id)
-                    if filter_out_type is None or not isinstance(
-                        event, filter_out_type
-                    ):
+                    # apply type and 'hidden' filters
+                    if (
+                        filter_out_type is None
+                        or not isinstance(event, filter_out_type)
+                    ) and (not filter_hidden or not getattr(event, 'hidden', False)):
                         yield event
                 except FileNotFoundError:
                     logger.debug(f'No event found for ID {event_id}')
@@ -93,9 +111,11 @@ def get_events(
                     break
                 try:
                     event = self.get_event(event_id)
-                    if filter_out_type is None or not isinstance(
-                        event, filter_out_type
-                    ):
+                    # apply type and 'hidden' filters
+                    if (
+                        filter_out_type is None
+                        or not isinstance(event, filter_out_type)
+                    ) and (not filter_hidden or not getattr(event, 'hidden', False)):
                         yield event
                 except FileNotFoundError:
                     break
diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index 9eb3a08aa990..7e63922b8b85 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -7,13 +7,18 @@
 from typing import Any
 
 from openhands.core.config import LLMConfig
+from openhands.core.exceptions import TokenLimitExceededError
+from openhands.events.event import Event
+from openhands.events.serialization.event import event_to_memory
 
 with warnings.catch_warnings():
     warnings.simplefilter('ignore')
     import litellm
+import numpy as np
 from litellm import ModelInfo, PromptTokensDetails
 from litellm import completion as litellm_completion
 from litellm import completion_cost as litellm_completion_cost
+from litellm import embedding as litellm_embedding
 from litellm.exceptions import (
     APIConnectionError,
     APIError,
@@ -22,6 +27,7 @@
     ServiceUnavailableError,
 )
 from litellm.types.utils import CostPerToken, ModelResponse, Usage
+from litellm.utils import create_pretrained_tokenizer
 
 from openhands.core.exceptions import CloudFlareBlockageError
 from openhands.core.logger import openhands_logger as logger
@@ -126,6 +132,13 @@ def __init__(
                 ):
                     self.config.max_output_tokens = self.model_info['max_tokens']
 
+        # if using a custom tokenizer, make sure it's loaded and accessible in the format expected by litellm
+        if self.config.custom_tokenizer is not None:
+            self.tokenizer = create_pretrained_tokenizer(self.config.custom_tokenizer)
+        else:
+            self.tokenizer = None
+
+        # set up the completion function
         self._completion = partial(
             litellm_completion,
             model=self.config.model,
@@ -185,6 +198,14 @@ def wrapper(*args, **kwargs):
             # log the entire LLM prompt
             self.log_prompt(messages)
 
+            # find out if we have too many tokens
+            token_count = self.get_token_count(messages)
+            max_input_tokens = self.config.max_input_tokens
+            if token_count > max_input_tokens:
+                raise TokenLimitExceededError(
+                    f'Token limit exceeded: {token_count} > {max_input_tokens}'
+                )
+
             if self.is_caching_prompt_active():
                 # Anthropic-specific prompt caching
                 if 'claude-3' in self.config.model:
@@ -339,15 +360,32 @@ def get_token_count(self, messages):
         """Get the number of tokens in a list of messages.
 
         Args:
-            messages (list): A list of messages.
+            messages (list): A list of messages, either as a list of dicts or as a list of Message objects.
 
         Returns:
             int: The number of tokens.
         """
+        # convert Message objects to dicts, litellm expects dicts
+        if (
+            isinstance(messages, list)
+            and len(messages) > 0
+            and isinstance(messages[0], Message)
+        ):
+            messages = self.format_messages_for_llm(messages)
+
+        # try to get the token count with the default litellm tokenizers
+        # or the custom tokenizer attribute if set for this LLM configuration
         try:
-            return litellm.token_counter(model=self.config.model, messages=messages)
-        except Exception:
+            return litellm.token_counter(
+                model=self.config.model,
+                messages=messages,
+                custom_tokenizer=self.tokenizer,
+            )
+        except Exception as e:
             # TODO: this is to limit logspam in case token count is not supported
+            logger.error(
+                f'Error getting token count for\n model {self.config.model}\ncustom_tokenizer: {self.config.custom_tokenizer}\n{e}'
+            )
             return 0
 
     def _is_local(self):
@@ -426,3 +464,87 @@ def format_messages_for_llm(self, messages: Message | list[Message]) -> list[dic
 
         # let pydantic handle the serialization
         return [message.model_dump() for message in messages]
+
+    def embed_event(self, event: Event) -> np.ndarray:
+        """
+        Embeds a single event using the embedding model.
+
+        Args:
+            event (Event): The event to embed.
+
+        Returns:
+            np.ndarray: The embedding vector of the event.
+        """
+        # Convert the event to a string representation
+        event_str = event_to_memory(event, -1)
+        # Get the embedding
+        embedding_response = litellm_embedding(
+            model=self.config.embedding_model,
+            input=event_str,
+            custom_llm_provider=self.config.custom_llm_provider,
+            api_key=self.config.api_key,
+            base_url=self.config.base_url,
+            api_version=self.config.api_version,
+            input_cost_per_token=self.config.input_cost_per_token,
+            output_cost_per_token=self.config.output_cost_per_token,
+        )
+        embedding = embedding_response['data'][0]['embedding']
+        return np.array(embedding)
+
+    def embed_history(self, history: list[Event]) -> list[np.ndarray]:
+        """
+        Embeds a list of events.
+
+        Args:
+            history (list[Event]): The list of events to embed.
+
+        Returns:
+            list[np.ndarray]: A list of embedding vectors.
+        """
+        embeddings = []
+        for event in history:
+            embedding = self.embed_event(event)
+            embeddings.append(embedding)
+        return embeddings
+
+    def search(self, query: str, history: list[Event], top_k: int = 5) -> list[Event]:
+        """
+        Recalls the most similar events based on the query.
+
+        Args:
+            query (str): The query string.
+            embeddings (list[np.ndarray]): The list of embedded vectors.
+            history (list[Event]): The corresponding list of events.
+            top_k (int, optional): The number of top similar events to retrieve. Defaults to 5.
+
+        Returns:
+            list[Event]: The list of recalled events.
+        """
+
+        # make sure history has been embedded
+        embeddings = self.embed_history(history)
+
+        # Embed the query
+        query_embedding_response = litellm_embedding(
+            model=self.config.embedding_model,
+            input=query,
+            custom_llm_provider=self.config.custom_llm_provider,
+            api_key=self.config.api_key,
+            base_url=self.config.base_url,
+            api_version=self.config.api_version,
+            input_cost_per_token=self.config.input_cost_per_token,
+            output_cost_per_token=self.config.output_cost_per_token,
+        )
+        query_embedding = np.array(
+            query_embedding_response['data'][0]['embedding']
+        ).reshape(1, -1)
+
+        # Compute cosine similarity
+        similarity_scores = np.dot(query_embedding, embeddings)
+
+        # Get the top_k indices
+        top_indices = similarity_scores.argsort()[-top_k:][::-1]
+
+        # Retrieve the corresponding events
+        recalled_events = [history[i] for i in top_indices]
+        return recalled_events
diff --git a/openhands/memory/__init__.py b/openhands/memory/__init__.py
index 0ce208cef581..100ea245fac5 100644
--- a/openhands/memory/__init__.py
+++ b/openhands/memory/__init__.py
@@ -1,5 +1,3 @@
-from openhands.memory.condenser import MemoryCondenser
-from openhands.memory.history import ShortTermHistory
 from openhands.memory.memory import LongTermMemory
-
-__all__ = ['LongTermMemory', 'ShortTermHistory', 'MemoryCondenser']
+from openhands.memory.utils import parse_summary_response
+__all__ = ['LongTermMemory', 'parse_summary_response']
diff --git a/openhands/memory/base_memory.py b/openhands/memory/base_memory.py
new file mode 100644
index 000000000000..25de017ab1bb
--- /dev/null
+++ b/openhands/memory/base_memory.py
@@ -0,0 +1,26 @@
+from abc import ABC, abstractmethod
+from typing import Any
+
+
+class Memory(ABC):
+    """Abstract base class for all memory modules."""
+
+    @abstractmethod
+    def to_dict(self) -> dict[str, Any]:
+        """Convert the memory module to a dictionary of individual memories."""
+        pass
+
+    # @abstractmethod
+    # def from_dict(self, data: dict[str, Any]) -> None:
+    #    """Load the memory module from a dictionary of individual memories."""
+    #    pass
+
+    @abstractmethod
+    def __str__(self) -> str:
+        """String representation of the memory module."""
+        pass
+
+    @abstractmethod
+    def reset(self) -> None:
+        """Reset the memory module."""
+        pass
diff --git a/openhands/memory/condenser.py b/openhands/memory/condenser.py
index bd48d184305c..d3b283dd677f 100644
--- a/openhands/memory/condenser.py
+++ b/openhands/memory/condenser.py
@@ -1,24 +1,134 @@
+from litellm.types.utils import ModelResponse
+
+from openhands.core.exceptions import SummarizeError
 from openhands.core.logger import openhands_logger as logger
+from openhands.core.message import Message, TextContent
+from openhands.events.action import AgentSummarizeAction
 from openhands.llm.llm import LLM
+from openhands.memory.utils import parse_summary_response
+from openhands.utils.prompt import PromptManager
 
 
 class MemoryCondenser:
-    def condense(self, summarize_prompt: str, llm: LLM):
-        """Attempts to condense the memory by using the llm
+    def __init__(self, llm: LLM, prompt_manager: PromptManager):
+        self.llm = llm
+        self.prompt_manager = prompt_manager
 
-        Parameters:
-        - llm (LLM): llm to be used for summarization
+        # just easier to read
+        self.context_window = llm.config.max_input_tokens
 
-        Raises:
-        - Exception: the same exception as it got from the llm or processing the response
+    def condense(
+        self,
+        messages: list[Message],
+    ) -> AgentSummarizeAction:
         """
-        try:
-            messages = [{'content': summarize_prompt, 'role': 'user'}]
-            resp = llm.completion(messages=messages)
-            summary_response = resp['choices'][0]['message']['content']
-            return summary_response
-        except Exception as e:
-            logger.error('Error condensing thoughts: %s', str(e), exc_info=False)
-
-            # TODO If the llm fails with ContextWindowExceededError, we can try to condense the memory chunk by chunk
-            raise
+        Condenses a list of messages using the LLM and returns a summary action.
+
+        Args:
+            messages (list[Message]): The list of messages to condense.
+
+        Returns:
+            AgentSummarizeAction: The summary action containing the condensed summary.
+        """
+        assert (
+            self.context_window is not None and self.context_window > 2000
+        ), 'context window must be a number over 2000'
+
+        # don't condense if under the token limit
+        total_token_count = self.llm.get_token_count(messages)
+        if total_token_count < self.context_window:
+            logger.debug(
+                f'Not condensing messages because token count ({total_token_count}) is less than max input tokens ({self.context_window})'
+            )
+            return AgentSummarizeAction(end_id=-1)
+
+        # calculate safe token limit for processing (e.g. 80% of context window)
+        safe_token_limit = int(
+            self.context_window * self.llm.config.message_summary_warning_level
+        )
+
+        # collect condensable messages with their IDs and token counts
+        condensable_messages: list[tuple[Message, int]] = [
+            (msg, self.llm.get_token_count([msg.model_dump()]))
+            for msg in messages
+            if msg.condensable
+        ]
+
+        if len(condensable_messages) <= 1:
+            # prevents potential infinite loop of summarizing the same message repeatedly
+            raise SummarizeError(
+                f"Summarize error: tried to run summarize, but couldn't find enough messages to compress [len={len(condensable_messages)} <= 1]"
+            )
+
+        # track the very first message's id - this will be our start_id
+        first_message_id = condensable_messages[0][0].event_id
+
+        # create chunks that fit within safe_token_limit
+        chunks: list[list[Message]] = []
+        current_chunk: list[Message] = []
+        current_chunk_tokens = 0
+
+        for msg, token_count in condensable_messages:
+            if current_chunk_tokens + token_count > safe_token_limit:
+                if current_chunk:  # save current chunk if not empty, it's done
+                    chunks.append(current_chunk)
+
+                # start a new chunk
+                current_chunk = [msg]
+                current_chunk_tokens = token_count
+            else:
+                # add to current chunk
+                current_chunk.append(msg)
+                current_chunk_tokens += token_count
+
+        # add the last chunk
+        if current_chunk:
+            chunks.append(current_chunk)
+
+        # process chunks
+        final_summary = None
+        # track the last real message id (note: not summary actions)
+        last_real_message_id = condensable_messages[-1][0].event_id
+
+        for i, chunk in enumerate(chunks):
+            if final_summary:
+                # prepend previous summary to next chunk
+                summary_message = Message(
+                    role='user',
+                    content=[TextContent(text=f'Previous summary:\n{final_summary}')],
+                    condensable=True,
+                    # Note: summary messages don't have an event_id
+                    event_id=-1,
+                )
+                chunk.insert(0, summary_message)
+
+            action_response = self._summarize_messages(chunk)
+            summary_action = parse_summary_response(action_response)
+            final_summary = summary_action.summary
+
+        # create final summary action
+        assert final_summary is not None, 'final summary must not be None here'
+        return AgentSummarizeAction(
+            summary=final_summary,
+            start_id=first_message_id,
+            end_id=last_real_message_id,
+        )
+
+    def _summarize_messages(self, message_sequence_to_summarize: list[Message]) -> str:
+        """Summarize a message sequence using LLM"""
+        # build the message to send
+        self.prompt_manager.conversation_history = self.llm.format_messages_for_llm(
+            message_sequence_to_summarize
+        )
+        summarize_prompt = self.prompt_manager.summarize_message
+        message = Message(role='system', content=[TextContent(text=summarize_prompt)])
+        serialized_message = message.model_dump()
+
+        response = self.llm.completion(
+            messages=[serialized_message],
+            temperature=0.2,
+        )
+
+        print(f'summarize_messages got response: {response}')
+        assert isinstance(response, ModelResponse), 'response must be a ModelResponse'
+        return response.choices[0].message.content
diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py
new file mode 100644
index 000000000000..f3f42d986267
--- /dev/null
+++ b/openhands/memory/conversation_memory.py
@@ -0,0 +1,115 @@
+from enum import Enum
+
+from openhands.controller.state.state import State
+from openhands.core.config.llm_config import LLMConfig
+from openhands.events.event import Event
+from openhands.events.serialization.event import event_to_dict
+from openhands.llm.llm import LLM
+from openhands.memory.base_memory import Memory
+
+
+class StorageType(Enum):
+    IN_MEMORY = 'in-memory'
+    VECTOR = 'vector'
+
+
+class ConversationMemory(Memory):
+    """Allows the agent to recall events from its entire history, with support for summarization and recall.
+
+    This class handles the summarized events (from state.summary['start_id] to state.summary['end_id'])
+    and slices the history to include only the events after the summary.
+    """
+
+    memory: list[Event]
+    memory_config: LLMConfig
+
+    def __init__(
+        self,
+        memory_config: LLMConfig,
+        state: State,
+    ) -> None:
+        """
+        Initialize ConversationMemory with a reference to history and long-term memory.
+
+        Args:
+        - history: The history of the current agent conversation.
+        - llm_config: The LLM configuration.
+        - top_k: Number of top results to retrieve.
+        """
+        self.memory = []
+        self.memory_config = memory_config
+        # total messages in the conversation
+        # won't this always be the same as len(history)?
+        # core memory isn't counted here
+        self.total_message_count = 0
+        # of which hidden
+        self.hidden_message_count = 0
+
+        # init storage type
+        self.storage_type = StorageType.IN_MEMORY
+
+        # read itself from the runtime state
+        self.update(state)
+
+    def update(self, state: State) -> None:
+        """Updates the conversation memory from a new runtime state."""
+        # this isn't actually state.history
+        # if it has a summary, the messages from summary.start_id to summary.end_id are not included,
+        # but replaced with a single summary event
+        if state and state.summary:
+            self.memory = (
+                state.history[: state.summary.start_id]
+                + [state.summary]
+                + state.history[state.summary.end_id :]
+            )
+            self.hidden_message_count = state.summary.end_id - state.summary.start_id
+        else:
+            self.memory = state.history  # this is not cool but let it be for now
+            self.hidden_message_count = 0
+
+    def reset(self) -> None:
+        """Resets the conversation memory."""
+        self.memory = []
+        self.total_message_count = 0
+        self.hidden_message_count = 0
+
+    def update_summary(self, summary: str, hidden_count: int) -> None:
+        """Updates the memory with a new summary and tracks hidden messages."""
+        self.hidden_message_count = hidden_count
+
+    def to_dict(self) -> dict:
+        # return a dict with key = event.id, value = event.to_dict()
+        return {event.id: event_to_dict(event) for event in self.memory}
+
+    def __str__(self) -> str:
+        return f'ConversationMemory with {len(self.memory)} total events'
+
+    def search(self, llm: LLM, query: str, top_k: int = 5) -> list:
+        """Searches the conversation memory for relevant messages."""
+        if not self.memory or not query:
+            return []
+
+        if self.storage_type == StorageType.IN_MEMORY:
+            # use the llm.py search to find relevant messages
+            recalled_events = llm.search(query=query, history=self.memory, top_k=top_k)
+        else:
+            raise ValueError(f'Unsupported storage type: {self.storage_type}')
+
+        return recalled_events
+
+    def recall_memory(
+        self, llm: LLM, state: State, query: str, top_k: int = 5
+    ) -> list[Event]:
+        """
+        Get the most similar events based on the query.
+
+        Args:
+            query: The query string for semantic search.
+            top_k: Number of top results to retrieve.
+
+        Returns:
+            A list of semantically similar events.
+        """
+        # get the most similar events based on the query
+        # for testing recall with litellm
+        return llm.search(query, state.history, top_k)
diff --git a/openhands/memory/core_memory.py b/openhands/memory/core_memory.py
new file mode 100644
index 000000000000..272c7a6bc776
--- /dev/null
+++ b/openhands/memory/core_memory.py
@@ -0,0 +1,60 @@
+from openhands.memory.base_memory import Memory
+
+
+class CoreMemory(Memory):
+    """Memory contents to be inserted in the prompt. This includes key facts and context
+    that the LLM needs to maintain about its current tasks and capabilities."""
+
+    def __init__(self, limit: int = 1500):
+        super().__init__()
+        self.char_limit = limit
+        self.blocks = {
+            'personality': [],  # agent's personality traits and capabilities
+            'task_context': [],  # important context about current tasks
+        }
+
+    def add_block(self, category: str, content: str) -> bool:
+        """Add a memory block to a specific category.
+        Returns True if successful, False if would exceed limit."""
+        if category not in self.blocks:
+            raise ValueError(
+                f'Invalid category: {category}. Must be one of {list(self.blocks.keys())}'
+            )
+
+        # Calculate total size with new content
+        potential_content = self.format_blocks() + f'\n- {content}'
+        if len(potential_content) > self.char_limit:
+            return False
+
+        self.blocks[category].append(content)
+        return True
+
+    def get_blocks(
+        self, category: str | None = None
+    ) -> dict[str, list[str]] | list[str]:
+        """Get memory blocks, optionally filtered by category."""
+        if category:
+            return self.blocks.get(category, [])
+        return self.blocks
+
+    def format_blocks(self) -> str:
+        """Format memory blocks for inclusion in the system prompt."""
+        formatted = []
+
+        for category, items in self.blocks.items():
+            if items:
+                formatted.append(f"\n{category.replace('_', ' ').title()}:")
+                formatted.extend([f'- {item}' for item in items])
+
+        return '\n'.join(formatted)
+
+    def __str__(self) -> str:
+        return self.format_blocks()
+
+    def to_dict(self) -> dict:
+        return {category: items for category, items in self.blocks.items()}
+
+    def reset(self) -> None:
+        """Reset all memory blocks."""
+        for category in self.blocks:
+            self.blocks[category] = []
diff --git a/openhands/memory/history.py b/openhands/memory/history.py
deleted file mode 100644
index 1e4cfb8b5f05..000000000000
--- a/openhands/memory/history.py
+++ /dev/null
@@ -1,224 +0,0 @@
-from typing import ClassVar, Iterable
-
-from openhands.core.logger import openhands_logger as logger
-from openhands.events.action.action import Action
-from openhands.events.action.agent import (
-    AgentDelegateAction,
-    ChangeAgentStateAction,
-)
-from openhands.events.action.empty import NullAction
-from openhands.events.action.message import MessageAction
-from openhands.events.event import Event, EventSource
-from openhands.events.observation.agent import AgentStateChangedObservation
-from openhands.events.observation.delegate import AgentDelegateObservation
-from openhands.events.observation.empty import NullObservation
-from openhands.events.observation.observation import Observation
-from openhands.events.serialization.event import event_to_dict
-from openhands.events.stream import EventStream
-from openhands.events.utils import get_pairs_from_events
-
-
-class ShortTermHistory(list[Event]):
-    """A list of events that represents the short-term memory of the agent.
-
-    This class provides methods to retrieve and filter the events in the history of the running agent from the event stream.
-    """
-
-    start_id: int
-    end_id: int
-    _event_stream: EventStream
-    delegates: dict[tuple[int, int], tuple[str, str]]
-    filter_out: ClassVar[tuple[type[Event], ...]] = (
-        NullAction,
-        NullObservation,
-        ChangeAgentStateAction,
-        AgentStateChangedObservation,
-    )
-
-    def __init__(self):
-        super().__init__()
-        self.start_id = -1
-        self.end_id = -1
-        self.delegates = {}
-
-    def set_event_stream(self, event_stream: EventStream):
-        self._event_stream = event_stream
-
-    def get_events_as_list(self, include_delegates: bool = False) -> list[Event]:
-        """Return the history as a list of Event objects."""
-        return list(self.get_events(include_delegates=include_delegates))
-
-    def get_events(
-        self,
-        reverse: bool = False,
-        include_delegates: bool = False,
-        include_hidden=False,
-    ) -> Iterable[Event]:
-        """Return the events as a stream of Event objects."""
-        # TODO handle AgentRejectAction, if it's not part of a chunk ending with an AgentDelegateObservation
-        # or even if it is, because currently we don't add it to the summary
-
-        # iterate from start_id to end_id, or reverse
-        start_id = self.start_id if self.start_id != -1 else 0
-        end_id = (
-            self.end_id
-            if self.end_id != -1
-            else self._event_stream.get_latest_event_id()
-        )
-
-        for event in self._event_stream.get_events(
-            start_id=start_id,
-            end_id=end_id,
-            reverse=reverse,
-            filter_out_type=self.filter_out,
-        ):
-            if not include_hidden and hasattr(event, 'hidden') and event.hidden:
-                continue
-            # TODO add summaries
-            # and filter out events that were included in a summary
-
-            # filter out the events from a delegate of the current agent
-            if not include_delegates and not any(
-                # except for the delegate action and observation themselves, currently
-                # AgentDelegateAction has id = delegate_start
-                # AgentDelegateObservation has id = delegate_end
-                delegate_start < event.id < delegate_end
-                for delegate_start, delegate_end in self.delegates.keys()
-            ):
-                yield event
-            elif include_delegates:
-                yield event
-
-    def get_last_action(self, end_id: int = -1) -> Action | None:
-        """Return the last action from the event stream, filtered to exclude unwanted events."""
-        # from end_id in reverse, find the first action
-        end_id = self._event_stream.get_latest_event_id() if end_id == -1 else end_id
-
-        last_action = next(
-            (
-                event
-                for event in self._event_stream.get_events(
-                    end_id=end_id, reverse=True, filter_out_type=self.filter_out
-                )
-                if isinstance(event, Action)
-            ),
-            None,
-        )
-
-        return last_action
-
-    def get_last_observation(self, end_id: int = -1) -> Observation | None:
-        """Return the last observation from the event stream, filtered to exclude unwanted events."""
-        # from end_id in reverse, find the first observation
-        end_id = self._event_stream.get_latest_event_id() if end_id == -1 else end_id
-
-        last_observation = next(
-            (
-                event
-                for event in self._event_stream.get_events(
-                    end_id=end_id, reverse=True, filter_out_type=self.filter_out
-                )
-                if isinstance(event, Observation)
-            ),
-            None,
-        )
-
-        return last_observation
-
-    def get_last_user_message(self) -> str:
-        """Return the content of the last user message from the event stream."""
-        last_user_message = next(
-            (
-                event.content
-                for event in self._event_stream.get_events(reverse=True)
-                if isinstance(event, MessageAction) and event.source == EventSource.USER
-            ),
-            None,
-        )
-
-        return last_user_message if last_user_message is not None else ''
-
-    def get_last_agent_message(self) -> str:
-        """Return the content of the last agent message from the event stream."""
-        last_agent_message = next(
-            (
-                event.content
-                for event in self._event_stream.get_events(reverse=True)
-                if isinstance(event, MessageAction)
-                and event.source == EventSource.AGENT
-            ),
-            None,
-        )
-
-        return last_agent_message if last_agent_message is not None else ''
-
-    def get_last_events(self, n: int) -> list[Event]:
-        """Return the last n events from the event stream."""
-        # dummy agent is using this
-        # it should work, but it's not great to store temporary lists now just for a test
-        end_id = self._event_stream.get_latest_event_id()
-        start_id = max(0, end_id - n + 1)
-
-        return list(
-            event
-            for event in self._event_stream.get_events(
-                start_id=start_id,
-                end_id=end_id,
-                filter_out_type=self.filter_out,
-            )
-        )
-
-    def has_delegation(self) -> bool:
-        for event in self._event_stream.get_events():
-            if isinstance(event, AgentDelegateObservation):
-                return True
-        return False
-
-    def on_event(self, event: Event):
-        if not isinstance(event, AgentDelegateObservation):
-            return
-
-        logger.debug('AgentDelegateObservation received')
-
-        # figure out what this delegate's actions were
-        # from the last AgentDelegateAction to this AgentDelegateObservation
-        # and save their ids as start and end ids
-        # in order to use later to exclude them from parent stream
-        # or summarize them
-        delegate_end = event.id
-        delegate_start = -1
-        delegate_agent: str = ''
-        delegate_task: str = ''
-        for prev_event in self._event_stream.get_events(
-            end_id=event.id - 1, reverse=True
-        ):
-            if isinstance(prev_event, AgentDelegateAction):
-                delegate_start = prev_event.id
-                delegate_agent = prev_event.agent
-                delegate_task = prev_event.inputs.get('task', '')
-                break
-
-        if delegate_start == -1:
-            logger.error(
-                f'No AgentDelegateAction found for AgentDelegateObservation with id={delegate_end}'
-            )
-            return
-
-        self.delegates[(delegate_start, delegate_end)] = (delegate_agent, delegate_task)
-        logger.debug(
-            f'Delegate {delegate_agent} with task {delegate_task} ran from id={delegate_start} to id={delegate_end}'
-        )
-
-    # TODO remove me when unnecessary
-    # history is now available as a filtered stream of events, rather than list of pairs of (Action, Observation)
-    # we rebuild the pairs here
-    # for compatibility with the existing output format in evaluations
-    def compatibility_for_eval_history_pairs(self) -> list[tuple[dict, dict]]:
-        history_pairs = []
-
-        for action, observation in get_pairs_from_events(
-            self.get_events_as_list(include_delegates=True)
-        ):
-            history_pairs.append((event_to_dict(action), event_to_dict(observation)))
-
-        return history_pairs
diff --git a/openhands/memory/memory.py b/openhands/memory/memory.py
index 9d83cc9cdc8c..4fa823adc520 100644
--- a/openhands/memory/memory.py
+++ b/openhands/memory/memory.py
@@ -54,10 +54,10 @@ def __init__(
 
         # instantiate the index
         self.index = VectorStoreIndex.from_vector_store(vector_store, self.embed_model)
-        self.thought_idx = 0
 
         # initialize the event stream
         self.event_stream = event_stream
+        self._events_to_docs()
 
         # max of threads to run the pipeline
         self.memory_max_threads = agent_config.memory_max_threads
@@ -85,18 +85,17 @@ def add_event(self, event: Event):
             event_type = 'observation'
             event_id = event_data['observation']
 
-        # create a Document instance for the event
+        # create a Document instance for the event using event.id
         doc = Document(
             text=json.dumps(event_data),
-            doc_id=str(self.thought_idx),
+            doc_id=event.id,
             extra_info={
                 'type': event_type,
                 'id': event_id,
-                'idx': self.thought_idx,
+                'event_id': event.id,
             },
         )
-        self.thought_idx += 1
-        logger.debug('Adding %s event to memory: %d', event_type, self.thought_idx)
+        logger.debug('Adding %s event to memory with doc_id: %s', event_type, event.id)
         self._add_document(document=doc)
 
     def _add_document(self, document: 'Document'):
@@ -159,18 +158,17 @@ def _events_to_docs(self) -> list['Document']:
                     event_type = 'observation'
                     event_id = event_data['observation']
 
-                # create a Document instance for the event
+                # create a Document instance for the event using event.id
                 doc = Document(
                     text=json.dumps(event_data),
-                    doc_id=str(self.thought_idx),
+                    doc_id=event.id,
                     extra_info={
                         'type': event_type,
                         'id': event_id,
-                        'idx': self.thought_idx,
+                        'event_id': event.id,
                     },
                 )
                 documents.append(doc)
-                self.thought_idx += 1
             except (json.JSONDecodeError, KeyError, ValueError) as e:
                 logger.warning(f'Failed to process event: {e}')
                 continue
diff --git a/openhands/memory/utils.py b/openhands/memory/utils.py
new file mode 100644
index 000000000000..f49479ec4fb1
--- /dev/null
+++ b/openhands/memory/utils.py
@@ -0,0 +1,39 @@
+import openhands.core.utils.json as json
+from openhands.core.exceptions import (
+    InvalidSummaryResponseError,
+    LLMMalformedActionError,
+    LLMResponseError,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.events.action.agent import AgentSummarizeAction
+from openhands.events.event import EventSource
+from openhands.events.serialization.event import action_from_dict
+
+
+def parse_summary_response(response: str) -> AgentSummarizeAction:
+    """
+    Parses a JSON summary of events.
+
+    Parameters:
+    - response: The response string to be parsed
+
+    Returns:
+    - The summary action output by the model
+    """
+    try:
+        action_dict = json.loads(response)
+        action = action_from_dict(action_dict)
+        if action is None or not isinstance(action, AgentSummarizeAction):
+            error_message = f'Expected a summarize action, but the response got {str(type(action)) if action else None}'
+            logger.error(error_message)
+            raise InvalidSummaryResponseError(error_message)
+        action._source = EventSource.AGENT  # type: ignore
+        action.summary = (
+            action.summarized_actions + '\n' + action.summarized_observations
+        )
+    except (LLMResponseError, LLMMalformedActionError) as e:
+        logger.error(f'Failed to parse summary response: {str(e)}')
+        raise InvalidSummaryResponseError(
+            f'Failed to parse the response: {str(e)}'
+        ) from e
+    return action
diff --git a/openhands/server/session/session.py b/openhands/server/session/session.py
index 512be02fabc2..6052643bb646 100644
--- a/openhands/server/session/session.py
+++ b/openhands/server/session/session.py
@@ -78,6 +78,7 @@ async def _initialize_agent(self, data: dict):
         self.agent_session.event_stream.add_event(
             AgentStateChangedObservation('', AgentState.LOADING), EventSource.AGENT
         )
+
         # Extract the agent-relevant arguments from the request
         args = {key: value for key, value in data.get('args', {}).items()}
         agent_cls = args.get(ConfigType.AGENT, self.config.default_agent)
@@ -102,6 +103,7 @@ async def _initialize_agent(self, data: dict):
 
         # TODO: override other LLM config & agent config groups (#2075)
 
+        # set up the required arguments for the agent
         llm = LLM(config=self.config.get_llm_config_from_agent(agent_cls))
         agent_config = self.config.get_agent_config(agent_cls)
         agent = Agent.get_cls(agent_cls)(llm, agent_config)
diff --git a/openhands/utils/embeddings.py b/openhands/utils/embeddings.py
index 07ee2d27f52d..ef4cb3b8c4f5 100644
--- a/openhands/utils/embeddings.py
+++ b/openhands/utils/embeddings.py
@@ -101,6 +101,12 @@ def get_embedding_model(strategy: str, llm_config: LLMConfig) -> 'BaseEmbedding'
                 azure_endpoint=llm_config.base_url,
                 api_version=llm_config.api_version,
             )
+        elif strategy == 'voyage':
+            from llama_index.legacy.embeddings.voyageai import VoyageEmbedding
+
+            return VoyageEmbedding(
+                model='voyageai/voyage-code-2',
+            )
         elif (strategy is not None) and (strategy.lower() == 'none'):
             # TODO: this works but is not elegant enough. The incentive is when
             # an agent using embeddings is not used, there is no reason we need to
diff --git a/openhands/utils/prompt.py b/openhands/utils/prompt.py
index 8b9dd81e8bfd..2c9ed30bca80 100644
--- a/openhands/utils/prompt.py
+++ b/openhands/utils/prompt.py
@@ -1,6 +1,17 @@
+import importlib
 import os
+from inspect import signature
+from pathlib import Path
+from typing import Any
 
-from jinja2 import Template
+import yaml
+from jinja2 import (
+    Environment,
+    FileSystemLoader,
+    Template,
+    TemplateNotFound,
+    select_autoescape,
+)
 
 from openhands.utils.microagent import MicroAgent
 
@@ -17,34 +28,169 @@ class PromptManager:
         prompt_dir (str): Directory containing prompt templates.
         agent_skills_docs (str): Documentation of agent skills.
         micro_agent (MicroAgent | None): Micro-agent, if specified.
+        conversation_history (list[dict[str, Any]]): History of conversations.
+        core_memory (str): Core memory storage.
+        env (Environment): Jinja2 environment for template rendering.
+        templates (dict[str, Template]): Loaded templates.
+        available_skills (list[str]): List of available skills from YAML configuration.
     """
 
     def __init__(
         self,
-        prompt_dir: str,
-        agent_skills_docs: str,
+        prompt_dir: str | Path,
         micro_agent: MicroAgent | None = None,
     ):
-        self.prompt_dir: str = prompt_dir
-        self.agent_skills_docs: str = agent_skills_docs
+        """Initialize PromptManager with template directories and agent configuration.
 
-        self.system_template: Template = self._load_template('system_prompt')
-        self.user_template: Template = self._load_template('user_prompt')
-        self.micro_agent: MicroAgent | None = micro_agent
+        The system supports two types of templates:
+        1. Simple .md files - For basic customization with variable substitution
+        2. Advanced .j2 files - For complex templates using Jinja2 features
+
+        Templates are loaded in this order (later ones override earlier ones):
+        1. Default templates from prompt_dir
+        2. Custom templates from custom_prompt_dir
+        3. .j2 files take precedence over .md files with the same base nam
+        """
+
+        self.prompt_dir = os.path.abspath(prompt_dir)
+        self.micro_agent = micro_agent
+        self.conversation_history: list[dict[str, Any]] = []
+        self.core_memory: str = ''
+
+        # load available skills from YAML
+        yaml_path = os.path.join(prompt_dir, 'agent.yaml')
+        if os.path.exists(yaml_path):
+            with open(yaml_path, 'r') as f:
+                config = yaml.safe_load(f)
+
+            custom_templates_dir = config.get('custom_templates_dir', None)
+            if custom_templates_dir:
+                # custom templates directory is an absolute path or relative to the script location
+                custom_templates_dir = os.path.abspath(custom_templates_dir)
+
+                # prioritize custom_templates_dir over the default templates directory
+                self.env = Environment(
+                    loader=FileSystemLoader([custom_templates_dir, self.prompt_dir]),
+                    autoescape=select_autoescape(['j2', 'md']),
+                    trim_blocks=True,
+                    lstrip_blocks=True,
+                )
+
+            self._system_template = self._load_template(
+                config['template']['system_prompt']
+            )
+            self._agent_skills_template = self._load_template(
+                config['template']['agent_skills']
+            )
+            self._examples_template = self._load_template(
+                config['template']['examples']
+            )
+            self._user_template = self._load_template(config['template']['user_prompt'])
+
+            self.available_skills = config['agent_skills']['available_skills']
+        else:
+            # no agent.yaml file found, use the default templates
+            self.env = Environment(loader=FileSystemLoader(prompt_dir))
+
+            self._system_template = self._load_template('system_prompt')
+            self._agent_skills_template = self._load_template('agent_skills')
+            self._user_template = self._load_template('user_prompt')
+            self._examples_template = self._load_template('examples')
+
+            self.available_skills = []  # FIXME: default to empty list if YAML not found
+
+        # TODO: agent config should have a tool use enabled or disabled
+        # and we can use that to conditionally load the tools variant of agentskills
+
+        # Load all templates
+        self.templates = self._load_templates()
 
     def _load_template(self, template_name: str) -> Template:
-        template_path = os.path.join(self.prompt_dir, f'{template_name}.j2')
-        if not os.path.exists(template_path):
-            raise FileNotFoundError(f'Prompt file {template_path} not found')
-        with open(template_path, 'r') as file:
-            return Template(file.read())
+        """Load a template from the environment."""
+        # use the jinja2 environment to load the template
+        try:
+            return self.env.get_template(f'{template_name}.j2')
+        except TemplateNotFound:
+            # try to load from the prompt_dir
+            template_path = os.path.join(self.prompt_dir, f'{template_name}.j2')
+            if not os.path.exists(template_path):
+                raise FileNotFoundError(f'Prompt file {template_path} not found')
+            with open(template_path, 'r') as file:
+                return Template(file.read())
+
+    def _load_templates(self) -> dict[str, Template]:
+        """Load templates with appropriate extensions based on complexity.
+
+        For each template name (e.g. 'system_prompt'), checks for files in this order:
+        1. {name}.j2 in custom_prompt_dir (if provided)
+        2. {name}.md in custom_prompt_dir (if provided)
+        3. {name}.j2 in prompt_dir
+        4. {name}.md in prompt_dir
+
+        Returns:
+            A dictionary mapping template names to their loaded Template objects.
+        """
+        templates = {}
+
+        # Template names and their default types
+        template_configs = {
+            # Complex templates that typically need Jinja features
+            'system_prompt': '.j2',
+            'summarize_prompt': '.j2',
+            # Simple templates that work well as markdown
+            'user_prompt': '.md',
+            'examples': '.md',
+        }
+
+        for name, default_ext in template_configs.items():
+            # Try loading template with either extension
+            template = None
+            for ext in ['.j2', '.md']:
+                try:
+                    template = self.env.get_template(f'{name}{ext}')
+                    break
+                except TemplateNotFound:
+                    continue
+
+            # If no template found, create empty one
+            if template is None:
+                print(f'No template found for {name}, using empty template')
+                template = self.env.from_string('')
+
+            templates[name] = template
+
+        return templates
+
+    def get_template_variables(self) -> dict[str, Any]:
+        """Get the current template variables.
+
+        Returns:
+            Dictionary of variables available to templates.
+        """
+        return {
+            'core_memory': self.core_memory,
+            'conversation_history': self.conversation_history,
+            'micro_agent': self.micro_agent.content if self.micro_agent else None,
+        }
 
     @property
     def system_message(self) -> str:
-        rendered = self.system_template.render(
-            agent_skills_docs=self.agent_skills_docs,
+        """Render the system message template."""
+        # render the agent_skills.j2 template
+
+        self.env.globals['get_skill_docstring'] = self._get_skill_docstring
+        rendered_docs = self._agent_skills_template.render(
+            available_skills=self.available_skills
+        )
+        rendered = self._system_template.render(
+            agent_skills_docs=rendered_docs,
         ).strip()
         return rendered
+        # return (
+        #    self.templates['system_prompt']
+        #    .render(**self.get_template_variables())
+        #    .strip()
+        # )
 
     @property
     def initial_user_message(self) -> str:
@@ -57,7 +203,58 @@ def initial_user_message(self) -> str:
         These additional context will convert the current generic agent
         into a more specialized agent that is tailored to the user's task.
         """
-        rendered = self.user_template.render(
-            micro_agent=self.micro_agent.content if self.micro_agent else None
+        # this should render the examples.j2 template first, then the user_prompt.j2 template
+        rendered_examples = self._examples_template.render()
+        rendered = self._user_template.render(
+            examples=rendered_examples,
+            micro_agent=self.micro_agent.content if self.micro_agent else None,
         )
         return rendered.strip()
+
+        # return (
+        #    self.templates['user_prompt']
+        #    .render(**self.get_template_variables())
+        #    .strip()
+        # )
+
+    @property
+    def summarize_message(self) -> str:
+        """Render the summarize message template."""
+        return (
+            self.templates['summarize_prompt']
+            .render(**self.get_template_variables())
+            .strip()
+        )
+
+    def _get_skill_docstring(self, skill_name: str) -> str:
+        """Retrieves the docstring of a skill function.
+
+        Args:
+            skill_name: The name of the skill in the format 'module:function'.
+
+        Returns:
+            A formatted string containing the function signature and docstring.
+        """
+        module_name, function_name = skill_name.split(':')
+        try:
+            module = importlib.import_module(
+                f'openhands.runtime.plugins.agent_skills.{module_name}'
+            )
+
+            # find the function
+            agent_skill_fn = getattr(module, function_name)
+
+            # get the function signature with parameter names, types and return type
+            fn_signature = f'{agent_skill_fn.__name__}' + str(signature(agent_skill_fn))
+
+            doc = agent_skill_fn.__doc__
+
+            # remove indentation from docstring and extra empty lines
+            doc = '\n'.join(filter(None, map(lambda x: x.strip(), doc.split('\n'))))
+
+            # now add a consistent 4 indentation
+            doc = '\n'.join(map(lambda x: ' ' * 4 + x, doc.split('\n')))
+            return f'{fn_signature}\n{doc}'
+        except (ImportError, AttributeError) as e:
+            print(e)
+            return f'Documentation not found for skill: {skill_name}'
diff --git a/poetry.lock b/poetry.lock
index 20316ac24fe7..a424c3a39af1 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3924,72 +3924,88 @@ tokenizers = "*"
 extra-proxy = ["azure-identity (>=1.15.0,<2.0.0)", "azure-keyvault-secrets (>=4.8.0,<5.0.0)", "google-cloud-kms (>=2.21.3,<3.0.0)", "prisma (==0.11.0)", "resend (>=0.8.0,<0.9.0)"]
 proxy = ["PyJWT (>=2.8.0,<3.0.0)", "apscheduler (>=3.10.4,<4.0.0)", "backoff", "cryptography (>=42.0.5,<43.0.0)", "fastapi (>=0.111.0,<0.112.0)", "fastapi-sso (>=0.10.0,<0.11.0)", "gunicorn (>=22.0.0,<23.0.0)", "orjson (>=3.9.7,<4.0.0)", "pynacl (>=1.5.0,<2.0.0)", "python-multipart (>=0.0.9,<0.0.10)", "pyyaml (>=6.0.1,<7.0.0)", "rq", "uvicorn (>=0.22.0,<0.23.0)"]
 
+[[package]]
+name = "llama-cloud"
+version = "0.1.4"
+description = ""
+optional = false
+python-versions = "<4,>=3.8"
+files = [
+    {file = "llama_cloud-0.1.4-py3-none-any.whl", hash = "sha256:cfca6c4e0a87468b922d732f0f313a2ecd3a8e0bf74382ee80829ce49dcbc5e0"},
+    {file = "llama_cloud-0.1.4.tar.gz", hash = "sha256:6f0155979bd96160951cb812c48836f1face037bc79ccfd8d185b18ef4c9faf8"},
+]
+
+[package.dependencies]
+httpx = ">=0.20.0"
+pydantic = ">=1.10"
+
 [[package]]
 name = "llama-index"
-version = "0.10.45.post1"
+version = "0.11.20"
 description = "Interface between LLMs and your data"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index-0.10.45.post1-py3-none-any.whl", hash = "sha256:11ff93431a81f1eae5bb73092d194cfb66a36ea90f272ea145f20e6e4324c71c"},
-    {file = "llama_index-0.10.45.post1.tar.gz", hash = "sha256:0bd3dcdbbfa468c408ad2f9e839b60562367ec6563c13c9bddcd108309881447"},
+    {file = "llama_index-0.11.20-py3-none-any.whl", hash = "sha256:fc9e5e47e6da3610bc3b788d208bb782c03a342fd71e3b22b37abc83ecebe46e"},
+    {file = "llama_index-0.11.20.tar.gz", hash = "sha256:5e8e3fcb5af5b4e4525498b075ff0a54160b00bf0fc0b83801fc7faf1c8a8c1d"},
 ]
 
 [package.dependencies]
-llama-index-agent-openai = ">=0.1.4,<0.3.0"
-llama-index-cli = ">=0.1.2,<0.2.0"
-llama-index-core = "0.10.45"
-llama-index-embeddings-openai = ">=0.1.5,<0.2.0"
-llama-index-indices-managed-llama-cloud = ">=0.1.2,<0.2.0"
+llama-index-agent-openai = ">=0.3.4,<0.4.0"
+llama-index-cli = ">=0.3.1,<0.4.0"
+llama-index-core = ">=0.11.20,<0.12.0"
+llama-index-embeddings-openai = ">=0.2.4,<0.3.0"
+llama-index-indices-managed-llama-cloud = ">=0.3.0"
 llama-index-legacy = ">=0.9.48,<0.10.0"
-llama-index-llms-openai = ">=0.1.13,<0.2.0"
-llama-index-multi-modal-llms-openai = ">=0.1.3,<0.2.0"
-llama-index-program-openai = ">=0.1.3,<0.2.0"
-llama-index-question-gen-openai = ">=0.1.2,<0.2.0"
-llama-index-readers-file = ">=0.1.4,<0.2.0"
-llama-index-readers-llama-parse = ">=0.1.2,<0.2.0"
+llama-index-llms-openai = ">=0.2.10,<0.3.0"
+llama-index-multi-modal-llms-openai = ">=0.2.0,<0.3.0"
+llama-index-program-openai = ">=0.2.0,<0.3.0"
+llama-index-question-gen-openai = ">=0.2.0,<0.3.0"
+llama-index-readers-file = ">=0.2.0,<0.3.0"
+llama-index-readers-llama-parse = ">=0.3.0"
+nltk = ">3.8.1"
 
 [[package]]
 name = "llama-index-agent-openai"
-version = "0.2.9"
+version = "0.3.4"
 description = "llama-index agent openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_agent_openai-0.2.9-py3-none-any.whl", hash = "sha256:d7f0fd4c87124781acd783be603871f8808b1a3969e876a9c96e2ed0844d46ac"},
-    {file = "llama_index_agent_openai-0.2.9.tar.gz", hash = "sha256:debe86da6d9d983db32b445ddca7c798ac140fe59573bafded73595b3995f3d5"},
+    {file = "llama_index_agent_openai-0.3.4-py3-none-any.whl", hash = "sha256:3720ce9bb12417a99a3fe84e52cce23e762b13f88a2dfc4292c76f4df9b26b4a"},
+    {file = "llama_index_agent_openai-0.3.4.tar.gz", hash = "sha256:80e3408d97121bebca3fa3ffd14b51285870c1c3c73d4ee04d3d18cfe6040466"},
 ]
 
 [package.dependencies]
-llama-index-core = ">=0.10.41,<0.11.0"
-llama-index-llms-openai = ">=0.1.5,<0.2.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-llms-openai = ">=0.2.9,<0.3.0"
 openai = ">=1.14.0"
 
 [[package]]
 name = "llama-index-cli"
-version = "0.1.13"
+version = "0.3.1"
 description = "llama-index cli"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_cli-0.1.13-py3-none-any.whl", hash = "sha256:5e05bc3ce55ee1bf6e5af7e87631a71d6b6cf8fc2af10cd3947b09b1bac6788d"},
-    {file = "llama_index_cli-0.1.13.tar.gz", hash = "sha256:86147ded4439fbab1d6c7c0d72e8f231d2935da9fdf5c9d3f0dde4f35d44aa59"},
+    {file = "llama_index_cli-0.3.1-py3-none-any.whl", hash = "sha256:2111fbb6973f5b1eabce0d6cca3986499f0f2f625b13d7f48269a49c64c027d4"},
+    {file = "llama_index_cli-0.3.1.tar.gz", hash = "sha256:1890dd687cf440f3651365a549e303363162c167b8efbd87a3aa10058d6d5c77"},
 ]
 
 [package.dependencies]
-llama-index-core = ">=0.10.11.post1,<0.11.0"
-llama-index-embeddings-openai = ">=0.1.1,<0.2.0"
-llama-index-llms-openai = ">=0.1.1,<0.2.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-embeddings-openai = ">=0.2.0,<0.3.0"
+llama-index-llms-openai = ">=0.2.0,<0.3.0"
 
 [[package]]
 name = "llama-index-core"
-version = "0.10.45"
+version = "0.11.20"
 description = "Interface between LLMs and your data"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_core-0.10.45-py3-none-any.whl", hash = "sha256:8c800c7221322b8e1cbbbc13325039b5fe3575d4b0e0be14ac9a8f1e5d14fee3"},
-    {file = "llama_index_core-0.10.45.tar.gz", hash = "sha256:f32d0448e7193ff45c8e84abd49493be030998fc8f1a0cab069387deef3e577c"},
+    {file = "llama_index_core-0.11.20-py3-none-any.whl", hash = "sha256:e84daf45e90e4b5d9e135baf40ab9853a1c3169a1076af6d58739d098e70adb1"},
+    {file = "llama_index_core-0.11.20.tar.gz", hash = "sha256:6b5eaaf4be5030808b9ba953e8f7aead7ba495b8e72ba0a81dfc7dda96be416f"},
 ]
 
 [package.dependencies]
@@ -3999,18 +4015,16 @@ deprecated = ">=1.2.9.3"
 dirtyjson = ">=1.0.8,<2.0.0"
 fsspec = ">=2023.5.0"
 httpx = "*"
-llamaindex-py-client = ">=0.1.18,<0.2.0"
 nest-asyncio = ">=1.5.8,<2.0.0"
 networkx = ">=3.0"
-nltk = ">=3.8.1,<4.0.0"
-numpy = "*"
-openai = ">=1.1.0"
-pandas = "*"
+nltk = ">3.8.1"
+numpy = "<2.0.0"
 pillow = ">=9.0.0"
+pydantic = ">=2.7.0,<3.0.0"
 PyYAML = ">=6.0.1"
 requests = ">=2.31.0"
 SQLAlchemy = {version = ">=1.4.49", extras = ["asyncio"]}
-tenacity = ">=8.2.0,<9.0.0"
+tenacity = ">=8.2.0,<8.4.0 || >8.4.0,<9.0.0"
 tiktoken = ">=0.3.3"
 tqdm = ">=4.66.1,<5.0.0"
 typing-extensions = ">=4.5.0"
@@ -4019,79 +4033,95 @@ wrapt = "*"
 
 [[package]]
 name = "llama-index-embeddings-azure-openai"
-version = "0.1.11"
+version = "0.2.5"
 description = "llama-index embeddings azure openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_embeddings_azure_openai-0.1.11-py3-none-any.whl", hash = "sha256:afefe55ee69934528c569ddf71fb1e9ddf2992b6c344c4c9d72a03fa8c33cf40"},
-    {file = "llama_index_embeddings_azure_openai-0.1.11.tar.gz", hash = "sha256:40a4fd9a31ba74f071739d6c8405187b66e7f584ae2f64a30316c6c7b6a25325"},
+    {file = "llama_index_embeddings_azure_openai-0.2.5-py3-none-any.whl", hash = "sha256:e3384002618d027c3d188134e7fe09ffb16029202db6b3e6955a9f1f6d591a3e"},
+    {file = "llama_index_embeddings_azure_openai-0.2.5.tar.gz", hash = "sha256:d8b2e3134c2b3510214f2260e6c17be18396d0c765f3edd6c3ffe6109528aed0"},
 ]
 
 [package.dependencies]
-llama-index-core = ">=0.10.11.post1,<0.11.0"
-llama-index-embeddings-openai = ">=0.1.3,<0.2.0"
-llama-index-llms-azure-openai = ">=0.1.3,<0.2.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-embeddings-openai = ">=0.2.3,<0.3.0"
+llama-index-llms-azure-openai = ">=0.2.0,<0.3.0"
 
 [[package]]
 name = "llama-index-embeddings-huggingface"
-version = "0.2.3"
+version = "0.3.1"
 description = "llama-index embeddings huggingface integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_embeddings_huggingface-0.2.3-py3-none-any.whl", hash = "sha256:7dee842f938d5fa8992e7803eda8a14f6bea72ec0bc0a546f4c6aa455166cde5"},
-    {file = "llama_index_embeddings_huggingface-0.2.3.tar.gz", hash = "sha256:6fe54366eeb87ff81b50624d6b8ccca4230f8035fcc19a0b0b3f31c6d8a82f8b"},
+    {file = "llama_index_embeddings_huggingface-0.3.1-py3-none-any.whl", hash = "sha256:71708240b1aec183c80f20d531b39a75d0cce774586e11bb0798f3ecb270749c"},
+    {file = "llama_index_embeddings_huggingface-0.3.1.tar.gz", hash = "sha256:7aef6324a19576e6b95bfe927c3bd4fc1c5725edce9f26b4e5d2eefa27c02fdb"},
 ]
 
 [package.dependencies]
 huggingface-hub = {version = ">=0.19.0", extras = ["inference"]}
-llama-index-core = ">=0.10.1,<0.11.0"
+llama-index-core = ">=0.11.0,<0.12.0"
 sentence-transformers = ">=2.6.1"
 
 [[package]]
 name = "llama-index-embeddings-ollama"
-version = "0.2.0"
+version = "0.3.1"
 description = "llama-index embeddings ollama integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_embeddings_ollama-0.2.0-py3-none-any.whl", hash = "sha256:372b059321386bd9bbf4f619ad33dd551adb9ee92eeeb0c664d3466f7c212e2e"},
-    {file = "llama_index_embeddings_ollama-0.2.0.tar.gz", hash = "sha256:5673c740e1dd146e17d1c0401c1e179c0d559caf0967f4a4721b89fbb6822ad8"},
+    {file = "llama_index_embeddings_ollama-0.3.1-py3-none-any.whl", hash = "sha256:b869ce7e9f8e67aa7d81336e90d25d3ea1fca91c68dce8922b2d4b9c06c5acef"},
+    {file = "llama_index_embeddings_ollama-0.3.1.tar.gz", hash = "sha256:5a3e75fa14be7e2b1a82937416c880204dc96e1b1d2626dc5bde93f021e7b540"},
 ]
 
 [package.dependencies]
-llama-index-core = ">=0.10.1,<0.11.0"
+llama-index-core = ">=0.11.0,<0.12.0"
 ollama = ">=0.3.1,<0.4.0"
 
 [[package]]
 name = "llama-index-embeddings-openai"
-version = "0.1.11"
+version = "0.2.5"
 description = "llama-index embeddings openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_embeddings_openai-0.1.11-py3-none-any.whl", hash = "sha256:e20806fc4baff6b8f5274decf2c1ca7c5c737648e01865475ffada164e32e173"},
-    {file = "llama_index_embeddings_openai-0.1.11.tar.gz", hash = "sha256:6025e229e375201788a9b14d6ebe470329907576cba5f6b7b832c3d68f39db30"},
+    {file = "llama_index_embeddings_openai-0.2.5-py3-none-any.whl", hash = "sha256:823c8311e556349ba19dda408a64a314fa3dafe0e5759709c54d33a0269aa6ba"},
+    {file = "llama_index_embeddings_openai-0.2.5.tar.gz", hash = "sha256:0047dd71d747068645ed728c29312aa91b65bbe4c6142180034c64dfc5c6f6e8"},
+]
+
+[package.dependencies]
+llama-index-core = ">=0.11.0,<0.12.0"
+openai = ">=1.1.0"
+
+[[package]]
+name = "llama-index-embeddings-voyageai"
+version = "0.2.2"
+description = "llama-index embeddings voyageai integration"
+optional = false
+python-versions = "<4.0,>=3.8.1"
+files = [
+    {file = "llama_index_embeddings_voyageai-0.2.2-py3-none-any.whl", hash = "sha256:7bbb79558d474497ff700a930a0f9081976d1b4e0f5107e38a1059600de92c58"},
+    {file = "llama_index_embeddings_voyageai-0.2.2.tar.gz", hash = "sha256:237f70074af05f3b950c89d5d0720de30f9f5e98426a420f6e08125600b69be9"},
 ]
 
 [package.dependencies]
-llama-index-core = ">=0.10.1,<0.11.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+voyageai = ">=0.2.1,<0.3.0"
 
 [[package]]
 name = "llama-index-indices-managed-llama-cloud"
-version = "0.1.6"
+version = "0.4.0"
 description = "llama-index indices llama-cloud integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_indices_managed_llama_cloud-0.1.6-py3-none-any.whl", hash = "sha256:cba33e1a3677b2a2ae7f239119acbf6dc3818f105edc92315729842b56fbc949"},
-    {file = "llama_index_indices_managed_llama_cloud-0.1.6.tar.gz", hash = "sha256:74b3b0e9ebf9d348d3054f9fc0c657031acceb9351c31116ad8d5a7ae4729f5c"},
+    {file = "llama_index_indices_managed_llama_cloud-0.4.0-py3-none-any.whl", hash = "sha256:c2c54821f1bf17a7810e6c013fbe7ddfef4154b7e5b100f7bf8673098f8004e4"},
+    {file = "llama_index_indices_managed_llama_cloud-0.4.0.tar.gz", hash = "sha256:fbebff7876a219b6ab96892ae7c432a9299195fab8f67d4a4a0ebf6da210b242"},
 ]
 
 [package.dependencies]
-llama-index-core = ">=0.10.0,<0.11.0"
-llamaindex-py-client = ">=0.1.19,<0.2.0"
+llama-cloud = ">=0.0.11"
+llama-index-core = ">=0.11.13.post1,<0.12.0"
 
 [[package]]
 name = "llama-index-legacy"
@@ -4134,96 +4164,98 @@ query-tools = ["guidance (>=0.0.64,<0.0.65)", "jsonpath-ng (>=1.6.0,<2.0.0)", "l
 
 [[package]]
 name = "llama-index-llms-azure-openai"
-version = "0.1.10"
+version = "0.2.2"
 description = "llama-index llms azure openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_llms_azure_openai-0.1.10-py3-none-any.whl", hash = "sha256:8666b095118ed9c5087dc2d91a83a826d4549ea4d442b9eef363e243207d3539"},
-    {file = "llama_index_llms_azure_openai-0.1.10.tar.gz", hash = "sha256:f1624c9bd7bf4458e98cca6f3b805eec06105fa951536ff24b098d913d2368bd"},
+    {file = "llama_index_llms_azure_openai-0.2.2-py3-none-any.whl", hash = "sha256:c8a7d04a111ceff0b4335dc9273fbdb37fdb5095b6234190ca727736f6466d7b"},
+    {file = "llama_index_llms_azure_openai-0.2.2.tar.gz", hash = "sha256:717bc3bf858e800d66e4f2ddec85a2e7dd503006d55981053d08e98771ec3abc"},
 ]
 
 [package.dependencies]
 azure-identity = ">=1.15.0,<2.0.0"
 httpx = "*"
-llama-index-core = ">=0.10.11.post1,<0.11.0"
-llama-index-llms-openai = ">=0.1.1,<0.2.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-llms-openai = ">=0.2.1,<0.3.0"
 
 [[package]]
 name = "llama-index-llms-openai"
-version = "0.1.26"
+version = "0.2.16"
 description = "llama-index llms openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_llms_openai-0.1.26-py3-none-any.whl", hash = "sha256:1ad8e4eb02f9410c2091749d4d9aa9db4452646b595eb5eb937edbc496fb65fe"},
-    {file = "llama_index_llms_openai-0.1.26.tar.gz", hash = "sha256:08a408cd53af4cd4623dd5807be4cbbd5e5b3ca01272128cd678d667343e4d5d"},
+    {file = "llama_index_llms_openai-0.2.16-py3-none-any.whl", hash = "sha256:413466acbb894bd81f8dab2037f595e92392d869eec6d8274a16d43123cac8b6"},
+    {file = "llama_index_llms_openai-0.2.16.tar.gz", hash = "sha256:7c666dd27056c278a079ff45d53f1fbfc8ed363764aa7baeee2e03df47f9072a"},
 ]
 
 [package.dependencies]
-llama-index-core = ">=0.10.24,<0.11.0"
+llama-index-core = ">=0.11.7,<0.12.0"
+openai = ">=1.40.0,<2.0.0"
 
 [[package]]
 name = "llama-index-multi-modal-llms-openai"
-version = "0.1.9"
+version = "0.2.3"
 description = "llama-index multi-modal-llms openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_multi_modal_llms_openai-0.1.9-py3-none-any.whl", hash = "sha256:614f40427a4671e72742780be8fda77297dbf2942519bffcb2c9de8696a9edff"},
-    {file = "llama_index_multi_modal_llms_openai-0.1.9.tar.gz", hash = "sha256:dbacf44d5c2cca07ca424eacd1337583002d70387a3c1868cf8ae743b1dbec4a"},
+    {file = "llama_index_multi_modal_llms_openai-0.2.3-py3-none-any.whl", hash = "sha256:96b36beb2c3fca4faca80c59ecf7c6c6629ecdb96c288ef89777b592ec43f872"},
+    {file = "llama_index_multi_modal_llms_openai-0.2.3.tar.gz", hash = "sha256:8eb9b7f1ff3956ef0979e21bc83e6a885e40987b7199f195e46525d06e3ae402"},
 ]
 
 [package.dependencies]
-llama-index-core = ">=0.10.1,<0.11.0"
-llama-index-llms-openai = ">=0.1.1,<0.2.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-llms-openai = ">=0.2.11,<0.3.0"
 
 [[package]]
 name = "llama-index-program-openai"
-version = "0.1.6"
+version = "0.2.0"
 description = "llama-index program openai integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_program_openai-0.1.6-py3-none-any.whl", hash = "sha256:4660b338503537c5edca1e0dab606af6ce372b4f1b597e2833c6b602447c5d8d"},
-    {file = "llama_index_program_openai-0.1.6.tar.gz", hash = "sha256:c6a4980c5ea826088b28b4dee3367edb20221e6d05eb0e05019049190131d772"},
+    {file = "llama_index_program_openai-0.2.0-py3-none-any.whl", hash = "sha256:2e10d0c8f21af2e9443eb79e81bb31e7b73835b7c7bbd7ddf20e0a9c846cd368"},
+    {file = "llama_index_program_openai-0.2.0.tar.gz", hash = "sha256:4139935541c011257fbfeb9662b3bf1237b729ef4b1c8f4ddf5b6789d2374ac4"},
 ]
 
 [package.dependencies]
-llama-index-agent-openai = ">=0.1.1,<0.3.0"
-llama-index-core = ">=0.10.1,<0.11.0"
-llama-index-llms-openai = ">=0.1.1,<0.2.0"
+llama-index-agent-openai = ">=0.3.0,<0.4.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-llms-openai = ">=0.2.0,<0.3.0"
 
 [[package]]
 name = "llama-index-question-gen-openai"
-version = "0.1.3"
+version = "0.2.0"
 description = "llama-index question_gen openai integration"
 optional = false
-python-versions = ">=3.8.1,<4.0"
+python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_question_gen_openai-0.1.3-py3-none-any.whl", hash = "sha256:1f83b49e8b2e665030d1ec8c54687d6985d9fa8426147b64e46628a9e489b302"},
-    {file = "llama_index_question_gen_openai-0.1.3.tar.gz", hash = "sha256:4486198117a45457d2e036ae60b93af58052893cc7d78fa9b6f47dd47b81e2e1"},
+    {file = "llama_index_question_gen_openai-0.2.0-py3-none-any.whl", hash = "sha256:a16e68fc5434e9a793f1dfd0cc0354ee19afd167f1d499403b0085b11c5406c0"},
+    {file = "llama_index_question_gen_openai-0.2.0.tar.gz", hash = "sha256:3dde1cecbd651000639c20031d7ea23334276aabb181cac40ff424f35e10465e"},
 ]
 
 [package.dependencies]
-llama-index-core = ">=0.10.1,<0.11.0"
-llama-index-llms-openai = ">=0.1.1,<0.2.0"
-llama-index-program-openai = ">=0.1.1,<0.2.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-index-llms-openai = ">=0.2.0,<0.3.0"
+llama-index-program-openai = ">=0.2.0,<0.3.0"
 
 [[package]]
 name = "llama-index-readers-file"
-version = "0.1.33"
+version = "0.2.2"
 description = "llama-index readers file integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_readers_file-0.1.33-py3-none-any.whl", hash = "sha256:c968308497c1355acf61fe7e3f05ad8e308bb6487dddd3bd2a60e102225d0b38"},
-    {file = "llama_index_readers_file-0.1.33.tar.gz", hash = "sha256:247a4d5bfabc7d1022027adf58064bc16c224d006db142abb0d182ac5574a887"},
+    {file = "llama_index_readers_file-0.2.2-py3-none-any.whl", hash = "sha256:ffec878771c1e7575afb742887561059bcca77b97a81c1c1be310ebb73f10f46"},
+    {file = "llama_index_readers_file-0.2.2.tar.gz", hash = "sha256:48459f90960b863737147b66ed83afec9ce8984f8eda2561b6d2500214365db2"},
 ]
 
 [package.dependencies]
 beautifulsoup4 = ">=4.12.3,<5.0.0"
-llama-index-core = ">=0.10.37.post1,<0.11.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+pandas = "*"
 pypdf = ">=4.0.1,<5.0.0"
 striprtf = ">=0.0.26,<0.0.27"
 
@@ -4232,62 +4264,48 @@ pymupdf = ["pymupdf (>=1.23.21,<2.0.0)"]
 
 [[package]]
 name = "llama-index-readers-llama-parse"
-version = "0.1.6"
+version = "0.3.0"
 description = "llama-index readers llama-parse integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_readers_llama_parse-0.1.6-py3-none-any.whl", hash = "sha256:71d445a2357ce4c632e0fada7c913ac62790e77c062f12d916dd86378380ff1f"},
-    {file = "llama_index_readers_llama_parse-0.1.6.tar.gz", hash = "sha256:04f2dcfbb0fb87ce70890f5a2f4f89941d79be6a818b43738f053560e4b451cf"},
+    {file = "llama_index_readers_llama_parse-0.3.0-py3-none-any.whl", hash = "sha256:1973cc710dbd5e110c7500c9983ecb45787ad1ff92e6b2113f94a57cf48f3038"},
+    {file = "llama_index_readers_llama_parse-0.3.0.tar.gz", hash = "sha256:a5feada0895714dcc41d65dd512c1c38cf70d8ae19947cff82b80d58e6aa367e"},
 ]
 
 [package.dependencies]
-llama-index-core = ">=0.10.7,<0.11.0"
-llama-parse = ">=0.4.0"
+llama-index-core = ">=0.11.0,<0.12.0"
+llama-parse = ">=0.5.0"
 
 [[package]]
 name = "llama-index-vector-stores-chroma"
-version = "0.1.10"
+version = "0.2.1"
 description = "llama-index vector_stores chroma integration"
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_index_vector_stores_chroma-0.1.10-py3-none-any.whl", hash = "sha256:18859272ec8d3ed20bae7e4a9bc18feb4233e8be2a725d33626f283ac41d1475"},
-    {file = "llama_index_vector_stores_chroma-0.1.10.tar.gz", hash = "sha256:97971f7b36461ef37be023b9ceb5531396cc48360d0bdbda51cce1290301cc47"},
+    {file = "llama_index_vector_stores_chroma-0.2.1-py3-none-any.whl", hash = "sha256:6dcca6450d298d3033a47b2131d0618ad48c172a3541eb6c790a61bf94136fed"},
+    {file = "llama_index_vector_stores_chroma-0.2.1.tar.gz", hash = "sha256:def15a76354bb4658b16badb92537a72e766273d5e566b0575461005da53847f"},
 ]
 
 [package.dependencies]
-chromadb = ">=0.4.0,<0.6.0"
-llama-index-core = ">=0.10.1,<0.11.0"
+chromadb = ">=0.4.0,<0.5.4 || >0.5.4,<0.5.7 || >0.5.7,<0.5.9 || >0.5.9,<0.5.10 || >0.5.10,<0.5.11 || >0.5.11,<0.5.12 || >0.5.12,<0.6.0"
+llama-index-core = ">=0.11.0,<0.12.0"
 
 [[package]]
 name = "llama-parse"
-version = "0.4.9"
+version = "0.5.12"
 description = "Parse files into RAG-Optimized formats."
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "llama_parse-0.4.9-py3-none-any.whl", hash = "sha256:71974a57a73d642608cc406942bee4e7fc1a713fa410f51df67da509479ba544"},
-    {file = "llama_parse-0.4.9.tar.gz", hash = "sha256:657f8fa5f7d399f14c0454fc05cae6034da0373f191df6cfca17a1b4a704ef87"},
-]
-
-[package.dependencies]
-llama-index-core = ">=0.10.29"
-
-[[package]]
-name = "llamaindex-py-client"
-version = "0.1.19"
-description = ""
-optional = false
-python-versions = "<4,>=3.8"
-files = [
-    {file = "llamaindex_py_client-0.1.19-py3-none-any.whl", hash = "sha256:fd9416fd78b97209bf323bc3c7fab314499778563e7274f10853ad560563d10e"},
-    {file = "llamaindex_py_client-0.1.19.tar.gz", hash = "sha256:73f74792bb8c092bae6dc626627a09ac13a099fa8d10f8fcc83e17a2b332cca7"},
+    {file = "llama_parse-0.5.12-py3-none-any.whl", hash = "sha256:6011feb49da5db4bcbeea1cc6688b6ff24b483877fda80b03fe59239cd08b907"},
+    {file = "llama_parse-0.5.12.tar.gz", hash = "sha256:e241606cf3574425df76c0f5d01a31a95c792c6fbef80aaf72f8ed6448bd1715"},
 ]
 
 [package.dependencies]
-httpx = ">=0.20.0"
-pydantic = ">=1.10"
+click = ">=8.1.7,<9.0.0"
+llama-index-core = ">=0.11.0"
 
 [[package]]
 name = "lxml"
@@ -5298,56 +5316,47 @@ test = ["pytest", "pytest-console-scripts", "pytest-jupyter", "pytest-tornasync"
 
 [[package]]
 name = "numpy"
-version = "2.0.2"
+version = "1.26.4"
 description = "Fundamental package for array computing in Python"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece"},
-    {file = "numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04"},
-    {file = "numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66"},
-    {file = "numpy-2.0.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b"},
-    {file = "numpy-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd"},
-    {file = "numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318"},
-    {file = "numpy-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8"},
-    {file = "numpy-2.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326"},
-    {file = "numpy-2.0.2-cp310-cp310-win32.whl", hash = "sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97"},
-    {file = "numpy-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131"},
-    {file = "numpy-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448"},
-    {file = "numpy-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195"},
-    {file = "numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57"},
-    {file = "numpy-2.0.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a"},
-    {file = "numpy-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669"},
-    {file = "numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951"},
-    {file = "numpy-2.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9"},
-    {file = "numpy-2.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15"},
-    {file = "numpy-2.0.2-cp311-cp311-win32.whl", hash = "sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4"},
-    {file = "numpy-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc"},
-    {file = "numpy-2.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b"},
-    {file = "numpy-2.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e"},
-    {file = "numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c"},
-    {file = "numpy-2.0.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c"},
-    {file = "numpy-2.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692"},
-    {file = "numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a"},
-    {file = "numpy-2.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c"},
-    {file = "numpy-2.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded"},
-    {file = "numpy-2.0.2-cp312-cp312-win32.whl", hash = "sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5"},
-    {file = "numpy-2.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a"},
-    {file = "numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c"},
-    {file = "numpy-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd"},
-    {file = "numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b"},
-    {file = "numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729"},
-    {file = "numpy-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1"},
-    {file = "numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd"},
-    {file = "numpy-2.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d"},
-    {file = "numpy-2.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d"},
-    {file = "numpy-2.0.2-cp39-cp39-win32.whl", hash = "sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa"},
-    {file = "numpy-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73"},
-    {file = "numpy-2.0.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8"},
-    {file = "numpy-2.0.2-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4"},
-    {file = "numpy-2.0.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c"},
-    {file = "numpy-2.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385"},
-    {file = "numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78"},
+    {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"},
+    {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"},
+    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4"},
+    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f"},
+    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a"},
+    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2"},
+    {file = "numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07"},
+    {file = "numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5"},
+    {file = "numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71"},
+    {file = "numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef"},
+    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e"},
+    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5"},
+    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a"},
+    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a"},
+    {file = "numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20"},
+    {file = "numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2"},
+    {file = "numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218"},
+    {file = "numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b"},
+    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b"},
+    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed"},
+    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a"},
+    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"},
+    {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"},
+    {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"},
+    {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"},
+    {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"},
+    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"},
+    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"},
+    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"},
+    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"},
+    {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"},
+    {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"},
+    {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
 ]
 
 [[package]]
@@ -9354,6 +9363,24 @@ platformdirs = ">=3.9.1,<5"
 docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
 test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
 
+[[package]]
+name = "voyageai"
+version = "0.2.4"
+description = ""
+optional = false
+python-versions = "<4.0.0,>=3.7.1"
+files = [
+    {file = "voyageai-0.2.4-py3-none-any.whl", hash = "sha256:e3070e5c78dec89adae43231334b4637aa88933dad99b1c33d3219fdfc94dfa4"},
+    {file = "voyageai-0.2.4.tar.gz", hash = "sha256:b9911d8629e8a4e363291c133482fead49a3536afdf1e735f3ab3aaccd8d250d"},
+]
+
+[package.dependencies]
+aiohttp = ">=3.5,<4.0"
+aiolimiter = ">=1.1.0,<2.0.0"
+numpy = ">=1.11"
+requests = ">=2.20,<3.0"
+tenacity = ">=8.0.1"
+
 [[package]]
 name = "watchdog"
 version = "5.0.3"
@@ -10095,4 +10122,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "aeb09e429a789c3f8ced605e7e1a5932fd6cce7f7f4ce30a960da77fba18b9a3"
+content-hash = "62de6b5fb79f97f563a3ff6a4cf225cc639954745b63ddea8921b2eb9fb0e155"
diff --git a/pyproject.toml b/pyproject.toml
index 393bab9594dd..ad4e08ea2c8d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -67,6 +67,7 @@ llama-index-embeddings-huggingface = "*"
 torch = "2.5.0"
 llama-index-embeddings-azure-openai = "*"
 llama-index-embeddings-ollama = "*"
+llama-index-embeddings-voyageai = "*"
 
 [tool.poetry.group.dev.dependencies]
 ruff = "0.7.1"
@@ -89,6 +90,7 @@ reportlab = "*"
 [tool.coverage.run]
 concurrency = ["gevent"]
 
+
 [tool.poetry.group.runtime.dependencies]
 jupyterlab = "*"
 notebook = "*"
@@ -119,6 +121,7 @@ ignore = ["D1"]
 [tool.ruff.lint.pydocstyle]
 convention = "google"
 
+
 [tool.poetry.group.evaluation.dependencies]
 streamlit = "*"
 whatthepatch = "*"
diff --git a/tests/unit/test_codeact_agent.py b/tests/unit/test_codeact_agent.py
index 55dfa3feb75b..14ca1ee639e4 100644
--- a/tests/unit/test_codeact_agent.py
+++ b/tests/unit/test_codeact_agent.py
@@ -92,5 +92,4 @@ def test_error_observation_message(agent: CodeActAgent):
 def test_unknown_observation_message(agent: CodeActAgent):
     obs = Mock()
 
-    with pytest.raises(ValueError, match='Unknown observation type:'):
-        agent.get_observation_message(obs)
+    assert agent.get_observation_message(obs) is None
diff --git a/tests/unit/test_condenser.py b/tests/unit/test_condenser.py
index 5a8ab72978a5..71d30dbf3e65 100644
--- a/tests/unit/test_condenser.py
+++ b/tests/unit/test_condenser.py
@@ -1,44 +1,172 @@
-from unittest.mock import Mock, patch
+import argparse
+import json
+import os
+from datetime import datetime
+from pathlib import Path
 
-import pytest
-
-from openhands.core.exceptions import LLMResponseError
+from openhands.core import logger
+from openhands.core.config.utils import get_llm_config_arg, load_app_config
+from openhands.core.message import Message, TextContent
+from openhands.events.action.agent import AgentSummarizeAction
 from openhands.llm.llm import LLM
 from openhands.memory.condenser import MemoryCondenser
+from openhands.utils.prompt import PromptManager
 
 
-@pytest.fixture
-def memory_condenser():
-    return MemoryCondenser()
+def save_messages_for_debugging(
+    messages: list[Message], summary_action: AgentSummarizeAction
+) -> None:
+    """
+    Serializes the list of Message objects and the summary action,
+    then saves them to a JSON file in the ./logs directory for debugging purposes.
 
+    Args:
+        messages (list[Message]): The list of messages to serialize.
+        summary_action (AgentSummarizeAction): The summary action to append.
+    """
+    # Ensure the logs directory exists
+    log_dir = Path('./logs')
+    log_dir.mkdir(parents=True, exist_ok=True)
 
-@pytest.fixture
-def mock_llm():
-    return Mock(spec=LLM)
+    # Generate a timestamped filename
+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+    filename = f'debug_summary_{timestamp}.json'
+    file_path = log_dir / filename
 
+    try:
+        # Serialize messages using Pydantic's model_dump()
+        serialized_messages = [message.model_dump() for message in messages]
 
-def test_condense_success(memory_condenser, mock_llm):
-    mock_llm.completion.return_value = {
-        'choices': [{'message': {'content': 'Condensed memory'}}]
-    }
-    result = memory_condenser.condense('Summarize this', mock_llm)
-    assert result == 'Condensed memory'
-    mock_llm.completion.assert_called_once_with(
-        messages=[{'content': 'Summarize this', 'role': 'user'}]
-    )
+        # Create a Message instance for the summary_action
+        summary_event = Message(
+            role='assistant', content=[TextContent(text=str(summary_action))]
+        )
+        serialized_summary = summary_event.model_dump()
+
+        # Append the serialized summary to the messages
+        serialized_messages.append(serialized_summary)
+
+        with file_path.open('w', encoding='utf-8') as f:
+            json.dump(serialized_messages, f, ensure_ascii=False, indent=4)
+
+        logger.debug(f'Messages successfully saved to {file_path}')
+    except Exception as e:
+        logger.error(f'Failed to save messages for debugging: {e}')
+
+
+def main(condenser: MemoryCondenser, file_path: str | None = None):
+    """
+    Main method for quick testing and debugging.
+    Reads a specified debug summary JSON file from the ./logs/deepseek-24sept directory,
+    deserializes the messages, and prints them.
+    If no file is specified, it falls back to the latest file based on timestamp.
+
+    Args:
+        file_path (str | None): The path to the log file to process. If None, the latest file is used.
+    """
+    log_dir = Path('./logs/deepseek-24sept')
+    log_dir.mkdir(parents=True, exist_ok=True)
+
+    if file_path:
+        target_log = Path(file_path)
+        if not target_log.exists():
+            print(f'Specified log file does not exist: {target_log}')
+            return
+    else:
+        log_files = list(log_dir.glob('instance_*_*.json'))
+
+        if not log_files:
+            print(
+                'No instance_*_*.json files found in the ./logs/deepseek-24sept directory.'
+            )
+            return
+
+        # Sort files to find the latest one based on the digits at the end of the filename
+        def extract_digits(file_path: Path) -> int:
+            try:
+                # Extract the digits part from the filename
+                digits_str = file_path.stem.split('_')[-1]
+                return int(digits_str)
+            except (IndexError, ValueError):
+                # If digit extraction fails, assign the lowest possible value
+                return -1
+
+        log_files.sort(key=extract_digits, reverse=True)
+        target_log = log_files[0]
+
+        print(f'Loading messages from: {target_log}')
 
+    try:
+        with target_log.open('r', encoding='utf-8') as f:
+            messages_data = json.load(f)
 
-def test_condense_exception(memory_condenser, mock_llm):
-    mock_llm.completion.side_effect = LLMResponseError('LLM error')
-    with pytest.raises(LLMResponseError, match='LLM error'):
-        memory_condenser.condense('Summarize this', mock_llm)
+            # convert string content to list of TextContent if necessary
+            for msg in messages_data:
+                if isinstance(msg['content'], str):
+                    msg['content'] = [{'type': 'text', 'text': msg['content']}]
 
+            messages: list[Message] = [
+                Message.model_validate(msg, strict=False) for msg in messages_data
+            ]
 
-@patch('openhands.memory.condenser.logger')
-def test_condense_logs_error(mock_logger, memory_condenser, mock_llm):
-    mock_llm.completion.side_effect = LLMResponseError('LLM error')
-    with pytest.raises(LLMResponseError):
-        memory_condenser.condense('Summarize this', mock_llm)
-    mock_logger.error.assert_called_once_with(
-        'Error condensing thoughts: %s', 'LLM error', exc_info=False
+            print(f'Successfully loaded {len(messages)} messages:')
+            # for msg in messages:
+            #    print(f'{msg.role}:\n {msg.content[50:]}')
+
+            # run condense on these messages
+            summary_action = condenser.condense(messages)
+            print(f'summary_action: {summary_action}')
+
+            # save the summary action to a file named with the same name as the log file + summary
+            summary_file_path = target_log.with_suffix('.summary.json')
+            with summary_file_path.open('w', encoding='utf-8') as f:
+                json.dump(summary_action.model_dump(), f, ensure_ascii=False, indent=4)
+
+    except Exception as e:
+        print(f'An error occurred while reading {target_log}: {e}')
+        return
+
+
+if __name__ == '__main__':
+    # load or simulate dependencies as needed for testing
+    app_config = load_app_config()
+    llm_config = get_llm_config_arg('deepseek')
+    if llm_config is not None:
+        llm = LLM(config=llm_config)
+    else:
+        llm = LLM(app_config.get_llm_config('llm'))
+
+    prompt_dir = os.path.join(
+        os.path.dirname(__file__),
+        '..',
+        '..',
+        'openhands',
+        'agenthub',
+        'memcodeact_agent',
+        'prompts',
+    )
+    prompt_manager = PromptManager(
+        prompt_dir=prompt_dir,
+        agent_skills_docs='',
     )
+
+    condenser = MemoryCondenser(llm=llm, prompt_manager=prompt_manager)
+
+    # attach on fly the save_messages_for_debugging method to the condenser
+    condenser.save_messages_for_debugging = save_messages_for_debugging
+
+    # Setup argument parser for optional file parameter
+    parser = argparse.ArgumentParser(description='Run MemoryCondenser on a .json file.')
+    parser.add_argument(
+        '--file',
+        type=str,
+        default=None,
+        help='Path to the specific file to process. If not provided, the latest file is used.',
+    )
+    args = parser.parse_args()
+
+    if args.file is not None and args.file == '':
+        args.file = None
+
+    # Call the main method with the specified file path if provided
+    main(condenser, file_path=args.file)
diff --git a/tests/unit/test_is_stuck.py b/tests/unit/test_is_stuck.py
index 4a1330752161..1f28e9800799 100644
--- a/tests/unit/test_is_stuck.py
+++ b/tests/unit/test_is_stuck.py
@@ -17,8 +17,6 @@
 from openhands.events.observation.empty import NullObservation
 from openhands.events.observation.error import ErrorObservation
 from openhands.events.stream import EventSource, EventStream
-from openhands.events.utils import get_pairs_from_events
-from openhands.memory.history import ShortTermHistory
 from openhands.storage import get_file_store
 
 
@@ -55,22 +53,21 @@ def event_stream(temp_dir):
 
 class TestStuckDetector:
     @pytest.fixture
-    def stuck_detector(self, event_stream):
+    def stuck_detector(self):
         state = State(inputs={}, max_iterations=50)
-        state.history.set_event_stream(event_stream)
-
+        state.history = []  # Initialize history as an empty list
         return StuckDetector(state)
 
     def _impl_syntax_error_events(
         self,
-        event_stream: EventStream,
+        state: State,
         error_message: str,
         random_line: bool,
         incidents: int = 4,
     ):
         for i in range(incidents):
             ipython_action = IPythonRunCellAction(code=code_snippet)
-            event_stream.add_event(ipython_action, EventSource.AGENT)
+            state.history.append(ipython_action)
             extra_number = (i + 1) * 10 if random_line else '42'
             extra_line = '\n' * (i + 1) if random_line else ''
             ipython_observation = IPythonRunCellObservation(
@@ -79,15 +76,15 @@ def _impl_syntax_error_events(
                 f'{error_message}{extra_line}' + jupyter_line_1 + jupyter_line_2,
                 code=code_snippet,
             )
-            ipython_observation._cause = ipython_action._id
-            event_stream.add_event(ipython_observation, EventSource.USER)
+            # ipython_observation._cause = ipython_action._id
+            state.history.append(ipython_observation)
 
     def _impl_unterminated_string_error_events(
-        self, event_stream: EventStream, random_line: bool, incidents: int = 4
+        self, state: State, random_line: bool, incidents: int = 4
     ):
         for i in range(incidents):
             ipython_action = IPythonRunCellAction(code=code_snippet)
-            event_stream.add_event(ipython_action, EventSource.AGENT)
+            state.history.append(ipython_action)
             line_number = (i + 1) * 10 if random_line else '1'
             ipython_observation = IPythonRunCellObservation(
                 content=f'print("  Cell In[1], line {line_number}\nhello\n       ^\nSyntaxError: unterminated string literal (detected at line {line_number})'
@@ -95,34 +92,30 @@ def _impl_unterminated_string_error_events(
                 + jupyter_line_2,
                 code=code_snippet,
             )
-            ipython_observation._cause = ipython_action._id
-            event_stream.add_event(ipython_observation, EventSource.USER)
+            # ipython_observation._cause = ipython_action._
+            state.history.append(ipython_observation)
 
-    def test_history_too_short(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_history_too_short(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
         message_action = MessageAction(content='Hello', wait_for_response=False)
         message_action._source = EventSource.USER
         observation = NullObservation(content='')
-        observation._cause = message_action.id
-        event_stream.add_event(message_action, EventSource.USER)
-        event_stream.add_event(observation, EventSource.USER)
+        # observation._cause = message_action.id
+        state.history.append(message_action)
+        state.history.append(observation)
 
         cmd_action = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action, EventSource.AGENT)
+        state.history.append(cmd_action)
         cmd_observation = CmdOutputObservation(
             command_id=1, command='ls', content='file1.txt\nfile2.txt'
         )
-        cmd_observation._cause = cmd_action._id
-        event_stream.add_event(cmd_observation, EventSource.USER)
-
-        # stuck_detector.state.history.set_event_stream(event_stream)
+        # cmd_observation._cause = cmd_action._id
+        state.history.append(cmd_observation)
 
         assert stuck_detector.is_stuck() is False
 
-    def test_is_stuck_repeating_action_observation(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_is_stuck_repeating_action_observation(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
         message_action = MessageAction(content='Done', wait_for_response=False)
         message_action._source = EventSource.USER
 
@@ -130,135 +123,125 @@ def test_is_stuck_repeating_action_observation(
         hello_observation = NullObservation('')
 
         # 2 events
-        event_stream.add_event(hello_action, EventSource.USER)
-        event_stream.add_event(hello_observation, EventSource.USER)
+        state.history.append(hello_action)
+        state.history.append(hello_observation)
 
         cmd_action_1 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_1, EventSource.AGENT)
-        cmd_observation_1 = CmdOutputObservation(
-            content='', command='ls', command_id=cmd_action_1._id
-        )
+        cmd_action_1._id = 1
+        state.history.append(cmd_action_1)
+        cmd_observation_1 = CmdOutputObservation(content='', command='ls', command_id=1)
         cmd_observation_1._cause = cmd_action_1._id
-        event_stream.add_event(cmd_observation_1, EventSource.USER)
+        state.history.append(cmd_observation_1)
         # 4 events
 
         cmd_action_2 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_2, EventSource.AGENT)
-        cmd_observation_2 = CmdOutputObservation(
-            content='', command='ls', command_id=cmd_action_2._id
-        )
+        cmd_action_2._id = 2
+        state.history.append(cmd_action_2)
+        cmd_observation_2 = CmdOutputObservation(content='', command='ls', command_id=2)
         cmd_observation_2._cause = cmd_action_2._id
-        event_stream.add_event(cmd_observation_2, EventSource.USER)
+        state.history.append(cmd_observation_2)
         # 6 events
 
         # random user message just because we can
         message_null_observation = NullObservation(content='')
-        event_stream.add_event(message_action, EventSource.USER)
-        event_stream.add_event(message_null_observation, EventSource.USER)
+        state.history.append(message_action)
+        state.history.append(message_null_observation)
         # 8 events
 
         assert stuck_detector.is_stuck() is False
         assert stuck_detector.state.almost_stuck == 2
 
         cmd_action_3 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_3, EventSource.AGENT)
-        cmd_observation_3 = CmdOutputObservation(
-            content='', command='ls', command_id=cmd_action_3._id
-        )
+        cmd_action_3._id = 3
+        state.history.append(cmd_action_3)
+        cmd_observation_3 = CmdOutputObservation(content='', command='ls', command_id=3)
         cmd_observation_3._cause = cmd_action_3._id
-        event_stream.add_event(cmd_observation_3, EventSource.USER)
+        state.history.append(cmd_observation_3)
         # 10 events
 
-        assert len(collect_events(event_stream)) == 10
-        assert len(list(stuck_detector.state.history.get_events())) == 8
+        assert len(state.history) == 10
         assert (
-            len(
-                get_pairs_from_events(
-                    stuck_detector.state.history.get_events_as_list(
-                        include_delegates=True
-                    )
-                )
-            )
-            == 5
-        )
+            len(state.history) == 10
+        )  # Adjusted since history is a list and the controller is not running
+
+        # FIXME are we still testing this without this test?
+        # assert (
+        #    len(
+        #        get_pairs_from_events(state.history)
+        #    )
+        #    == 5
+        # )
 
         assert stuck_detector.is_stuck() is False
         assert stuck_detector.state.almost_stuck == 1
 
         cmd_action_4 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_4, EventSource.AGENT)
-        cmd_observation_4 = CmdOutputObservation(
-            content='', command='ls', command_id=cmd_action_4._id
-        )
+        cmd_action_4._id = 4
+        state.history.append(cmd_action_4)
+        cmd_observation_4 = CmdOutputObservation(content='', command='ls', command_id=4)
         cmd_observation_4._cause = cmd_action_4._id
-        event_stream.add_event(cmd_observation_4, EventSource.USER)
+        state.history.append(cmd_observation_4)
         # 12 events
 
-        assert len(collect_events(event_stream)) == 12
-        assert len(list(stuck_detector.state.history.get_events())) == 10
-        assert (
-            len(
-                get_pairs_from_events(
-                    stuck_detector.state.history.get_events_as_list(
-                        include_delegates=True
-                    )
-                )
-            )
-            == 6
-        )
+        assert len(state.history) == 12
+        # assert (
+        #    len(
+        #        get_pairs_from_events(state.history)
+        #    )
+        #    == 6
+        # )
 
         with patch('logging.Logger.warning') as mock_warning:
             assert stuck_detector.is_stuck() is True
             assert stuck_detector.state.almost_stuck == 0
             mock_warning.assert_called_once_with('Action, Observation loop detected')
 
-    def test_is_stuck_repeating_action_error(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_is_stuck_repeating_action_error(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
         # (action, error_observation), not necessarily the same error
         message_action = MessageAction(content='Done', wait_for_response=False)
         message_action._source = EventSource.USER
 
         hello_action = MessageAction(content='Hello', wait_for_response=False)
         hello_observation = NullObservation(content='')
-        event_stream.add_event(hello_action, EventSource.USER)
-        hello_observation._cause = hello_action._id
-        event_stream.add_event(hello_observation, EventSource.USER)
+        state.history.append(hello_action)
+        # hello_observation._cause = hello_action._id
+        state.history.append(hello_observation)
         # 2 events
 
         cmd_action_1 = CmdRunAction(command='invalid_command')
-        event_stream.add_event(cmd_action_1, EventSource.AGENT)
+        state.history.append(cmd_action_1)
         error_observation_1 = ErrorObservation(content='Command not found')
-        error_observation_1._cause = cmd_action_1._id
-        event_stream.add_event(error_observation_1, EventSource.USER)
+        # error_observation_1._cause = cmd_action_1._id
+        state.history.append(error_observation_1)
         # 4 events
 
         cmd_action_2 = CmdRunAction(command='invalid_command')
-        event_stream.add_event(cmd_action_2, EventSource.AGENT)
+        state.history.append(cmd_action_2)
         error_observation_2 = ErrorObservation(
             content='Command still not found or another error'
         )
-        error_observation_2._cause = cmd_action_2._id
-        event_stream.add_event(error_observation_2, EventSource.USER)
+        # error_observation_2._cause = cmd_action_2._id
+        state.history.append(error_observation_2)
         # 6 events
 
         message_null_observation = NullObservation(content='')
-        event_stream.add_event(message_action, EventSource.USER)
-        event_stream.add_event(message_null_observation, EventSource.USER)
+        state.history.append(message_action)
+        state.history.append(message_null_observation)
         # 8 events
 
         cmd_action_3 = CmdRunAction(command='invalid_command')
-        event_stream.add_event(cmd_action_3, EventSource.AGENT)
+        state.history.append(cmd_action_3)
         error_observation_3 = ErrorObservation(content='Different error')
-        error_observation_3._cause = cmd_action_3._id
-        event_stream.add_event(error_observation_3, EventSource.USER)
+        # error_observation_3._cause = cmd_action_3._id
+        state.history.append(error_observation_3)
         # 10 events
 
         cmd_action_4 = CmdRunAction(command='invalid_command')
-        event_stream.add_event(cmd_action_4, EventSource.AGENT)
+        state.history.append(cmd_action_4)
         error_observation_4 = ErrorObservation(content='Command not found')
-        error_observation_4._cause = cmd_action_4._id
-        event_stream.add_event(error_observation_4, EventSource.USER)
+        # error_observation_4._cause = cmd_action_4._id
+        state.history.append(error_observation_4)
         # 12 events
 
         with patch('logging.Logger.warning') as mock_warning:
@@ -267,11 +250,10 @@ def test_is_stuck_repeating_action_error(
                 'Action, ErrorObservation loop detected'
             )
 
-    def test_is_stuck_invalid_syntax_error(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_is_stuck_invalid_syntax_error(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
         self._impl_syntax_error_events(
-            event_stream,
+            state,
             error_message='SyntaxError: invalid syntax. Perhaps you forgot a comma?',
             random_line=False,
         )
@@ -280,10 +262,11 @@ def test_is_stuck_invalid_syntax_error(
             assert stuck_detector.is_stuck() is True
 
     def test_is_not_stuck_invalid_syntax_error_random_lines(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
     ):
+        state = stuck_detector.state
         self._impl_syntax_error_events(
-            event_stream,
+            state,
             error_message='SyntaxError: invalid syntax. Perhaps you forgot a comma?',
             random_line=True,
         )
@@ -292,10 +275,11 @@ def test_is_not_stuck_invalid_syntax_error_random_lines(
             assert stuck_detector.is_stuck() is False
 
     def test_is_not_stuck_invalid_syntax_error_only_three_incidents(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
     ):
+        state = stuck_detector.state
         self._impl_syntax_error_events(
-            event_stream,
+            state,
             error_message='SyntaxError: invalid syntax. Perhaps you forgot a comma?',
             random_line=True,
             incidents=3,
@@ -304,11 +288,10 @@ def test_is_not_stuck_invalid_syntax_error_only_three_incidents(
         with patch('logging.Logger.warning'):
             assert stuck_detector.is_stuck() is False
 
-    def test_is_stuck_incomplete_input_error(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_is_stuck_incomplete_input_error(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
         self._impl_syntax_error_events(
-            event_stream,
+            state,
             error_message='SyntaxError: incomplete input',
             random_line=False,
         )
@@ -316,11 +299,10 @@ def test_is_stuck_incomplete_input_error(
         with patch('logging.Logger.warning'):
             assert stuck_detector.is_stuck() is True
 
-    def test_is_not_stuck_incomplete_input_error(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_is_not_stuck_incomplete_input_error(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
         self._impl_syntax_error_events(
-            event_stream,
+            state,
             error_message='SyntaxError: incomplete input',
             random_line=True,
         )
@@ -329,238 +311,239 @@ def test_is_not_stuck_incomplete_input_error(
             assert stuck_detector.is_stuck() is False
 
     def test_is_not_stuck_ipython_unterminated_string_error_random_lines(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
     ):
-        self._impl_unterminated_string_error_events(event_stream, random_line=True)
+        state = stuck_detector.state
+        self._impl_unterminated_string_error_events(state, random_line=True)
 
         with patch('logging.Logger.warning'):
             assert stuck_detector.is_stuck() is False
 
     def test_is_not_stuck_ipython_unterminated_string_error_only_three_incidents(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
     ):
+        state = stuck_detector.state
         self._impl_unterminated_string_error_events(
-            event_stream, random_line=False, incidents=3
+            state, random_line=False, incidents=3
         )
 
         with patch('logging.Logger.warning'):
             assert stuck_detector.is_stuck() is False
 
     def test_is_stuck_ipython_unterminated_string_error(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
     ):
-        self._impl_unterminated_string_error_events(event_stream, random_line=False)
+        state = stuck_detector.state
+        self._impl_unterminated_string_error_events(state, random_line=False)
 
         with patch('logging.Logger.warning'):
             assert stuck_detector.is_stuck() is True
 
     def test_is_not_stuck_ipython_syntax_error_not_at_end(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
     ):
+        state = stuck_detector.state
         # this test is to make sure we don't get false positives
         # since the "at line x" is changing in between!
         ipython_action_1 = IPythonRunCellAction(code='print("hello')
-        event_stream.add_event(ipython_action_1, EventSource.AGENT)
+        state.history.append(ipython_action_1)
         ipython_observation_1 = IPythonRunCellObservation(
             content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 1)\nThis is some additional output',
             code='print("hello',
         )
-        ipython_observation_1._cause = ipython_action_1._id
-        event_stream.add_event(ipython_observation_1, EventSource.USER)
+        # ipython_observation_1._cause = ipython_action_1._id
+        state.history.append(ipython_observation_1)
 
         ipython_action_2 = IPythonRunCellAction(code='print("hello')
-        event_stream.add_event(ipython_action_2, EventSource.AGENT)
+        state.history.append(ipython_action_2)
         ipython_observation_2 = IPythonRunCellObservation(
             content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 1)\nToo much output here on and on',
             code='print("hello',
         )
-        ipython_observation_2._cause = ipython_action_2._id
-        event_stream.add_event(ipython_observation_2, EventSource.USER)
+        # ipython_observation_2._cause = ipython_action_2._id
+        state.history.append(ipython_observation_2)
 
         ipython_action_3 = IPythonRunCellAction(code='print("hello')
-        event_stream.add_event(ipython_action_3, EventSource.AGENT)
+        state.history.append(ipython_action_3)
         ipython_observation_3 = IPythonRunCellObservation(
             content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 3)\nEnough',
             code='print("hello',
         )
-        ipython_observation_3._cause = ipython_action_3._id
-        event_stream.add_event(ipython_observation_3, EventSource.USER)
+        # ipython_observation_3._cause = ipython_action_3._id
+        state.history.append(ipython_observation_3)
 
         ipython_action_4 = IPythonRunCellAction(code='print("hello')
-        event_stream.add_event(ipython_action_4, EventSource.AGENT)
+        state.history.append(ipython_action_4)
         ipython_observation_4 = IPythonRunCellObservation(
             content='print("hello\n       ^\nSyntaxError: unterminated string literal (detected at line 2)\nLast line of output',
             code='print("hello',
         )
-        ipython_observation_4._cause = ipython_action_4._id
-        event_stream.add_event(ipython_observation_4, EventSource.USER)
+        # ipython_observation_4._cause = ipython_action_4._id
+        state.history.append(ipython_observation_4)
 
         with patch('logging.Logger.warning') as mock_warning:
             assert stuck_detector.is_stuck() is False
             mock_warning.assert_not_called()
 
     def test_is_stuck_repeating_action_observation_pattern(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
+        self, stuck_detector: StuckDetector
     ):
+        state = stuck_detector.state
         message_action = MessageAction(content='Come on', wait_for_response=False)
         message_action._source = EventSource.USER
-        event_stream.add_event(message_action, EventSource.USER)
+        state.history.append(message_action)
         message_observation = NullObservation(content='')
-        event_stream.add_event(message_observation, EventSource.USER)
+        state.history.append(message_observation)
 
         cmd_action_1 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_1, EventSource.AGENT)
+        state.history.append(cmd_action_1)
         cmd_observation_1 = CmdOutputObservation(
             command_id=1, command='ls', content='file1.txt\nfile2.txt'
         )
-        cmd_observation_1._cause = cmd_action_1._id
-        event_stream.add_event(cmd_observation_1, EventSource.USER)
+        # cmd_observation_1._cause = cmd_action_1._id
+        state.history.append(cmd_observation_1)
 
         read_action_1 = FileReadAction(path='file1.txt')
-        event_stream.add_event(read_action_1, EventSource.AGENT)
+        state.history.append(read_action_1)
         read_observation_1 = FileReadObservation(
             content='File content', path='file1.txt'
         )
-        read_observation_1._cause = read_action_1._id
-        event_stream.add_event(read_observation_1, EventSource.USER)
+        # read_observation_1._cause = read_action_1._id
+        state.history.append(read_observation_1)
 
         cmd_action_2 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_2, EventSource.AGENT)
+        state.history.append(cmd_action_2)
         cmd_observation_2 = CmdOutputObservation(
             command_id=2, command='ls', content='file1.txt\nfile2.txt'
         )
-        cmd_observation_2._cause = cmd_action_2._id
-        event_stream.add_event(cmd_observation_2, EventSource.USER)
+        # cmd_observation_2._cause = cmd_action_2._id
+        state.history.append(cmd_observation_2)
 
         read_action_2 = FileReadAction(path='file1.txt')
-        event_stream.add_event(read_action_2, EventSource.AGENT)
+        state.history.append(read_action_2)
         read_observation_2 = FileReadObservation(
             content='File content', path='file1.txt'
         )
-        read_observation_2._cause = read_action_2._id
-        event_stream.add_event(read_observation_2, EventSource.USER)
+        # read_observation_2._cause = read_action_2._id
+        state.history.append(read_observation_2)
 
         # one more message to break the pattern
         message_null_observation = NullObservation(content='')
-        event_stream.add_event(message_action, EventSource.USER)
-        event_stream.add_event(message_null_observation, EventSource.USER)
+        state.history.append(message_action)
+        state.history.append(message_null_observation)
 
         cmd_action_3 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_3, EventSource.AGENT)
+        state.history.append(cmd_action_3)
         cmd_observation_3 = CmdOutputObservation(
             command_id=3, command='ls', content='file1.txt\nfile2.txt'
         )
-        cmd_observation_3._cause = cmd_action_3._id
-        event_stream.add_event(cmd_observation_3, EventSource.USER)
+        # cmd_observation_3._cause = cmd_action_3._id
+        state.history.append(cmd_observation_3)
 
         read_action_3 = FileReadAction(path='file1.txt')
-        event_stream.add_event(read_action_3, EventSource.AGENT)
+        state.history.append(read_action_3)
         read_observation_3 = FileReadObservation(
             content='File content', path='file1.txt'
         )
-        read_observation_3._cause = read_action_3._id
-        event_stream.add_event(read_observation_3, EventSource.USER)
+        # read_observation_3._cause = read_action_3._id
+        state.history.append(read_observation_3)
 
         with patch('logging.Logger.warning') as mock_warning:
             assert stuck_detector.is_stuck() is True
             mock_warning.assert_called_once_with('Action, Observation pattern detected')
 
-    def test_is_stuck_not_stuck(
-        self, stuck_detector: StuckDetector, event_stream: EventStream
-    ):
+    def test_is_stuck_not_stuck(self, stuck_detector: StuckDetector):
+        state = stuck_detector.state
         message_action = MessageAction(content='Done', wait_for_response=False)
         message_action._source = EventSource.USER
 
         hello_action = MessageAction(content='Hello', wait_for_response=False)
-        event_stream.add_event(hello_action, EventSource.USER)
+        state.history.append(hello_action)
         hello_observation = NullObservation(content='')
-        hello_observation._cause = hello_action._id
-        event_stream.add_event(hello_observation, EventSource.USER)
+        # hello_observation._cause = hello_action._id
+        state.history.append(hello_observation)
 
         cmd_action_1 = CmdRunAction(command='ls')
-        event_stream.add_event(cmd_action_1, EventSource.AGENT)
+        state.history.append(cmd_action_1)
         cmd_observation_1 = CmdOutputObservation(
             command_id=cmd_action_1.id, command='ls', content='file1.txt\nfile2.txt'
         )
-        cmd_observation_1._cause = cmd_action_1._id
-        event_stream.add_event(cmd_observation_1, EventSource.USER)
+        # cmd_observation_1._cause = cmd_action_1._id
+        state.history.append(cmd_observation_1)
 
         read_action_1 = FileReadAction(path='file1.txt')
-        event_stream.add_event(read_action_1, EventSource.AGENT)
+        state.history.append(read_action_1)
         read_observation_1 = FileReadObservation(
             content='File content', path='file1.txt'
         )
-        read_observation_1._cause = read_action_1._id
-        event_stream.add_event(read_observation_1, EventSource.USER)
+        # read_observation_1._cause = read_action_1._id
+        state.history.append(read_observation_1)
 
         cmd_action_2 = CmdRunAction(command='pwd')
-        event_stream.add_event(cmd_action_2, EventSource.AGENT)
+        state.history.append(cmd_action_2)
         cmd_observation_2 = CmdOutputObservation(
             command_id=2, command='pwd', content='/home/user'
         )
-        cmd_observation_2._cause = cmd_action_2._id
-        event_stream.add_event(cmd_observation_2, EventSource.USER)
+        # cmd_observation_2._cause = cmd_action_2._id
+        state.history.append(cmd_observation_2)
 
         read_action_2 = FileReadAction(path='file2.txt')
-        event_stream.add_event(read_action_2, EventSource.AGENT)
+        state.history.append(read_action_2)
         read_observation_2 = FileReadObservation(
             content='Another file content', path='file2.txt'
         )
-        read_observation_2._cause = read_action_2._id
-        event_stream.add_event(read_observation_2, EventSource.USER)
+        # read_observation_2._cause = read_action_2._id
+        state.history.append(read_observation_2)
 
         message_null_observation = NullObservation(content='')
-        event_stream.add_event(message_action, EventSource.USER)
-        event_stream.add_event(message_null_observation, EventSource.USER)
+        state.history.append(message_action)
+        state.history.append(message_null_observation)
 
         cmd_action_3 = CmdRunAction(command='pwd')
-        event_stream.add_event(cmd_action_3, EventSource.AGENT)
+        state.history.append(cmd_action_3)
         cmd_observation_3 = CmdOutputObservation(
             command_id=cmd_action_3.id, command='pwd', content='/home/user'
         )
-        cmd_observation_3._cause = cmd_action_3._id
-        event_stream.add_event(cmd_observation_3, EventSource.USER)
+        # cmd_observation_3._cause = cmd_action_3._id
+        state.history.append(cmd_observation_3)
 
         read_action_3 = FileReadAction(path='file2.txt')
-        event_stream.add_event(read_action_3, EventSource.AGENT)
+        state.history.append(read_action_3)
         read_observation_3 = FileReadObservation(
             content='Another file content', path='file2.txt'
         )
-        read_observation_3._cause = read_action_3._id
-        event_stream.add_event(read_observation_3, EventSource.USER)
+        # read_observation_3._cause = read_action_3._id
+        state.history.append(read_observation_3)
 
         assert stuck_detector.is_stuck() is False
 
-    def test_is_stuck_monologue(self, stuck_detector, event_stream):
-        # Add events to the event stream
+    def test_is_stuck_monologue(self, stuck_detector):
+        state = stuck_detector.state
+        # Add events to the history list directly
         message_action_1 = MessageAction(content='Hi there!')
-        event_stream.add_event(message_action_1, EventSource.USER)
         message_action_1._source = EventSource.USER
-
+        state.history.append(message_action_1)
         message_action_2 = MessageAction(content='Hi there!')
-        event_stream.add_event(message_action_2, EventSource.AGENT)
         message_action_2._source = EventSource.AGENT
-
+        state.history.append(message_action_2)
         message_action_3 = MessageAction(content='How are you?')
-        event_stream.add_event(message_action_3, EventSource.USER)
         message_action_3._source = EventSource.USER
+        state.history.append(message_action_3)
 
         cmd_kill_action = CmdRunAction(
             command='echo 42', thought="I'm not stuck, he's stuck"
         )
-        event_stream.add_event(cmd_kill_action, EventSource.AGENT)
+        state.history.append(cmd_kill_action)
 
         message_action_4 = MessageAction(content="I'm doing well, thanks for asking.")
-        event_stream.add_event(message_action_4, EventSource.AGENT)
         message_action_4._source = EventSource.AGENT
-
+        state.history.append(message_action_4)
         message_action_5 = MessageAction(content="I'm doing well, thanks for asking.")
-        event_stream.add_event(message_action_5, EventSource.AGENT)
         message_action_5._source = EventSource.AGENT
-
+        state.history.append(message_action_5)
         message_action_6 = MessageAction(content="I'm doing well, thanks for asking.")
-        event_stream.add_event(message_action_6, EventSource.AGENT)
         message_action_6._source = EventSource.AGENT
+        state.history.append(message_action_6)
 
         assert stuck_detector.is_stuck()
 
@@ -571,16 +554,15 @@ def test_is_stuck_monologue(self, stuck_detector, event_stream):
             command='storybook',
             exit_code=0,
         )
-        cmd_output_observation._cause = cmd_kill_action._id
-        event_stream.add_event(cmd_output_observation, EventSource.USER)
+        # cmd_output_observation._cause = cmd_kill_action._id
+        state.history.append(cmd_output_observation)
 
         message_action_7 = MessageAction(content="I'm doing well, thanks for asking.")
-        event_stream.add_event(message_action_7, EventSource.AGENT)
         message_action_7._source = EventSource.AGENT
-
+        state.history.append(message_action_7)
         message_action_8 = MessageAction(content="I'm doing well, thanks for asking.")
-        event_stream.add_event(message_action_8, EventSource.AGENT)
         message_action_8._source = EventSource.AGENT
+        state.history.append(message_action_8)
 
         with patch('logging.Logger.warning'):
             assert not stuck_detector.is_stuck()
@@ -595,7 +577,6 @@ def controller(self):
         )
         controller.delegate = None
         controller.state = Mock()
-        controller.state.history = ShortTermHistory()
         return controller
 
     def test_is_stuck_delegate_stuck(self, controller: AgentController):
diff --git a/tests/unit/test_llm_config.py b/tests/unit/test_llm_config.py
new file mode 100644
index 000000000000..2fc22d6f2232
--- /dev/null
+++ b/tests/unit/test_llm_config.py
@@ -0,0 +1,228 @@
+import pathlib
+
+import pytest
+
+from openhands.core.config import AppConfig
+from openhands.core.config.utils import load_from_toml
+
+
+@pytest.fixture
+def default_config(monkeypatch):
+    # Fixture to provide a default AppConfig instance
+    yield AppConfig()
+
+
+@pytest.fixture
+def generic_llm_toml(tmp_path: pathlib.Path) -> str:
+    """Fixture to create a generic LLM TOML configuration with all custom LLMs
+    providing mandatory 'model' and 'api_key', and testing fallback to the generic section values
+    for other attributes like 'num_retries'.
+    """
+    toml_content = """
+[core]
+workspace_base = "./workspace"
+
+[llm]
+model = "base-model"
+api_key = "base-api-key"
+embedding_model = "base-embedding"
+num_retries = 3
+
+[llm.custom1]
+model = "custom-model-1"
+api_key = "custom-api-key-1"
+# 'num_retries' is not overridden and should fallback to the value from [llm]
+
+[llm.custom2]
+model = "custom-model-2"
+api_key = "custom-api-key-2"
+num_retries = 5  # Overridden value
+
+[llm.custom3]
+model = "custom-model-3"
+api_key = "custom-api-key-3"
+# No overrides for additional attributes
+    """
+    toml_file = tmp_path / 'llm_config.toml'
+    toml_file.write_text(toml_content)
+    return str(toml_file)
+
+
+def test_load_from_toml_llm_with_fallback(
+    default_config: AppConfig, generic_llm_toml: str
+) -> None:
+    """Test that custom LLM configurations fallback non-overridden attributes
+    like 'num_retries' from the generic [llm] section.
+    """
+    load_from_toml(default_config, generic_llm_toml)
+
+    # Verify generic LLM configuration
+    generic_llm = default_config.get_llm_config('llm')
+    assert generic_llm.model == 'base-model'
+    assert generic_llm.api_key == 'base-api-key'
+    assert generic_llm.embedding_model == 'base-embedding'
+    assert generic_llm.num_retries == 3
+
+    # Verify custom1 LLM falls back 'num_retries' from base
+    custom1 = default_config.get_llm_config('custom1')
+    assert custom1.model == 'custom-model-1'
+    assert custom1.api_key == 'custom-api-key-1'
+    assert custom1.embedding_model == 'base-embedding'
+    assert custom1.num_retries == 3  # from [llm]
+
+    # Verify custom2 LLM overrides 'num_retries'
+    custom2 = default_config.get_llm_config('custom2')
+    assert custom2.model == 'custom-model-2'
+    assert custom2.api_key == 'custom-api-key-2'
+    assert custom2.embedding_model == 'base-embedding'
+    assert custom2.num_retries == 5  # overridden value
+
+    # Verify custom3 LLM inherits all attributes except 'model' and 'api_key'
+    custom3 = default_config.get_llm_config('custom3')
+    assert custom3.model == 'custom-model-3'
+    assert custom3.api_key == 'custom-api-key-3'
+    assert custom3.embedding_model == 'base-embedding'
+    assert custom3.num_retries == 3  # from [llm]
+
+
+def test_load_from_toml_llm_custom_overrides_all(
+    default_config: AppConfig, tmp_path: pathlib.Path
+) -> None:
+    """Test that a custom LLM can fully override all attributes from the generic [llm] section."""
+    toml_content = """
+[core]
+workspace_base = "./workspace"
+
+[llm]
+model = "base-model"
+api_key = "base-api-key"
+embedding_model = "base-embedding"
+num_retries = 3
+
+[llm.custom_full]
+model = "full-custom-model"
+api_key = "full-custom-api-key"
+embedding_model = "full-custom-embedding"
+num_retries = 10
+    """
+    toml_file = tmp_path / 'full_override_llm.toml'
+    toml_file.write_text(toml_content)
+
+    load_from_toml(default_config, str(toml_file))
+
+    # Verify generic LLM configuration remains unchanged
+    generic_llm = default_config.get_llm_config('llm')
+    assert generic_llm.model == 'base-model'
+    assert generic_llm.api_key == 'base-api-key'
+    assert generic_llm.embedding_model == 'base-embedding'
+    assert generic_llm.num_retries == 3
+
+    # Verify custom_full LLM overrides all attributes
+    custom_full = default_config.get_llm_config('custom_full')
+    assert custom_full.model == 'full-custom-model'
+    assert custom_full.api_key == 'full-custom-api-key'
+    assert custom_full.embedding_model == 'full-custom-embedding'
+    assert custom_full.num_retries == 10  # overridden value
+
+
+def test_load_from_toml_llm_custom_partial_override(
+    default_config: AppConfig, generic_llm_toml: str
+) -> None:
+    """Test that custom LLM configurations can partially override attributes
+    from the generic [llm] section while inheriting others.
+    """
+    load_from_toml(default_config, generic_llm_toml)
+
+    # Verify custom1 LLM overrides 'model' and 'api_key' but inherits 'num_retries'
+    custom1 = default_config.get_llm_config('custom1')
+    assert custom1.model == 'custom-model-1'
+    assert custom1.api_key == 'custom-api-key-1'
+    assert custom1.embedding_model == 'base-embedding'
+    assert custom1.num_retries == 3  # from [llm]
+
+    # Verify custom2 LLM overrides 'model', 'api_key', and 'num_retries'
+    custom2 = default_config.get_llm_config('custom2')
+    assert custom2.model == 'custom-model-2'
+    assert custom2.api_key == 'custom-api-key-2'
+    assert custom2.embedding_model == 'base-embedding'
+    assert custom2.num_retries == 5  # Overridden value
+
+
+def test_load_from_toml_llm_custom_no_override(
+    default_config: AppConfig, generic_llm_toml: str
+) -> None:
+    """Test that custom LLM configurations with no additional overrides
+    inherit all non-specified attributes from the generic [llm] section.
+    """
+    load_from_toml(default_config, generic_llm_toml)
+
+    # Verify custom3 LLM inherits 'embedding_model' and 'num_retries' from generic
+    custom3 = default_config.get_llm_config('custom3')
+    assert custom3.model == 'custom-model-3'
+    assert custom3.api_key == 'custom-api-key-3'
+    assert custom3.embedding_model == 'base-embedding'
+    assert custom3.num_retries == 3  # from [llm]
+
+
+def test_load_from_toml_llm_missing_generic(
+    default_config: AppConfig, tmp_path: pathlib.Path
+) -> None:
+    """Test that custom LLM configurations without a generic [llm] section
+    use only their own attributes and fallback to defaults for others.
+    """
+    toml_content = """
+[core]
+workspace_base = "./workspace"
+
+[llm.custom_only]
+model = "custom-only-model"
+api_key = "custom-only-api-key"
+    """
+    toml_file = tmp_path / 'custom_only_llm.toml'
+    toml_file.write_text(toml_content)
+
+    load_from_toml(default_config, str(toml_file))
+
+    # Verify custom_only LLM uses its own attributes and defaults for others
+    custom_only = default_config.get_llm_config('custom_only')
+    assert custom_only.model == 'custom-only-model'
+    assert custom_only.api_key == 'custom-only-api-key'
+    assert custom_only.embedding_model == 'local'  # default value
+    assert custom_only.num_retries == 8  # default value
+
+
+def test_load_from_toml_llm_invalid_config(
+    default_config: AppConfig, tmp_path: pathlib.Path
+) -> None:
+    """Test that invalid custom LLM configurations do not override the generic
+    and raise appropriate warnings.
+    """
+    toml_content = """
+[core]
+workspace_base = "./workspace"
+
+[llm]
+model = "base-model"
+api_key = "base-api-key"
+num_retries = 3
+
+[llm.invalid_custom]
+unknown_attr = "should_not_exist"
+    """
+    toml_file = tmp_path / 'invalid_custom_llm.toml'
+    toml_file.write_text(toml_content)
+
+    load_from_toml(default_config, str(toml_file))
+
+    # Verify generic LLM is loaded correctly
+    generic_llm = default_config.get_llm_config('llm')
+    assert generic_llm.model == 'base-model'
+    assert generic_llm.api_key == 'base-api-key'
+    assert generic_llm.num_retries == 3
+
+    # Verify invalid_custom LLM does not override generic attributes
+    custom_invalid = default_config.get_llm_config('invalid_custom')
+    assert custom_invalid.model == 'base-model'
+    assert custom_invalid.api_key == 'base-api-key'
+    assert custom_invalid.num_retries == 3  # default value
+    assert custom_invalid.embedding_model == 'local'  # default value
diff --git a/tests/unit/test_micro_agents.py b/tests/unit/test_micro_agents.py
index 70553d851125..8cff14fdd4f2 100644
--- a/tests/unit/test_micro_agents.py
+++ b/tests/unit/test_micro_agents.py
@@ -10,10 +10,8 @@
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
-from openhands.events import EventSource
 from openhands.events.action import MessageAction
 from openhands.events.stream import EventStream
-from openhands.memory.history import ShortTermHistory
 from openhands.storage import get_file_store
 
 
@@ -74,10 +72,10 @@ def test_coder_agent_with_summary(event_stream: EventStream, agent_configs: dict
     )
     assert coder_agent is not None
 
+    # give it some history
     task = 'This is a dummy task'
-    history = ShortTermHistory()
-    history.set_event_stream(event_stream)
-    event_stream.add_event(MessageAction(content=task), EventSource.USER)
+    history = list()
+    history.append(MessageAction(content=task))
 
     summary = 'This is a dummy summary about this repo'
     state = State(history=history, inputs={'summary': summary})
@@ -119,10 +117,10 @@ def test_coder_agent_without_summary(event_stream: EventStream, agent_configs: d
     )
     assert coder_agent is not None
 
+    # give it some history
     task = 'This is a dummy task'
-    history = ShortTermHistory()
-    history.set_event_stream(event_stream)
-    event_stream.add_event(MessageAction(content=task), EventSource.USER)
+    history = list()
+    history.append(MessageAction(content=task))
 
     # set state without codebase summary
     state = State(history=history)
diff --git a/tests/unit/test_prompt_caching.py b/tests/unit/test_prompt_caching.py
index 41dc75618746..db4eb2a56dab 100644
--- a/tests/unit/test_prompt_caching.py
+++ b/tests/unit/test_prompt_caching.py
@@ -1,14 +1,12 @@
-from unittest.mock import MagicMock, Mock, patch
+from unittest.mock import Mock, patch
 
 import pytest
 
 from openhands.agenthub.codeact_agent.codeact_agent import CodeActAgent
 from openhands.core.config import AgentConfig, LLMConfig
-from openhands.events import EventSource, EventStream
 from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.llm.llm import LLM
-from openhands.storage import get_file_store
 
 
 @pytest.fixture
@@ -19,29 +17,34 @@ def mock_llm():
     return llm
 
 
-@pytest.fixture
-def mock_event_stream(tmp_path):
-    file_store = get_file_store('local', str(tmp_path))
-    return EventStream('test_session', file_store)
-
-
 @pytest.fixture
 def codeact_agent(mock_llm):
     config = AgentConfig()
     return CodeActAgent(mock_llm, config)
 
 
-def test_get_messages_with_reminder(codeact_agent, mock_event_stream):
-    # Add some events to the stream
-    mock_event_stream.add_event(MessageAction('Initial user message'), EventSource.USER)
-    mock_event_stream.add_event(MessageAction('Sure!'), EventSource.AGENT)
-    mock_event_stream.add_event(MessageAction('Hello, agent!'), EventSource.USER)
-    mock_event_stream.add_event(MessageAction('Hello, user!'), EventSource.AGENT)
-    mock_event_stream.add_event(MessageAction('Laaaaaaaast!'), EventSource.USER)
+def test_get_messages_with_reminder(codeact_agent: CodeActAgent):
+    # Add some events to history
+    history = list()
+    message_action_1 = MessageAction('Initial user message')
+    message_action_1._source = 'user'
+    history.append(message_action_1)
+    message_action_2 = MessageAction('Sure!')
+    message_action_2._source = 'assistant'
+    history.append(message_action_2)
+    message_action_3 = MessageAction('Hello, agent!')
+    message_action_3._source = 'user'
+    history.append(message_action_3)
+    message_action_4 = MessageAction('Hello, user!')
+    message_action_4._source = 'assistant'
+    history.append(message_action_4)
+    message_action_5 = MessageAction('Laaaaaaaast!')
+    message_action_5._source = 'user'
+    history.append(message_action_5)
 
     codeact_agent.reset()
     messages = codeact_agent._get_messages(
-        Mock(history=mock_event_stream, max_iterations=5, iteration=0)
+        Mock(history=history, max_iterations=5, iteration=0)
     )
 
     assert (
@@ -71,19 +74,20 @@ def test_get_messages_with_reminder(codeact_agent, mock_event_stream):
     )
 
 
-def test_get_messages_prompt_caching(codeact_agent, mock_event_stream):
+def test_get_messages_prompt_caching(codeact_agent: CodeActAgent):
+    history = list()
     # Add multiple user and agent messages
     for i in range(15):
-        mock_event_stream.add_event(
-            MessageAction(f'User message {i}'), EventSource.USER
-        )
-        mock_event_stream.add_event(
-            MessageAction(f'Agent message {i}'), EventSource.AGENT
-        )
+        message_action_user = MessageAction(f'User message {i}')
+        message_action_user._source = 'user'
+        history.append(message_action_user)
+        message_action_agent = MessageAction(f'Agent message {i}')
+        message_action_agent._source = 'assistant'
+        history.append(message_action_agent)
 
     codeact_agent.reset()
     messages = codeact_agent._get_messages(
-        Mock(history=mock_event_stream, max_iterations=10, iteration=5)
+        Mock(history=history, max_iterations=10, iteration=5)
     )
 
     # Check that only the last two user messages have cache_prompt=True
@@ -104,15 +108,19 @@ def test_get_messages_prompt_caching(codeact_agent, mock_event_stream):
     assert cached_user_messages[3].content[0].text.startswith('User message 1')
 
 
-def test_get_messages_with_cmd_action(codeact_agent, mock_event_stream):
+def test_get_messages_with_cmd_action(codeact_agent: CodeActAgent):
+    history = list()
     # Add a mix of actions and observations
     message_action_1 = MessageAction(
         "Let's list the contents of the current directory."
     )
-    mock_event_stream.add_event(message_action_1, EventSource.USER)
+    message_action_1._source = 'user'
+    history.append(message_action_1)
 
     cmd_action_1 = CmdRunAction('ls -l', thought='List files in current directory')
-    mock_event_stream.add_event(cmd_action_1, EventSource.AGENT)
+    cmd_action_1._source = 'agent'
+    cmd_action_1._id = 'cmd_1'
+    history.append(cmd_action_1)
 
     cmd_observation_1 = CmdOutputObservation(
         content='total 0\n-rw-r--r-- 1 user group 0 Jan 1 00:00 file1.txt\n-rw-r--r-- 1 user group 0 Jan 1 00:00 file2.txt',
@@ -120,13 +128,17 @@ def test_get_messages_with_cmd_action(codeact_agent, mock_event_stream):
         command='ls -l',
         exit_code=0,
     )
-    mock_event_stream.add_event(cmd_observation_1, EventSource.USER)
+    cmd_observation_1._source = 'user'
+    history.append(cmd_observation_1)
 
     message_action_2 = MessageAction("Now, let's create a new directory.")
-    mock_event_stream.add_event(message_action_2, EventSource.AGENT)
+    message_action_2._source = 'agent'
+    history.append(message_action_2)
 
     cmd_action_2 = CmdRunAction('mkdir new_directory', thought='Create a new directory')
-    mock_event_stream.add_event(cmd_action_2, EventSource.AGENT)
+    cmd_action_2._source = 'agent'
+    cmd_action_2._id = 'cmd_2'
+    history.append(cmd_action_2)
 
     cmd_observation_2 = CmdOutputObservation(
         content='',
@@ -134,11 +146,12 @@ def test_get_messages_with_cmd_action(codeact_agent, mock_event_stream):
         command='mkdir new_directory',
         exit_code=0,
     )
-    mock_event_stream.add_event(cmd_observation_2, EventSource.USER)
+    cmd_observation_2._source = 'user'
+    history.append(cmd_observation_2)
 
     codeact_agent.reset()
     messages = codeact_agent._get_messages(
-        Mock(history=mock_event_stream, max_iterations=5, iteration=0)
+        Mock(history=history, max_iterations=5, iteration=0)
     )
 
     # Assert the presence of key elements in the messages
@@ -180,16 +193,14 @@ def test_get_messages_with_cmd_action(codeact_agent, mock_event_stream):
     assert 'ENVIRONMENT REMINDER: You have 5 turns' in messages[5].content[1].text
 
 
-def test_prompt_caching_headers(codeact_agent, mock_event_stream):
+def test_prompt_caching_headers(codeact_agent: CodeActAgent):
+    history = list()
     # Setup
-    mock_event_stream.add_event(MessageAction('Hello, agent!'), EventSource.USER)
-    mock_event_stream.add_event(MessageAction('Hello, user!'), EventSource.AGENT)
-
-    mock_short_term_history = MagicMock()
-    mock_short_term_history.get_last_user_message.return_value = 'Hello, agent!'
+    history.append(MessageAction('Hello, agent!'))
+    history.append(MessageAction('Hello, user!'))
 
     mock_state = Mock()
-    mock_state.history = mock_short_term_history
+    mock_state.history = history
     mock_state.max_iterations = 5
     mock_state.iteration = 0
 
diff --git a/tests/unit/test_prompt_manager.py b/tests/unit/test_prompt_manager.py
index 2534f73d3ab8..be20456b539d 100644
--- a/tests/unit/test_prompt_manager.py
+++ b/tests/unit/test_prompt_manager.py
@@ -14,7 +14,7 @@ def prompt_dir(tmp_path):
     shutil.copytree('openhands/agenthub/codeact_agent', tmp_path, dirs_exist_ok=True)
 
     # Return the temporary directory path
-    return tmp_path
+    return str(tmp_path)  # Return string path
 
 
 SAMPLE_AGENT_SKILLS_DOCS = """Sample agent skills documentation"""
@@ -26,10 +26,10 @@ def agent_skills_docs():
 
 
 def test_prompt_manager_without_micro_agent(prompt_dir, agent_skills_docs):
-    manager = PromptManager(prompt_dir, agent_skills_docs)
+    manager = PromptManager(prompt_dir)
 
     assert manager.prompt_dir == prompt_dir
-    assert manager.agent_skills_docs == agent_skills_docs
+    # assert manager.agent_skills_docs == agent_skills_docs
     assert manager.micro_agent is None
 
     assert isinstance(manager.system_message, str)
@@ -37,7 +37,7 @@ def test_prompt_manager_without_micro_agent(prompt_dir, agent_skills_docs):
         "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed answers to the user's questions."
         in manager.system_message
     )
-    assert SAMPLE_AGENT_SKILLS_DOCS in manager.system_message
+    # assert SAMPLE_AGENT_SKILLS_DOCS in manager.system_message
     assert isinstance(manager.initial_user_message, str)
     assert '--- BEGIN OF GUIDELINE ---' not in manager.initial_user_message
     assert '--- END OF GUIDELINE ---' not in manager.initial_user_message
@@ -64,12 +64,11 @@ def test_prompt_manager_with_micro_agent(prompt_dir, agent_skills_docs):
 
     manager = PromptManager(
         prompt_dir=prompt_dir,
-        agent_skills_docs=agent_skills_docs,
         micro_agent=mock_micro_agent,
     )
 
     assert manager.prompt_dir == prompt_dir
-    assert manager.agent_skills_docs == agent_skills_docs
+    # assert manager.agent_skills_docs == agent_skills_docs
     assert manager.micro_agent == mock_micro_agent
 
     assert isinstance(manager.system_message, str)
@@ -77,7 +76,7 @@ def test_prompt_manager_with_micro_agent(prompt_dir, agent_skills_docs):
         "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed answers to the user's questions."
         in manager.system_message
     )
-    assert SAMPLE_AGENT_SKILLS_DOCS in manager.system_message
+    # assert SAMPLE_AGENT_SKILLS_DOCS in manager.system_message
 
     assert isinstance(manager.initial_user_message, str)
     assert (
@@ -106,11 +105,19 @@ def test_prompt_manager_template_rendering(prompt_dir, agent_skills_docs):
     with open(os.path.join(prompt_dir, 'user_prompt.j2'), 'w') as f:
         f.write('User prompt: {{ micro_agent }}')
 
-    manager = PromptManager(prompt_dir, agent_skills_docs)
+    manager = PromptManager(prompt_dir)
 
-    assert manager.system_message == f'System prompt: {agent_skills_docs}'
+    # assert manager.system_message == f'System prompt: {agent_skills_docs}'
     assert manager.initial_user_message == 'User prompt: None'
 
     # Clean up temporary files
     os.remove(os.path.join(prompt_dir, 'system_prompt.j2'))
     os.remove(os.path.join(prompt_dir, 'user_prompt.j2'))
+
+
+def test_prompt_manager_loads_agent_skill(prompt_dir):
+    manager = PromptManager(prompt_dir)
+    assert (
+        'open_file(path: str, line_number: int | None = 1, context_lines: int | None = 100) -> None'
+        in manager.system_message
+    )