All-Hands-AI · enyst · Oct 13, 2024 · Oct 13, 2024 · Oct 13, 2024 · Oct 4, 2024
diff --git a/config.template.toml b/config.template.toml
@@ -171,6 +171,24 @@ model = "gpt-4o"
 # If model is vision capable, this option allows to disable image processing (useful for cost reduction).
 #disable_vision = true
 
+# maximum number of messages in a conversation, after which they are truncated or summarized
+# max_conversation_window = 10
+
+# number of results when recalling message history
+# conversation_top_k = 5
+
+# fraction of the conversation window to summarize
+# message_summary_trunc_tokens_fraction = 0.75
+
+# summary LLM
+[llm.summary]
+model = "deepseek"
+
+# default LLM
+[llm.default]
+model = "claude"
+
+
 [llm.gpt4o-mini]
 api_key = "your-api-key"
 model = "gpt-4o"

diff --git a/evaluation/EDA/run_infer.py b/evaluation/EDA/run_infer.py
@@ -8,6 +8,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -34,7 +35,7 @@ def codeact_user_response_eda(state: State) -> str:
 
     # retrieve the latest model message from history
     if state.history:
-        model_guess = state.history.get_last_agent_message()
+        model_guess = state.get_last_agent_message()
 
     assert game is not None, 'Game is not initialized.'
     msg = game.generate_user_response(model_guess)
@@ -139,7 +140,7 @@ def process_instance(
     if state is None:
         raise ValueError('State should not be None.')
 
-    final_message = state.history.get_last_agent_message()
+    final_message = state.get_last_agent_message()
 
     logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
     test_result = game.reward()
@@ -148,7 +149,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(

diff --git a/evaluation/agent_bench/run_infer.py b/evaluation/agent_bench/run_infer.py
@@ -16,6 +16,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -242,7 +243,7 @@ def process_instance(
         raw_ans = ''
 
         # retrieve the last agent message or thought
-        for event in state.history.get_events(reverse=True):
+        for event in reversed(state.history):
             if event.source == 'agent':
                 if isinstance(event, AgentFinishAction):
                     raw_ans = event.thought
@@ -271,7 +272,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     metrics = state.metrics.get() if state.metrics else None
 

diff --git a/evaluation/aider_bench/run_infer.py b/evaluation/aider_bench/run_infer.py
@@ -15,6 +15,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -250,7 +251,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
     metrics = state.metrics.get() if state.metrics else None
 
     # Save the output

diff --git a/evaluation/biocoder/run_infer.py b/evaluation/biocoder/run_infer.py
@@ -13,6 +13,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -299,7 +300,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     test_result['generated'] = test_result['metadata']['1_copy_change_code']
 

diff --git a/evaluation/bird/run_infer.py b/evaluation/bird/run_infer.py
@@ -16,6 +16,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -46,7 +47,7 @@ def codeact_user_response(state: State) -> str:
         # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
         user_msgs = [
             event
-            for event in state.history.get_events()
+            for event in state.history
             if isinstance(event, MessageAction) and event.source == 'user'
         ]
         if len(user_msgs) > 2:
@@ -431,7 +432,7 @@ def execute_sql(db_path, sql):
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(

diff --git a/evaluation/browsing_delegation/run_infer.py b/evaluation/browsing_delegation/run_infer.py
@@ -9,6 +9,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -89,7 +90,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # find the last delegate action
     last_delegate_action = None

diff --git a/evaluation/gaia/run_infer.py b/evaluation/gaia/run_infer.py
@@ -12,6 +12,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -166,7 +167,7 @@ def process_instance(
 
     model_answer_raw = ''
     # get the last message or thought from the agent
-    for event in state.history.get_events(reverse=True):
+    for event in reversed(state.history):
         if event.source == 'agent':
             if isinstance(event, AgentFinishAction):
                 model_answer_raw = event.thought
@@ -203,7 +204,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(

diff --git a/evaluation/gorilla/run_infer.py b/evaluation/gorilla/run_infer.py
@@ -10,6 +10,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -101,7 +102,7 @@ def process_instance(
         raise ValueError('State should not be None.')
 
     # retrieve the last message from the agent
-    model_answer_raw = state.history.get_last_agent_message()
+    model_answer_raw = state.get_last_agent_message()
 
     # attempt to parse model_answer
     ast_eval_fn = instance['ast_eval']
@@ -114,7 +115,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     output = EvalOutput(
         instance_id=instance_id,

diff --git a/evaluation/gpqa/run_infer.py b/evaluation/gpqa/run_infer.py
@@ -28,6 +28,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -244,7 +245,7 @@ def process_instance(
         'C': False,
         'D': False,
     }
-    for event in state.history.get_events(reverse=True):
+    for event in reversed(state.history):
         if (
             isinstance(event, AgentFinishAction)
             and event.source != 'user'
@@ -300,7 +301,7 @@ def process_instance(
         instance_id=str(instance.instance_id),
         instruction=instruction,
         metadata=metadata,
-        history=state.history.compatibility_for_eval_history_pairs(),
+        history=compatibility_for_eval_history_pairs(state.history),
         metrics=metrics,
         error=state.last_error if state and state.last_error else None,
         test_result={

diff --git a/evaluation/humanevalfix/run_infer.py b/evaluation/humanevalfix/run_infer.py
@@ -21,6 +21,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -255,7 +256,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(

diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py
@@ -122,7 +122,7 @@ def process_instance(
     # # result evaluation
     # # =============================================
 
-    histories = state.history.get_events()
+    histories = state.history
     test_result: TestResult = test_class.verify_result(runtime, histories)
     metrics = state.metrics.get() if state.metrics else None
 

diff --git a/evaluation/logic_reasoning/run_infer.py b/evaluation/logic_reasoning/run_infer.py
@@ -8,6 +8,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -225,7 +226,7 @@ def process_instance(
         raise ValueError('State should not be None.')
 
     final_message = ''
-    for event in state.history.get_events(reverse=True):
+    for event in reversed(state.history):
         if isinstance(event, AgentFinishAction):
             final_message = event.thought
             break
@@ -247,7 +248,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(

diff --git a/evaluation/miniwob/run_infer.py b/evaluation/miniwob/run_infer.py
@@ -10,6 +10,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -152,7 +153,7 @@ def process_instance(
 
     # Instruction is the first message from the USER
     instruction = ''
-    for event in state.history.get_events():
+    for event in state.history:
         if isinstance(event, MessageAction):
             instruction = event.content
             break
@@ -164,7 +165,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(

diff --git a/evaluation/mint/run_infer.py b/evaluation/mint/run_infer.py
@@ -13,6 +13,7 @@
 from evaluation.utils.shared import (
     EvalMetadata,
     EvalOutput,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -28,6 +29,7 @@
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import (
+    Action,
     CmdRunAction,
     MessageAction,
 )
@@ -45,7 +47,10 @@ def codeact_user_response_mint(state: State, task: Task, task_config: dict[str,
         task=task,
         task_config=task_config,
     )
-    last_action = state.history.get_last_action()
+    last_action = next(
+        (event for event in reversed(state.history) if isinstance(event, Action)),
+        None,
+    )
     result_state: TaskState = env.step(last_action.message or '')
 
     state.extra_data['task_state'] = result_state
@@ -202,7 +207,7 @@ def process_instance(
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(

diff --git a/evaluation/ml_bench/run_infer.py b/evaluation/ml_bench/run_infer.py
@@ -24,6 +24,7 @@
     EvalMetadata,
     EvalOutput,
     codeact_user_response,
+    compatibility_for_eval_history_pairs,
     make_metadata,
     prepare_dataset,
     reset_logger_for_multiprocessing,
@@ -256,7 +257,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
     # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
     # for compatibility with the existing output format, we can remake the pairs here
     # remove when it becomes unnecessary
-    histories = state.history.compatibility_for_eval_history_pairs()
+    histories = compatibility_for_eval_history_pairs(state.history)
 
     # Save the output
     output = EvalOutput(

diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
@@ -430,7 +430,8 @@ def process_instance(
     if state is None:
         raise ValueError('State should not be None.')
 
-    histories = [event_to_dict(event) for event in state.history.get_events()]
+    # NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events
+    histories = [event_to_dict(event) for event in state.history]
     metrics = state.metrics.get() if state.metrics else None
 
     # Save the output