Merge branch 'main' into kevin

SmartManoj · Feb 3, 2025 · c6bee0c · c6bee0c
2 parents a9fa9cc + e487008
commit c6bee0c
Show file tree

Hide file tree

Showing 10 changed files with 933 additions and 18 deletions.
diff --git a/evaluation/benchmarks/the_agent_company/README.md b/evaluation/benchmarks/the_agent_company/README.md
@@ -17,27 +17,36 @@ When the `run_infer.sh` script is started, it will automatically pull all task i
 
 ```bash
 ./evaluation/benchmarks/the_agent_company/scripts/run_infer.sh \
-  --agent-llm-config <agent-llm-config>  \
-  --env-llm-config <env-llm-config> \
-  --outputs-path <outputs-path> \
-  --server-hostname <server-hostname> \
-  --version <version>
+  --agent-llm-config <agent-llm-config, default to 'agent'>  \
+  --env-llm-config <env-llm-config, default to 'env'> \
+  --outputs-path <outputs-path, default to outputs> \
+  --server-hostname <server-hostname, default to localhost> \
+  --version <version, default to 1.0.0> \
+  --start-percentile <integer from 0 to 99, default to 0> \
+  --end-percentile <integer from 1 to 100, default to 100>
+
 
 # Example
 ./evaluation/benchmarks/the_agent_company/scripts/run_infer.sh \
   --agent-llm-config claude-3-5-sonnet-20240620 \
   --env-llm-config claude-3-5-sonnet-20240620 \
   --outputs-path outputs \
   --server-hostname localhost \
-  --version 1.0.0
+  --version 1.0.0 \
+  --start-percentile 10 \
+  --end-percentile 20
 ```
 
 - `agent-llm-config`: the config name for the agent LLM. This should match the config name in config.toml. This is the LLM used by the agent (e.g. CodeActAgent).
 - `env-llm-config`: the config name for the environment LLM. This should match the config name in config.toml. This is used by the chat bots (NPCs) and LLM-based evaluators.
 - `outputs-path`: the path to save trajectories and evaluation results.
 - `server-hostname`: the hostname of the server that hosts all the web services. It could be localhost if you are running the evaluation and services on the same machine. If the services are hosted on a remote machine, you must use the hostname of the remote machine rather than IP address.
 - `version`: the version of the task images to use. Currently, the only supported version is 1.0.0.
+- `start-percentile`: the start percentile of the task split, must be an integer between 0 to 99.
+- `end-percentile`: the end percentile of the task split, must be an integer between 1 to 100 and larger than start-percentile.
 
-The script is idempotent. If you run it again, it will resume from the last checkpoint. It would usually take a few days to finish evaluation.
+The script is idempotent. If you run it again, it will resume from the last checkpoint. It would usually take 2 days to finish evaluation if you run the whole task set.
+To speed up evaluation, you can use `start-percentile` and `end-percentile` to split the tasks for higher parallelism,
+provided concurrent runs are **targeting different servers**.
 
 Note: the script will automatically skip a task if it encounters an error. This usually happens when the OpenHands runtime dies due to some unexpected errors. This means even if the script finishes, it might not have evaluated all tasks. You can manually resume the evaluation by running the script again.
diff --git a/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh b/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh
@@ -56,6 +56,14 @@ while [[ $# -gt 0 ]]; do
             VERSION="$2"
             shift 2
             ;;
+        --start-percentile)
+            START_PERCENTILE="$2"
+            shift 2
+            ;;
+        --end-percentile)
+            END_PERCENTILE="$2"
+            shift 2
+            ;;
         *)
             echo "Unknown argument: $1"
             exit 1
@@ -69,16 +77,53 @@ if [[ ! "$OUTPUTS_PATH" = /* ]]; then
     OUTPUTS_PATH="$(cd "$(dirname "$OUTPUTS_PATH")" 2>/dev/null && pwd)/$(basename "$OUTPUTS_PATH")"
 fi
 
+: "${START_PERCENTILE:=0}"  # Default to 0 percentile (first line)
+: "${END_PERCENTILE:=100}"  # Default to 100 percentile (last line)
+
+# Validate percentile ranges if provided
+if ! [[ "$START_PERCENTILE" =~ ^[0-9]+$ ]] || ! [[ "$END_PERCENTILE" =~ ^[0-9]+$ ]]; then
+    echo "Error: Percentiles must be integers"
+    exit 1
+fi
+
+if [ "$START_PERCENTILE" -ge "$END_PERCENTILE" ]; then
+    echo "Error: Start percentile must be less than end percentile"
+    exit 1
+fi
+
+if [ "$START_PERCENTILE" -lt 0 ] || [ "$END_PERCENTILE" -gt 100 ]; then
+    echo "Error: Percentiles must be between 0 and 100"
+    exit 1
+fi
+
 echo "Using agent LLM config: $AGENT_LLM_CONFIG"
 echo "Using environment LLM config: $ENV_LLM_CONFIG"
 echo "Outputs path: $OUTPUTS_PATH"
 echo "Server hostname: $SERVER_HOSTNAME"
 echo "Version: $VERSION"
+echo "Start Percentile: $START_PERCENTILE"
+echo "End Percentile: $END_PERCENTILE"
 
 echo "Downloading tasks.md..."
 rm -f tasks.md
 wget https://github.com/TheAgentCompany/TheAgentCompany/releases/download/${VERSION}/tasks.md
 
+total_lines=$(cat tasks.md | grep "ghcr.io/theagentcompany" | wc -l)
+if [ "$total_lines" -ne 175 ]; then
+    echo "Error: Expected 175 tasks in tasks.md but found $total_lines lines"
+    exit 1
+fi
+
+# Calculate line numbers based on percentiles
+start_line=$(echo "scale=0; ($total_lines * $START_PERCENTILE / 100) + 1" | bc)
+end_line=$(echo "scale=0; $total_lines * $END_PERCENTILE / 100" | bc)
+
+echo "Using tasks No. $start_line to $end_line (inclusive) out of 1-175 tasks"
+
+# Create a temporary file with just the desired range
+temp_file="tasks_${START_PERCENTILE}_${END_PERCENTILE}.md"
+sed -n "${start_line},${end_line}p" tasks.md > "$temp_file"
+
 while IFS= read -r task_image; do
     docker pull $task_image
 
@@ -108,8 +153,8 @@ while IFS= read -r task_image; do
     docker images "ghcr.io/all-hands-ai/runtime" -q | xargs -r docker rmi -f
     docker volume prune -f
     docker system prune -f
-done < tasks.md
+done < "$temp_file"
 
-rm tasks.md
+rm tasks.md "$temp_file"
 
 echo "All evaluation completed successfully!"
diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
@@ -711,6 +711,7 @@ async def _step(self) -> None:
                 action = self.agent.step(self.state)
                 if action is None:
                     raise LLMNoActionError('No action was returned')
+                action._source = EventSource.AGENT  # type: ignore [attr-defined]
             except (
                 LLMMalformedActionError,
                 LLMNoActionError,
@@ -769,7 +770,7 @@ async def _step(self) -> None:
                 == ActionConfirmationStatus.AWAITING_CONFIRMATION
             ):
                 await self.set_agent_state_to(AgentState.AWAITING_USER_CONFIRMATION)
-            self.event_stream.add_event(action, EventSource.AGENT)
+            self.event_stream.add_event(action, action._source)  # type: ignore [attr-defined]
 
         await self.update_state_after_step()
 

diff --git a/openhands/controller/replay.py b/openhands/controller/replay.py
@@ -1,6 +1,8 @@
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action.action import Action
+from openhands.events.action.message import MessageAction
 from openhands.events.event import Event, EventSource
+from openhands.events.observation.empty import NullObservation
 
 
 class ReplayManager:
@@ -15,9 +17,31 @@ class ReplayManager:
     initial state of the trajectory.
     """
 
-    def __init__(self, replay_events: list[Event] | None):
+    def __init__(self, events: list[Event] | None):
+        replay_events = []
+        for event in events or []:
+            if event.source == EventSource.ENVIRONMENT:
+                # ignore ENVIRONMENT events as they are not issued by
+                # the user or agent, and should not be replayed
+                continue
+            if isinstance(event, NullObservation):
+                # ignore NullObservation
+                continue
+            replay_events.append(event)
+
         if replay_events:
-            logger.info(f'Replay logs loaded, events length = {len(replay_events)}')
+            logger.info(f'Replay events loaded, events length = {len(replay_events)}')
+            for index in range(len(replay_events) - 1):
+                event = replay_events[index]
+                if isinstance(event, MessageAction) and event.wait_for_response:
+                    # For any message waiting for response that is not the last
+                    # event, we override wait_for_response to False, as a response
+                    # would have been included in the next event, and we don't
+                    # want the user to interfere with the replay process
+                    logger.info(
+                        'Replay events contains wait_for_response message action, ignoring wait_for_response'
+                    )
+                    event.wait_for_response = False
         self.replay_events = replay_events
         self.replay_mode = bool(replay_events)
         self.replay_index = 0
@@ -27,7 +51,6 @@ def _replayable(self) -> bool:
             self.replay_events is not None
             and self.replay_index < len(self.replay_events)
             and isinstance(self.replay_events[self.replay_index], Action)
-            and self.replay_events[self.replay_index].source != EventSource.USER
         )
 
     def should_replay(self) -> bool:

diff --git a/openhands/core/main.py b/openhands/core/main.py
@@ -231,6 +231,10 @@ def load_replay_log(trajectory_path: str) -> tuple[list[Event] | None, Action]:
             events = []
             for item in data:
                 event = event_from_dict(item)
+                if event.source == EventSource.ENVIRONMENT:
+                    # ignore ENVIRONMENT events as they are not issued by
+                    # the user or agent, and should not be replayed
+                    continue
                 # cannot add an event with _id to event stream
                 event._id = None  # type: ignore[attr-defined]
                 events.append(event)

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,7 @@ packages = [
 python = "^3.12"
 datasets = "*"
 pandas = "*"
-litellm = "^1.55.4"
+litellm = "^1.60.0"
 google-generativeai = "*" # To use litellm with Gemini Pro API
 google-api-python-client = "*" # For Google Sheets API
 google-auth-httplib2 = "*" # For Google Sheets authentication

diff --git a/tests/runtime/test_replay.py b/tests/runtime/test_replay.py
@@ -10,6 +10,8 @@
 from openhands.core.main import run_controller
 from openhands.core.schema.agent import AgentState
 from openhands.events.action.empty import NullAction
+from openhands.events.action.message import MessageAction
+from openhands.events.event import EventSource
 from openhands.events.observation.commands import CmdOutputObservation
 
 
@@ -46,6 +48,36 @@ def test_simple_replay(temp_dir, runtime_cls, run_as_openhands):
     _close_test_runtime(runtime)
 
 
+def test_simple_gui_replay(temp_dir, runtime_cls, run_as_openhands):
+    """
+    A simple replay test that involves simple terminal operations and edits
+    (writing a Vue.js App), using the default agent
+
+    Note:
+    1. This trajectory is exported from GUI mode, meaning it has extra
+    environmental actions that don't appear in headless mode's trajectories
+    2. In GUI mode, agents typically don't finish; rather, they wait for the next
+    task from the user, so this exported trajectory ends with awaiting_user_input
+    """
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+
+    config = _get_config('basic_gui_mode')
+
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=NullAction(),
+            runtime=runtime,
+            # exit on message, otherwise this would be stuck on waiting for user input
+            exit_on_message=True,
+        )
+    )
+
+    assert state.agent_state == AgentState.FINISHED
+
+    _close_test_runtime(runtime)
+
+
 def test_replay_wrong_initial_state(temp_dir, runtime_cls, run_as_openhands):
     """
     Replay requires a consistent initial state to start with, otherwise it might
@@ -78,3 +110,43 @@ def test_replay_wrong_initial_state(temp_dir, runtime_cls, run_as_openhands):
     assert has_error_in_action
 
     _close_test_runtime(runtime)
+
+
+def test_replay_basic_interactions(temp_dir, runtime_cls, run_as_openhands):
+    """
+    Replay a trajectory that involves interactions, i.e. with user messages
+    in the middle. This tests two things:
+    1) The controller should be able to replay all actions without human
+    interference (no asking for user input).
+    2) The user messages in the trajectory should appear in the history.
+    """
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+
+    config = _get_config('basic_interactions')
+
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=NullAction(),
+            runtime=runtime,
+        )
+    )
+
+    assert state.agent_state == AgentState.FINISHED
+
+    # all user messages appear in the history, so that after a replay (assuming
+    # the trajectory doesn't end with `finish` action), LLM knows about all the
+    # context and can continue
+    user_messages = [
+        "what's 1+1?",
+        "No, I mean by Goldbach's conjecture!",
+        'Finish please',
+    ]
+    i = 0
+    for event in state.history:
+        if isinstance(event, MessageAction) and event._source == EventSource.USER:
+            assert event.message == user_messages[i]
+            i += 1
+    assert i == len(user_messages)
+
+    _close_test_runtime(runtime)