Merge branch 'main' into ALL-556/wait-for-events

All-Hands-AI · Nov 14, 2024 · 6abe70d · 6abe70d
2 parents c26757d + fac5237
commit 6abe70d
Show file tree

Hide file tree

Showing 14 changed files with 88 additions and 19 deletions.
diff --git a/.github/workflows/openhands-resolver.yml b/.github/workflows/openhands-resolver.yml
@@ -150,6 +150,7 @@ jobs:
       - name: Attempt to resolve issue
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_USERNAME: ${{ secrets.PAT_USERNAME }}
           LLM_MODEL: ${{ secrets.LLM_MODEL }}
           LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
           LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
@@ -182,6 +183,7 @@ jobs:
       - name: Create draft PR or push branch
         env:
           GITHUB_TOKEN: ${{ secrets.PAT_TOKEN }}
+          GITHUB_USERNAME: ${{ secrets.PAT_USERNAME }}
           LLM_MODEL: ${{ secrets.LLM_MODEL }}
           LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
           LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}

diff --git a/evaluation/aider_bench/README.md b/evaluation/aider_bench/README.md
@@ -56,6 +56,20 @@ You can update the arguments in the script
 ./evaluation/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10"
 ```
 
+### Run Inference on `RemoteRuntime` (experimental)
+
+This is in limited beta. Contact Xingyao over slack if you want to try this out!
+
+```bash
+./evaluation/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
+
+# Example - This runs evaluation on CodeActAgent for 133 instances on aider_bench test set, with 2 workers running in parallel
+export ALLHANDS_API_KEY="YOUR-API-KEY"
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+./evaluation/aider_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 133 2
+```
+
 ## Summarize Results
 
 ```bash

diff --git a/evaluation/aider_bench/run_infer.py b/evaluation/aider_bench/run_infer.py
@@ -58,6 +58,9 @@ def get_config(
             use_host_network=False,
             timeout=100,
             api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
+            keep_runtime_alive=False,
+            remote_runtime_init_timeout=1800,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/miniwob/README.md b/evaluation/miniwob/README.md
@@ -16,6 +16,20 @@ Access with browser the above MiniWoB URLs and see if they load correctly.
 ./evaluation/miniwob/scripts/run_infer.sh llm.claude-35-sonnet-eval
 ```
 
+### Run Inference on `RemoteRuntime` (experimental)
+
+This is in limited beta. Contact Xingyao over slack if you want to try this out!
+
+```bash
+./evaluation/miniwob/scripts/run_infer.sh [model_config] [git-version] [agent] [note] [eval_limit] [num_workers]
+
+# Example - This runs evaluation on BrowsingAgent for 125 instances on miniwob, with 2 workers running in parallel
+export ALLHANDS_API_KEY="YOUR-API-KEY"
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+./evaluation/miniwob/scripts/run_infer.sh llm.eval HEAD BrowsingAgent "" 125 2
+```
+
 Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`
 
 To calculate the average reward, run:

diff --git a/evaluation/miniwob/__init__.py b/evaluation/miniwob/__init__.py
diff --git a/evaluation/miniwob/get_avg_reward.py b/evaluation/miniwob/get_avg_reward.py
@@ -23,7 +23,7 @@
             data = json.loads(line)
             actual_num += 1
             total_cost += data['metrics']['accumulated_cost']
-            total_reward += data['test_result']
+            total_reward += data['test_result']['reward']
 
     avg_reward = total_reward / total_num
     print('Avg Reward: ', avg_reward)

diff --git a/evaluation/miniwob/run_infer.py b/evaluation/miniwob/run_infer.py
@@ -47,6 +47,7 @@
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': codeact_user_response,
+    'BrowsingAgent': 'Continue the task. IMPORTANT: do not talk to the user until you have finished the task',
 }
 
 
@@ -66,7 +67,9 @@ def get_config(
             browsergym_eval_env=env_id,
             api_key=os.environ.get('ALLHANDS_API_KEY', None),
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
+            remote_runtime_init_timeout=1800,
             keep_runtime_alive=False,
+            timeout=120,
         ),
         # do not mount workspace
         workspace_base=None,

diff --git a/evaluation/miniwob/scripts/run_infer.sh b/evaluation/miniwob/scripts/run_infer.sh
@@ -33,7 +33,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
 
-COMMAND="poetry run python evaluation/miniwob/run_infer.py \
+COMMAND="export PYTHONPATH=evaluation/miniwob:\$PYTHONPATH && poetry run python evaluation/miniwob/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \

diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -148,12 +148,14 @@ def get_action_message(
             action,
             (
                 AgentDelegateAction,
-                CmdRunAction,
                 IPythonRunCellAction,
                 FileEditAction,
                 BrowseInteractiveAction,
             ),
-        ) or (isinstance(action, AgentFinishAction) and action.source == 'agent'):
+        ) or (
+            isinstance(action, (AgentFinishAction, CmdRunAction))
+            and action.source == 'agent'
+        ):
             tool_metadata = action.tool_call_metadata
             assert tool_metadata is not None, (
                 'Tool call metadata should NOT be None when function calling is enabled. Action: '
@@ -184,6 +186,14 @@ def get_action_message(
                     content=content,
                 )
             ]
+        elif isinstance(action, CmdRunAction) and action.source == 'user':
+            content = [TextContent(text=f'User executed the command:\n{action.command}')]
+            return [
+                Message(
+                    role='user',
+                    content=content,
+                )
+            ]
         return []
 
     def get_observation_message(
@@ -219,9 +229,16 @@ def get_observation_message(
         message: Message
         max_message_chars = self.llm.config.max_message_chars
         if isinstance(obs, CmdOutputObservation):
-            text = truncate_content(
-                obs.content + obs.interpreter_details, max_message_chars
-            )
+            # if it doesn't have tool call metadata, it was triggered by a user action
+            if obs.tool_call_metadata is None:
+                text = truncate_content(
+                    f'\nObserved result of command executed by user:\n{obs.content}',
+                    max_message_chars,
+                )
+            else:
+                text = truncate_content(
+                    obs.content + obs.interpreter_details, max_message_chars
+                )
             text += f'\n[Command finished with exit code {obs.exit_code}]'
             message = Message(role='user', content=[TextContent(text=text)])
         elif isinstance(obs, IPythonRunCellObservation):

diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
@@ -65,6 +65,7 @@ class AgentController:
     parent: 'AgentController | None' = None
     delegate: 'AgentController | None' = None
     _pending_action: Action | None = None
+    _closed: bool = False
     filter_out: ClassVar[tuple[type[Event], ...]] = (
         NullAction,
         NullObservation,
@@ -160,6 +161,7 @@ async def close(self):
 
         # unsubscribe from the event stream
         self.event_stream.unsubscribe(EventStreamSubscriber.AGENT_CONTROLLER, self.id)
+        self._closed = True
 
     def log(self, level: str, message: str, extra: dict | None = None):
         """Logs a message to the agent controller's logger.
@@ -194,6 +196,8 @@ async def start_step_loop(self):
 
         self.log('info', 'Starting step loop...')
         while should_continue():
+            if self._closed:
+                break
             try:
                 await self._step()
             except asyncio.CancelledError:

diff --git a/openhands/runtime/impl/eventstream/eventstream_runtime.py b/openhands/runtime/impl/eventstream/eventstream_runtime.py
@@ -111,6 +111,9 @@ def __del__(self):
     def close(self, timeout: float = 5.0):
         self._stop_event.set()
         self.log_stream_thread.join(timeout)
+        # Close the log generator to release the file descriptor
+        if hasattr(self.log_generator, 'close'):
+            self.log_generator.close()
 
 
 class EventStreamRuntime(Runtime):
@@ -232,6 +235,8 @@ async def connect(self):
                 f'Container started: {self.container_name}. VSCode URL: {self.vscode_url}',
             )
 
+        self.log_buffer = LogBuffer(self.container, self.log)
+
         if not self.attach_to_existing:
             self.log('info', f'Waiting for client to become ready at {self.api_url}...')
             self.send_status_message('STATUS$WAITING_FOR_CLIENT')
@@ -358,7 +363,6 @@ def _init_container(self):
                 environment=environment,
                 volumes=volumes,
             )
-            self.log_buffer = LogBuffer(self.container, self.log)
             self.log('debug', f'Container started. Server url: {self.api_url}')
             self.send_status_message('STATUS$CONTAINER_STARTED')
         except docker.errors.APIError as e:
@@ -385,11 +389,9 @@ def _init_container(self):
             raise e
 
     def _attach_to_container(self):
-        container = self.docker_client.containers.get(self.container_name)
-        self.log_buffer = LogBuffer(container, self.log)
-        self.container = container
         self._container_port = 0
-        for port in container.attrs['NetworkSettings']['Ports']:
+        self.container = self.docker_client.containers.get(self.container_name)
+        for port in self.container.attrs['NetworkSettings']['Ports']:  # type: ignore
             self._container_port = int(port.split('/')[0])
             break
         self._host_port = self._container_port

diff --git a/openhands/runtime/utils/request.py b/openhands/runtime/utils/request.py
@@ -58,5 +58,9 @@ def send_request(
     **kwargs: Any,
 ) -> requests.Response:
     response = session.request(method, url, **kwargs)
-    response.raise_for_status()
+    try:
+        response.raise_for_status()
+    finally:
+        response.close()
+
     return response
diff --git a/openhands/server/github.py b/openhands/server/github.py
@@ -115,13 +115,15 @@ async def get_github_user(token: str) -> str:
         github handle of the user
     """
     logger.debug('Fetching GitHub user info from token')
+    g = Github(token)
     try:
-        g = Github(token)
         user = await call_sync_from_async(g.get_user)
-        login = user.login
-        logger.info(f'Successfully retrieved GitHub user: {login}')
-        return login
     except GithubException as e:
         logger.error(f'Error making request to GitHub API: {str(e)}')
         logger.error(e)
         raise
+    finally:
+        g.close()
+    login = user.login
+    logger.info(f'Successfully retrieved GitHub user: {login}')
+    return login
diff --git a/openhands/server/session/session.py b/openhands/server/session/session.py
@@ -51,7 +51,12 @@ def __init__(
 
     def close(self):
         self.is_alive = False
-        self.agent_session.close()
+        try:
+            if self.websocket is not None:
+                asyncio.run_coroutine_threadsafe(self.websocket.close(), self.loop)
+                self.websocket = None
+        finally:
+            self.agent_session.close()
 
     async def loop_recv(self):
         try:
@@ -107,7 +112,6 @@ async def _initialize_agent(self, data: dict):
         agent_config = self.config.get_agent_config(agent_cls)
         agent = Agent.get_cls(agent_cls)(llm, agent_config)
 
-        # Create the agent session
         try:
             await self.agent_session.start(
                 runtime_name=self.config.runtime,