Skip to content

Commit

Permalink
Merge branch 'main' into ALL-556/wait-for-events
Browse files Browse the repository at this point in the history
  • Loading branch information
mamoodi authored Nov 14, 2024
2 parents c26757d + fac5237 commit 6abe70d
Show file tree
Hide file tree
Showing 14 changed files with 88 additions and 19 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/openhands-resolver.yml
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ jobs:
- name: Attempt to resolve issue
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_USERNAME: ${{ secrets.PAT_USERNAME }}
LLM_MODEL: ${{ secrets.LLM_MODEL }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
Expand Down Expand Up @@ -182,6 +183,7 @@ jobs:
- name: Create draft PR or push branch
env:
GITHUB_TOKEN: ${{ secrets.PAT_TOKEN }}
GITHUB_USERNAME: ${{ secrets.PAT_USERNAME }}
LLM_MODEL: ${{ secrets.LLM_MODEL }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
Expand Down
14 changes: 14 additions & 0 deletions evaluation/aider_bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,20 @@ You can update the arguments in the script
./evaluation/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10"
```

### Run Inference on `RemoteRuntime` (experimental)

This is in limited beta. Contact Xingyao over slack if you want to try this out!

```bash
./evaluation/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]

# Example - This runs evaluation on CodeActAgent for 133 instances on aider_bench test set, with 2 workers running in parallel
export ALLHANDS_API_KEY="YOUR-API-KEY"
export RUNTIME=remote
export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
./evaluation/aider_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 133 2
```

## Summarize Results

```bash
Expand Down
3 changes: 3 additions & 0 deletions evaluation/aider_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ def get_config(
use_host_network=False,
timeout=100,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
keep_runtime_alive=False,
remote_runtime_init_timeout=1800,
),
# do not mount workspace
workspace_base=None,
Expand Down
14 changes: 14 additions & 0 deletions evaluation/miniwob/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,20 @@ Access with browser the above MiniWoB URLs and see if they load correctly.
./evaluation/miniwob/scripts/run_infer.sh llm.claude-35-sonnet-eval
```

### Run Inference on `RemoteRuntime` (experimental)

This is in limited beta. Contact Xingyao over slack if you want to try this out!

```bash
./evaluation/miniwob/scripts/run_infer.sh [model_config] [git-version] [agent] [note] [eval_limit] [num_workers]

# Example - This runs evaluation on BrowsingAgent for 125 instances on miniwob, with 2 workers running in parallel
export ALLHANDS_API_KEY="YOUR-API-KEY"
export RUNTIME=remote
export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
./evaluation/miniwob/scripts/run_infer.sh llm.eval HEAD BrowsingAgent "" 125 2
```

Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`

To calculate the average reward, run:
Expand Down
Empty file removed evaluation/miniwob/__init__.py
Empty file.
2 changes: 1 addition & 1 deletion evaluation/miniwob/get_avg_reward.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
data = json.loads(line)
actual_num += 1
total_cost += data['metrics']['accumulated_cost']
total_reward += data['test_result']
total_reward += data['test_result']['reward']

avg_reward = total_reward / total_num
print('Avg Reward: ', avg_reward)
Expand Down
3 changes: 3 additions & 0 deletions evaluation/miniwob/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@

AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
'BrowsingAgent': 'Continue the task. IMPORTANT: do not talk to the user until you have finished the task',
}


Expand All @@ -66,7 +67,9 @@ def get_config(
browsergym_eval_env=env_id,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
remote_runtime_init_timeout=1800,
keep_runtime_alive=False,
timeout=120,
),
# do not mount workspace
workspace_base=None,
Expand Down
2 changes: 1 addition & 1 deletion evaluation/miniwob/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"

EVAL_NOTE="${AGENT_VERSION}_${NOTE}"

COMMAND="poetry run python evaluation/miniwob/run_infer.py \
COMMAND="export PYTHONPATH=evaluation/miniwob:\$PYTHONPATH && poetry run python evaluation/miniwob/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
Expand Down
27 changes: 22 additions & 5 deletions openhands/agenthub/codeact_agent/codeact_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,12 +148,14 @@ def get_action_message(
action,
(
AgentDelegateAction,
CmdRunAction,
IPythonRunCellAction,
FileEditAction,
BrowseInteractiveAction,
),
) or (isinstance(action, AgentFinishAction) and action.source == 'agent'):
) or (
isinstance(action, (AgentFinishAction, CmdRunAction))
and action.source == 'agent'
):
tool_metadata = action.tool_call_metadata
assert tool_metadata is not None, (
'Tool call metadata should NOT be None when function calling is enabled. Action: '
Expand Down Expand Up @@ -184,6 +186,14 @@ def get_action_message(
content=content,
)
]
elif isinstance(action, CmdRunAction) and action.source == 'user':
content = [TextContent(text=f'User executed the command:\n{action.command}')]
return [
Message(
role='user',
content=content,
)
]
return []

def get_observation_message(
Expand Down Expand Up @@ -219,9 +229,16 @@ def get_observation_message(
message: Message
max_message_chars = self.llm.config.max_message_chars
if isinstance(obs, CmdOutputObservation):
text = truncate_content(
obs.content + obs.interpreter_details, max_message_chars
)
# if it doesn't have tool call metadata, it was triggered by a user action
if obs.tool_call_metadata is None:
text = truncate_content(
f'\nObserved result of command executed by user:\n{obs.content}',
max_message_chars,
)
else:
text = truncate_content(
obs.content + obs.interpreter_details, max_message_chars
)
text += f'\n[Command finished with exit code {obs.exit_code}]'
message = Message(role='user', content=[TextContent(text=text)])
elif isinstance(obs, IPythonRunCellObservation):
Expand Down
4 changes: 4 additions & 0 deletions openhands/controller/agent_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class AgentController:
parent: 'AgentController | None' = None
delegate: 'AgentController | None' = None
_pending_action: Action | None = None
_closed: bool = False
filter_out: ClassVar[tuple[type[Event], ...]] = (
NullAction,
NullObservation,
Expand Down Expand Up @@ -160,6 +161,7 @@ async def close(self):

# unsubscribe from the event stream
self.event_stream.unsubscribe(EventStreamSubscriber.AGENT_CONTROLLER, self.id)
self._closed = True

def log(self, level: str, message: str, extra: dict | None = None):
"""Logs a message to the agent controller's logger.
Expand Down Expand Up @@ -194,6 +196,8 @@ async def start_step_loop(self):

self.log('info', 'Starting step loop...')
while should_continue():
if self._closed:
break
try:
await self._step()
except asyncio.CancelledError:
Expand Down
12 changes: 7 additions & 5 deletions openhands/runtime/impl/eventstream/eventstream_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ def __del__(self):
def close(self, timeout: float = 5.0):
self._stop_event.set()
self.log_stream_thread.join(timeout)
# Close the log generator to release the file descriptor
if hasattr(self.log_generator, 'close'):
self.log_generator.close()


class EventStreamRuntime(Runtime):
Expand Down Expand Up @@ -232,6 +235,8 @@ async def connect(self):
f'Container started: {self.container_name}. VSCode URL: {self.vscode_url}',
)

self.log_buffer = LogBuffer(self.container, self.log)

if not self.attach_to_existing:
self.log('info', f'Waiting for client to become ready at {self.api_url}...')
self.send_status_message('STATUS$WAITING_FOR_CLIENT')
Expand Down Expand Up @@ -358,7 +363,6 @@ def _init_container(self):
environment=environment,
volumes=volumes,
)
self.log_buffer = LogBuffer(self.container, self.log)
self.log('debug', f'Container started. Server url: {self.api_url}')
self.send_status_message('STATUS$CONTAINER_STARTED')
except docker.errors.APIError as e:
Expand All @@ -385,11 +389,9 @@ def _init_container(self):
raise e

def _attach_to_container(self):
container = self.docker_client.containers.get(self.container_name)
self.log_buffer = LogBuffer(container, self.log)
self.container = container
self._container_port = 0
for port in container.attrs['NetworkSettings']['Ports']:
self.container = self.docker_client.containers.get(self.container_name)
for port in self.container.attrs['NetworkSettings']['Ports']: # type: ignore
self._container_port = int(port.split('/')[0])
break
self._host_port = self._container_port
Expand Down
6 changes: 5 additions & 1 deletion openhands/runtime/utils/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,9 @@ def send_request(
**kwargs: Any,
) -> requests.Response:
response = session.request(method, url, **kwargs)
response.raise_for_status()
try:
response.raise_for_status()
finally:
response.close()

return response
10 changes: 6 additions & 4 deletions openhands/server/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,15 @@ async def get_github_user(token: str) -> str:
github handle of the user
"""
logger.debug('Fetching GitHub user info from token')
g = Github(token)
try:
g = Github(token)
user = await call_sync_from_async(g.get_user)
login = user.login
logger.info(f'Successfully retrieved GitHub user: {login}')
return login
except GithubException as e:
logger.error(f'Error making request to GitHub API: {str(e)}')
logger.error(e)
raise
finally:
g.close()
login = user.login
logger.info(f'Successfully retrieved GitHub user: {login}')
return login
8 changes: 6 additions & 2 deletions openhands/server/session/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,12 @@ def __init__(

def close(self):
self.is_alive = False
self.agent_session.close()
try:
if self.websocket is not None:
asyncio.run_coroutine_threadsafe(self.websocket.close(), self.loop)
self.websocket = None
finally:
self.agent_session.close()

async def loop_recv(self):
try:
Expand Down Expand Up @@ -107,7 +112,6 @@ async def _initialize_agent(self, data: dict):
agent_config = self.config.get_agent_config(agent_cls)
agent = Agent.get_cls(agent_cls)(llm, agent_config)

# Create the agent session
try:
await self.agent_session.start(
runtime_name=self.config.runtime,
Expand Down

0 comments on commit 6abe70d

Please sign in to comment.