Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/mle bench evaluation #5148

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions evaluation/mle_bench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# mle-bench

## Setup

### Kaggle API token

Install your Kaggle API token at `~/.kaggle/kaggle.json`.

### MLE-bench

`poetry add git+https://[email protected]:openai/mle-bench`

### Build images

```bash
docker build --platform=linux/amd64 \
--build-arg PYTHON_VERSION=3.12 \
--build-arg INSTALL_HEAVY_DEPENDENCIES=false \
-t mlebench-env -f environment/Dockerfile .
```
4 changes: 4 additions & 0 deletions evaluation/mle_bench/agents/.shared_env
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
SUBMISSION_DIR=/home/submission
LOGS_DIR=/home/logs
CODE_DIR=/home/code
AGENT_DIR=/home/agent
Empty file.
50 changes: 50 additions & 0 deletions evaluation/mle_bench/agents/openhands/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
FROM --platform=linux/amd64 mlebench-env

# Do not modify the following block of arg-env pairs. The evaluation infrastructure will
# periodically download the contents of these directories for logging purposes.

# Where the agent should write its `submission.csv`
ARG SUBMISSION_DIR=/home/submission
ENV SUBMISSION_DIR=${SUBMISSION_DIR}

# Where the agent's logs should be written
ARG LOGS_DIR=/home/logs
ENV LOGS_DIR=${LOGS_DIR}

# Where the agent should write its code
ARG CODE_DIR=/home/code
ENV CODE_DIR=${CODE_DIR}

# Where the agent code should exist
ARG AGENT_DIR=/home/agent
ENV AGENT_DIR=${AGENT_DIR}

ARG CONDA_ENV_NAME=agent
ARG OH_VERSION=main
ENV DEBIAN_FRONTEND=noninteractive

RUN mkdir -p ${LOGS_DIR} ${AGENT_DIR} ${CODE_DIR} ${SUBMISSION_DIR} && \
curl -fsSL https://get.docker.com -o /tmp/get-docker.sh && \
chmod 700 /tmp/get-docker.sh && \
/tmp/get-docker.sh && \
sudo usermod -aG docker nonroot && \
git clone --branch ${OH_VERSION} --single-branch https://github.com/All-Hands-AI/OpenHands.git ${AGENT_DIR}

WORKDIR ${AGENT_DIR}

# Assumes that the `agent` conda env already exists, which is created in the `mlebench-env` base image.
RUN conda install -y -n ${CONDA_ENV_NAME} -c conda-forge nodejs=18.17.1 python=3.12 poetry=1.8 && \
conda run -n ${CONDA_ENV_NAME} poetry install && \
conda clean -afy

COPY setup.py start.py templates.py utils.py start.sh build.sh ${AGENT_DIR}/
COPY entrypoint.sh /agent_entrypoint.sh

RUN chmod +x /agent_entrypoint.sh

# Allow the agent to do `sudo /build.sh`, but restrict it from reading or editing the file.
RUN echo "ALL ALL=NOPASSWD: ${AGENT_DIR}/build.sh" >> /etc/sudoers && \
chmod 711 ${AGENT_DIR}/build.sh

# run the agent's entrypoint script instead (which also runs root entrypoint)
ENTRYPOINT ["/agent_entrypoint.sh"]
9 changes: 9 additions & 0 deletions evaluation/mle_bench/agents/openhands/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

cd /home/agent

# The final configuration for the agent happens here, otherwise the standard entrypoint hangs while
# all the build artifacts are being recursively chmod'ed.

/opt/conda/bin/conda run -n agent --no-capture-output make build
# sudo -u nonroot /opt/conda/bin/conda run -n agent --no-capture-output playwright install
13 changes: 13 additions & 0 deletions evaluation/mle_bench/agents/openhands/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
openhands:
start: openhands/start.sh
dockerfile: openhands/Dockerfile
kwargs_type: argparse
kwargs:
agent: CodeActAgent
model: gpt-4o
max_time_in_hours: 24
max_steps: 500
shm_size: "100G"
env_vars:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
privileged: true
45 changes: 45 additions & 0 deletions evaluation/mle_bench/agents/openhands/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash

# Print commands and their arguments as they are executed
set -x

# Run the root entrypoint in the background
/entrypoint.sh &

mkdir -p $LOGS_DIR
mkdir -p $AGENT_DIR
{
# Check if Docker installed, and if so start the Docker daemon in the background.
if [ -x "$(command -v docker)" ]; then

# if CUDA is available, install the nvidia container toolkit
if [ -x "$(command -v nvidia-smi)" ]; then
# configure production repository
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
| sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
| sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
# install the nvidia container toolkit
sudo apt-get update
sudo apt-get install -y nvidia-container-toolkit
# configure the runtime
sudo nvidia-ctk runtime configure --runtime=docker
fi

# (re)start the Docker daemon
if sudo pgrep dockerd > /dev/null; then
sudo pkill dockerd
fi
# sudo dockerd > $LOGS_DIR/docker.log 2>&1 &
sleep 5
else
echo "Docker not installed. Skipping Docker startup."
fi

} 2>&1 | tee $LOGS_DIR/agent_entrypoint.log

# signal that the entrypoint has finished
touch $AGENT_DIR/entrypoint_done

# wait for root entrypoint (a server), need this otherwise the container exits
wait
49 changes: 49 additions & 0 deletions evaluation/mle_bench/agents/openhands/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import argparse
import os
from pathlib import Path

from templates import additional_notes_template, config_template
from utils import get_gpu_generation

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
CODE_DIR = Path(os.getenv('CODE_DIR'))
AGENT_DIR = Path(os.getenv('AGENT_DIR'))

parser = argparse.ArgumentParser()
parser.add_argument('--agent', type=str, required=True)
parser.add_argument('--model', type=str, required=True)
parser.add_argument('--max_time_in_hours', type=float, required=True)
parser.add_argument('--max_steps', type=int, required=True)
parser.add_argument('--shm_size', type=str, required=True)
args, other_args = parser.parse_known_args()

gpu_generation = get_gpu_generation()
type_of_processor = gpu_generation if gpu_generation else 'CPU'

config = config_template.substitute(
workspace_base='/home',
workspace_mount_path_in_sandbox='/home',
max_steps=args.max_steps,
model=args.model,
api_key=OPENAI_API_KEY,
agent=args.agent,
shm_size=args.shm_size,
)

additional_notes = additional_notes_template.substitute(
type_of_processor=type_of_processor,
max_time_in_hours=args.max_time_in_hours,
max_steps=args.max_steps,
workspace=CODE_DIR,
)

with open(AGENT_DIR / 'config.toml', 'w') as file:
file.write(config.strip())

with open('/home/instructions.txt', 'r') as file:
partial_instructions = file.read()

instructions = partial_instructions + additional_notes

with open('/home/full_instructions.txt', 'w') as file:
file.write(instructions.strip())
184 changes: 184 additions & 0 deletions evaluation/mle_bench/agents/openhands/start.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
"""
Main entrypoint for the OpenHands agent.
"""

import argparse
import asyncio
import json
import os
from pathlib import Path
from typing import Callable, Iterable

from openhands.controller.state.state import State
from openhands.core.config import load_app_config
from openhands.core.main import create_runtime, run_controller
from openhands.events import Event, EventStreamSubscriber
from openhands.events.action import (
Action,
CmdRunAction,
IPythonRunCellAction,
MessageAction,
)
from openhands.events.observation import (
CmdOutputObservation,
IPythonRunCellObservation,
)

# pylint: disable=unspecified-encoding

parser = argparse.ArgumentParser()
parser.add_argument('--max_time_in_hours', type=float, required=True)
args, other_args = parser.parse_known_args()

global_events = []
global_events_lock = asyncio.Lock()

CODE_DIR = Path(os.getenv('CODE_DIR')) # type: ignore
MAX_TIME_IN_SECONDS = args.max_time_in_hours * 60 * 60


def fake_user_response_fn(
_state: State,
_encapsulate_solution: bool = False,
_try_parse: Callable[[Action], str] | None = None,
) -> str:
return 'Please continue working on the approach you think is most promising. You should complete the task without any human input.'


async def on_event(event: Event):
"""Used to stream the agent's Jupyter notebook and Python code to the code directory."""

async with global_events_lock:
global_events.append(event)
notebook = get_notebook(global_events)
python = get_python(global_events)

with open(CODE_DIR / 'solution.ipynb', 'w') as file:
json.dump(notebook, file)

with open(CODE_DIR / 'solution.py', 'w') as file:
file.write(python)


async def run(instructions: str) -> State:
config = load_app_config()

runtime = create_runtime(config)
runtime.event_stream.subscribe(EventStreamSubscriber.TEST, on_event, 'mle-bench')
await runtime.connect()

state: State | None = await asyncio.wait_for(
run_controller(
config=config,
initial_user_action=MessageAction(content=instructions),
runtime=runtime,
exit_on_message=False,
fake_user_response_fn=fake_user_response_fn,
),
timeout=MAX_TIME_IN_SECONDS,
)

assert state, 'Controller produced no final state.'
return state


def get_python(events: Iterable[Event]) -> str:
code = '# %%\n\n'
sep = '\n\n# %%\n\n'

for event in events:
if isinstance(event, IPythonRunCellAction):
code += event.code + sep
elif isinstance(event, CmdRunAction):
code += f'!{event.command}' + sep

return code


def get_notebook(events: Iterable[Event]) -> dict:
cells = []

for event in events:
if event.source != 'agent':
continue

if isinstance(event, IPythonRunCellAction):
cells.append(
{
'cell_type': 'markdown',
'metadata': {},
'source': [event.thought],
}
)
cells.append(
{
'cell_type': 'code',
'metadata': {},
'source': event.code.split('\n'),
'outputs': [],
'execution_count': None,
}
)
elif isinstance(event, IPythonRunCellObservation):
assert cells
assert cells[-1]['cell_type'] == 'code'

cells[-1]['outputs'].append(
{
'name': 'stdout',
'output_type': 'stream',
'text': event.content.split('\n'),
}
)
elif isinstance(event, CmdRunAction):
cells.append(
{
'cell_type': 'code',
'metadata': {},
'source': [f'!{event.command}'],
'outputs': [],
'execution_count': None,
}
)
elif isinstance(event, CmdOutputObservation):
assert cells
assert cells[-1]['cell_type'] == 'code'

cells[-1]['outputs'].append(
{
'name': 'stdout',
'output_type': 'stream',
'text': event.content.split('\n'),
}
)

notebook = {
'cells': cells,
'metadata': {
'kernelspec': {
'display_name': 'Python 3',
'language': 'python',
'name': 'python3',
},
'language_info': {
'codemirror_mode': {'name': 'ipython', 'version': 3},
'file_extension': '.py',
'mimetype': 'text/x-python',
'name': 'python',
'nbconvert_exporter': 'python',
'pygments_lexer': 'ipython3',
'version': '3.11.0',
},
},
'nbformat': 4,
'nbformat_minor': 4,
}

return notebook


if __name__ == '__main__':
with open('/home/full_instructions.txt', 'r') as file:
instructions = file.read()

asyncio.run(run(instructions))
Loading
Loading