All-Hands-AI · csmith49 · Nov 18, 2024 · Nov 19, 2024 · Nov 21, 2024 · Nov 22, 2024
diff --git a/evaluation/mle_bench/README.md b/evaluation/mle_bench/README.md
@@ -0,0 +1,20 @@
+# mle-bench
+
+## Setup
+
+### Kaggle API token
+
+Install your Kaggle API token at `~/.kaggle/kaggle.json`.
+
+### MLE-bench
+
+`poetry add git+https://[email protected]:openai/mle-bench`
+
+### Build images
+
+```bash
+docker build --platform=linux/amd64 \
+--build-arg PYTHON_VERSION=3.12 \
+--build-arg INSTALL_HEAVY_DEPENDENCIES=false \
+-t mlebench-env -f environment/Dockerfile .
+```
diff --git a/evaluation/mle_bench/agents/.shared_env b/evaluation/mle_bench/agents/.shared_env
@@ -0,0 +1,4 @@
+SUBMISSION_DIR=/home/submission
+LOGS_DIR=/home/logs
+CODE_DIR=/home/code
+AGENT_DIR=/home/agent
diff --git a/evaluation/mle_bench/agents/__init__.py b/evaluation/mle_bench/agents/__init__.py
diff --git a/evaluation/mle_bench/agents/openhands/Dockerfile b/evaluation/mle_bench/agents/openhands/Dockerfile
@@ -0,0 +1,50 @@
+FROM --platform=linux/amd64 mlebench-env
+
+# Do not modify the following block of arg-env pairs. The evaluation infrastructure will
+# periodically download the contents of these directories for logging purposes.
+
+# Where the agent should write its `submission.csv`
+ARG SUBMISSION_DIR=/home/submission
+ENV SUBMISSION_DIR=${SUBMISSION_DIR}
+
+# Where the agent's logs should be written
+ARG LOGS_DIR=/home/logs
+ENV LOGS_DIR=${LOGS_DIR}
+
+# Where the agent should write its code
+ARG CODE_DIR=/home/code
+ENV CODE_DIR=${CODE_DIR}
+
+# Where the agent code should exist
+ARG AGENT_DIR=/home/agent
+ENV AGENT_DIR=${AGENT_DIR}
+
+ARG CONDA_ENV_NAME=agent
+ARG OH_VERSION=main
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN mkdir -p ${LOGS_DIR} ${AGENT_DIR} ${CODE_DIR} ${SUBMISSION_DIR} && \
+    curl -fsSL https://get.docker.com -o /tmp/get-docker.sh && \
+    chmod 700 /tmp/get-docker.sh && \
+    /tmp/get-docker.sh && \
+    sudo usermod -aG docker nonroot && \
+    git clone --branch ${OH_VERSION} --single-branch https://github.com/All-Hands-AI/OpenHands.git ${AGENT_DIR}
+
+WORKDIR ${AGENT_DIR}
+
+# Assumes that the `agent` conda env already exists, which is created in the `mlebench-env` base image.
+RUN conda install -y -n ${CONDA_ENV_NAME} -c conda-forge nodejs=18.17.1 python=3.12 poetry=1.8 && \
+    conda run -n ${CONDA_ENV_NAME} poetry install && \
+    conda clean -afy
+
+COPY setup.py start.py templates.py utils.py start.sh build.sh ${AGENT_DIR}/
+COPY entrypoint.sh /agent_entrypoint.sh
+
+RUN chmod +x /agent_entrypoint.sh
+
+# Allow the agent to do `sudo /build.sh`, but restrict it from reading or editing the file.
+RUN echo "ALL ALL=NOPASSWD: ${AGENT_DIR}/build.sh" >> /etc/sudoers && \
+    chmod 711 ${AGENT_DIR}/build.sh
+
+# run the agent's entrypoint script instead (which also runs root entrypoint)
+ENTRYPOINT ["/agent_entrypoint.sh"]
diff --git a/evaluation/mle_bench/agents/openhands/build.sh b/evaluation/mle_bench/agents/openhands/build.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+cd /home/agent
+
+# The final configuration for the agent happens here, otherwise the standard entrypoint hangs while
+# all the build artifacts are being recursively chmod'ed.
+
+/opt/conda/bin/conda run -n agent --no-capture-output make build
+# sudo -u nonroot /opt/conda/bin/conda run -n agent --no-capture-output playwright install
diff --git a/evaluation/mle_bench/agents/openhands/config.yaml b/evaluation/mle_bench/agents/openhands/config.yaml
@@ -0,0 +1,13 @@
+openhands:
+  start: openhands/start.sh
+  dockerfile: openhands/Dockerfile
+  kwargs_type: argparse
+  kwargs:
+    agent: CodeActAgent
+    model: gpt-4o
+    max_time_in_hours: 24
+    max_steps: 500
+    shm_size: "100G"
+  env_vars:
+    OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+  privileged: true
diff --git a/evaluation/mle_bench/agents/openhands/entrypoint.sh b/evaluation/mle_bench/agents/openhands/entrypoint.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Print commands and their arguments as they are executed
+set -x
+
+# Run the root entrypoint in the background
+/entrypoint.sh &
+
+mkdir -p $LOGS_DIR
+mkdir -p $AGENT_DIR
+{
+  # Check if Docker installed, and if so start the Docker daemon in the background.
+  if [ -x "$(command -v docker)" ]; then
+
+    # if CUDA is available, install the nvidia container toolkit
+    if [ -x "$(command -v nvidia-smi)" ]; then
+      # configure production repository
+      curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+        && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
+        | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
+          | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+      # install the nvidia container toolkit
+      sudo apt-get update
+      sudo apt-get install -y nvidia-container-toolkit
+      # configure the runtime
+      sudo nvidia-ctk runtime configure --runtime=docker
+    fi
+
+    # (re)start the Docker daemon
+    if sudo pgrep dockerd > /dev/null; then
+      sudo pkill dockerd
+    fi
+    # sudo dockerd > $LOGS_DIR/docker.log 2>&1 &
+    sleep 5
+  else
+    echo "Docker not installed. Skipping Docker startup."
+  fi
+
+} 2>&1 | tee $LOGS_DIR/agent_entrypoint.log
+
+# signal that the entrypoint has finished
+touch $AGENT_DIR/entrypoint_done
+
+# wait for root entrypoint (a server), need this otherwise the container exits
+wait
diff --git a/evaluation/mle_bench/agents/openhands/setup.py b/evaluation/mle_bench/agents/openhands/setup.py
@@ -0,0 +1,49 @@
+import argparse
+import os
+from pathlib import Path
+
+from templates import additional_notes_template, config_template
+from utils import get_gpu_generation
+
+OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+CODE_DIR = Path(os.getenv('CODE_DIR'))
+AGENT_DIR = Path(os.getenv('AGENT_DIR'))
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--agent', type=str, required=True)
+parser.add_argument('--model', type=str, required=True)
+parser.add_argument('--max_time_in_hours', type=float, required=True)
+parser.add_argument('--max_steps', type=int, required=True)
+parser.add_argument('--shm_size', type=str, required=True)
+args, other_args = parser.parse_known_args()
+
+gpu_generation = get_gpu_generation()
+type_of_processor = gpu_generation if gpu_generation else 'CPU'
+
+config = config_template.substitute(
+    workspace_base='/home',
+    workspace_mount_path_in_sandbox='/home',
+    max_steps=args.max_steps,
+    model=args.model,
+    api_key=OPENAI_API_KEY,
+    agent=args.agent,
+    shm_size=args.shm_size,
+)
+
+additional_notes = additional_notes_template.substitute(
+    type_of_processor=type_of_processor,
+    max_time_in_hours=args.max_time_in_hours,
+    max_steps=args.max_steps,
+    workspace=CODE_DIR,
+)
+
+with open(AGENT_DIR / 'config.toml', 'w') as file:
+    file.write(config.strip())
+
+with open('/home/instructions.txt', 'r') as file:
+    partial_instructions = file.read()
+
+instructions = partial_instructions + additional_notes
+
+with open('/home/full_instructions.txt', 'w') as file:
+    file.write(instructions.strip())
diff --git a/evaluation/mle_bench/agents/openhands/start.py b/evaluation/mle_bench/agents/openhands/start.py
@@ -0,0 +1,184 @@
+"""
+Main entrypoint for the OpenHands agent.
+"""
+
+import argparse
+import asyncio
+import json
+import os
+from pathlib import Path
+from typing import Callable, Iterable
+
+from openhands.controller.state.state import State
+from openhands.core.config import load_app_config
+from openhands.core.main import create_runtime, run_controller
+from openhands.events import Event, EventStreamSubscriber
+from openhands.events.action import (
+    Action,
+    CmdRunAction,
+    IPythonRunCellAction,
+    MessageAction,
+)
+from openhands.events.observation import (
+    CmdOutputObservation,
+    IPythonRunCellObservation,
+)
+
+# pylint: disable=unspecified-encoding
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--max_time_in_hours', type=float, required=True)
+args, other_args = parser.parse_known_args()
+
+global_events = []
+global_events_lock = asyncio.Lock()
+
+CODE_DIR = Path(os.getenv('CODE_DIR'))  # type: ignore
+MAX_TIME_IN_SECONDS = args.max_time_in_hours * 60 * 60
+
+
+def fake_user_response_fn(
+    _state: State,
+    _encapsulate_solution: bool = False,
+    _try_parse: Callable[[Action], str] | None = None,
+) -> str:
+    return 'Please continue working on the approach you think is most promising. You should complete the task without any human input.'
+
+
+async def on_event(event: Event):
+    """Used to stream the agent's Jupyter notebook and Python code to the code directory."""
+
+    async with global_events_lock:
+        global_events.append(event)
+        notebook = get_notebook(global_events)
+        python = get_python(global_events)
+
+        with open(CODE_DIR / 'solution.ipynb', 'w') as file:
+            json.dump(notebook, file)
+
+        with open(CODE_DIR / 'solution.py', 'w') as file:
+            file.write(python)
+
+
+async def run(instructions: str) -> State:
+    config = load_app_config()
+
+    runtime = create_runtime(config)
+    runtime.event_stream.subscribe(EventStreamSubscriber.TEST, on_event, 'mle-bench')
+    await runtime.connect()
+
+    state: State | None = await asyncio.wait_for(
+        run_controller(
+            config=config,
+            initial_user_action=MessageAction(content=instructions),
+            runtime=runtime,
+            exit_on_message=False,
+            fake_user_response_fn=fake_user_response_fn,
+        ),
+        timeout=MAX_TIME_IN_SECONDS,
+    )
+
+    assert state, 'Controller produced no final state.'
+    return state
+
+
+def get_python(events: Iterable[Event]) -> str:
+    code = '# %%\n\n'
+    sep = '\n\n# %%\n\n'
+
+    for event in events:
+        if isinstance(event, IPythonRunCellAction):
+            code += event.code + sep
+        elif isinstance(event, CmdRunAction):
+            code += f'!{event.command}' + sep
+
+    return code
+
+
+def get_notebook(events: Iterable[Event]) -> dict:
+    cells = []
+
+    for event in events:
+        if event.source != 'agent':
+            continue
+
+        if isinstance(event, IPythonRunCellAction):
+            cells.append(
+                {
+                    'cell_type': 'markdown',
+                    'metadata': {},
+                    'source': [event.thought],
+                }
+            )
+            cells.append(
+                {
+                    'cell_type': 'code',
+                    'metadata': {},
+                    'source': event.code.split('\n'),
+                    'outputs': [],
+                    'execution_count': None,
+                }
+            )
+        elif isinstance(event, IPythonRunCellObservation):
+            assert cells
+            assert cells[-1]['cell_type'] == 'code'
+
+            cells[-1]['outputs'].append(
+                {
+                    'name': 'stdout',
+                    'output_type': 'stream',
+                    'text': event.content.split('\n'),
+                }
+            )
+        elif isinstance(event, CmdRunAction):
+            cells.append(
+                {
+                    'cell_type': 'code',
+                    'metadata': {},
+                    'source': [f'!{event.command}'],
+                    'outputs': [],
+                    'execution_count': None,
+                }
+            )
+        elif isinstance(event, CmdOutputObservation):
+            assert cells
+            assert cells[-1]['cell_type'] == 'code'
+
+            cells[-1]['outputs'].append(
+                {
+                    'name': 'stdout',
+                    'output_type': 'stream',
+                    'text': event.content.split('\n'),
+                }
+            )
+
+    notebook = {
+        'cells': cells,
+        'metadata': {
+            'kernelspec': {
+                'display_name': 'Python 3',
+                'language': 'python',
+                'name': 'python3',
+            },
+            'language_info': {
+                'codemirror_mode': {'name': 'ipython', 'version': 3},
+                'file_extension': '.py',
+                'mimetype': 'text/x-python',
+                'name': 'python',
+                'nbconvert_exporter': 'python',
+                'pygments_lexer': 'ipython3',
+                'version': '3.11.0',
+            },
+        },
+        'nbformat': 4,
+        'nbformat_minor': 4,
+    }
+
+    return notebook
+
+
+if __name__ == '__main__':
+    with open('/home/full_instructions.txt', 'r') as file:
+        instructions = file.read()
+
+    asyncio.run(run(instructions))