From e15e42faa8d99ad51dca03c2603679d309761d91 Mon Sep 17 00:00:00 2001
From: Calvin Smith <email@cjsmith.io>
Date: Mon, 18 Nov 2024 16:06:43 -0700
Subject: [PATCH 1/4] initial lift of relevant mle-bench code

---
 evaluation/mle_bench/README.md                |   4 +
 evaluation/mle_bench/agents/__init__.py       |   0
 evaluation/mle_bench/agents/dummy/Dockerfile  |  29 +++
 evaluation/mle_bench/agents/dummy/config.yaml |   3 +
 evaluation/mle_bench/agents/dummy/main.py     |  34 +++
 evaluation/mle_bench/agents/dummy/start.sh    |  21 ++
 .../mle_bench/agents/opendevin/Dockerfile     |  53 ++++
 .../mle_bench/agents/opendevin/build.sh       |   5 +
 .../mle_bench/agents/opendevin/config.yaml    |  13 +
 .../mle_bench/agents/opendevin/entrypoint.sh  |  45 ++++
 .../mle_bench/agents/opendevin/setup.py       |  49 ++++
 .../mle_bench/agents/opendevin/start.py       | 168 ++++++++++++
 .../mle_bench/agents/opendevin/start.sh       |  36 +++
 .../mle_bench/agents/opendevin/templates.py   |  37 +++
 .../mle_bench/agents/opendevin/utils.py       |  26 ++
 evaluation/mle_bench/agents/registry.py       | 113 ++++++++
 evaluation/mle_bench/agents/run.py            | 158 ++++++++++++
 evaluation/mle_bench/agents/utils.py          |  43 ++++
 evaluation/mle_bench/environment/Dockerfile   |  87 +++++++
 evaluation/mle_bench/environment/__init__.py  |   0
 .../config/container_configs/default.json     |   6 +
 evaluation/mle_bench/environment/defaults.py  |   5 +
 .../mle_bench/environment/entrypoint.sh       |  19 ++
 .../mle_bench/environment/grading_server.py   |  46 ++++
 .../mle_bench/environment/instructions.txt    |  18 ++
 .../environment/instructions_obfuscated.txt   |  18 ++
 evaluation/mle_bench/environment/utils.py     | 164 ++++++++++++
 .../environment/validate_submission.sh        |  21 ++
 .../experiments/competition_categories.csv    |   3 +
 .../mle_bench/experiments/make_submission.py  | 101 ++++++++
 .../mle_bench/experiments/splits/all.txt      |  75 ++++++
 .../mle_bench/experiments/splits/dev.txt      |   7 +
 .../mle_bench/experiments/splits/high.txt     |  15 ++
 .../mle_bench/experiments/splits/low.txt      |  22 ++
 .../mle_bench/experiments/splits/medium.txt   |  38 +++
 .../experiments/splits/spaceship-titanic.txt  |   1 +
 evaluation/mle_bench/run_infer.py             | 241 ++++++++++++++++++
 37 files changed, 1724 insertions(+)
 create mode 100644 evaluation/mle_bench/README.md
 create mode 100644 evaluation/mle_bench/agents/__init__.py
 create mode 100644 evaluation/mle_bench/agents/dummy/Dockerfile
 create mode 100644 evaluation/mle_bench/agents/dummy/config.yaml
 create mode 100644 evaluation/mle_bench/agents/dummy/main.py
 create mode 100644 evaluation/mle_bench/agents/dummy/start.sh
 create mode 100644 evaluation/mle_bench/agents/opendevin/Dockerfile
 create mode 100644 evaluation/mle_bench/agents/opendevin/build.sh
 create mode 100644 evaluation/mle_bench/agents/opendevin/config.yaml
 create mode 100644 evaluation/mle_bench/agents/opendevin/entrypoint.sh
 create mode 100644 evaluation/mle_bench/agents/opendevin/setup.py
 create mode 100644 evaluation/mle_bench/agents/opendevin/start.py
 create mode 100644 evaluation/mle_bench/agents/opendevin/start.sh
 create mode 100644 evaluation/mle_bench/agents/opendevin/templates.py
 create mode 100644 evaluation/mle_bench/agents/opendevin/utils.py
 create mode 100644 evaluation/mle_bench/agents/registry.py
 create mode 100644 evaluation/mle_bench/agents/run.py
 create mode 100644 evaluation/mle_bench/agents/utils.py
 create mode 100644 evaluation/mle_bench/environment/Dockerfile
 create mode 100644 evaluation/mle_bench/environment/__init__.py
 create mode 100644 evaluation/mle_bench/environment/config/container_configs/default.json
 create mode 100644 evaluation/mle_bench/environment/defaults.py
 create mode 100644 evaluation/mle_bench/environment/entrypoint.sh
 create mode 100644 evaluation/mle_bench/environment/grading_server.py
 create mode 100644 evaluation/mle_bench/environment/instructions.txt
 create mode 100644 evaluation/mle_bench/environment/instructions_obfuscated.txt
 create mode 100644 evaluation/mle_bench/environment/utils.py
 create mode 100644 evaluation/mle_bench/environment/validate_submission.sh
 create mode 100644 evaluation/mle_bench/experiments/competition_categories.csv
 create mode 100644 evaluation/mle_bench/experiments/make_submission.py
 create mode 100644 evaluation/mle_bench/experiments/splits/all.txt
 create mode 100644 evaluation/mle_bench/experiments/splits/dev.txt
 create mode 100644 evaluation/mle_bench/experiments/splits/high.txt
 create mode 100644 evaluation/mle_bench/experiments/splits/low.txt
 create mode 100644 evaluation/mle_bench/experiments/splits/medium.txt
 create mode 100644 evaluation/mle_bench/experiments/splits/spaceship-titanic.txt
 create mode 100644 evaluation/mle_bench/run_infer.py

diff --git a/evaluation/mle_bench/README.md b/evaluation/mle_bench/README.md
new file mode 100644
index 000000000000..4d5a83dbc0c9
--- /dev/null
+++ b/evaluation/mle_bench/README.md
@@ -0,0 +1,4 @@
+# mle-bench
+
+1. `poetry add git+https://org-14957082@github.com:openai/mle-bench`
+2. Install your Kaggle API token at `~/.kaggle/kaggle.json`.
diff --git a/evaluation/mle_bench/agents/__init__.py b/evaluation/mle_bench/agents/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/evaluation/mle_bench/agents/dummy/Dockerfile b/evaluation/mle_bench/agents/dummy/Dockerfile
new file mode 100644
index 000000000000..4a825206bda1
--- /dev/null
+++ b/evaluation/mle_bench/agents/dummy/Dockerfile
@@ -0,0 +1,29 @@
+FROM mlebench-env
+
+# where to put submission.csv, will be extracted
+ARG SUBMISSION_DIR
+ENV SUBMISSION_DIR=${SUBMISSION_DIR}
+# where to put any logs, will be extracted
+ARG LOGS_DIR
+ENV LOGS_DIR=${LOGS_DIR}
+# where to put any code, will be extracted
+ARG CODE_DIR
+ENV CODE_DIR=${CODE_DIR}
+# where to put any other agent-specific files, will not be necessarily extracted
+ARG AGENT_DIR
+ENV AGENT_DIR=${AGENT_DIR}
+
+RUN mkdir -p ${LOGS_DIR} ${CODE_DIR} ${AGENT_DIR}
+
+ARG CONDA_ENV_NAME=agent
+ARG REQUIREMENTS=${AGENT_DIR}/requirements.txt
+
+# copy just the requirements file, so that we can cache conda separately from the agent files
+COPY requirements.txt ${AGENT_DIR}/requirements.txt
+
+# create conda environment and install the requirements to it
+RUN conda run -n ${CONDA_ENV_NAME} pip install -r ${AGENT_DIR}/requirements.txt && \
+    conda clean -afy
+
+# put all the agent files in the expected location
+COPY . ${AGENT_DIR}
diff --git a/evaluation/mle_bench/agents/dummy/config.yaml b/evaluation/mle_bench/agents/dummy/config.yaml
new file mode 100644
index 000000000000..f5ac0a2d01c3
--- /dev/null
+++ b/evaluation/mle_bench/agents/dummy/config.yaml
@@ -0,0 +1,3 @@
+dummy:
+  start: dummy/start.sh
+  dockerfile: dummy/Dockerfile
diff --git a/evaluation/mle_bench/agents/dummy/main.py b/evaluation/mle_bench/agents/dummy/main.py
new file mode 100644
index 000000000000..9584c42e2155
--- /dev/null
+++ b/evaluation/mle_bench/agents/dummy/main.py
@@ -0,0 +1,34 @@
+"""
+A dummy agent which copies the `sample_submission.csv` from the workspace/data directory
+and uses that as its `submission.csv`.
+"""
+
+import getpass
+import os
+import shutil
+import sys
+from pathlib import Path
+
+# Get the current user's username
+username = getpass.getuser()
+
+# Check if the current user ID is 0 (root user ID on Unix-like systems)
+if os.getuid() == 0:
+    print(f"You are running this script as root. Your username is '{username}'.")
+else:
+    print(f'You do not have root access. Your username is {username}.')
+
+print('The script is being run with the following python interpreter:')
+print(sys.executable)
+
+cwd = Path(__file__).parent
+workspace_data_dir = cwd.parent / 'data'
+
+print('Copying sample submission...')
+
+shutil.copy(
+    workspace_data_dir / 'sample_submission.csv',
+    cwd.parent / 'submission' / 'submission.csv',
+)
+
+print(f"Sample submission copied to {cwd.parent / 'submission' / 'submission.csv'}.")
diff --git a/evaluation/mle_bench/agents/dummy/start.sh b/evaluation/mle_bench/agents/dummy/start.sh
new file mode 100644
index 000000000000..51fa6c041fac
--- /dev/null
+++ b/evaluation/mle_bench/agents/dummy/start.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+set -x # Print commands and their arguments as they are executed
+
+eval "$(conda shell.bash hook)" # make conda available to the shell
+conda activate agent
+
+python ${AGENT_DIR}/main.py
+
+# Check that the agent doesn't have permissions to read private dir
+ls /private
+# ls: cannot open directory '/private': Permission denied
+
+# Check that the agent does have permissions to read/write everything in /home
+ls /home/data
+touch $CODE_DIR/code.py
+touch $LOGS_DIR/run.log
+touch $AGENT_DIR/agent_was_here.txt
+cat /home/instructions.txt
+
+# Use the environment-provided grading server to validate our submission
+bash /home/validate_submission.sh /home/submission/submission.csv
diff --git a/evaluation/mle_bench/agents/opendevin/Dockerfile b/evaluation/mle_bench/agents/opendevin/Dockerfile
new file mode 100644
index 000000000000..55c86233ec40
--- /dev/null
+++ b/evaluation/mle_bench/agents/opendevin/Dockerfile
@@ -0,0 +1,53 @@
+FROM --platform=linux/amd64 mlebench-env
+
+# Do not modify the following block of arg-env pairs. The evaluation infrastructure will
+# periodically download the contents of these directories for logging purposes.
+
+# Where the agent should write its `submission.csv`
+ARG SUBMISSION_DIR=/home/submission
+ENV SUBMISSION_DIR=${SUBMISSION_DIR}
+
+# Where the agent's logs should be written
+ARG LOGS_DIR=/home/logs
+ENV LOGS_DIR=${LOGS_DIR}
+
+# Where the agent should write its code
+ARG CODE_DIR=/home/code
+ENV CODE_DIR=${CODE_DIR}
+
+# Where the agent code should exist
+ARG AGENT_DIR=/home/agent
+ENV AGENT_DIR=${AGENT_DIR}
+
+ARG CONDA_ENV_NAME=agent
+ARG PYTHON_VERSION=3.11
+ARG OD_VERSION=v2.1.0
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN mkdir -p ${LOGS_DIR} ${AGENT_DIR} ${CODE_DIR} ${SUBMISSION_DIR} && \
+    curl -fsSL https://get.docker.com -o /tmp/get-docker.sh && \
+    chmod 700 /tmp/get-docker.sh && \
+    /tmp/get-docker.sh && \
+    sudo usermod -aG docker nonroot && \
+    git clone --branch ${OD_VERSION} --single-branch https://github.com/thesofakillers/OpenHands.git ${AGENT_DIR}
+
+WORKDIR ${AGENT_DIR}
+
+# Assumes that the `agent` conda env already exists, which is created in the `mlebench-env` base image.
+RUN conda install -y -n ${CONDA_ENV_NAME} -c conda-forge nodejs=18.17.1 && \
+    conda run -n ${CONDA_ENV_NAME} pip install poetry>=1.8 && \
+    conda run -n ${CONDA_ENV_NAME} poetry install && \
+    conda run -n ${CONDA_ENV_NAME} poetry add build && \
+    conda clean -afy
+
+COPY setup.py start.py templates.py utils.py start.sh build.sh ${AGENT_DIR}/
+COPY entrypoint.sh /agent_entrypoint.sh
+
+RUN chmod +x /agent_entrypoint.sh
+
+# Allow the agent to do `sudo /build.sh`, but restrict it from reading or editing the file.
+RUN echo "ALL ALL=NOPASSWD: ${AGENT_DIR}/build.sh" >> /etc/sudoers && \
+    chmod 711 ${AGENT_DIR}/build.sh
+
+# run the agent's entrypoint script instead (which also runs root entrypoint)
+ENTRYPOINT ["/agent_entrypoint.sh"]
diff --git a/evaluation/mle_bench/agents/opendevin/build.sh b/evaluation/mle_bench/agents/opendevin/build.sh
new file mode 100644
index 000000000000..93b2060fb94d
--- /dev/null
+++ b/evaluation/mle_bench/agents/opendevin/build.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+cd /home/agent
+/opt/conda/bin/conda run -n agent --no-capture-output make build
+sudo -u nonroot /opt/conda/bin/conda run -n agent --no-capture-output playwright install
diff --git a/evaluation/mle_bench/agents/opendevin/config.yaml b/evaluation/mle_bench/agents/opendevin/config.yaml
new file mode 100644
index 000000000000..46ef2d1e7a1e
--- /dev/null
+++ b/evaluation/mle_bench/agents/opendevin/config.yaml
@@ -0,0 +1,13 @@
+opendevin:
+  start: opendevin/start.sh
+  dockerfile: opendevin/Dockerfile
+  kwargs_type: argparse
+  kwargs:
+    agent: CodeActAgent
+    model: gpt-4o
+    max_time_in_hours: 24
+    max_steps: 500
+    shm_size: "100G"
+  env_vars:
+    OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+  privileged: true
diff --git a/evaluation/mle_bench/agents/opendevin/entrypoint.sh b/evaluation/mle_bench/agents/opendevin/entrypoint.sh
new file mode 100644
index 000000000000..509a37b198db
--- /dev/null
+++ b/evaluation/mle_bench/agents/opendevin/entrypoint.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Print commands and their arguments as they are executed
+set -x
+
+# run the root entrypoint in the background
+/entrypoint.sh &
+
+mkdir -p $LOGS_DIR
+mkdir -p $AGENT_DIR
+{
+  # Check if Docker installed, and if so start the Docker daemon in the background.
+  if [ -x "$(command -v docker)" ]; then
+
+    # if CUDA is available, install the nvidia container toolkit
+    if [ -x "$(command -v nvidia-smi)" ]; then
+      # configure production repository
+      curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+        && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
+        | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
+          | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+      # install the nvidia container toolkit
+      sudo apt-get update
+      sudo apt-get install -y nvidia-container-toolkit
+      # configure the runtime
+      sudo nvidia-ctk runtime configure --runtime=docker
+    fi
+
+    # (re)start the Docker daemon
+    if sudo pgrep dockerd > /dev/null; then
+      sudo pkill dockerd
+    fi
+    sudo dockerd > $LOGS_DIR/docker.log 2>&1 &
+    sleep 5
+  else
+    echo "Docker not installed. Skipping Docker startup."
+  fi
+
+} 2>&1 | tee $LOGS_DIR/agent_entrypoint.log
+
+# signal that the entrypoint has finished
+touch $AGENT_DIR/entrypoint_done
+
+# wait for root entrypoint (a server), need this otherwise the container exits
+wait
diff --git a/evaluation/mle_bench/agents/opendevin/setup.py b/evaluation/mle_bench/agents/opendevin/setup.py
new file mode 100644
index 000000000000..d431beafea9e
--- /dev/null
+++ b/evaluation/mle_bench/agents/opendevin/setup.py
@@ -0,0 +1,49 @@
+import argparse
+import os
+from pathlib import Path
+
+from templates import additional_notes_template, config_template
+from utils import get_gpu_generation
+
+OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+CODE_DIR = Path(os.getenv('CODE_DIR'))
+AGENT_DIR = Path(os.getenv('AGENT_DIR'))
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--agent', type=str, required=True)
+parser.add_argument('--model', type=str, required=True)
+parser.add_argument('--max_time_in_hours', type=float, required=True)
+parser.add_argument('--max_steps', type=int, required=True)
+parser.add_argument('--shm_size', type=str, required=True)
+args, other_args = parser.parse_known_args()
+
+gpu_generation = get_gpu_generation()
+type_of_processor = gpu_generation if gpu_generation else 'CPU'
+
+config = config_template.substitute(
+    workspace_base='/home',
+    workspace_mount_path_in_sandbox='/home',
+    max_steps=args.max_steps,
+    model=args.model,
+    api_key=OPENAI_API_KEY,
+    agent=args.agent,
+    shm_size=args.shm_size,
+)
+
+additional_notes = additional_notes_template.substitute(
+    type_of_processor=type_of_processor,
+    max_time_in_hours=args.max_time_in_hours,
+    max_steps=args.max_steps,
+    workspace=CODE_DIR,
+)
+
+with open(AGENT_DIR / 'config.toml', 'w') as file:
+    file.write(config.strip())
+
+with open('/home/instructions.txt', 'r') as file:
+    partial_instructions = file.read()
+
+instructions = partial_instructions + additional_notes
+
+with open('/home/full_instructions.txt', 'w') as file:
+    file.write(instructions.strip())
diff --git a/evaluation/mle_bench/agents/opendevin/start.py b/evaluation/mle_bench/agents/opendevin/start.py
new file mode 100644
index 000000000000..021eb7fa11a5
--- /dev/null
+++ b/evaluation/mle_bench/agents/opendevin/start.py
@@ -0,0 +1,168 @@
+import argparse
+import asyncio
+import json
+import os
+import random
+from pathlib import Path
+from typing import Iterable
+
+from openhands.controller.state.state import State  # type: ignore
+from openhands.core.config import load_app_config  # type: ignore
+from openhands.core.main import create_runtime, run_controller  # type: ignore
+from openhands.events.action import CmdRunAction, IPythonRunCellAction  # type: ignore
+from openhands.events.event import Event  # type: ignore
+from openhands.events.observation import (  # type: ignore
+    CmdOutputObservation,
+    IPythonRunCellObservation,
+)
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--max_time_in_hours', type=float, required=True)
+args, other_args = parser.parse_known_args()
+
+global_events = []
+global_events_lock = asyncio.Lock()
+
+CODE_DIR = Path(os.getenv('CODE_DIR'))
+MAX_TIME_IN_SECONDS = args.max_time_in_hours * 60 * 60
+
+
+def fake_user_response_fn(state: State) -> str:
+    return 'Please continue working on the approach you think is most promising. You should complete the task without any human input.'
+
+
+async def on_event(event: Event):
+    """Used to stream the agent's Jupyter notebook and Python code to the code directory."""
+
+    async with global_events_lock:
+        global_events.append(event)
+        notebook = get_notebook(global_events)
+        python = get_python(global_events)
+
+        with open(CODE_DIR / 'solution.ipynb', 'w') as file:
+            json.dump(notebook, file)
+
+        with open(CODE_DIR / 'solution.py', 'w') as file:
+            file.write(python)
+
+
+async def run(instructions: str) -> State:
+    config = load_app_config()
+    runtime = await create_runtime(config)
+    sid = random.randint(10_000, 100_000)
+    event_stream = runtime.event_stream
+    event_stream.subscribe(sid, on_event)
+    state = await asyncio.wait_for(
+        run_controller(
+            config=config,
+            task_str=instructions,
+            runtime=runtime,
+            exit_on_message=False,
+            fake_user_response_fn=fake_user_response_fn,
+        ),
+        timeout=MAX_TIME_IN_SECONDS,
+    )
+
+    return state
+
+
+def get_python(events: Iterable[Event]) -> str:
+    code = '# %%\n\n'
+    sep = '\n\n# %%\n\n'
+
+    for event in events:
+        if isinstance(event, IPythonRunCellAction):
+            code += event.code + sep
+        elif isinstance(event, CmdRunAction):
+            code += f'!{event.command}' + sep
+
+    return code
+
+
+def get_notebook(events: Iterable[Event]) -> dict:
+    cells = []
+
+    for event in events:
+        if event.source != 'agent':
+            continue
+
+        if isinstance(event, IPythonRunCellAction):
+            cells.append(
+                {
+                    'cell_type': 'markdown',
+                    'metadata': {},
+                    'source': [event.thought],
+                }
+            )
+            cells.append(
+                {
+                    'cell_type': 'code',
+                    'metadata': {},
+                    'source': event.code.split('\n'),
+                    'outputs': [],
+                    'execution_count': None,
+                }
+            )
+        elif isinstance(event, IPythonRunCellObservation):
+            assert cells
+            assert cells[-1]['cell_type'] == 'code'
+
+            cells[-1]['outputs'].append(
+                {
+                    'name': 'stdout',
+                    'output_type': 'stream',
+                    'text': event.content.split('\n'),
+                }
+            )
+        elif isinstance(event, CmdRunAction):
+            cells.append(
+                {
+                    'cell_type': 'code',
+                    'metadata': {},
+                    'source': [f'!{event.command}'],
+                    'outputs': [],
+                    'execution_count': None,
+                }
+            )
+        elif isinstance(event, CmdOutputObservation):
+            assert cells
+            assert cells[-1]['cell_type'] == 'code'
+
+            cells[-1]['outputs'].append(
+                {
+                    'name': 'stdout',
+                    'output_type': 'stream',
+                    'text': event.content.split('\n'),
+                }
+            )
+
+    notebook = {
+        'cells': cells,
+        'metadata': {
+            'kernelspec': {
+                'display_name': 'Python 3',
+                'language': 'python',
+                'name': 'python3',
+            },
+            'language_info': {
+                'codemirror_mode': {'name': 'ipython', 'version': 3},
+                'file_extension': '.py',
+                'mimetype': 'text/x-python',
+                'name': 'python',
+                'nbconvert_exporter': 'python',
+                'pygments_lexer': 'ipython3',
+                'version': '3.11.0',
+            },
+        },
+        'nbformat': 4,
+        'nbformat_minor': 4,
+    }
+
+    return notebook
+
+
+if __name__ == '__main__':
+    with open('/home/full_instructions.txt', 'r') as file:
+        instructions = file.read()
+
+    asyncio.run(run(instructions))
diff --git a/evaluation/mle_bench/agents/opendevin/start.sh b/evaluation/mle_bench/agents/opendevin/start.sh
new file mode 100644
index 000000000000..58efe55e79cd
--- /dev/null
+++ b/evaluation/mle_bench/agents/opendevin/start.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Print commands and their arguments as they are executed
+set -x
+
+{
+  # Check that we can use the GPU in PyTorch
+  conda run -n agent python -c "import torch; print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'WARNING: No GPU')"
+  # Check that we can use the GPU in TensorFlow
+  conda run -n agent python -c "import tensorflow as tf; print('GPUs Available: ', tf.config.list_physical_devices('GPU'))"
+
+  # wait for agent entrypoint to finish with timeout
+  timeout=300 # 5 minutes in seconds
+  elapsed=0
+  interval=5 # Check every 5 seconds
+  while [ ! -f "$AGENT_DIR/entrypoint_done" ]; do
+    if [ $elapsed -ge $timeout ]; then
+      echo "Error: Agent entrypoint did not finish within $timeout seconds."
+      exit 1
+    fi
+    sleep $interval
+    elapsed=$((elapsed + interval))
+  done
+  echo "Agent entrypoint finished!"
+
+  if ! docker info &>/dev/null; then
+    echo "Error: Docker is required but is either not running or not installed."
+    exit 1
+  fi
+
+  source /opt/conda/bin/activate agent
+
+  sudo ./build.sh \
+    && conda run -n agent --no-capture-output python setup.py "$@" \
+    && conda run -n agent --no-capture-output python start.py "$@"
+} 2>&1 | tee $LOGS_DIR/agent.log
diff --git a/evaluation/mle_bench/agents/opendevin/templates.py b/evaluation/mle_bench/agents/opendevin/templates.py
new file mode 100644
index 000000000000..6708472f34f2
--- /dev/null
+++ b/evaluation/mle_bench/agents/opendevin/templates.py
@@ -0,0 +1,37 @@
+from string import Template
+
+config_template = Template(
+    """
+[core]
+workspace_base="${workspace_base}"
+workspace_mount_path="${workspace_base}"
+workspace_mount_path_in_sandbox="${workspace_mount_path_in_sandbox}"
+default_agent="${agent}"
+max_iterations=${max_steps}
+persist_sandbox=false
+
+[llm]
+model="${model}"
+api_key="${api_key}"
+embedding_model=""
+
+[sandbox]
+user_id=1000
+use_host_network=true
+timeout=86400
+shm_size="${shm_size}"
+"""
+)
+
+additional_notes_template = Template(
+    """
+ADDITIONAL NOTES
+------
+- **Compute**: You have access to a ${type_of_processor} with the appropriate drivers installed.
+- **Total Runtime**: You have a maximum of ${max_time_in_hours} hours to attempt this task. You may iteratively develop your solution within this time frame, but bear in mind that program runtime also counts towards this limit.
+- **Total Steps**: You have a maximum of ${max_steps} steps to submit your solution.
+- You only have read and write permissions to the files that you create. You therefore cannot use the helper script `/home/validate_submission.sh`. Instead, use the HTTP endpoint (e.g. via `curl -X POST -F "file=@/path/to/your/file" http://localhost:5000/validate`) to validate your submission.
+- You must write all of the code necessary to produce the `submission.csv` in `${workspace}`. You must not write to any other location.
+- You must use all the time available to you, unless you've achieved the maximum possible score for the competition. Do not stop until you've reached the maximum possible score.
+"""
+)
diff --git a/evaluation/mle_bench/agents/opendevin/utils.py b/evaluation/mle_bench/agents/opendevin/utils.py
new file mode 100644
index 000000000000..0df89c78efed
--- /dev/null
+++ b/evaluation/mle_bench/agents/opendevin/utils.py
@@ -0,0 +1,26 @@
+import subprocess
+from typing import Optional
+
+
+def get_gpu_generation() -> Optional[str]:
+    """Returns the GPU generation, if available."""
+
+    try:
+        result = subprocess.run(
+            ['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+    except Exception:
+        return None
+
+    if result.returncode != 0:
+        return None
+
+    generation = result.stdout.strip().split('\n')
+
+    if not generation:
+        return None
+
+    return ', '.join([info.strip() for info in generation])
diff --git a/evaluation/mle_bench/agents/registry.py b/evaluation/mle_bench/agents/registry.py
new file mode 100644
index 000000000000..794b75bef146
--- /dev/null
+++ b/evaluation/mle_bench/agents/registry.py
@@ -0,0 +1,113 @@
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+
+import yaml
+from mlebench.utils import get_logger
+
+from agents.utils import parse_env_var_values
+
+logger = get_logger(__name__)
+
+
+@dataclass(frozen=True)
+class Agent:
+    id: str
+    name: str
+    agents_dir: Path
+    start: Path
+    dockerfile: Path
+    kwargs: dict
+    env_vars: dict
+    privileged: bool = False
+    kwargs_type: Optional[str] = None
+
+    def __post_init__(self):
+        assert isinstance(
+            self.start, Path
+        ), 'Agent start script must be a pathlib.Path object.'
+        assert isinstance(
+            self.dockerfile, Path
+        ), 'Agent dockerfile must be a pathlib.Path object.'
+        assert isinstance(self.kwargs, dict), 'Agent kwargs must be a dictionary.'
+        assert isinstance(self.privileged, bool), 'Agent privileged must be a boolean.'
+
+        if self.kwargs_type is not None:
+            assert isinstance(
+                self.kwargs_type, str
+            ), 'Agent kwargs_type must be a string.'
+        else:  # i.e., self.kwargs_type is None
+            assert (
+                self.kwargs == {}
+            ), 'Agent kwargs_type must be set if kwargs are provided.'
+
+        assert isinstance(self.env_vars, dict), 'Agent env_vars must be a dictionary.'
+
+        assert self.start.exists(), f'start script {self.start} does not exist.'
+        assert self.dockerfile.exists(), f'dockerfile {self.dockerfile} does not exist.'
+
+    @staticmethod
+    def from_dict(data: dict) -> 'Agent':
+        agents_dir = Path(data['agents_dir'])
+        try:
+            return Agent(
+                id=data['id'],
+                name=data['name'],
+                agents_dir=agents_dir,
+                start=agents_dir / data['start'],
+                dockerfile=agents_dir / data['dockerfile'],
+                kwargs=data.get('kwargs', {}),
+                kwargs_type=data.get('kwargs_type', None),
+                env_vars=data.get('env_vars', {}),
+                privileged=data.get('privileged', False),
+            )
+        except KeyError as e:
+            raise ValueError(f'Missing key {e} in agent config!')
+
+
+class Registry:
+    def get_agents_dir(self) -> Path:
+        """Retrieves the agents directory within the registry."""
+
+        return Path(__file__).parent
+
+    def get_agent(self, agent_id: str) -> Agent:
+        """Fetch the agent from the registry."""
+
+        agents_dir = self.get_agents_dir()
+
+        for fpath in agents_dir.glob('**/config.yaml'):
+            with open(fpath, 'r') as f:
+                contents = yaml.safe_load(f)
+
+            if agent_id not in contents:
+                continue
+
+            logger.debug(f'Fetching {fpath}')
+
+            kwargs = contents[agent_id].get('kwargs', {})
+            kwargs_type = contents[agent_id].get('kwargs_type', None)
+            env_vars = contents[agent_id].get('env_vars', {})
+            privileged = contents[agent_id].get('privileged', False)
+
+            # env vars can be used both in kwargs and env_vars
+            kwargs = parse_env_var_values(kwargs)
+            env_vars = parse_env_var_values(env_vars)
+
+            return Agent.from_dict(
+                {
+                    **contents[agent_id],
+                    'id': agent_id,
+                    'name': fpath.parent.name,
+                    'agents_dir': agents_dir,
+                    'kwargs': kwargs,
+                    'kwargs_type': kwargs_type,
+                    'env_vars': env_vars,
+                    'privileged': privileged,
+                }
+            )
+
+        raise ValueError(f'Agent with id {agent_id} not found')
+
+
+registry = Registry()
diff --git a/evaluation/mle_bench/agents/run.py b/evaluation/mle_bench/agents/run.py
new file mode 100644
index 000000000000..befa95f4ce99
--- /dev/null
+++ b/evaluation/mle_bench/agents/run.py
@@ -0,0 +1,158 @@
+import logging
+import time
+from pathlib import Path
+
+import docker
+from docker.models.containers import Container
+from dotenv import dotenv_values
+from environment.utils import (
+    create_competition_container,
+    extract_from_container,
+    extract_from_container_sysbox,
+)
+from mlebench.registry import Competition
+from mlebench.utils import purple
+
+from agents.registry import Agent
+
+CONSTANTS = dotenv_values(Path(__file__).parent.resolve() / '.shared_env')
+
+
+def save_output(container: Container, save_dir: Path, container_config: dict) -> Path:
+    """
+    Extracts the submission, logs, and code directories from the container
+
+    and saves them to the specified directory.
+
+    Args:
+        container: The Docker container.
+        save_dir: The directory where the output file will be saved.
+        container_config: The container configuration.
+    Returns:
+        Path to the output directory.
+    """
+    if 'runtime' in container_config and container_config['runtime'] == 'sysbox-runc':
+        extraction_fn = extract_from_container_sysbox
+    else:
+        extraction_fn = extract_from_container
+
+    for dir_type in ['SUBMISSION_DIR', 'LOGS_DIR', 'CODE_DIR']:
+        container_dir = CONSTANTS[dir_type]
+        extraction_fn(container, container_dir, save_dir)
+
+    return save_dir
+
+
+def execute_agent(container: Container, agent: Agent, logger: logging.Logger):
+    """
+    Initiates the agent via its start script inside the container.
+    """
+    cmd = ['bash', f"{CONSTANTS['AGENT_DIR']}/start.sh"]
+
+    if agent.kwargs_type == 'argparse':
+        for key, value in agent.kwargs.items():
+            cmd += [f'--{key}', str(value)]
+
+    if agent.kwargs_type == 'omegaconf':
+        cmd += [f'{key}={value}' for key, value in agent.kwargs.items()]
+
+    logger.info('Running agent...')
+    exit_code, output = container.exec_run(cmd, stream=True, user='nonroot')
+
+    for chunk in output:
+        logger.info(f"[Container] {chunk.decode('utf-8').strip()}")
+
+
+def clean_up(
+    container: Container, logger: logging.Logger, retain: bool = False
+) -> bool:
+    """
+    Stops and removes the container.
+
+    Returns:
+        True if successful, False otherwise.
+    """
+    logger.info(f'Cleaning up container: {container.name}')
+    try:
+        container.stop()
+        if not retain:
+            container.remove()
+        logger.info(f'Container {container.name} stopped and removed.')
+        return True
+    except Exception as e:
+        logger.error(
+            f'Error cleaning up: {e}. You may wish to manually check the status of the {container.name} container.'
+        )
+        return False
+
+
+def run_in_container(
+    client: docker.DockerClient,
+    competition: Competition,
+    agent: Agent,
+    image: str,
+    container_config: dict,
+    retain_container: bool,
+    run_dir: Path,
+    logger: logging.Logger,
+) -> Path:
+    """
+    Runs environment containing the competition and agent for a set maximum amount of time.
+
+    Args:
+        client: Docker client.
+        competition: The competition to run.
+        agent: The agent to run.
+        image: The Docker image to use. Assumes the image is built.
+        container_config: Configuration for the Docker container.
+        retain_container: Whether to retain the container after the run instead of removing it.
+        run_dir: Path to the directory where all assets associated with the run are stored.
+        logger: Logger for the run.
+
+    Returns:
+        Path to the output file.
+    """
+    volumes_config = {
+        competition.public_dir.resolve().as_posix(): {
+            'bind': '/home/data',
+            'mode': 'ro',
+        },
+        competition.private_dir.resolve().as_posix(): {
+            'bind': f'/private/data/{competition.id}/prepared/private/',
+            'mode': 'ro',
+        },
+    }
+
+    container = create_competition_container(
+        client=client,
+        competition=competition,
+        container_config=container_config,
+        volumes_config=volumes_config,
+        env_vars={
+            'COMPETITION_ID': competition.id,
+            **agent.env_vars,
+        },
+        container_image=image,
+        privileged=agent.privileged,
+    )
+
+    logger.info(purple(f'Run started: {run_dir}'))
+    try:
+        time_start = time.monotonic()
+        container.start()
+        exit_code, _ = container.exec_run(
+            'timeout 60s sh -c "while ! curl -s http://localhost:5000/health > /dev/null; do sleep 1; done"'
+        )
+        if exit_code != 0:
+            raise RuntimeError(
+                'The grading server failed to start within 60 seconds. This is likely due to an error in `entrypoint.sh`; check the logs.'
+            )
+        execute_agent(container, agent, logger)
+        save_output(container, run_dir, container_config)
+        time_end = time.monotonic()
+        logger.info(f'Run completed in {time_end - time_start:.2f} seconds.')
+        return run_dir
+    except Exception as e:
+        raise e
+    finally:
+        clean_up(container, logger, retain_container)
diff --git a/evaluation/mle_bench/agents/utils.py b/evaluation/mle_bench/agents/utils.py
new file mode 100644
index 000000000000..874e9f04a580
--- /dev/null
+++ b/evaluation/mle_bench/agents/utils.py
@@ -0,0 +1,43 @@
+import os
+import re
+from typing import Optional
+
+
+def get_env_var(value: str) -> Optional[str]:
+    """Returns the name of the environment variable in the format `${secrets.<name>}`."""
+
+    if not isinstance(value, str):
+        return None
+
+    env_var_pattern = r'\$\{\{\s*secrets\.(\w+)\s*\}\}'
+    match = re.match(env_var_pattern, value)
+
+    if not match:
+        return None
+
+    return match.group(1)
+
+
+def is_env_var(value: str) -> bool:
+    """Checks if the value is an environment variable."""
+
+    return get_env_var(value) is not None
+
+
+def parse_env_var_values(dictionary: dict) -> dict:
+    """
+    Parses any values in the dictionary that match the ${{ secrets.ENV_VAR }} pattern and replaces
+    them with the value of the ENV_VAR environment variable.
+    """
+    for key, value in dictionary.items():
+        if not is_env_var(value):
+            continue
+
+        env_var = get_env_var(value)
+
+        if os.getenv(env_var) is None:
+            raise ValueError(f'Environment variable `{env_var}` is not set!')
+
+        dictionary[key] = os.getenv(env_var)
+
+    return dictionary
diff --git a/evaluation/mle_bench/environment/Dockerfile b/evaluation/mle_bench/environment/Dockerfile
new file mode 100644
index 000000000000..c0f20b3cc91f
--- /dev/null
+++ b/evaluation/mle_bench/environment/Dockerfile
@@ -0,0 +1,87 @@
+FROM ubuntu:20.04
+
+# Avoid interactive dialog from apt-get and other packages requiring configuration
+ENV DEBIAN_FRONTEND=noninteractive
+
+# install basic packages
+RUN apt-get update && apt-get install -y \
+    curl \
+    wget \
+    git \
+    vim \
+    nano \
+    unzip \
+    zip \
+    p7zip-full \
+    python3 \
+    python3-pip \
+    python3-venv \
+    python3-dev \
+    python-is-python3 \
+    build-essential \
+    openssh-server \
+    tmux \
+    gettext \
+    sudo \
+    ffmpeg \
+    libsm6 \
+    libxext6 \
+    && pip install jupyter \
+    && rm -rf /var/lib/apt/lists/* # removes cache
+
+RUN pip install virtualenv \
+    && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh \
+    && bash /tmp/miniconda.sh -b -p /opt/conda \
+    && rm /tmp/miniconda.sh \
+    && /opt/conda/bin/conda init
+
+ARG CONDA_ENV_NAME=agent
+ARG PYTHON_VERSION=3.11
+ARG REQUIREMENTS=/tmp/requirements.txt
+
+COPY environment/requirements.txt ${REQUIREMENTS}
+
+# create conda environment and optionally install the requirements to it
+RUN /opt/conda/bin/conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} -y
+ARG INSTALL_HEAVY_DEPENDENCIES=true
+ENV INSTALL_HEAVY_DEPENDENCIES=${INSTALL_HEAVY_DEPENDENCIES}
+
+# The rest of your Dockerfile
+RUN if [ "$INSTALL_HEAVY_DEPENDENCIES" = "true" ]; then \
+    /opt/conda/bin/conda run -n ${CONDA_ENV_NAME} pip install -r /tmp/requirements.txt && \
+    /opt/conda/bin/conda run -n ${CONDA_ENV_NAME} pip install tensorflow[and-cuda]==2.17 && \
+    /opt/conda/bin/conda run -n ${CONDA_ENV_NAME} pip install torch==2.2.0 torchaudio==2.2.0 torchtext==0.17.0 torchvision==0.17.0 && \
+    /opt/conda/bin/conda clean -afy ; fi
+
+ENV PATH="/opt/conda/bin:${PATH}"
+
+# Installs from here onward go into the conda base env; previous was installed to /usr/bin/python
+
+# Install stuff for the grading server: mlebench and flask
+COPY . /mlebench
+RUN pip install flask && pip install -e /mlebench
+
+# Reset DEBIAN_FRONTEND
+ENV DEBIAN_FRONTEND=
+
+# Make private directory (root) owner-only. Grading server will be added here, later in the build
+# The test set answers will be added here separately via a mounted docker volume
+RUN mkdir /private && chmod 700 /private
+
+# Copy over relevant files
+COPY environment/grading_server.py /private/grading_server.py
+COPY environment/instructions.txt /home/instructions.txt
+COPY environment/instructions_obfuscated.txt /home/instructions_obfuscated.txt
+COPY environment/validate_submission.sh /home/validate_submission.sh
+COPY environment/entrypoint.sh /entrypoint.sh
+
+# Create nonroot user; make entrypoint executable
+RUN useradd -m nonroot \
+    && mkdir /home/submission \
+    && chmod +x /entrypoint.sh
+
+WORKDIR /home
+
+# IMPORTANT: This needs to run as root! Downstream Dockerfiles must not change the default USER for when the container starts.
+# Entrypoint script is in charge of setting up the user environment and running the grading server
+ENTRYPOINT ["/entrypoint.sh"]
diff --git a/evaluation/mle_bench/environment/__init__.py b/evaluation/mle_bench/environment/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/evaluation/mle_bench/environment/config/container_configs/default.json b/evaluation/mle_bench/environment/config/container_configs/default.json
new file mode 100644
index 000000000000..3200d2c01105
--- /dev/null
+++ b/evaluation/mle_bench/environment/config/container_configs/default.json
@@ -0,0 +1,6 @@
+{
+    "mem_limit": null,
+    "shm_size": "4G",
+    "nano_cpus": 4e9,
+    "runtime": "sysbox-runc"
+}
diff --git a/evaluation/mle_bench/environment/defaults.py b/evaluation/mle_bench/environment/defaults.py
new file mode 100644
index 000000000000..926cbe5f19b7
--- /dev/null
+++ b/evaluation/mle_bench/environment/defaults.py
@@ -0,0 +1,5 @@
+from pathlib import Path
+
+DEFAULT_CONTAINER_CONFIG_PATH = (
+    Path(__file__).parent / 'config' / 'container_configs' / 'default.json'
+)
diff --git a/evaluation/mle_bench/environment/entrypoint.sh b/evaluation/mle_bench/environment/entrypoint.sh
new file mode 100644
index 000000000000..95d988168ec0
--- /dev/null
+++ b/evaluation/mle_bench/environment/entrypoint.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# Print commands and their arguments as they are executed
+set -x
+
+{
+  # log into /home/logs
+  LOGS_DIR=/home/logs
+  mkdir -p $LOGS_DIR
+
+  # chmod the /home directory such that nonroot users can work on everything within it. We do this at container start
+  # time so that anything added later in agent-specific Dockerfiles will also receive the correct permissions.
+  # (this command does `chmod a+rw /home` but with the exception of /home/data, which is a read-only volume)
+  find /home -path /home/data -prune -o -exec chmod a+rw {} \;
+  ls -l /home
+
+  # Launch grading server, stays alive throughout container lifetime to service agent requests.
+  /opt/conda/bin/python /private/grading_server.py
+} 2>&1 | tee $LOGS_DIR/entrypoint.log
diff --git a/evaluation/mle_bench/environment/grading_server.py b/evaluation/mle_bench/environment/grading_server.py
new file mode 100644
index 000000000000..6c694c3fc8ea
--- /dev/null
+++ b/evaluation/mle_bench/environment/grading_server.py
@@ -0,0 +1,46 @@
+import os
+from pathlib import Path
+
+from flask import Flask, jsonify, request
+from mlebench.grade import validate_submission
+from mlebench.registry import registry
+
+app = Flask(__name__)
+
+PRIVATE_DATA_DIR = '/private/data'
+COMPETITION_ID = os.getenv(
+    'COMPETITION_ID'
+)  # This is populated for us at container runtime
+
+
+def run_validation(submission: Path) -> str:
+    new_registry = registry.set_data_dir(Path(PRIVATE_DATA_DIR))
+    competition = new_registry.get_competition(COMPETITION_ID)
+    is_valid, message = validate_submission(submission, competition)
+    return message
+
+
+@app.route('/validate', methods=['POST'])
+def validate():
+    submission_file = request.files['file']
+    submission_path = Path('/tmp/submission_to_validate.csv')
+    submission_file.save(submission_path)
+
+    try:
+        result = run_validation(submission_path)
+    except Exception as e:
+        # Server error
+        return jsonify(
+            {'error': 'An unexpected error occurred.', 'details': str(e)}
+        ), 500
+
+    return jsonify({'result': result})
+
+
+@app.route('/health', methods=['GET'])
+def health():
+    return jsonify({'status': 'running'}), 200
+
+
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=5000)
diff --git a/evaluation/mle_bench/environment/instructions.txt b/evaluation/mle_bench/environment/instructions.txt
new file mode 100644
index 000000000000..5725a2899cfd
--- /dev/null
+++ b/evaluation/mle_bench/environment/instructions.txt
@@ -0,0 +1,18 @@
+You are participating in *MLE-bench*, an offline version of Kaggle competitions adapted to test the coding ability of agents like yourself.
+
+You will be given a machine learning task. You must solve the task by training a model and running the model on the test set to produce a submission file. You may not hand-label the data to produce a submission file.
+
+BENCHMARK INSTRUCTIONS
+------
+First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `/home/instructions.txt`.
+
+- **Competition-specific instructions**: The instructions for the competition are available in `/home/data/description.md`. Read this file to understand the problem setup for the competition you are currently solving.
+- **Dataset**: The dataset files for the competition are available in `/home/data/`.
+- **Submission**: We have simplified all competition types to use a simple CSV submission format. Regardless of what the competition instructions say, you MUST produce a submission file at `/home/submission/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
+- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each competition. As such, the dataset you see in `/home/data/` may differ slightly from what is described in the competition instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the competition task on the train/test splits available to you.
+- **Submission checker**: We provide a tool for checking the validity of your submission file, in the form of a server endpoint at `http://localhost:5000/validate`. If you post a submission file to the server (e.g. `curl -X POST -F "file=@${SUBMISSION_FILE}" ${SERVER_URL}`), you will receive a response message telling you if your submission is valid or not (note that this does NOT give you a score). For your convenience, we provide a script `/home/validate_submission.sh` which can help you make this submission to the validation server.
+- **Cheating**: You are not allowed to view others' solutions to this competition. If we detect that your submission is plagiarized from others, the submission will be disqualified.
+
+Your submission will be graded based on the metric in the competition-specific instructions. Your goal is to achieve the best score possible on the competition task.
+
+Wherever there is a conflict between the points above and the competition-specific instructions in `/home/data/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
diff --git a/evaluation/mle_bench/environment/instructions_obfuscated.txt b/evaluation/mle_bench/environment/instructions_obfuscated.txt
new file mode 100644
index 000000000000..95353de44a56
--- /dev/null
+++ b/evaluation/mle_bench/environment/instructions_obfuscated.txt
@@ -0,0 +1,18 @@
+You are participating in *MLE-bench*, a benchmark designed to test the coding ability of agents like yourself.
+
+You will be given a machine learning task. You must solve the task by training a model and running the model on the test set to produce a submission file. You may not hand-label the data to produce a submission file.
+
+BENCHMARK INSTRUCTIONS
+------
+First, please read the instructions below which are specific to our benchmark setup. Note, these instructions can be found again in `/home/instructions.txt`.
+
+- **Task-specific instructions**: The instructions for the task are available in `/home/data/description.md`. Read this file to understand the problem setup for the task you are currently solving.
+- **Dataset**: The dataset files for the task are available in `/home/data/`.
+- **Submission**: Regardless of what the task instructions say, you MUST produce a submission file at `/home/submission/submission.csv`. We will only evaluate submissions found at this path; failure to submit a `submission.csv` at this path will result in a failed attempt.
+- **Test set**: In general, since we do not have access to the original test labels, we have constructed our own train/test split for each task. As such, the dataset you see in `/home/data/` may differ slightly from what is described in the task instructions. Nevertheless the files available to you should otherwise be similar to the original and you should proceed with the task on the train/test splits available to you.
+- **Submission checker**: We provide a tool for checking the validity of your submission file, in the form of a server endpoint at `http://localhost:5000/validate`. If you post a submission file to the server (e.g. `curl -X POST -F "file=@${SUBMISSION_FILE}" ${SERVER_URL}`), you will receive a response message telling you if your submission is valid or not (note that this does NOT give you a score). For your convenience, we provide a script `/home/validate_submission.sh` which can help you make this submission to the validation server.
+- **Cheating**: You are not allowed to view others' solutions to this task. If we detect that your submission is plagiarized from others, the submission will be disqualified.
+
+Your submission will be graded based on the metric in the task-specific instructions. Your goal is to achieve the best score possible on the task.
+
+Wherever there is a conflict between the points above and the task-specific instructions in `/home/data/description.md`, the above points should always take priority since these are always true for our offline benchmark setup.
diff --git a/evaluation/mle_bench/environment/utils.py b/evaluation/mle_bench/environment/utils.py
new file mode 100644
index 000000000000..e20eeb45c5d0
--- /dev/null
+++ b/evaluation/mle_bench/environment/utils.py
@@ -0,0 +1,164 @@
+import json
+import os
+import tarfile
+import uuid
+from pathlib import Path
+from typing import Optional
+
+import docker
+from docker import DockerClient
+from docker.models.containers import Container
+from mlebench.registry import Competition
+from mlebench.utils import get_logger, get_timestamp
+
+logger = get_logger(__name__)
+
+
+def parse_container_config(raw_config: dict) -> dict:
+    """
+    Parses raw configuration for container.
+    Mostly necessary for handling GPU configuration.
+    """
+    new_config = {k: v for k, v in raw_config.items() if k != 'gpus'}
+
+    # handle GPU configuration
+    if 'gpus' in raw_config and raw_config['gpus'] != 0:
+        gpu_count = raw_config['gpus']
+        new_config['device_requests'] = [
+            docker.types.DeviceRequest(count=gpu_count, capabilities=[['gpu']])
+        ]
+
+    # cast nano_cpus to int
+    new_config['nano_cpus'] = (
+        int(new_config['nano_cpus']) if 'nano_cpus' in new_config else None
+    )
+
+    return new_config
+
+
+def reconcile_args(
+    config_args: Optional[str] = None, dict_args: Optional[dict] = None
+) -> dict:
+    """
+    Reconcile the args specified by config file and args specified in a dictionary
+
+    In case of duplicates, config file's args take precedence
+    """
+    reconciled_args = {}
+    if dict_args:
+        reconciled_args.update(dict_args)
+    if config_args:
+        reconciled_args = json.loads(config_args)
+    return reconciled_args
+
+
+def extract_from_container_sysbox(
+    container: Container, container_file_path: str, local_dir: Path
+):
+    """
+    Extracts a file or directory from a container to a specified local directory.
+    """
+    try:
+        # Get the directory and base name of the file or directory to extract
+        dir_name = os.path.dirname(container_file_path)
+        base_name = os.path.basename(container_file_path)
+
+        # Construct the tar command to tar the base_name and output to stdout
+        command = ['tar', 'cf', '-', '-C', dir_name, base_name]
+        exec_result = container.exec_run(
+            cmd=command, stdout=True, stderr=True, stream=True
+        )
+
+        # Create a file-like object from the output generator
+        import io
+
+        class StreamWrapper(io.RawIOBase):
+            def __init__(self, generator):
+                self.generator = generator
+                self.leftover = b''
+
+            def readinto(self, b):
+                try:
+                    chunk = next(self.generator)
+                except StopIteration:
+                    return 0  # EOF
+                n = len(chunk)
+                b[:n] = chunk
+                return n
+
+        stream_wrapper = StreamWrapper(exec_result.output)
+
+        # Extract the tar stream
+        with tarfile.open(fileobj=stream_wrapper, mode='r|') as tar:
+            tar.extractall(path=local_dir)
+
+    except FileNotFoundError:
+        logger.warning(f'Nothing found in container at {container_file_path}.')
+    except Exception as e:
+        logger.error(f'Error extracting output file: {e}')
+
+
+def extract_from_container(
+    container: Container, container_file_path: str, local_dir: Path
+):
+    """
+    Extracts a file or directory from a container to a specified local directory.
+    """
+    try:
+        stream, _ = container.get_archive(container_file_path)
+        tmp_tar_path = local_dir / 'tmp.tar'
+
+        with open(tmp_tar_path, 'wb') as f:
+            for chunk in stream:
+                f.write(chunk)
+
+        # extracts the original file(s) from the tar file
+        with tarfile.open(tmp_tar_path, 'r') as tar:
+            tar.extractall(path=local_dir)
+
+        tmp_tar_path.unlink()
+    except FileNotFoundError:
+        logger.warning(f'Nothing found in container at {container_file_path}.')
+    except Exception as e:
+        logger.error(f'Error extracting output file: {e}')
+
+
+def create_competition_container(
+    client: DockerClient,
+    competition: Competition,
+    container_config: dict,
+    volumes_config: dict,
+    env_vars: dict,
+    container_image: str = 'mlebench-env',
+    privileged: bool = False,
+) -> Container:
+    """
+    Creates a container for the given competition, mounting the competition data and agent volumes.
+
+    Args:
+        client: Docker client to interact with Docker.
+        competition: Competition object
+        container_config: Docker configuration for the container.
+        volumes_config: Docker bind-mount configuration for the container.
+        env_vars: Environment variables to pass to the container.
+        container_image: Docker image to use for the container.
+        privileged: Whether to run the container in privileged mode. Default is False.
+
+    Returns:
+        Created Docker container.
+    """
+    unique_id = str(uuid.uuid4().hex)
+    timestamp = get_timestamp()
+
+    container = client.containers.create(
+        image=container_image,
+        name=f'competition-{competition.id}-{timestamp}-{unique_id}',
+        detach=True,
+        **parse_container_config(container_config),
+        volumes=volumes_config,
+        environment=env_vars,
+        privileged=privileged,
+    )
+
+    logger.info(f'Container created: {container.name}')
+    return container
diff --git a/evaluation/mle_bench/environment/validate_submission.sh b/evaluation/mle_bench/environment/validate_submission.sh
new file mode 100644
index 000000000000..2a062542b794
--- /dev/null
+++ b/evaluation/mle_bench/environment/validate_submission.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# This script is a tool for agents to submit a file to the grading server for validation.
+
+# validate_submission.sh
+# Usage: ./validate_submission.sh user_submission.csv
+
+SUBMISSION_FILE=$1
+SERVER_URL="http://localhost:5000/validate"
+
+if [ -z "$SUBMISSION_FILE" ]; then
+    echo "Usage: $0 <submission_file>"
+    exit 1
+fi
+
+if [ ! -f "$SUBMISSION_FILE" ]; then
+    echo "File $SUBMISSION_FILE does not exist."
+    exit 1
+fi
+
+curl -X POST -F "file=@${SUBMISSION_FILE}" ${SERVER_URL}
diff --git a/evaluation/mle_bench/experiments/competition_categories.csv b/evaluation/mle_bench/experiments/competition_categories.csv
new file mode 100644
index 000000000000..143f5249c31d
--- /dev/null
+++ b/evaluation/mle_bench/experiments/competition_categories.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b6967e944e6b105f54f943cd13042158f49fec5206c0ff536d16d42cf39b634
+size 6322
diff --git a/evaluation/mle_bench/experiments/make_submission.py b/evaluation/mle_bench/experiments/make_submission.py
new file mode 100644
index 000000000000..580b78e5bd0b
--- /dev/null
+++ b/evaluation/mle_bench/experiments/make_submission.py
@@ -0,0 +1,101 @@
+import argparse
+import json
+from logging import Logger
+from pathlib import Path
+
+import pandas as pd
+from mlebench.utils import get_logger
+
+
+def main(
+    metadata_path: Path,
+    output: Path,
+    rel_log_path: Path,
+    rel_code_path: Path,
+    logger: Logger,
+):
+    run_statuses = []
+    submission_lines = []
+
+    with open(metadata_path, 'r') as f:
+        metadata = json.load(f)
+
+    for run_id in metadata['runs']:
+        run_dir = metadata_path.parent / run_id
+        # run_id is something like f"{comp_id}_bfa0c73d"
+        comp_id = run_id.split('_')[0]
+
+        log_path = run_dir / rel_log_path
+        has_log = log_path.exists()
+        code_path = run_dir / rel_code_path
+        has_code = code_path.exists()
+        submission_path = run_dir / 'submission/submission.csv'
+        submitted = submission_path.exists()
+
+        submission_lines.append(
+            {
+                'competition_id': comp_id,
+                'submission_path': submission_path.as_posix() if submitted else None,
+                'logs_path': log_path.as_posix() if has_log else None,
+                'code_path': code_path.as_posix() if has_code else None,
+            }
+        )
+
+        run_status = {
+            'competition_id': comp_id[:20],
+            'has_log': has_log,
+            'has_code': has_code,
+            'submitted': submitted,
+        }
+        run_statuses.append(run_status)
+
+    status_df = pd.DataFrame(run_statuses)
+    logger.info(f'All runs:\n{status_df.to_string()}')
+
+    # Create submission.jsonl
+    with open(output, 'w') as f:
+        for line in submission_lines:
+            f.write(f'{json.dumps(line)}\n')
+    logger.info(f'Written sorted submission to {output}')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Makes a submission.jsonl for mlebench grade from a mlebench run group'
+    )
+
+    parser.add_argument(
+        '--metadata',
+        type=str,
+        help='Path to metadata.json file',
+        required=True,
+    )
+    parser.add_argument(
+        '--output',
+        type=str,
+        help='Path to the output jsonl file which can be used for `mlebench grade`',
+        default='submission.jsonl',
+    )
+    parser.add_argument(
+        '--rel-log-path',
+        type=str,
+        help='Path to logfile for analysis, relative to a run checkpoint. For example, if your logs are at `{runs_dir}/{run_id}/{checkpoint}/logs/agent.log`, this should be `logs/agent.log`.',
+        default='logs/agent.log',
+    )
+    parser.add_argument(
+        '--rel-code-path',
+        type=str,
+        help='Path to code file for analysis, relative to a run checkpoint. For example, if your code is at `{runs_dir}/{run_id}/{checkpoint}/code/train.py`, this should be `code/train.py`.',
+        default='code/train.py',
+    )
+
+    args = parser.parse_args()
+    logger = get_logger(__name__)
+
+    main(
+        metadata_path=Path(args.metadata),
+        output=Path(args.output),
+        rel_log_path=Path(args.rel_log_path),
+        rel_code_path=Path(args.rel_code_path),
+        logger=logger,
+    )
diff --git a/evaluation/mle_bench/experiments/splits/all.txt b/evaluation/mle_bench/experiments/splits/all.txt
new file mode 100644
index 000000000000..bcc876aa0298
--- /dev/null
+++ b/evaluation/mle_bench/experiments/splits/all.txt
@@ -0,0 +1,75 @@
+3d-object-detection-for-autonomous-vehicles
+AI4Code
+aerial-cactus-identification
+alaska2-image-steganalysis
+aptos2019-blindness-detection
+billion-word-imputation
+bms-molecular-translation
+cassava-leaf-disease-classification
+cdiscount-image-classification-challenge
+chaii-hindi-and-tamil-question-answering
+champs-scalar-coupling
+denoising-dirty-documents
+detecting-insults-in-social-commentary
+dog-breed-identification
+dogs-vs-cats-redux-kernels-edition
+facebook-recruiting-iii-keyword-extraction
+freesound-audio-tagging-2019
+google-quest-challenge
+google-research-identify-contrails-reduce-global-warming
+h-and-m-personalized-fashion-recommendations
+herbarium-2020-fgvc7
+herbarium-2021-fgvc8
+herbarium-2022-fgvc9
+histopathologic-cancer-detection
+hms-harmful-brain-activity-classification
+hotel-id-2021-fgvc8
+hubmap-kidney-segmentation
+icecube-neutrinos-in-deep-ice
+imet-2020-fgvc7
+inaturalist-2019-fgvc6
+iwildcam-2019-fgvc6
+iwildcam-2020-fgvc7
+jigsaw-toxic-comment-classification-challenge
+jigsaw-unintended-bias-in-toxicity-classification
+kuzushiji-recognition
+leaf-classification
+learning-agency-lab-automated-essay-scoring-2
+lmsys-chatbot-arena
+mlsp-2013-birds
+multi-modal-gesture-recognition
+new-york-city-taxi-fare-prediction
+nfl-player-contact-detection
+nomad2018-predict-transparent-conductors
+osic-pulmonary-fibrosis-progression
+petfinder-pawpularity-score
+plant-pathology-2020-fgvc7
+plant-pathology-2021-fgvc8
+predict-volcanic-eruptions-ingv-oe
+random-acts-of-pizza
+ranzcr-clip-catheter-line-classification
+rsna-2022-cervical-spine-fracture-detection
+rsna-breast-cancer-detection
+rsna-miccai-brain-tumor-radiogenomic-classification
+seti-breakthrough-listen
+siim-covid19-detection
+siim-isic-melanoma-classification
+smartphone-decimeter-2022
+spooky-author-identification
+stanford-covid-vaccine
+statoil-iceberg-classifier-challenge
+tabular-playground-series-dec-2021
+tabular-playground-series-may-2022
+tensorflow-speech-recognition-challenge
+tensorflow2-question-answering
+text-normalization-challenge-english-language
+text-normalization-challenge-russian-language
+tgs-salt-identification-challenge
+the-icml-2013-whale-challenge-right-whale-redux
+tweet-sentiment-extraction
+us-patent-phrase-to-phrase-matching
+uw-madison-gi-tract-image-segmentation
+ventilator-pressure-prediction
+vesuvius-challenge-ink-detection
+vinbigdata-chest-xray-abnormalities-detection
+whale-categorization-playground
diff --git a/evaluation/mle_bench/experiments/splits/dev.txt b/evaluation/mle_bench/experiments/splits/dev.txt
new file mode 100644
index 000000000000..8b69e5978e81
--- /dev/null
+++ b/evaluation/mle_bench/experiments/splits/dev.txt
@@ -0,0 +1,7 @@
+invasive-species-monitoring
+ml2021spring-hw2
+movie-review-sentiment-analysis-kernels-only
+paddy-disease-classification
+plant-seedlings-classification
+playground-series-s3e18
+spaceship-titanic
diff --git a/evaluation/mle_bench/experiments/splits/high.txt b/evaluation/mle_bench/experiments/splits/high.txt
new file mode 100644
index 000000000000..f430ee5c94be
--- /dev/null
+++ b/evaluation/mle_bench/experiments/splits/high.txt
@@ -0,0 +1,15 @@
+3d-object-detection-for-autonomous-vehicles
+bms-molecular-translation
+google-research-identify-contrails-reduce-global-warming
+hms-harmful-brain-activity-classification
+iwildcam-2019-fgvc6
+nfl-player-contact-detection
+predict-volcanic-eruptions-ingv-oe
+rsna-2022-cervical-spine-fracture-detection
+rsna-breast-cancer-detection
+rsna-miccai-brain-tumor-radiogenomic-classification
+siim-covid19-detection
+smartphone-decimeter-2022
+stanford-covid-vaccine
+vesuvius-challenge-ink-detection
+vinbigdata-chest-xray-abnormalities-detection
diff --git a/evaluation/mle_bench/experiments/splits/low.txt b/evaluation/mle_bench/experiments/splits/low.txt
new file mode 100644
index 000000000000..a4e315866b64
--- /dev/null
+++ b/evaluation/mle_bench/experiments/splits/low.txt
@@ -0,0 +1,22 @@
+aerial-cactus-identification
+aptos2019-blindness-detection
+denoising-dirty-documents
+detecting-insults-in-social-commentary
+dog-breed-identification
+dogs-vs-cats-redux-kernels-edition
+histopathologic-cancer-detection
+jigsaw-toxic-comment-classification-challenge
+leaf-classification
+mlsp-2013-birds
+new-york-city-taxi-fare-prediction
+nomad2018-predict-transparent-conductors
+plant-pathology-2020-fgvc7
+random-acts-of-pizza
+ranzcr-clip-catheter-line-classification
+siim-isic-melanoma-classification
+spooky-author-identification
+tabular-playground-series-dec-2021
+tabular-playground-series-may-2022
+text-normalization-challenge-english-language
+text-normalization-challenge-russian-language
+the-icml-2013-whale-challenge-right-whale-redux
diff --git a/evaluation/mle_bench/experiments/splits/medium.txt b/evaluation/mle_bench/experiments/splits/medium.txt
new file mode 100644
index 000000000000..9f1a21d852de
--- /dev/null
+++ b/evaluation/mle_bench/experiments/splits/medium.txt
@@ -0,0 +1,38 @@
+AI4Code
+alaska2-image-steganalysis
+billion-word-imputation
+cassava-leaf-disease-classification
+cdiscount-image-classification-challenge
+chaii-hindi-and-tamil-question-answering
+champs-scalar-coupling
+facebook-recruiting-iii-keyword-extraction
+freesound-audio-tagging-2019
+google-quest-challenge
+h-and-m-personalized-fashion-recommendations
+herbarium-2020-fgvc7
+herbarium-2021-fgvc8
+herbarium-2022-fgvc9
+hotel-id-2021-fgvc8
+hubmap-kidney-segmentation
+icecube-neutrinos-in-deep-ice
+imet-2020-fgvc7
+inaturalist-2019-fgvc6
+iwildcam-2020-fgvc7
+jigsaw-unintended-bias-in-toxicity-classification
+kuzushiji-recognition
+learning-agency-lab-automated-essay-scoring-2
+lmsys-chatbot-arena
+multi-modal-gesture-recognition
+osic-pulmonary-fibrosis-progression
+petfinder-pawpularity-score
+plant-pathology-2021-fgvc8
+seti-breakthrough-listen
+statoil-iceberg-classifier-challenge
+tensorflow-speech-recognition-challenge
+tensorflow2-question-answering
+tgs-salt-identification-challenge
+tweet-sentiment-extraction
+us-patent-phrase-to-phrase-matching
+uw-madison-gi-tract-image-segmentation
+ventilator-pressure-prediction
+whale-categorization-playground
diff --git a/evaluation/mle_bench/experiments/splits/spaceship-titanic.txt b/evaluation/mle_bench/experiments/splits/spaceship-titanic.txt
new file mode 100644
index 000000000000..ec46c5cb5dc3
--- /dev/null
+++ b/evaluation/mle_bench/experiments/splits/spaceship-titanic.txt
@@ -0,0 +1 @@
+spaceship-titanic
diff --git a/evaluation/mle_bench/run_infer.py b/evaluation/mle_bench/run_infer.py
new file mode 100644
index 000000000000..eaff038c0c76
--- /dev/null
+++ b/evaluation/mle_bench/run_infer.py
@@ -0,0 +1,241 @@
+import argparse
+import asyncio
+import json
+import logging
+import os
+import time
+import traceback
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+import docker
+from agents.registry import Agent
+from agents.registry import registry as agent_registry
+from agents.run import run_in_container
+from environment.defaults import DEFAULT_CONTAINER_CONFIG_PATH
+from mlebench.data import is_dataset_prepared
+from mlebench.registry import Competition, registry
+from mlebench.utils import create_run_dir, get_logger, get_timestamp
+
+logger = get_logger(__name__)
+
+
+@dataclass(frozen=True)
+class Task:
+    run_id: str
+    seed: int
+    image: str
+    path_to_run_group: Path
+    path_to_run: Path
+    agent: Agent
+    competition: Competition
+    container_config: dict[str, Any]
+
+
+async def worker(
+    idx: int,
+    queue: asyncio.Queue[Task],
+    client: docker.DockerClient,
+    tasks_outputs: dict[str, dict[str, Any]],
+) -> None:
+    while True:
+        task = await queue.get()
+
+        # Create logger for the run
+        run_logger = get_logger(str(task.path_to_run))
+        log_file_handler = logging.FileHandler(task.path_to_run / 'run.log')
+        log_file_handler.setFormatter(
+            logging.getLogger().handlers[0].formatter
+        )  # match the formatting we have
+        run_logger.addHandler(log_file_handler)
+        run_logger.propagate = False
+
+        run_logger.info(
+            f'[Worker {idx}] Running seed {task.seed} for {task.competition.id} and agent {task.agent.name}'
+        )
+
+        task_output = {}
+        try:
+            await asyncio.to_thread(
+                run_in_container,
+                client=client,
+                competition=task.competition,
+                agent=task.agent,
+                image=task.agent.name,
+                container_config=task.container_config,
+                retain_container=args.retain,
+                run_dir=task.path_to_run,
+                logger=run_logger,
+            )
+            task_output['success'] = True
+
+            run_logger.info(
+                f'[Worker {idx}] Finished running seed {task.seed} for {task.competition.id} and agent {task.agent.name}'
+            )
+        except Exception as e:
+            stack_trace = traceback.format_exc()
+            run_logger.error(type(e))
+            run_logger.error(stack_trace)
+            run_logger.error(
+                f'Run failed for seed {task.seed}, agent {task.agent.id} and competition '
+                f'{task.competition.id}'
+            )
+            task_output['success'] = False
+        finally:
+            tasks_outputs[task.run_id] = task_output
+            queue.task_done()
+
+
+async def main(args):
+    client = docker.from_env()
+    global registry
+    registry = registry.set_data_dir(Path(args.data_dir))
+
+    agent = agent_registry.get_agent(args.agent_id)
+    if agent.privileged and os.environ.get(
+        'I_ACCEPT_RUNNING_PRIVILEGED_CONTAINERS', 'False'
+    ).lower() not in ('true', '1', 't'):
+        raise ValueError(
+            'Agent requires running in a privileged container, but the environment variable `I_ACCEPT_RUNNING_PRIVILEGED_CONTAINERS` is not set to `True`! '
+            'Carefully consider if you wish to run this agent before continuing. See agents/README.md for more details.'
+        )
+
+    run_group = f'{get_timestamp()}_run-group_{agent.name}'
+
+    # Load competition ids and check all are prepared
+    with open(args.competition_set, 'r') as f:
+        competition_ids = [
+            line.strip() for line in f.read().splitlines() if line.strip()
+        ]
+    for competition_id in competition_ids:
+        competition = registry.get_competition(competition_id)
+        if not is_dataset_prepared(competition):
+            raise ValueError(
+                f'Dataset for competition `{competition.id}` is not prepared! '
+                f'Please run `mlebench prepare -c {competition.id}` to prepare the dataset.'
+            )
+
+    with open(args.container_config, 'r') as f:
+        container_config = json.load(f)
+
+    # Create tasks for each (competition * seed)
+    logger.info(f'Launching run group: {run_group}')
+    tasks = []
+    for seed in range(args.n_seeds):
+        for competition_id in competition_ids:
+            competition = registry.get_competition(competition_id)
+            run_dir = create_run_dir(competition.id, agent.id, run_group)
+            run_id = run_dir.stem
+            task = Task(
+                run_id=run_id,
+                seed=seed,
+                image=agent.name,
+                agent=agent,
+                competition=competition,
+                path_to_run_group=run_dir.parent,
+                path_to_run=run_dir,
+                container_config=container_config,
+            )
+            tasks.append(task)
+
+    logger.info(f'Creating {args.n_workers} workers to serve {len(tasks)} tasks...')
+
+    # Create queue of tasks, and assign workers to run them
+    queue = asyncio.Queue()
+    for task in tasks:
+        queue.put_nowait(task)
+    workers = []
+    tasks_outputs = {}
+    for idx in range(args.n_workers):
+        w = asyncio.create_task(worker(idx, queue, client, tasks_outputs))
+        workers.append(w)
+
+    # Wait for all tasks to be completed and collect results
+    started_at = time.monotonic()
+    await queue.join()
+    time_taken = time.monotonic() - started_at
+
+    for w in workers:
+        w.cancel()  # Cancel all workers now that the queue is empty
+
+    await asyncio.gather(*workers, return_exceptions=True)
+
+    # Generate metadata.json
+    metadata = {
+        'run_group': run_group,
+        'created_at': get_timestamp(),
+        'runs': tasks_outputs,
+    }
+    run_group_dir = os.path.join(os.path.curdir, 'runs', run_group)
+
+    if not os.path.exists(run_group_dir):
+        os.mkdir(run_group_dir)
+
+    with open(os.path.join(run_group_dir, 'metadata.json'), 'w') as f:
+        logger.info(f"Writing results to {run_group_dir + '/metadata.json'}")
+        json.dump(metadata, f, indent=4, sort_keys=False, default=str)
+    logger.info(f'{args.n_workers} workers ran for {time_taken:.2f} seconds in total')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Run an agent on a set of competitions in a Docker container.'
+    )
+    parser.add_argument(
+        '--agent-id',
+        help='Agent ID of the agent to run.',
+        type=str,
+    )
+    parser.add_argument(
+        '--competition-set',
+        type=str,
+        required=True,
+        help='Path to a text file with a single competition ID on each line',
+    )
+    parser.add_argument(
+        '--n-workers',
+        type=int,
+        required=False,
+        default=1,
+        help='Number of workers to run in parallel',
+    )
+    parser.add_argument(
+        '--n-seeds',
+        type=int,
+        required=False,
+        default=1,
+        help='Number of seeds to run for each competition',
+    )
+    parser.add_argument(
+        '--container-config',
+        help='Path to a JSON file with an environment configuration; these args will be passed to `docker.from_env().containers.create`',
+        type=str,
+        required=False,
+        default=DEFAULT_CONTAINER_CONFIG_PATH,
+    )
+    parser.add_argument(
+        '--retain',
+        help='Whether to retain the container after the run instead of removing it.',
+        action='store_true',
+        required=False,
+        default=False,
+    )
+    parser.add_argument(
+        '--run-dir',
+        help='Path to the directory where all assets associated with the run are stored.',
+        type=str,
+        required=False,
+        default=None,
+    )
+    parser.add_argument(
+        '--data-dir',
+        help='Path to the directory containing the competition data.',
+        type=str,
+        required=False,
+        default=registry.get_data_dir(),
+    )
+    args = parser.parse_args()
+    logger = get_logger(__name__)
+
+    asyncio.run(main(args))

From e2bf1f7858f9e8deaab4d13b86fa1ed86410234b Mon Sep 17 00:00:00 2001
From: Calvin Smith <email@cjsmith.io>
Date: Tue, 19 Nov 2024 14:31:14 -0700
Subject: [PATCH 2/4] builds, fails to build runtime

---
 evaluation/mle_bench/README.md                | 20 +++++++-
 evaluation/mle_bench/agents/.shared_env       |  4 ++
 evaluation/mle_bench/agents/dummy/Dockerfile  | 29 -----------
 evaluation/mle_bench/agents/dummy/config.yaml |  3 --
 evaluation/mle_bench/agents/dummy/main.py     | 34 -------------
 evaluation/mle_bench/agents/dummy/start.sh    | 21 --------
 .../{opendevin => openhands}/Dockerfile       | 13 +++--
 .../agents/{opendevin => openhands}/build.sh  |  4 ++
 .../{opendevin => openhands}/config.yaml      |  6 +--
 .../{opendevin => openhands}/entrypoint.sh    |  5 +-
 .../agents/{opendevin => openhands}/setup.py  |  0
 .../agents/{opendevin => openhands}/start.py  | 50 +++++++++++-------
 .../agents/{opendevin => openhands}/start.sh  |  5 --
 .../{opendevin => openhands}/templates.py     |  2 -
 .../agents/{opendevin => openhands}/utils.py  |  0
 evaluation/mle_bench/agents/run.py            | 24 ++++++++-
 .../config/container_configs/default.json     |  3 +-
 evaluation/mle_bench/run_infer.py             | 51 +++++++++++++++++--
 18 files changed, 143 insertions(+), 131 deletions(-)
 create mode 100644 evaluation/mle_bench/agents/.shared_env
 delete mode 100644 evaluation/mle_bench/agents/dummy/Dockerfile
 delete mode 100644 evaluation/mle_bench/agents/dummy/config.yaml
 delete mode 100644 evaluation/mle_bench/agents/dummy/main.py
 delete mode 100644 evaluation/mle_bench/agents/dummy/start.sh
 rename evaluation/mle_bench/agents/{opendevin => openhands}/Dockerfile (86%)
 rename evaluation/mle_bench/agents/{opendevin => openhands}/build.sh (53%)
 rename evaluation/mle_bench/agents/{opendevin => openhands}/config.yaml (75%)
 rename evaluation/mle_bench/agents/{opendevin => openhands}/entrypoint.sh (93%)
 rename evaluation/mle_bench/agents/{opendevin => openhands}/setup.py (100%)
 rename evaluation/mle_bench/agents/{opendevin => openhands}/start.py (80%)
 rename evaluation/mle_bench/agents/{opendevin => openhands}/start.sh (71%)
 rename evaluation/mle_bench/agents/{opendevin => openhands}/templates.py (97%)
 rename evaluation/mle_bench/agents/{opendevin => openhands}/utils.py (100%)

diff --git a/evaluation/mle_bench/README.md b/evaluation/mle_bench/README.md
index 4d5a83dbc0c9..abd159c6d734 100644
--- a/evaluation/mle_bench/README.md
+++ b/evaluation/mle_bench/README.md
@@ -1,4 +1,20 @@
 # mle-bench
 
-1. `poetry add git+https://org-14957082@github.com:openai/mle-bench`
-2. Install your Kaggle API token at `~/.kaggle/kaggle.json`.
+## Setup
+
+### Kaggle API token
+
+Install your Kaggle API token at `~/.kaggle/kaggle.json`.
+
+### MLE-bench
+
+`poetry add git+https://org-14957082@github.com:openai/mle-bench`
+
+### Build images
+
+```bash
+docker build --platform=linux/amd64 \
+--build-arg PYTHON_VERSION=3.12 \
+--build-arg INSTALL_HEAVY_DEPENDENCIES=false \
+-t mlebench-env -f environment/Dockerfile .
+```
diff --git a/evaluation/mle_bench/agents/.shared_env b/evaluation/mle_bench/agents/.shared_env
new file mode 100644
index 000000000000..66ac96cd52a8
--- /dev/null
+++ b/evaluation/mle_bench/agents/.shared_env
@@ -0,0 +1,4 @@
+SUBMISSION_DIR=/home/submission
+LOGS_DIR=/home/logs
+CODE_DIR=/home/code
+AGENT_DIR=/home/agent
diff --git a/evaluation/mle_bench/agents/dummy/Dockerfile b/evaluation/mle_bench/agents/dummy/Dockerfile
deleted file mode 100644
index 4a825206bda1..000000000000
--- a/evaluation/mle_bench/agents/dummy/Dockerfile
+++ /dev/null
@@ -1,29 +0,0 @@
-FROM mlebench-env
-
-# where to put submission.csv, will be extracted
-ARG SUBMISSION_DIR
-ENV SUBMISSION_DIR=${SUBMISSION_DIR}
-# where to put any logs, will be extracted
-ARG LOGS_DIR
-ENV LOGS_DIR=${LOGS_DIR}
-# where to put any code, will be extracted
-ARG CODE_DIR
-ENV CODE_DIR=${CODE_DIR}
-# where to put any other agent-specific files, will not be necessarily extracted
-ARG AGENT_DIR
-ENV AGENT_DIR=${AGENT_DIR}
-
-RUN mkdir -p ${LOGS_DIR} ${CODE_DIR} ${AGENT_DIR}
-
-ARG CONDA_ENV_NAME=agent
-ARG REQUIREMENTS=${AGENT_DIR}/requirements.txt
-
-# copy just the requirements file, so that we can cache conda separately from the agent files
-COPY requirements.txt ${AGENT_DIR}/requirements.txt
-
-# create conda environment and install the requirements to it
-RUN conda run -n ${CONDA_ENV_NAME} pip install -r ${AGENT_DIR}/requirements.txt && \
-    conda clean -afy
-
-# put all the agent files in the expected location
-COPY . ${AGENT_DIR}
diff --git a/evaluation/mle_bench/agents/dummy/config.yaml b/evaluation/mle_bench/agents/dummy/config.yaml
deleted file mode 100644
index f5ac0a2d01c3..000000000000
--- a/evaluation/mle_bench/agents/dummy/config.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dummy:
-  start: dummy/start.sh
-  dockerfile: dummy/Dockerfile
diff --git a/evaluation/mle_bench/agents/dummy/main.py b/evaluation/mle_bench/agents/dummy/main.py
deleted file mode 100644
index 9584c42e2155..000000000000
--- a/evaluation/mle_bench/agents/dummy/main.py
+++ /dev/null
@@ -1,34 +0,0 @@
-"""
-A dummy agent which copies the `sample_submission.csv` from the workspace/data directory
-and uses that as its `submission.csv`.
-"""
-
-import getpass
-import os
-import shutil
-import sys
-from pathlib import Path
-
-# Get the current user's username
-username = getpass.getuser()
-
-# Check if the current user ID is 0 (root user ID on Unix-like systems)
-if os.getuid() == 0:
-    print(f"You are running this script as root. Your username is '{username}'.")
-else:
-    print(f'You do not have root access. Your username is {username}.')
-
-print('The script is being run with the following python interpreter:')
-print(sys.executable)
-
-cwd = Path(__file__).parent
-workspace_data_dir = cwd.parent / 'data'
-
-print('Copying sample submission...')
-
-shutil.copy(
-    workspace_data_dir / 'sample_submission.csv',
-    cwd.parent / 'submission' / 'submission.csv',
-)
-
-print(f"Sample submission copied to {cwd.parent / 'submission' / 'submission.csv'}.")
diff --git a/evaluation/mle_bench/agents/dummy/start.sh b/evaluation/mle_bench/agents/dummy/start.sh
deleted file mode 100644
index 51fa6c041fac..000000000000
--- a/evaluation/mle_bench/agents/dummy/start.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-set -x # Print commands and their arguments as they are executed
-
-eval "$(conda shell.bash hook)" # make conda available to the shell
-conda activate agent
-
-python ${AGENT_DIR}/main.py
-
-# Check that the agent doesn't have permissions to read private dir
-ls /private
-# ls: cannot open directory '/private': Permission denied
-
-# Check that the agent does have permissions to read/write everything in /home
-ls /home/data
-touch $CODE_DIR/code.py
-touch $LOGS_DIR/run.log
-touch $AGENT_DIR/agent_was_here.txt
-cat /home/instructions.txt
-
-# Use the environment-provided grading server to validate our submission
-bash /home/validate_submission.sh /home/submission/submission.csv
diff --git a/evaluation/mle_bench/agents/opendevin/Dockerfile b/evaluation/mle_bench/agents/openhands/Dockerfile
similarity index 86%
rename from evaluation/mle_bench/agents/opendevin/Dockerfile
rename to evaluation/mle_bench/agents/openhands/Dockerfile
index 55c86233ec40..ad47b84b21d7 100644
--- a/evaluation/mle_bench/agents/opendevin/Dockerfile
+++ b/evaluation/mle_bench/agents/openhands/Dockerfile
@@ -20,8 +20,7 @@ ARG AGENT_DIR=/home/agent
 ENV AGENT_DIR=${AGENT_DIR}
 
 ARG CONDA_ENV_NAME=agent
-ARG PYTHON_VERSION=3.11
-ARG OD_VERSION=v2.1.0
+ARG OH_VERSION=main
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN mkdir -p ${LOGS_DIR} ${AGENT_DIR} ${CODE_DIR} ${SUBMISSION_DIR} && \
@@ -29,7 +28,10 @@ RUN mkdir -p ${LOGS_DIR} ${AGENT_DIR} ${CODE_DIR} ${SUBMISSION_DIR} && \
     chmod 700 /tmp/get-docker.sh && \
     /tmp/get-docker.sh && \
     sudo usermod -aG docker nonroot && \
-    git clone --branch ${OD_VERSION} --single-branch https://github.com/thesofakillers/OpenHands.git ${AGENT_DIR}
+    git clone --branch ${OH_VERSION} --single-branch https://github.com/All-Hands-AI/OpenHands.git ${AGENT_DIR}
+
+# We need fuse for docker.
+RUN sudo apt-get install -y fuse-overlayfs
 
 WORKDIR ${AGENT_DIR}
 
@@ -37,8 +39,9 @@ WORKDIR ${AGENT_DIR}
 RUN conda install -y -n ${CONDA_ENV_NAME} -c conda-forge nodejs=18.17.1 && \
     conda run -n ${CONDA_ENV_NAME} pip install poetry>=1.8 && \
     conda run -n ${CONDA_ENV_NAME} poetry install && \
-    conda run -n ${CONDA_ENV_NAME} poetry add build && \
-    conda clean -afy
+    conda run -n ${CONDA_ENV_NAME} poetry add build
+
+RUN conda clean -afy
 
 COPY setup.py start.py templates.py utils.py start.sh build.sh ${AGENT_DIR}/
 COPY entrypoint.sh /agent_entrypoint.sh
diff --git a/evaluation/mle_bench/agents/opendevin/build.sh b/evaluation/mle_bench/agents/openhands/build.sh
similarity index 53%
rename from evaluation/mle_bench/agents/opendevin/build.sh
rename to evaluation/mle_bench/agents/openhands/build.sh
index 93b2060fb94d..4eec5f938531 100644
--- a/evaluation/mle_bench/agents/opendevin/build.sh
+++ b/evaluation/mle_bench/agents/openhands/build.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
 
 cd /home/agent
+
+# The final configuration for the agent happens here, otherwise the standard entrypoint hangs while
+# all the build artifacts are being recursively chmod'ed.
+
 /opt/conda/bin/conda run -n agent --no-capture-output make build
 sudo -u nonroot /opt/conda/bin/conda run -n agent --no-capture-output playwright install
diff --git a/evaluation/mle_bench/agents/opendevin/config.yaml b/evaluation/mle_bench/agents/openhands/config.yaml
similarity index 75%
rename from evaluation/mle_bench/agents/opendevin/config.yaml
rename to evaluation/mle_bench/agents/openhands/config.yaml
index 46ef2d1e7a1e..f64074206763 100644
--- a/evaluation/mle_bench/agents/opendevin/config.yaml
+++ b/evaluation/mle_bench/agents/openhands/config.yaml
@@ -1,6 +1,6 @@
-opendevin:
-  start: opendevin/start.sh
-  dockerfile: opendevin/Dockerfile
+openhands:
+  start: openhands/start.sh
+  dockerfile: openhands/Dockerfile
   kwargs_type: argparse
   kwargs:
     agent: CodeActAgent
diff --git a/evaluation/mle_bench/agents/opendevin/entrypoint.sh b/evaluation/mle_bench/agents/openhands/entrypoint.sh
similarity index 93%
rename from evaluation/mle_bench/agents/opendevin/entrypoint.sh
rename to evaluation/mle_bench/agents/openhands/entrypoint.sh
index 509a37b198db..2c58dc4dc292 100644
--- a/evaluation/mle_bench/agents/opendevin/entrypoint.sh
+++ b/evaluation/mle_bench/agents/openhands/entrypoint.sh
@@ -3,9 +3,12 @@
 # Print commands and their arguments as they are executed
 set -x
 
-# run the root entrypoint in the background
+# Run the root entrypoint in the background
 /entrypoint.sh &
 
+# Start the docker daemon in the background
+sudo dockerd &
+
 mkdir -p $LOGS_DIR
 mkdir -p $AGENT_DIR
 {
diff --git a/evaluation/mle_bench/agents/opendevin/setup.py b/evaluation/mle_bench/agents/openhands/setup.py
similarity index 100%
rename from evaluation/mle_bench/agents/opendevin/setup.py
rename to evaluation/mle_bench/agents/openhands/setup.py
diff --git a/evaluation/mle_bench/agents/opendevin/start.py b/evaluation/mle_bench/agents/openhands/start.py
similarity index 80%
rename from evaluation/mle_bench/agents/opendevin/start.py
rename to evaluation/mle_bench/agents/openhands/start.py
index 021eb7fa11a5..bba4e12cb877 100644
--- a/evaluation/mle_bench/agents/opendevin/start.py
+++ b/evaluation/mle_bench/agents/openhands/start.py
@@ -1,21 +1,31 @@
+"""
+Main entrypoint for the OpenHands agent.
+"""
+
 import argparse
 import asyncio
 import json
 import os
-import random
 from pathlib import Path
-from typing import Iterable
-
-from openhands.controller.state.state import State  # type: ignore
-from openhands.core.config import load_app_config  # type: ignore
-from openhands.core.main import create_runtime, run_controller  # type: ignore
-from openhands.events.action import CmdRunAction, IPythonRunCellAction  # type: ignore
-from openhands.events.event import Event  # type: ignore
-from openhands.events.observation import (  # type: ignore
+from typing import Callable, Iterable
+
+from openhands.controller.state.state import State
+from openhands.core.config import load_app_config
+from openhands.core.main import create_runtime, run_controller
+from openhands.events import Event, EventStreamSubscriber
+from openhands.events.action import (
+    Action,
+    CmdRunAction,
+    IPythonRunCellAction,
+    MessageAction,
+)
+from openhands.events.observation import (
     CmdOutputObservation,
     IPythonRunCellObservation,
 )
 
+# pylint: disable=unspecified-encoding
+
 parser = argparse.ArgumentParser()
 parser.add_argument('--max_time_in_hours', type=float, required=True)
 args, other_args = parser.parse_known_args()
@@ -23,11 +33,15 @@
 global_events = []
 global_events_lock = asyncio.Lock()
 
-CODE_DIR = Path(os.getenv('CODE_DIR'))
+CODE_DIR = Path(os.getenv('CODE_DIR'))  # type: ignore
 MAX_TIME_IN_SECONDS = args.max_time_in_hours * 60 * 60
 
 
-def fake_user_response_fn(state: State) -> str:
+def fake_user_response_fn(
+    _state: State,
+    _encapsulate_solution: bool = False,
+    _try_parse: Callable[[Action], str] | None = None,
+) -> str:
     return 'Please continue working on the approach you think is most promising. You should complete the task without any human input.'
 
 
@@ -48,14 +62,15 @@ async def on_event(event: Event):
 
 async def run(instructions: str) -> State:
     config = load_app_config()
-    runtime = await create_runtime(config)
-    sid = random.randint(10_000, 100_000)
-    event_stream = runtime.event_stream
-    event_stream.subscribe(sid, on_event)
-    state = await asyncio.wait_for(
+
+    runtime = create_runtime(config)
+    runtime.event_stream.subscribe(EventStreamSubscriber.TEST, on_event, 'mle-bench')
+    await runtime.connect()
+
+    state: State | None = await asyncio.wait_for(
         run_controller(
             config=config,
-            task_str=instructions,
+            initial_user_action=MessageAction(content=instructions),
             runtime=runtime,
             exit_on_message=False,
             fake_user_response_fn=fake_user_response_fn,
@@ -63,6 +78,7 @@ async def run(instructions: str) -> State:
         timeout=MAX_TIME_IN_SECONDS,
     )
 
+    assert state, 'Controller produced no final state.'
     return state
 
 
diff --git a/evaluation/mle_bench/agents/opendevin/start.sh b/evaluation/mle_bench/agents/openhands/start.sh
similarity index 71%
rename from evaluation/mle_bench/agents/opendevin/start.sh
rename to evaluation/mle_bench/agents/openhands/start.sh
index 58efe55e79cd..cdf54030c5fc 100644
--- a/evaluation/mle_bench/agents/opendevin/start.sh
+++ b/evaluation/mle_bench/agents/openhands/start.sh
@@ -4,11 +4,6 @@
 set -x
 
 {
-  # Check that we can use the GPU in PyTorch
-  conda run -n agent python -c "import torch; print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'WARNING: No GPU')"
-  # Check that we can use the GPU in TensorFlow
-  conda run -n agent python -c "import tensorflow as tf; print('GPUs Available: ', tf.config.list_physical_devices('GPU'))"
-
   # wait for agent entrypoint to finish with timeout
   timeout=300 # 5 minutes in seconds
   elapsed=0
diff --git a/evaluation/mle_bench/agents/opendevin/templates.py b/evaluation/mle_bench/agents/openhands/templates.py
similarity index 97%
rename from evaluation/mle_bench/agents/opendevin/templates.py
rename to evaluation/mle_bench/agents/openhands/templates.py
index 6708472f34f2..fc45504cb291 100644
--- a/evaluation/mle_bench/agents/opendevin/templates.py
+++ b/evaluation/mle_bench/agents/openhands/templates.py
@@ -8,7 +8,6 @@
 workspace_mount_path_in_sandbox="${workspace_mount_path_in_sandbox}"
 default_agent="${agent}"
 max_iterations=${max_steps}
-persist_sandbox=false
 
 [llm]
 model="${model}"
@@ -19,7 +18,6 @@
 user_id=1000
 use_host_network=true
 timeout=86400
-shm_size="${shm_size}"
 """
 )
 
diff --git a/evaluation/mle_bench/agents/opendevin/utils.py b/evaluation/mle_bench/agents/openhands/utils.py
similarity index 100%
rename from evaluation/mle_bench/agents/opendevin/utils.py
rename to evaluation/mle_bench/agents/openhands/utils.py
diff --git a/evaluation/mle_bench/agents/run.py b/evaluation/mle_bench/agents/run.py
index befa95f4ce99..b935e649b830 100644
--- a/evaluation/mle_bench/agents/run.py
+++ b/evaluation/mle_bench/agents/run.py
@@ -57,7 +57,7 @@ def execute_agent(container: Container, agent: Agent, logger: logging.Logger):
         cmd += [f'{key}={value}' for key, value in agent.kwargs.items()]
 
     logger.info('Running agent...')
-    exit_code, output = container.exec_run(cmd, stream=True, user='nonroot')
+    _exit_code, output = container.exec_run(cmd, stream=True, user='nonroot')
 
     for chunk in output:
         logger.info(f"[Container] {chunk.decode('utf-8').strip()}")
@@ -140,13 +140,33 @@ def run_in_container(
     try:
         time_start = time.monotonic()
         container.start()
+        print('TEST RUN>PY')
         exit_code, _ = container.exec_run(
-            'timeout 60s sh -c "while ! curl -s http://localhost:5000/health > /dev/null; do sleep 1; done"'
+            'timeout 60s sh -c "while ! curl -s http://localhost:5000/health > /dev/null; do sleep 1; done"',
         )
+
         if exit_code != 0:
             raise RuntimeError(
                 'The grading server failed to start within 60 seconds. This is likely due to an error in `entrypoint.sh`; check the logs.'
             )
+
+        exit_code, output = container.exec_run(
+            ' '.join(
+                [
+                    'docker',
+                    'buildx',
+                    'build',
+                    '--progress=plain',
+                    '--build-arg=OPENHANDS_RUNTIME_VERSION=0.14.0',
+                    '--build-arg=OPENHANDS_RUNTIME_BUILD_TIME=2024-11-19T21:01:38.353125',
+                    '--tag=ghcr.io/all-hands-ai/runtime:oh_v0.14.0_4jgqg088oqwrl1d8_943wj498fiupdhg1',
+                ]
+            )
+        )
+
+        print(exit_code)
+        print(output.decode('utf-8'))
+
         execute_agent(container, agent, logger)
         save_output(container, run_dir, container_config)
         time_end = time.monotonic()
diff --git a/evaluation/mle_bench/environment/config/container_configs/default.json b/evaluation/mle_bench/environment/config/container_configs/default.json
index 3200d2c01105..1955f2946ea0 100644
--- a/evaluation/mle_bench/environment/config/container_configs/default.json
+++ b/evaluation/mle_bench/environment/config/container_configs/default.json
@@ -1,6 +1,5 @@
 {
     "mem_limit": null,
     "shm_size": "4G",
-    "nano_cpus": 4e9,
-    "runtime": "sysbox-runc"
+    "nano_cpus": 4e9
 }
diff --git a/evaluation/mle_bench/run_infer.py b/evaluation/mle_bench/run_infer.py
index eaff038c0c76..46d4e3e444a9 100644
--- a/evaluation/mle_bench/run_infer.py
+++ b/evaluation/mle_bench/run_infer.py
@@ -5,9 +5,10 @@
 import os
 import time
 import traceback
+import uuid
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any
+from typing import Any, Optional
 
 import docker
 from agents.registry import Agent
@@ -16,11 +17,52 @@
 from environment.defaults import DEFAULT_CONTAINER_CONFIG_PATH
 from mlebench.data import is_dataset_prepared
 from mlebench.registry import Competition, registry
-from mlebench.utils import create_run_dir, get_logger, get_timestamp
+from mlebench.utils import generate_run_id, get_logger, get_timestamp
 
 logger = get_logger(__name__)
 
 
+def get_runs_dir():
+    return Path(os.path.join(os.path.curdir, 'runs'))
+
+
+def create_run_dir(
+    competition_id: Optional[str] = None,
+    agent_id: Optional[str] = None,
+    run_group: Optional[str] = None,
+) -> Path:
+    """Creates a directory for the run."""
+
+    assert competition_id is None or isinstance(
+        competition_id, str
+    ), f'Expected a string or None, but got `{type(competition_id).__name__}`.'
+
+    assert agent_id is None or isinstance(
+        agent_id, str
+    ), f'Expected a string or None, but got `{type(agent_id).__name__}`.'
+
+    assert run_group is None or isinstance(
+        run_group, str
+    ), f'Expected a string or None, but got `{type(run_group).__name__}`.'
+
+    run_id = str(uuid.uuid4())
+
+    if competition_id and agent_id:
+        run_id = generate_run_id(competition_id, agent_id, run_group)
+
+    run_dir = get_runs_dir() / run_id
+
+    if run_group:
+        run_dir = get_runs_dir() / run_group / run_id
+
+    run_dir.mkdir(parents=True, exist_ok=False)
+
+    assert isinstance(run_dir, Path), f'Expected a `Path`, but got `{type(run_dir)}`.'
+    assert run_dir.is_dir(), f'Expected a directory, but got `{run_dir}`.'
+
+    return run_dir
+
+
 @dataclass(frozen=True)
 class Task:
     run_id: str
@@ -167,13 +209,12 @@ async def main(args):
         'created_at': get_timestamp(),
         'runs': tasks_outputs,
     }
-    run_group_dir = os.path.join(os.path.curdir, 'runs', run_group)
 
+    run_group_dir = get_runs_dir() / run_group
     if not os.path.exists(run_group_dir):
         os.mkdir(run_group_dir)
 
-    with open(os.path.join(run_group_dir, 'metadata.json'), 'w') as f:
-        logger.info(f"Writing results to {run_group_dir + '/metadata.json'}")
+    with open(run_group_dir / 'metadata.json', 'w') as f:
         json.dump(metadata, f, indent=4, sort_keys=False, default=str)
     logger.info(f'{args.n_workers} workers ran for {time_taken:.2f} seconds in total')
 

From f2d1645eb7983d36661ca5ec8778b57df9198911 Mon Sep 17 00:00:00 2001
From: Calvin Smith <email@cjsmith.io>
Date: Thu, 21 Nov 2024 11:37:35 -0700
Subject: [PATCH 3/4] fixing dockerfile and build scripts, configuration broken

---
 .../mle_bench/agents/openhands/Dockerfile     | 10 ++--------
 .../mle_bench/agents/openhands/build.sh       |  2 +-
 .../mle_bench/agents/openhands/entrypoint.sh  |  3 ---
 evaluation/mle_bench/agents/run.py            | 19 +------------------
 4 files changed, 4 insertions(+), 30 deletions(-)

diff --git a/evaluation/mle_bench/agents/openhands/Dockerfile b/evaluation/mle_bench/agents/openhands/Dockerfile
index ad47b84b21d7..8f39f55d6e1b 100644
--- a/evaluation/mle_bench/agents/openhands/Dockerfile
+++ b/evaluation/mle_bench/agents/openhands/Dockerfile
@@ -30,18 +30,12 @@ RUN mkdir -p ${LOGS_DIR} ${AGENT_DIR} ${CODE_DIR} ${SUBMISSION_DIR} && \
     sudo usermod -aG docker nonroot && \
     git clone --branch ${OH_VERSION} --single-branch https://github.com/All-Hands-AI/OpenHands.git ${AGENT_DIR}
 
-# We need fuse for docker.
-RUN sudo apt-get install -y fuse-overlayfs
-
 WORKDIR ${AGENT_DIR}
 
 # Assumes that the `agent` conda env already exists, which is created in the `mlebench-env` base image.
-RUN conda install -y -n ${CONDA_ENV_NAME} -c conda-forge nodejs=18.17.1 && \
-    conda run -n ${CONDA_ENV_NAME} pip install poetry>=1.8 && \
+RUN conda install -y -n ${CONDA_ENV_NAME} -c conda-forge nodejs=18.17.1 python=3.12 poetry=1.8 && \
     conda run -n ${CONDA_ENV_NAME} poetry install && \
-    conda run -n ${CONDA_ENV_NAME} poetry add build
-
-RUN conda clean -afy
+    conda clean -afy
 
 COPY setup.py start.py templates.py utils.py start.sh build.sh ${AGENT_DIR}/
 COPY entrypoint.sh /agent_entrypoint.sh
diff --git a/evaluation/mle_bench/agents/openhands/build.sh b/evaluation/mle_bench/agents/openhands/build.sh
index 4eec5f938531..b12322d28033 100644
--- a/evaluation/mle_bench/agents/openhands/build.sh
+++ b/evaluation/mle_bench/agents/openhands/build.sh
@@ -6,4 +6,4 @@ cd /home/agent
 # all the build artifacts are being recursively chmod'ed.
 
 /opt/conda/bin/conda run -n agent --no-capture-output make build
-sudo -u nonroot /opt/conda/bin/conda run -n agent --no-capture-output playwright install
+# sudo -u nonroot /opt/conda/bin/conda run -n agent --no-capture-output playwright install
diff --git a/evaluation/mle_bench/agents/openhands/entrypoint.sh b/evaluation/mle_bench/agents/openhands/entrypoint.sh
index 2c58dc4dc292..773af7d71585 100644
--- a/evaluation/mle_bench/agents/openhands/entrypoint.sh
+++ b/evaluation/mle_bench/agents/openhands/entrypoint.sh
@@ -6,9 +6,6 @@ set -x
 # Run the root entrypoint in the background
 /entrypoint.sh &
 
-# Start the docker daemon in the background
-sudo dockerd &
-
 mkdir -p $LOGS_DIR
 mkdir -p $AGENT_DIR
 {
diff --git a/evaluation/mle_bench/agents/run.py b/evaluation/mle_bench/agents/run.py
index b935e649b830..3e55feca8e28 100644
--- a/evaluation/mle_bench/agents/run.py
+++ b/evaluation/mle_bench/agents/run.py
@@ -121,6 +121,7 @@ def run_in_container(
             'bind': f'/private/data/{competition.id}/prepared/private/',
             'mode': 'ro',
         },
+        # "/var/run/docker.sock": {"bind": "/var/run/docker.sock", "mode": "rw"},
     }
 
     container = create_competition_container(
@@ -140,7 +141,6 @@ def run_in_container(
     try:
         time_start = time.monotonic()
         container.start()
-        print('TEST RUN>PY')
         exit_code, _ = container.exec_run(
             'timeout 60s sh -c "while ! curl -s http://localhost:5000/health > /dev/null; do sleep 1; done"',
         )
@@ -150,23 +150,6 @@ def run_in_container(
                 'The grading server failed to start within 60 seconds. This is likely due to an error in `entrypoint.sh`; check the logs.'
             )
 
-        exit_code, output = container.exec_run(
-            ' '.join(
-                [
-                    'docker',
-                    'buildx',
-                    'build',
-                    '--progress=plain',
-                    '--build-arg=OPENHANDS_RUNTIME_VERSION=0.14.0',
-                    '--build-arg=OPENHANDS_RUNTIME_BUILD_TIME=2024-11-19T21:01:38.353125',
-                    '--tag=ghcr.io/all-hands-ai/runtime:oh_v0.14.0_4jgqg088oqwrl1d8_943wj498fiupdhg1',
-                ]
-            )
-        )
-
-        print(exit_code)
-        print(output.decode('utf-8'))
-
         execute_agent(container, agent, logger)
         save_output(container, run_dir, container_config)
         time_end = time.monotonic()

From 99a373d7cf1f2fdf33bee38ec63f17b9b842f1ff Mon Sep 17 00:00:00 2001
From: Calvin Smith <email@cjsmith.io>
Date: Fri, 22 Nov 2024 13:49:37 -0700
Subject: [PATCH 4/4] docker-in-docker using host socket

---
 evaluation/mle_bench/agents/openhands/entrypoint.sh | 2 +-
 evaluation/mle_bench/agents/run.py                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/evaluation/mle_bench/agents/openhands/entrypoint.sh b/evaluation/mle_bench/agents/openhands/entrypoint.sh
index 773af7d71585..23b0b5c0ba16 100644
--- a/evaluation/mle_bench/agents/openhands/entrypoint.sh
+++ b/evaluation/mle_bench/agents/openhands/entrypoint.sh
@@ -30,7 +30,7 @@ mkdir -p $AGENT_DIR
     if sudo pgrep dockerd > /dev/null; then
       sudo pkill dockerd
     fi
-    sudo dockerd > $LOGS_DIR/docker.log 2>&1 &
+    # sudo dockerd > $LOGS_DIR/docker.log 2>&1 &
     sleep 5
   else
     echo "Docker not installed. Skipping Docker startup."
diff --git a/evaluation/mle_bench/agents/run.py b/evaluation/mle_bench/agents/run.py
index 3e55feca8e28..ae0cec15dabc 100644
--- a/evaluation/mle_bench/agents/run.py
+++ b/evaluation/mle_bench/agents/run.py
@@ -121,7 +121,7 @@ def run_in_container(
             'bind': f'/private/data/{competition.id}/prepared/private/',
             'mode': 'ro',
         },
-        # "/var/run/docker.sock": {"bind": "/var/run/docker.sock", "mode": "rw"},
+        '/var/run/docker.sock': {'bind': '/var/run/docker.sock', 'mode': 'rw'},
     }
 
     container = create_competition_container(