Add polyglot benchmark implementation

AlexCuadron · Feb 26, 2025 · afbf10f · afbf10f
1 parent 3a2b167
commit afbf10f
Show file tree

Hide file tree

Showing 10 changed files with 912 additions and 0 deletions.
diff --git a/evaluation/benchmarks/polyglot_benchmark/Dockerfile b/evaluation/benchmarks/polyglot_benchmark/Dockerfile
@@ -0,0 +1,63 @@
+FROM ubuntu:22.04
+
+# Avoid prompts from apt
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install common dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    python3 \
+    python3-pip \
+    python3-dev \
+    python3-venv \
+    wget \
+    software-properties-common \
+    apt-transport-https \
+    ca-certificates \
+    gnupg \
+    lsb-release \
+    libboost-all-dev \
+    cmake \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python packages
+RUN pip3 install --no-cache-dir pytest pytest-timeout
+
+# Install Node.js and npm
+RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
+    && apt-get install -y nodejs \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Install Go
+RUN wget https://go.dev/dl/go1.20.5.linux-amd64.tar.gz \
+    && tar -C /usr/local -xzf go1.20.5.linux-amd64.tar.gz \
+    && rm go1.20.5.linux-amd64.tar.gz
+ENV PATH="/usr/local/go/bin:${PATH}"
+
+# Install Java
+RUN apt-get update && apt-get install -y openjdk-17-jdk \
+    && rm -rf /var/lib/apt/lists/*
+ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
+
+# Install Gradle
+RUN wget https://services.gradle.org/distributions/gradle-7.6-bin.zip \
+    && mkdir /opt/gradle \
+    && unzip -d /opt/gradle gradle-7.6-bin.zip \
+    && rm gradle-7.6-bin.zip
+ENV PATH="/opt/gradle/gradle-7.6/bin:${PATH}"
+
+# Create workspace directory
+RUN mkdir -p /workspace
+WORKDIR /workspace
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=UTF-8
+
+CMD ["/bin/bash"]
diff --git a/evaluation/benchmarks/polyglot_benchmark/README.md b/evaluation/benchmarks/polyglot_benchmark/README.md
@@ -0,0 +1,90 @@
+# Polyglot Benchmark
+
+This benchmark is based on the [Aider Polyglot Benchmark](https://github.com/Aider-AI/polyglot-benchmark), which evaluates how effectively an agent can translate natural language coding requests into executable code that passes unit tests across multiple programming languages.
+
+## Features
+
+- Supports multiple programming languages (Python, JavaScript, Rust, Go, C++, Java)
+- End-to-end evaluation of code editing capabilities
+- Automated test execution and validation
+- Parallel evaluation with multiple workers
+- Detailed metrics and logging
+
+## Setup
+
+1. Clone the polyglot-benchmark repository:
+   ```bash
+   git clone https://github.com/Aider-AI/polyglot-benchmark.git /workspace/polyglot-benchmark
+   ```
+
+2. Build the Docker image for the benchmark:
+   ```bash
+   ./evaluation/benchmarks/polyglot_benchmark/scripts/build_docker.sh
+   ```
+
+## Usage
+
+1. Make sure you have the required dependencies installed:
+   ```bash
+   pip install -e .[dev]
+   ```
+
+2. Run the benchmark:
+   ```bash
+   ./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh <model_config> <git-version> <agent> <eval_limit> <eval-num-workers> <eval_ids> <eval_languages>
+   ```
+
+### Command Line Arguments
+
+- `model_config`: The LLM configuration to use (e.g., `eval_gpt4_1106_preview`)
+- `git-version`: Git commit or note to append to output directory (e.g., `HEAD`)
+- `agent`: Agent class name (e.g., `CodeActAgent`)
+- `eval_limit`: Limit the number of examples to evaluate (default: `-1` for all)
+- `eval-num-workers`: Number of parallel workers (default: `1`)
+- `eval_ids`: Comma-separated list of specific test IDs to run (e.g., `"1,3,10"`)
+- `eval_languages`: Comma-separated list of languages to test (e.g., `"python,javascript,rust"`)
+
+### Environment Variables
+
+You can also set the following environment variables:
+
+```bash
+export POLYGLOT_BENCHMARK_PATH="/path/to/polyglot-benchmark"  # Path to the polyglot-benchmark repository
+export USE_UNIT_TESTS="true"  # Whether to run unit tests (default: true)
+```
+
+### Example
+
+```bash
+# Run evaluation on CodeActAgent for all Python instances with 2 workers
+export POLYGLOT_BENCHMARK_PATH="/workspace/polyglot-benchmark"
+./evaluation/benchmarks/polyglot_benchmark/scripts/run_infer.sh eval_gpt4_1106_preview HEAD CodeActAgent -1 2 "" "python"
+```
+
+## Summarize Results
+
+After running the benchmark, you can summarize the results:
+
+```bash
+poetry run python ./evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py <path_to_output_jsonl_file>
+```
+
+Example:
+
+```bash
+poetry run python ./evaluation/benchmarks/polyglot_benchmark/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/PolyglotBenchmark/CodeActAgent/gpt-4-1106-preview_maxiter_30/output.jsonl
+```
+
+## Supported Languages
+
+The benchmark supports the following languages and test frameworks:
+- Python: pytest
+- JavaScript: npm test
+- Rust: cargo test
+- Go: go test
+- C++: make test
+- Java: Gradle test
+
+## Docker Support
+
+The benchmark runs in a Docker container to safely execute untrusted code. The container image includes all necessary language toolchains and test frameworks.
diff --git a/evaluation/benchmarks/polyglot_benchmark/helper/__init__.py b/evaluation/benchmarks/polyglot_benchmark/helper/__init__.py
diff --git a/evaluation/benchmarks/polyglot_benchmark/helper/prompts.py b/evaluation/benchmarks/polyglot_benchmark/helper/prompts.py
@@ -0,0 +1,28 @@
+"""Prompts used in the polyglot benchmark."""
+
+INSTRUCTIONS_ADDENDUM = """
+I've provided the following files that need to be modified:
+{file_list}
+
+Please help me implement the necessary changes to meet the requirements.
+You should ONLY modify these files, and NOT create any new files.
+"""
+
+TEST_FAILURES = """
+The tests failed. Please fix the issues and try again.
+Remember to only modify the following files:
+{file_list}
+"""
+
+# Dictionary mapping agent class names to their specific instruction suffixes
+INST_SUFFIXES = {
+    'CodeActAgent': (
+        'REMEMBER: All edits must be made directly in the files. Do NOT send'
+        ' the edited file as output to the user.\n'
+    )
+}
+
+# Dictionary mapping agent class names to their fake response functions
+FAKE_RESPONSES = {
+    'CodeActAgent': lambda _: None,  # Will be replaced with codeact_user_response from shared.py
+}