Add benchmarks from FC4RLLM repository

AlexCuadron · Feb 26, 2025 · 4eb5a22 · 4eb5a22
1 parent 013ff2d
commit 4eb5a22
Show file tree

Hide file tree

Showing 18 changed files with 1,665 additions and 0 deletions.
diff --git a/evaluation/README.md b/evaluation/README.md
@@ -60,6 +60,7 @@ The OpenHands evaluation harness supports a wide variety of benchmarks across [s
 - AiderBench: [`evaluation/benchmarks/aider_bench`](./benchmarks/aider_bench/)
 - Commit0: [`evaluation/benchmarks/commit0_bench`](./benchmarks/commit0_bench/)
 - DiscoveryBench: [`evaluation/benchmarks/discoverybench`](./benchmarks/discoverybench/)
+- APPS: [`evaluation/benchmarks/apps`](./benchmarks/apps/)
 
 ### Web Browsing
 
@@ -76,6 +77,10 @@ The OpenHands evaluation harness supports a wide variety of benchmarks across [s
 - Entity deduction Arena (EDA): [`evaluation/benchmarks/EDA`](./benchmarks/EDA)
 - ProofWriter: [`evaluation/benchmarks/logic_reasoning`](./benchmarks/logic_reasoning)
 - ScienceAgentBench: [`evaluation/benchmarks/scienceagentbench`](./benchmarks/scienceagentbench)
+- MATH: [`evaluation/benchmarks/math`](./benchmarks/math/)
+- HotpotQA: [`evaluation/benchmarks/hotpotqa`](./benchmarks/hotpotqa/)
+- WikiTableQuestion: [`evaluation/benchmarks/wiki_table_question`](./benchmarks/wiki_table_question/)
+- AlfWorld: [`evaluation/benchmarks/alfworld`](./benchmarks/alfworld/)
 
 ### Real World
 

diff --git a/evaluation/benchmarks/alfworld/__init__.py b/evaluation/benchmarks/alfworld/__init__.py
diff --git a/evaluation/benchmarks/apps/README.md b/evaluation/benchmarks/apps/README.md
@@ -0,0 +1,45 @@
+# APPS Benchmark Evaluation
+
+This folder contains evaluation harness for evaluating agents on the [APPS benchmark](https://huggingface.co/datasets/codeparrot/apps).
+
+APPS is a benchmark for code generation that consists of 10,000 problems, which range from introductory programming problems to competition-level problems. The benchmark contains natural language descriptions of problems, canonical solutions, and test cases.
+
+## Setup Environment and LLM Configuration
+
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
+
+## Start the evaluation
+
+```bash
+./evaluation/benchmarks/apps/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] [run_evaluation]
+```
+
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your LLM settings, as defined in your `config.toml`.
+- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version you would like to evaluate. It could also be a release tag like `0.9.0`.
+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting to `CodeActAgent`.
+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default, the script evaluates the entire test set.
+- `eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
+- `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the given IDs (comma separated).
+- `run_evaluation`: set to `eval` to automatically run evaluation after the benchmark completes.
+
+Following is the basic command to start the evaluation:
+
+```bash
+# Run benchmark without evaluation
+./evaluation/benchmarks/apps/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 10 1 "1,3,10"
+
+# Run benchmark with automatic evaluation
+./evaluation/benchmarks/apps/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 10 1 "1,3,10" eval
+```
+
+## Summarize Results
+
+```bash
+poetry run python ./evaluation/benchmarks/apps/scripts/summarize_results.py [path_to_output_jsonl_file]
+```
+
+Full example:
+
+```bash
+poetry run python ./evaluation/benchmarks/apps/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/APPS/CodeActAgent/gpt-4o-2024-05-13@20240620_maxiter_30_N_v1.9/output.jsonl
+```
diff --git a/evaluation/benchmarks/apps/__init__.py b/evaluation/benchmarks/apps/__init__.py
diff --git a/evaluation/benchmarks/apps/run_infer.py b/evaluation/benchmarks/apps/run_infer.py
@@ -0,0 +1,319 @@
+import asyncio
+import copy
+import os
+import tempfile
+from typing import Any
+
+import pandas as pd
+from datasets import load_dataset
+
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    compatibility_for_eval_history_pairs,
+    get_default_sandbox_config_for_eval,
+    make_metadata,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+    update_llm_config_for_completions_logging,
+)
+from openhands.controller.state.state import State
+from openhands.core.config import (
+    AppConfig,
+    get_llm_config_arg,
+    load_from_toml,
+    parse_arguments,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import CmdRunAction, MessageAction
+from openhands.events.observation import CmdOutputObservation
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
+
+# Configure any environment variables
+SKIP_NUM = os.environ.get('SKIP_NUM')
+SKIP_NUM = (
+    int(SKIP_NUM) if SKIP_NUM and SKIP_NUM.isdigit() and int(SKIP_NUM) >= 0 else None
+)
+
+
+def get_config(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'python:3.11-bookworm'
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_openhands=False,
+        runtime=os.environ.get('RUNTIME', 'docker'),
+        max_iterations=metadata.max_iterations,
+        sandbox=sandbox_config,
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    # Update llm_config to enable completions logging
+    llm_config = update_llm_config_for_completions_logging(
+        metadata.llm_config,
+        metadata.eval_output_dir,
+        str(instance.instance_id)
+    )
+    config.set_llm_config(llm_config)
+    agent_config = config.get_agent_config(metadata.agent_class)
+    agent_config.enable_prompt_extensions = False
+
+    # copy 'draft_editor' config if exists
+    config_copy = copy.deepcopy(config)
+    load_from_toml(config_copy)
+    if 'draft_editor' in config_copy.llms:
+        config.set_llm_config(config_copy.llms['draft_editor'], 'draft_editor')
+
+    return config
+
+
+def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"\n{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}\n")
+    obs: CmdOutputObservation
+
+    # Set up workspace
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    action = CmdRunAction(command='cd /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
+    # Create problem file
+    with tempfile.TemporaryDirectory() as tmpdir:
+        file_path = os.path.join(tmpdir, 'problem.txt')
+        with open(file_path, 'w') as f:
+            f.write(instance.problem)
+        runtime.copy_to(
+            file_path,
+            '/workspace',
+        )
+
+        # Create test cases file
+        file_path = os.path.join(tmpdir, 'test_cases.py')
+        with open(file_path, 'w') as f:
+            f.write(instance.test_cases)
+        runtime.copy_to(
+            file_path,
+            '/workspace',
+        )
+
+    logger.info(f"\n{'-' * 50} END Runtime Initialization Fn {'-' * 50}\n")
+
+
+def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called after the agent has run.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"\n{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}\n")
+    obs: CmdOutputObservation
+
+    # Check if solution.py exists
+    action = CmdRunAction(command='ls -la /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    # Run test cases
+    action = CmdRunAction(command='python3 /workspace/test_cases.py')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    exit_code = 1
+    if isinstance(obs, CmdOutputObservation):
+        exit_code = obs.exit_code
+
+    logger.info(f"\n{'-' * 50} END Runtime Completion Fn {'-' * 50}\n")
+
+    runtime.close()
+
+    return {
+        'test_output': obs.content,
+        'exit_code': exit_code,
+    }
+
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+) -> EvalOutput:
+    config = get_config(instance, metadata)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, str(instance.instance_id), log_dir)
+    else:
+        logger.info(
+            f'\nStarting evaluation for instance {str(instance.instance_id)}.\n'
+        )
+
+    # =============================================
+    # build instruction
+    # =============================================
+
+    # Prepare instruction
+    logger.info(instance)
+    instruction = f"""You are given a programming problem to solve. The problem description is in the file 'problem.txt'.
+
+Please read the problem carefully and implement a solution in Python. Save your solution in a file named 'solution.py'.
+
+After implementing your solution, you can test it by running 'python3 test_cases.py'. This will execute your solution against a set of test cases.
+
+IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.
+"""
+
+    # =============================================
+    # create sandbox and run the agent
+    # =============================================
+
+    runtime: Runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
+
+    initialize_runtime(runtime, instance=instance)
+
+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=MessageAction(content=instruction),
+            runtime=runtime,
+        )
+    )
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    # =============================================
+    # result evaluation
+    # =============================================
+
+    return_val = complete_runtime(runtime, instance)
+    exit_code = return_val['exit_code']
+    test_output = return_val['test_output']
+
+    test_result = {
+        'exit_code': exit_code,
+        'test_output': test_output,
+    }
+
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = compatibility_for_eval_history_pairs(state.history)
+    metrics = state.metrics.get() if state.metrics else None
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=str(instance.instance_id),
+        instance=instance.to_dict(),
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result=test_result,
+    )
+    return output
+
+
+def prepare_apps_dataset():
+    """Prepare the APPS dataset for evaluation."""
+    # Load the APPS dataset
+    dataset = load_dataset('codeparrot/apps', split='test')
+
+    # Convert to pandas DataFrame
+    df = dataset.to_pandas()
+
+    # Add instance_id column
+    df['instance_id'] = df.index
+
+    # Rename columns to match expected format
+    df = df.rename(columns={
+        'question': 'problem',
+        'test': 'test_cases',
+    })
+
+    return df
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+
+    # Prepare the APPS dataset
+    apps_dataset = prepare_apps_dataset()
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+        # modify_params must be False for evaluation purpose, for reproducibility and accuracy of results
+        llm_config.modify_params = False
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
+
+    # Create details dictionary with agent configuration
+    agent_details = {
+        "agent_config": {
+            "codeact_enable_jupyter": False,
+            "codeact_enable_browsing": False,
+            "codeact_enable_llm_editor": False,
+        }
+    }
+
+    metadata = make_metadata(
+        llm_config,
+        'APPS',
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+        details=agent_details,
+    )
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+
+    # Parse dataset IDs if provided
+    eval_ids = None
+    if args.eval_ids:
+        eval_ids = str(args.eval_ids).split(',')
+        logger.info(f'\nUsing specific dataset IDs: {eval_ids}\n')
+
+    instances = prepare_dataset(
+        apps_dataset,
+        output_file,
+        args.eval_n_limit,
+        eval_ids=eval_ids,
+        skip_num=SKIP_NUM,
+    )
+
+    run_evaluation(
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+    )