forked from All-Hands-AI/OpenHands
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add benchmarks from FC4RLLM repository
- Loading branch information
1 parent
013ff2d
commit 4eb5a22
Showing
18 changed files
with
1,665 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
# APPS Benchmark Evaluation | ||
|
||
This folder contains evaluation harness for evaluating agents on the [APPS benchmark](https://huggingface.co/datasets/codeparrot/apps). | ||
|
||
APPS is a benchmark for code generation that consists of 10,000 problems, which range from introductory programming problems to competition-level problems. The benchmark contains natural language descriptions of problems, canonical solutions, and test cases. | ||
|
||
## Setup Environment and LLM Configuration | ||
|
||
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM. | ||
|
||
## Start the evaluation | ||
|
||
```bash | ||
./evaluation/benchmarks/apps/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids] [run_evaluation] | ||
``` | ||
|
||
- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your LLM settings, as defined in your `config.toml`. | ||
- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version you would like to evaluate. It could also be a release tag like `0.9.0`. | ||
- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting to `CodeActAgent`. | ||
- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default, the script evaluates the entire test set. | ||
- `eval-num-workers`: the number of workers to use for evaluation. Default: `1`. | ||
- `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the given IDs (comma separated). | ||
- `run_evaluation`: set to `eval` to automatically run evaluation after the benchmark completes. | ||
|
||
Following is the basic command to start the evaluation: | ||
|
||
```bash | ||
# Run benchmark without evaluation | ||
./evaluation/benchmarks/apps/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 10 1 "1,3,10" | ||
|
||
# Run benchmark with automatic evaluation | ||
./evaluation/benchmarks/apps/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 10 1 "1,3,10" eval | ||
``` | ||
|
||
## Summarize Results | ||
|
||
```bash | ||
poetry run python ./evaluation/benchmarks/apps/scripts/summarize_results.py [path_to_output_jsonl_file] | ||
``` | ||
|
||
Full example: | ||
|
||
```bash | ||
poetry run python ./evaluation/benchmarks/apps/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/APPS/CodeActAgent/gpt-4o-2024-05-13@20240620_maxiter_30_N_v1.9/output.jsonl | ||
``` |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,319 @@ | ||
import asyncio | ||
import copy | ||
import os | ||
import tempfile | ||
from typing import Any | ||
|
||
import pandas as pd | ||
from datasets import load_dataset | ||
|
||
from evaluation.utils.shared import ( | ||
EvalMetadata, | ||
EvalOutput, | ||
compatibility_for_eval_history_pairs, | ||
get_default_sandbox_config_for_eval, | ||
make_metadata, | ||
prepare_dataset, | ||
reset_logger_for_multiprocessing, | ||
run_evaluation, | ||
update_llm_config_for_completions_logging, | ||
) | ||
from openhands.controller.state.state import State | ||
from openhands.core.config import ( | ||
AppConfig, | ||
get_llm_config_arg, | ||
load_from_toml, | ||
parse_arguments, | ||
) | ||
from openhands.core.logger import openhands_logger as logger | ||
from openhands.core.main import create_runtime, run_controller | ||
from openhands.events.action import CmdRunAction, MessageAction | ||
from openhands.events.observation import CmdOutputObservation | ||
from openhands.runtime.base import Runtime | ||
from openhands.utils.async_utils import call_async_from_sync | ||
|
||
# Configure any environment variables | ||
SKIP_NUM = os.environ.get('SKIP_NUM') | ||
SKIP_NUM = ( | ||
int(SKIP_NUM) if SKIP_NUM and SKIP_NUM.isdigit() and int(SKIP_NUM) >= 0 else None | ||
) | ||
|
||
|
||
def get_config( | ||
instance: pd.Series, | ||
metadata: EvalMetadata, | ||
) -> AppConfig: | ||
sandbox_config = get_default_sandbox_config_for_eval() | ||
sandbox_config.base_container_image = 'python:3.11-bookworm' | ||
config = AppConfig( | ||
default_agent=metadata.agent_class, | ||
run_as_openhands=False, | ||
runtime=os.environ.get('RUNTIME', 'docker'), | ||
max_iterations=metadata.max_iterations, | ||
sandbox=sandbox_config, | ||
# do not mount workspace | ||
workspace_base=None, | ||
workspace_mount_path=None, | ||
) | ||
# Update llm_config to enable completions logging | ||
llm_config = update_llm_config_for_completions_logging( | ||
metadata.llm_config, | ||
metadata.eval_output_dir, | ||
str(instance.instance_id) | ||
) | ||
config.set_llm_config(llm_config) | ||
agent_config = config.get_agent_config(metadata.agent_class) | ||
agent_config.enable_prompt_extensions = False | ||
|
||
# copy 'draft_editor' config if exists | ||
config_copy = copy.deepcopy(config) | ||
load_from_toml(config_copy) | ||
if 'draft_editor' in config_copy.llms: | ||
config.set_llm_config(config_copy.llms['draft_editor'], 'draft_editor') | ||
|
||
return config | ||
|
||
|
||
def initialize_runtime( | ||
runtime: Runtime, | ||
instance: pd.Series, | ||
): | ||
"""Initialize the runtime for the agent. | ||
This function is called before the runtime is used to run the agent. | ||
""" | ||
logger.info(f"\n{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}\n") | ||
obs: CmdOutputObservation | ||
|
||
# Set up workspace | ||
action = CmdRunAction(command='mkdir -p /workspace') | ||
logger.info(action, extra={'msg_type': 'ACTION'}) | ||
obs = runtime.run_action(action) | ||
assert obs.exit_code == 0 | ||
|
||
action = CmdRunAction(command='cd /workspace') | ||
logger.info(action, extra={'msg_type': 'ACTION'}) | ||
obs = runtime.run_action(action) | ||
assert obs.exit_code == 0 | ||
|
||
# Create problem file | ||
with tempfile.TemporaryDirectory() as tmpdir: | ||
file_path = os.path.join(tmpdir, 'problem.txt') | ||
with open(file_path, 'w') as f: | ||
f.write(instance.problem) | ||
runtime.copy_to( | ||
file_path, | ||
'/workspace', | ||
) | ||
|
||
# Create test cases file | ||
file_path = os.path.join(tmpdir, 'test_cases.py') | ||
with open(file_path, 'w') as f: | ||
f.write(instance.test_cases) | ||
runtime.copy_to( | ||
file_path, | ||
'/workspace', | ||
) | ||
|
||
logger.info(f"\n{'-' * 50} END Runtime Initialization Fn {'-' * 50}\n") | ||
|
||
|
||
def complete_runtime( | ||
runtime: Runtime, | ||
instance: pd.Series, | ||
) -> dict[str, Any]: | ||
"""Complete the runtime for the agent. | ||
This function is called after the agent has run. | ||
If you need to do something in the sandbox to get the correctness metric after | ||
the agent has run, modify this function. | ||
""" | ||
logger.info(f"\n{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}\n") | ||
obs: CmdOutputObservation | ||
|
||
# Check if solution.py exists | ||
action = CmdRunAction(command='ls -la /workspace') | ||
logger.info(action, extra={'msg_type': 'ACTION'}) | ||
obs = runtime.run_action(action) | ||
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | ||
|
||
# Run test cases | ||
action = CmdRunAction(command='python3 /workspace/test_cases.py') | ||
logger.info(action, extra={'msg_type': 'ACTION'}) | ||
obs = runtime.run_action(action) | ||
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | ||
|
||
exit_code = 1 | ||
if isinstance(obs, CmdOutputObservation): | ||
exit_code = obs.exit_code | ||
|
||
logger.info(f"\n{'-' * 50} END Runtime Completion Fn {'-' * 50}\n") | ||
|
||
runtime.close() | ||
|
||
return { | ||
'test_output': obs.content, | ||
'exit_code': exit_code, | ||
} | ||
|
||
|
||
def process_instance( | ||
instance: pd.Series, | ||
metadata: EvalMetadata, | ||
reset_logger: bool = True, | ||
) -> EvalOutput: | ||
config = get_config(instance, metadata) | ||
|
||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation | ||
if reset_logger: | ||
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs') | ||
reset_logger_for_multiprocessing(logger, str(instance.instance_id), log_dir) | ||
else: | ||
logger.info( | ||
f'\nStarting evaluation for instance {str(instance.instance_id)}.\n' | ||
) | ||
|
||
# ============================================= | ||
# build instruction | ||
# ============================================= | ||
|
||
# Prepare instruction | ||
logger.info(instance) | ||
instruction = f"""You are given a programming problem to solve. The problem description is in the file 'problem.txt'. | ||
Please read the problem carefully and implement a solution in Python. Save your solution in a file named 'solution.py'. | ||
After implementing your solution, you can test it by running 'python3 test_cases.py'. This will execute your solution against a set of test cases. | ||
IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP. | ||
""" | ||
|
||
# ============================================= | ||
# create sandbox and run the agent | ||
# ============================================= | ||
|
||
runtime: Runtime = create_runtime(config) | ||
call_async_from_sync(runtime.connect) | ||
|
||
initialize_runtime(runtime, instance=instance) | ||
|
||
# Here's how you can run the agent (similar to the `main` function) and get the final task state | ||
state: State | None = asyncio.run( | ||
run_controller( | ||
config=config, | ||
initial_user_action=MessageAction(content=instruction), | ||
runtime=runtime, | ||
) | ||
) | ||
if state is None: | ||
raise ValueError('State should not be None.') | ||
|
||
# ============================================= | ||
# result evaluation | ||
# ============================================= | ||
|
||
return_val = complete_runtime(runtime, instance) | ||
exit_code = return_val['exit_code'] | ||
test_output = return_val['test_output'] | ||
|
||
test_result = { | ||
'exit_code': exit_code, | ||
'test_output': test_output, | ||
} | ||
|
||
# history is now available as a stream of events, rather than list of pairs of (Action, Observation) | ||
# for compatibility with the existing output format, we can remake the pairs here | ||
# remove when it becomes unnecessary | ||
histories = compatibility_for_eval_history_pairs(state.history) | ||
metrics = state.metrics.get() if state.metrics else None | ||
|
||
# Save the output | ||
output = EvalOutput( | ||
instance_id=str(instance.instance_id), | ||
instance=instance.to_dict(), | ||
instruction=instruction, | ||
metadata=metadata, | ||
history=histories, | ||
metrics=metrics, | ||
error=state.last_error if state and state.last_error else None, | ||
test_result=test_result, | ||
) | ||
return output | ||
|
||
|
||
def prepare_apps_dataset(): | ||
"""Prepare the APPS dataset for evaluation.""" | ||
# Load the APPS dataset | ||
dataset = load_dataset('codeparrot/apps', split='test') | ||
|
||
# Convert to pandas DataFrame | ||
df = dataset.to_pandas() | ||
|
||
# Add instance_id column | ||
df['instance_id'] = df.index | ||
|
||
# Rename columns to match expected format | ||
df = df.rename(columns={ | ||
'question': 'problem', | ||
'test': 'test_cases', | ||
}) | ||
|
||
return df | ||
|
||
|
||
if __name__ == '__main__': | ||
args = parse_arguments() | ||
|
||
# Prepare the APPS dataset | ||
apps_dataset = prepare_apps_dataset() | ||
|
||
llm_config = None | ||
if args.llm_config: | ||
llm_config = get_llm_config_arg(args.llm_config) | ||
# modify_params must be False for evaluation purpose, for reproducibility and accuracy of results | ||
llm_config.modify_params = False | ||
|
||
if llm_config is None: | ||
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') | ||
|
||
# Create details dictionary with agent configuration | ||
agent_details = { | ||
"agent_config": { | ||
"codeact_enable_jupyter": False, | ||
"codeact_enable_browsing": False, | ||
"codeact_enable_llm_editor": False, | ||
} | ||
} | ||
|
||
metadata = make_metadata( | ||
llm_config, | ||
'APPS', | ||
args.agent_cls, | ||
args.max_iterations, | ||
args.eval_note, | ||
args.eval_output_dir, | ||
details=agent_details, | ||
) | ||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') | ||
|
||
# Parse dataset IDs if provided | ||
eval_ids = None | ||
if args.eval_ids: | ||
eval_ids = str(args.eval_ids).split(',') | ||
logger.info(f'\nUsing specific dataset IDs: {eval_ids}\n') | ||
|
||
instances = prepare_dataset( | ||
apps_dataset, | ||
output_file, | ||
args.eval_n_limit, | ||
eval_ids=eval_ids, | ||
skip_num=SKIP_NUM, | ||
) | ||
|
||
run_evaluation( | ||
instances, | ||
metadata, | ||
output_file, | ||
args.eval_num_workers, | ||
process_instance, | ||
) |
Oops, something went wrong.