From 33b79a7f3d664ca90fa4f396c89dd8dbc2da090b Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 16 Jan 2025 12:42:48 -0500 Subject: [PATCH 1/3] feat(eval): misc SWE-Bench improvement - use different resources for different instances --- evaluation/benchmarks/swe_bench/eval_infer.py | 105 +++++----- .../benchmarks/swe_bench/resource/mapping.py | 40 ++++ ...rinceton-nlp__SWE-bench_Verified-test.json | 1 + evaluation/benchmarks/swe_bench/run_infer.py | 25 ++- .../scripts/eval/combine_final_completions.py | 69 +++++++ .../eval/convert_oh_output_to_swe_json.py | 3 +- .../scripts/eval/update_output_with_eval.py | 191 +++++++++++------- .../benchmarks/swe_bench/scripts/run_infer.sh | 11 + 8 files changed, 317 insertions(+), 128 deletions(-) create mode 100644 evaluation/benchmarks/swe_bench/resource/mapping.py create mode 100644 evaluation/benchmarks/swe_bench/resource/princeton-nlp__SWE-bench_Verified-test.json create mode 100644 evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py diff --git a/evaluation/benchmarks/swe_bench/eval_infer.py b/evaluation/benchmarks/swe_bench/eval_infer.py index 7beacf344408..3c222edd6b9e 100644 --- a/evaluation/benchmarks/swe_bench/eval_infer.py +++ b/evaluation/benchmarks/swe_bench/eval_infer.py @@ -1,3 +1,4 @@ +import json import os import tempfile import time @@ -11,7 +12,11 @@ ) from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec from swebench.harness.utils import load_swebench_dataset +from tqdm import tqdm +from evaluation.benchmarks.swe_bench.resource.mapping import ( + get_instance_resource_factor, +) from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image from evaluation.utils.shared import ( EvalMetadata, @@ -81,10 +86,14 @@ def get_config(instance: pd.Series) -> AppConfig: base_container_image=base_container_image, use_host_network=False, # large enough timeout, since some testcases take very long to run - timeout=1800, + timeout=600, api_key=os.environ.get('ALLHANDS_API_KEY', None), remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), remote_runtime_init_timeout=3600, + remote_runtime_resource_factor=get_instance_resource_factor( + dataset_name=metadata.dataset, + instance_id=instance['instance_id'], + ), ), # do not mount workspace workspace_base=None, @@ -151,52 +160,52 @@ def process_instance( if runtime_failure_count > 0: config.sandbox.remote_runtime_resource_factor = min( config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count), - 4, # hardcode maximum resource factor to 4 + 8, ) logger.warning( - f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}' + f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}' ) - runtime = create_runtime(config) - call_async_from_sync(runtime.connect) - # Get patch and save it to /tmp/patch.diff - with tempfile.TemporaryDirectory() as temp_dir: - # Patch file - patch_file_path = os.path.join(temp_dir, 'patch.diff') - with open(patch_file_path, 'w') as f: - f.write(model_patch) - runtime.copy_to(patch_file_path, '/tmp') - # Eval script - eval_script_path = os.path.join(temp_dir, 'eval.sh') - with open(eval_script_path, 'w') as f: - f.write(test_spec.eval_script) - runtime.copy_to(eval_script_path, '/tmp') - - # Set +x - action = CmdRunAction(command='chmod +x /tmp/eval.sh') - action.timeout = 600 - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert obs.exit_code == 0 - - # Apply patch - exec_command = ( - 'cd /testbed && ' - "(git apply -v /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || " - "(echo 'Failed to apply patch with git apply, trying with patch command...' && " - "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || " - "echo 'APPLY_PATCH_FAIL')))" - ) - action = CmdRunAction(command=exec_command) - action.timeout = 600 - obs = runtime.run_action(action) - assert isinstance(obs, CmdOutputObservation) - apply_patch_output = obs.content - assert isinstance(apply_patch_output, str) - instance['test_result']['apply_patch_output'] = apply_patch_output - try: + runtime = create_runtime(config) + call_async_from_sync(runtime.connect) + # Get patch and save it to /tmp/patch.diff + with tempfile.TemporaryDirectory() as temp_dir: + # Patch file + patch_file_path = os.path.join(temp_dir, 'patch.diff') + with open(patch_file_path, 'w') as f: + f.write(model_patch) + runtime.copy_to(patch_file_path, '/tmp') + # Eval script + eval_script_path = os.path.join(temp_dir, 'eval.sh') + with open(eval_script_path, 'w') as f: + f.write(test_spec.eval_script) + runtime.copy_to(eval_script_path, '/tmp') + + # Set +x + action = CmdRunAction(command='chmod +x /tmp/eval.sh') + action.timeout = 600 + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.exit_code == 0 + + # Apply patch + exec_command = ( + 'cd /testbed && ' + "(git apply -v /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || " + "(echo 'Failed to apply patch with git apply, trying with patch command...' && " + "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || " + "echo 'APPLY_PATCH_FAIL')))" + ) + action = CmdRunAction(command=exec_command) + action.timeout = 600 + obs = runtime.run_action(action) + assert isinstance(obs, CmdOutputObservation) + apply_patch_output = obs.content + assert isinstance(apply_patch_output, str) + instance['test_result']['apply_patch_output'] = apply_patch_output + if 'APPLY_PATCH_FAIL' in apply_patch_output: logger.info(f'[{instance_id}] {APPLY_PATCH_FAIL}:\n{apply_patch_output}') instance['test_result']['report']['failed_apply_patch'] = True @@ -212,7 +221,7 @@ def process_instance( # Run eval script in background and save output to log file log_file = '/tmp/eval_output.log' action = CmdRunAction(command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!') - action.timeout = 60 # Short timeout just to get the process ID + action.timeout = 300 # Short timeout just to get the process ID obs = runtime.run_action(action) if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0: @@ -235,7 +244,7 @@ def process_instance( check_action = CmdRunAction( command=f'ps -p {pid} > /dev/null; echo $?' ) - check_action.timeout = 60 + check_action.timeout = 300 check_obs = runtime.run_action(check_action) if ( isinstance(check_obs, CmdOutputObservation) @@ -352,7 +361,13 @@ def process_instance( # Load predictions assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.' - predictions = pd.read_json(args.input_file, lines=True) + required_fields = ['instance_id', 'model_patch', 'test_result'] + predictions = pd.DataFrame.from_records( + [ + {k: v for k, v in json.loads(line).items() if k in required_fields} + for line in tqdm(open(args.input_file), desc='Loading predictions') + ] + ) assert ( 'instance_id' in predictions.columns ), 'Input file must contain instance_id column.' diff --git a/evaluation/benchmarks/swe_bench/resource/mapping.py b/evaluation/benchmarks/swe_bench/resource/mapping.py new file mode 100644 index 000000000000..755591238116 --- /dev/null +++ b/evaluation/benchmarks/swe_bench/resource/mapping.py @@ -0,0 +1,40 @@ +"""Mapping instance_id to resource_factor. + +Different instances may have different resource requirements. +e.g., some instances may require more memory/CPU to run inference. +This file tracks the resource requirements of different instances. +""" + +import json +import os +from typing import Dict + +from openhands.core.logger import openhands_logger as logger + +CUR_DIR = os.path.dirname(os.path.abspath(__file__)) +DEFAULT_RUNTIME_RESOURCE_FACTOR = int( + os.environ.get('DEFAULT_RUNTIME_RESOURCE_FACTOR', 1) +) + +# dataset to resource mapping +_global_resource_mapping: Dict[str, Dict[str, float]] = {} + + +def get_resource_mapping(dataset_name: str) -> Dict[str, float]: + if dataset_name not in _global_resource_mapping: + file_path = os.path.join(CUR_DIR, f'{dataset_name}.json') + if not os.path.exists(file_path): + logger.warning(f'Resource mapping for {dataset_name} not found.') + return None + + with open(file_path, 'r') as f: + _global_resource_mapping[dataset_name] = json.load(f) + logger.info(f'Loaded resource mapping for {dataset_name}') + return _global_resource_mapping[dataset_name] + + +def get_instance_resource_factor(dataset_name: str, instance_id: str) -> int: + resource_mapping = get_resource_mapping(dataset_name) + if resource_mapping is None: + return DEFAULT_RUNTIME_RESOURCE_FACTOR + return int(resource_mapping.get(instance_id, DEFAULT_RUNTIME_RESOURCE_FACTOR)) diff --git a/evaluation/benchmarks/swe_bench/resource/princeton-nlp__SWE-bench_Verified-test.json b/evaluation/benchmarks/swe_bench/resource/princeton-nlp__SWE-bench_Verified-test.json new file mode 100644 index 000000000000..161ab736da08 --- /dev/null +++ b/evaluation/benchmarks/swe_bench/resource/princeton-nlp__SWE-bench_Verified-test.json @@ -0,0 +1 @@ +{"pydata__xarray-6721": 8, "pytest-dev__pytest-7236": 8, "matplotlib__matplotlib-24627": 4, "django__django-15561": 4, "django__django-15098": 4, "django__django-14771": 4, "sympy__sympy-21612": 4, "sympy__sympy-15345": 4, "psf__requests-5414": 4, "astropy__astropy-14508": 2, "django__django-11451": 2, "django__django-11477": 2, "django__django-10880": 2, "django__django-11163": 2, "django__django-11815": 2, "astropy__astropy-14369": 2, "django__django-10097": 2, "django__django-10554": 2, "django__django-12304": 2, "django__django-12325": 2, "django__django-11551": 2, "django__django-11734": 2, "django__django-13109": 2, "django__django-13089": 2, "django__django-13343": 2, "django__django-13363": 2, "django__django-13809": 2, "django__django-13810": 2, "django__django-13786": 2, "django__django-13807": 2, "django__django-14493": 2, "django__django-11820": 2, "django__django-11951": 2, "django__django-11964": 2, "astropy__astropy-14309": 2, "astropy__astropy-14365": 2, "astropy__astropy-12907": 2, "astropy__astropy-14182": 2, "django__django-15161": 2, "django__django-15128": 2, "django__django-14999": 2, "django__django-14915": 2, "django__django-14752": 2, "django__django-14765": 2, "django__django-14089": 2, "django__django-15252": 2, "django__django-15380": 2, "django__django-15382": 2, "django__django-15499": 2, "django__django-15467": 2, "django__django-15280": 2, "django__django-15315": 2, "django__django-15277": 2, "django__django-15268": 2, "django__django-15629": 2, "django__django-15695": 2, "django__django-15732": 2, "django__django-15863": 2, "django__django-16082": 2, "django__django-16145": 2, "django__django-16256": 2, "django__django-16429": 2, "django__django-16454": 2, "django__django-16493": 2, "matplotlib__matplotlib-13989": 2, "matplotlib__matplotlib-20488": 2, "django__django-15503": 2, "django__django-15525": 2, "django__django-15375": 2, "django__django-15278": 2, "matplotlib__matplotlib-21568": 2, "matplotlib__matplotlib-20859": 2, "matplotlib__matplotlib-20826": 2, "matplotlib__matplotlib-20676": 2, "matplotlib__matplotlib-23412": 2, "matplotlib__matplotlib-22719": 2, "matplotlib__matplotlib-23299": 2, "matplotlib__matplotlib-22865": 2, "matplotlib__matplotlib-24149": 2, "matplotlib__matplotlib-24177": 2, "matplotlib__matplotlib-24570": 2, "matplotlib__matplotlib-24637": 2, "matplotlib__matplotlib-24970": 2, "matplotlib__matplotlib-23476": 2, "matplotlib__matplotlib-24026": 2, "matplotlib__matplotlib-23314": 2, "matplotlib__matplotlib-25332": 2, "matplotlib__matplotlib-25311": 2, "matplotlib__matplotlib-25122": 2, "matplotlib__matplotlib-25479": 2, "matplotlib__matplotlib-26342": 2, "psf__requests-2317": 2, "matplotlib__matplotlib-25960": 2, "matplotlib__matplotlib-25775": 2, "pydata__xarray-4356": 2, "pydata__xarray-4075": 2, "pydata__xarray-6461": 2, "pydata__xarray-4687": 2, "pydata__xarray-6599": 2, "pylint-dev__pylint-4661": 2, "django__django-15554": 2, "django__django-15563": 2, "pytest-dev__pytest-5262": 2, "pytest-dev__pytest-10081": 2, "scikit-learn__scikit-learn-12973": 2, "scikit-learn__scikit-learn-13124": 2, "scikit-learn__scikit-learn-13779": 2, "scikit-learn__scikit-learn-14141": 2, "scikit-learn__scikit-learn-13439": 2, "scikit-learn__scikit-learn-13496": 2, "scikit-learn__scikit-learn-15100": 2, "scikit-learn__scikit-learn-25102": 2, "scikit-learn__scikit-learn-25232": 2, "scikit-learn__scikit-learn-25747": 2, "scikit-learn__scikit-learn-26323": 2, "scikit-learn__scikit-learn-9288": 2, "scikit-learn__scikit-learn-14496": 2, "scikit-learn__scikit-learn-14629": 2, "sphinx-doc__sphinx-8265": 2, "sphinx-doc__sphinx-8548": 2, "sphinx-doc__sphinx-8593": 2, "sphinx-doc__sphinx-8595": 2, "sphinx-doc__sphinx-8621": 2, "sphinx-doc__sphinx-8638": 2, "sphinx-doc__sphinx-9229": 2, "sphinx-doc__sphinx-9281": 2, "sphinx-doc__sphinx-9461": 2, "sphinx-doc__sphinx-9591": 2, "sphinx-doc__sphinx-9658": 2, "sphinx-doc__sphinx-9673": 2, "sympy__sympy-12096": 2, "sympy__sympy-12481": 2, "sphinx-doc__sphinx-10323": 2, "sphinx-doc__sphinx-7590": 2, "sympy__sympy-13877": 2, "sympy__sympy-12489": 2, "sympy__sympy-15809": 2, "sympy__sympy-14711": 2, "sympy__sympy-16597": 2, "sympy__sympy-16766": 2, "sympy__sympy-16792": 2, "sympy__sympy-15875": 2, "sympy__sympy-17655": 2, "sympy__sympy-18189": 2, "sympy__sympy-18763": 2, "sympy__sympy-19040": 2, "sympy__sympy-19495": 2, "sympy__sympy-19637": 2, "sympy__sympy-19783": 2, "sympy__sympy-17630": 2, "sympy__sympy-20428": 2, "sympy__sympy-20590": 2, "sympy__sympy-20801": 2, "sympy__sympy-21379": 2, "sympy__sympy-21847": 2, "sympy__sympy-22456": 2, "sympy__sympy-22714": 2, "sympy__sympy-22914": 2, "sympy__sympy-23262": 2, "sympy__sympy-23413": 2, "sympy__sympy-23534": 2, "sympy__sympy-24066": 2, "sympy__sympy-24213": 2, "sympy__sympy-24443": 2, "sympy__sympy-24562": 2, "sympy__sympy-24661": 2} diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index bf065ada9734..4b07fde45128 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -9,6 +9,9 @@ from datasets import load_dataset import openhands.agenthub +from evaluation.benchmarks.swe_bench.resource.mapping import ( + get_instance_resource_factor, +) from evaluation.utils.shared import ( EvalException, EvalMetadata, @@ -41,9 +44,10 @@ from openhands.utils.shutdown_listener import sleep_if_should_continue USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true' -USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true' +USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'true').lower() == 'true' RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true' + AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { 'CodeActAgent': codeact_user_response, } @@ -135,6 +139,10 @@ def get_config( remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), keep_runtime_alive=False, remote_runtime_init_timeout=3600, + remote_runtime_resource_factor=get_instance_resource_factor( + dataset_name=metadata.dataset, + instance_id=instance['instance_id'], + ), ), # do not mount workspace workspace_base=None, @@ -239,7 +247,7 @@ def initialize_runtime( assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}') action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh') - action.timeout = 3600 + action.timeout = 600 logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -351,7 +359,7 @@ def complete_runtime( action = CmdRunAction( command=f'git diff --no-color --cached {instance["base_commit"]}' ) - action.timeout = 600 + 100 * n_retries + action.timeout = max(300 + 100 * n_retries, 600) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -399,7 +407,7 @@ def process_instance( 8, ) logger.warning( - f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}' + f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}' ) runtime = create_runtime(config) call_async_from_sync(runtime.connect) @@ -479,6 +487,10 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame: subset = dataset[dataset[filter_column].isin(selected_ids)] logger.info(f'Retained {subset.shape[0]} tasks after filtering') return subset + skip_ids = os.environ.get('SKIP_IDS', '').split(',') + if len(skip_ids) > 0: + logger.info(f'Filtering {len(skip_ids)} tasks from "SKIP_IDS"...') + return dataset[~dataset[filter_column].isin(skip_ids)] return dataset @@ -501,8 +513,10 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame: # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing # so we don't need to manage file uploading to OpenHands's repo dataset = load_dataset(args.dataset, split=args.split) - logger.info(f'Loaded dataset {args.dataset} with split {args.split}') swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id') + logger.info( + f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks' + ) llm_config = None if args.llm_config: @@ -531,6 +545,7 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame: ) output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') + print(f'### OUTPUT FILE: {output_file} ###') instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit) if len(instances) > 0 and not isinstance( diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py b/evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py new file mode 100644 index 000000000000..6fa5aeda83f7 --- /dev/null +++ b/evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py @@ -0,0 +1,69 @@ +import argparse +import gzip +import json +import os +from glob import glob + +from tqdm import tqdm + +tqdm.pandas() + + +# Load trajectories for resolved instances +def load_completions(output_dir: str, instance_id: str): + glob_path = os.path.join(output_dir, 'llm_completions', instance_id, '*.json') + files = sorted(glob(glob_path)) # this is ascending order + # pick the last file (last turn) + try: + file_path = files[-1] + except IndexError: + # print(f'No files found for instance {instance_id}: files={files}') + return None + with open(file_path, 'r') as f: + result = json.load(f) + # create messages + messages = result['messages'] + messages.append(result['response']['choices'][0]['message']) + tools = result['kwargs']['tools'] + return { + 'messages': messages, + 'tools': tools, + } + + +parser = argparse.ArgumentParser() +parser.add_argument('jsonl_path', type=str) +args = parser.parse_args() + +output_dir = os.path.dirname(args.jsonl_path) +output_path = os.path.join(output_dir, 'output.with_completions.jsonl.gz') + +# Check if output would be different from input +needs_update = False +with open(args.jsonl_path, 'r') as f_in: + for line in tqdm(f_in, desc='Checking for changes'): + data = json.loads(line) + new_completions = load_completions(output_dir, data['instance_id']) + current_completions = data.get('raw_completions') + if current_completions != new_completions: + needs_update = True + break + +if not needs_update: + print('No updates required. Skipping file update.') + exit(0) + +if os.path.exists(output_path): + print(f'Output file already exists at {output_path}, overwriting? (y/n)') + if input() != 'y': + print('Exiting...') + exit(0) + +# Process line by line +with open(args.jsonl_path, 'r') as f_in, gzip.open(output_path, 'wt') as f_out: + for line in tqdm(f_in): + data = json.loads(line) + data['raw_completions'] = load_completions(output_dir, data['instance_id']) + f_out.write(json.dumps(data) + '\n') + +print(f'Saved compressed output to {output_path}') diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py index f333012f489a..69000106c6c2 100644 --- a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py +++ b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py @@ -22,7 +22,8 @@ def convert_row_to_swebench_format(row): elif 'test_result' in row and 'git_patch' in row['test_result']: model_patch = row['test_result']['git_patch'] else: - raise ValueError(f'Row {row} does not have a git_patch') + print(f'WARNING: Row {row} does not have a git_patch') + model_patch = '' return { 'instance_id': row['instance_id'], diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py b/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py index d9c5c540f24b..f8527acd7a6c 100644 --- a/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py +++ b/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py @@ -3,7 +3,7 @@ import os from collections import defaultdict -import pandas as pd +from tqdm import tqdm parser = argparse.ArgumentParser() parser.add_argument('input_file', type=str) @@ -11,8 +11,7 @@ dirname = os.path.dirname(args.input_file) -df = pd.read_json(args.input_file, lines=True) - +# Initialize counters and data structures instance_id_to_status = defaultdict( lambda: { 'empty_generation': False, @@ -23,15 +22,7 @@ } ) - -# Apply the status to the dataframe -def apply_report(row): - instance_id = row['instance_id'] - if instance_id in instance_id_to_status: - return dict(instance_id_to_status[instance_id]) - return row.get('report', {}) - - +# Process official report if it exists swebench_official_report_json = os.path.join(dirname, 'report.json') openhands_remote_report_jsonl = args.input_file.replace( '.jsonl', '.swebench_eval.jsonl' @@ -90,113 +81,159 @@ def apply_report(row): f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n' ) - df['report'] = df.apply(apply_report, axis=1) - with open(output_md_filepath, 'w') as f: f.write(output_md) elif os.path.exists(openhands_remote_report_jsonl): output_md_filepath = args.input_file.replace('.jsonl', '.swebench_eval.md') - df_eval = pd.read_json(openhands_remote_report_jsonl, lines=True, orient='records') - - assert len(df['instance_id'].unique()) == len( - df - ), 'There are duplicate instance ids in the original output which is not allowed' - assert len(df_eval['instance_id'].unique()) == len( - df_eval - ), 'There are duplicate instance ids in the eval report which is not allowed' - - for _, row in df_eval.iterrows(): - instance_id_to_status[row['instance_id']] = row['test_result']['report'] - df['report'] = df.apply(apply_report, axis=1) - - report_is_dict = df['report'].apply(lambda x: isinstance(x, dict)) - if not report_is_dict.all(): - print(df[~report_is_dict]) - raise ValueError(f'Report is not a dict, but a {type(row["report"])}') - - _n_instances = len(df) - _n_resolved = len(df[df['report'].apply(lambda x: x.get('resolved', False))]) - _n_unresolved = _n_instances - _n_resolved - _n_empty_patch = len( - df[df['report'].apply(lambda x: x.get('empty_generation', False))] - ) - _n_error = len(df[df['report'].apply(lambda x: x.get('error_eval', False))]) + # First pass: Read eval report and count instances + instance_ids = set() + eval_instance_ids = set() + + # Count instances in original file + n_instances = 0 + with open(args.input_file, 'r') as f: + for line in tqdm(f, desc='Counting instances in original file'): + data = json.loads(line) + instance_ids.add(data['instance_id']) + n_instances += 1 + print(f'Total instances in original file: {n_instances}') + + # Process eval report + n_eval_instances = 0 + with open(openhands_remote_report_jsonl, 'r') as f: + for line in tqdm(f, desc='Processing eval report'): + data = json.loads(line) + instance_id = data['instance_id'] + eval_instance_ids.add(instance_id) + n_eval_instances += 1 + instance_id_to_status[instance_id] = data['test_result']['report'] + print(f'Total instances in eval report: {n_eval_instances}') + + # Verify no duplicates + assert ( + len(instance_ids) == n_instances + ), 'Duplicate instance ids found in original output' + assert ( + len(eval_instance_ids) == n_eval_instances + ), 'Duplicate instance ids found in eval report' + + # Initialize counters + stats = {'total': len(instance_ids), 'resolved': 0, 'empty_patch': 0, 'error': 0} + + # Collect instance IDs by category + resolved_ids = [] + unresolved_ids = [] + error_ids = [] + empty_patch_ids = [] + timeout_ids = [] + + # Process original file and categorize instances + with open(args.input_file, 'r') as f: + for line in f: + data = json.loads(line) + instance_id = data['instance_id'] + report = instance_id_to_status[instance_id] + + if report.get('resolved', False): + stats['resolved'] += 1 + resolved_ids.append(instance_id) + else: + unresolved_ids.append(instance_id) + + if report.get('empty_generation', False): + stats['empty_patch'] += 1 + empty_patch_ids.append(instance_id) + if report.get('error_eval', False): + stats['error'] += 1 + error_ids.append(instance_id) + if report.get('test_timeout', False): + timeout_ids.append(instance_id) + + # Generate markdown report + def _instance_id_to_log_path(instance_id): + path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log" + return os.path.relpath(path, start=dirname) + + # ... rest of markdown generation code remains the same ... output_md = ( '# SWE-bench Report\n' 'This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n' '## Summary\n' - f'- submitted instances: {_n_instances}\n' - f'- empty patch instances: {_n_empty_patch}\n' - f'- resolved instances: {_n_resolved}\n' - f'- unresolved instances: {_n_unresolved}\n' - f'- error instances: {_n_error}\n' + f'- submitted instances: {stats["total"]}\n' + f'- empty patch instances: {stats["empty_patch"]}\n' + f'- resolved instances: {stats["resolved"]}\n' + f'- unresolved instances: {len(unresolved_ids)}\n' + f'- error instances: {stats["error"]}\n' ) - def _instance_id_to_log_path(instance_id): - path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log" - # make it relative path - path = os.path.relpath(path, start=dirname) - return path - output_md += '\n## Resolved Instances\n' # instance_id to status - for instance_id in sorted( - df[df['report'].apply(lambda x: x.get('resolved', False))][ - 'instance_id' - ].unique() - ): + for instance_id in resolved_ids: instance_id_to_status[instance_id]['resolved'] = True output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n' output_md += '\n## Unresolved Instances\n' - for instance_id in sorted( - df[~df['report'].apply(lambda x: x.get('resolved', False))][ - 'instance_id' - ].unique() - ): + for instance_id in unresolved_ids: output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n' output_md += '\n## Error Instances\n' - for instance_id in sorted( - df[df['report'].apply(lambda x: x.get('error_eval', False))][ - 'instance_id' - ].unique() - ): + for instance_id in error_ids: instance_id_to_status[instance_id]['error_eval'] = True output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n' output_md += '\n## Empty Patch Instances\n' - for instance_id in sorted( - df[df['report'].apply(lambda x: x.get('empty_generation', False))][ - 'instance_id' - ].unique() - ): + for instance_id in empty_patch_ids: instance_id_to_status[instance_id]['empty_generation'] = True output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n' output_md += '\n## Incomplete Instances\n' - for instance_id in sorted( - df[df['report'].apply(lambda x: x.get('test_timeout', False))][ - 'instance_id' - ].unique() - ): + for instance_id in timeout_ids: output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n' + with open(output_md_filepath, 'w') as f: f.write(output_md) + else: print( f'No report file found: Both {swebench_official_report_json} and {openhands_remote_report_jsonl} do not exist.' ) exit() +# Before backup and update, check if any changes would be made +needs_update = False +with open(args.input_file, 'r') as infile: + for line in tqdm(infile, desc='Checking for changes'): + data = json.loads(line) + instance_id = data['instance_id'] + if instance_id in instance_id_to_status: + current_report = data.get('report', {}) + new_report = instance_id_to_status[instance_id] + if current_report != new_report: + needs_update = True + break + +if not needs_update: + print('No updates detected. Skipping file update.') + exit() + +# Backup and update the original file row by row if os.path.exists(args.input_file + '.bak'): conf = input('Existing backup file found. Do you want to overwrite it? (y/n)') if conf != 'y': exit() os.remove(args.input_file + '.bak') -# backup the original file os.rename(args.input_file, args.input_file + '.bak') -df.to_json(args.input_file, orient='records', lines=True) + +# Process and write file row by row +with open(args.input_file + '.bak', 'r') as infile, open( + args.input_file, 'w' +) as outfile: + for line in tqdm(infile, desc='Updating output file'): + data = json.loads(line) + instance_id = data['instance_id'] + if instance_id in instance_id_to_status: + data['report'] = instance_id_to_status[instance_id] + outfile.write(json.dumps(data) + '\n') diff --git a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh index b1d375152dc4..d16b548ebacd 100755 --- a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh @@ -100,6 +100,10 @@ function run_eval() { # Run the command eval $COMMAND + # if exit code is not 0, exit the script + if [ $? -ne 0 ]; then + exit 1 + fi } unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push @@ -108,7 +112,14 @@ if [ -z "$N_RUNS" ]; then echo "N_RUNS not specified, use default $N_RUNS" fi +# Skip runs if the run number is in the SKIP_RUNS list +# read from env variable SKIP_RUNS as a comma separated list of run numbers +SKIP_RUNS=(${SKIP_RUNS//,/ }) for i in $(seq 1 $N_RUNS); do + if [[ " ${SKIP_RUNS[@]} " =~ " $i " ]]; then + echo "Skipping run $i" + continue + fi current_eval_note="$EVAL_NOTE-run_$i" echo "EVAL_NOTE: $current_eval_note" run_eval $current_eval_note From d84c8dc4c0d60caee2098c7db682bed8cdffe6cc Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 16 Jan 2025 12:48:12 -0500 Subject: [PATCH 2/3] remove extra --- evaluation/benchmarks/swe_bench/scripts/run_infer.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh index d16b548ebacd..73e8bd3a3e55 100755 --- a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh @@ -100,10 +100,6 @@ function run_eval() { # Run the command eval $COMMAND - # if exit code is not 0, exit the script - if [ $? -ne 0 ]; then - exit 1 - fi } unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push From 9c912b62a178ae678438aa6ac778eea0f4368780 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 16 Jan 2025 18:22:56 +0000 Subject: [PATCH 3/3] Fix pr #6313: feat(eval): misc SWE-Bench improvement - use different resources for different instances --- evaluation/benchmarks/swe_bench/eval_infer.py | 13 +++++++------ evaluation/benchmarks/swe_bench/resource/mapping.py | 6 ++---- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/evaluation/benchmarks/swe_bench/eval_infer.py b/evaluation/benchmarks/swe_bench/eval_infer.py index f8fd662c958b..52972a920e8e 100644 --- a/evaluation/benchmarks/swe_bench/eval_infer.py +++ b/evaluation/benchmarks/swe_bench/eval_infer.py @@ -362,12 +362,13 @@ def process_instance( # Load predictions assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.' required_fields = ['instance_id', 'model_patch', 'test_result'] - predictions = pd.DataFrame.from_records( - [ - {k: v for k, v in json.loads(line).items() if k in required_fields} - for line in tqdm(open(args.input_file), desc='Loading predictions') - ] - ) + with open(args.input_file) as f: + predictions = pd.DataFrame.from_records( + [ + {k: v for k, v in json.loads(line).items() if k in required_fields} + for line in tqdm(f, desc='Loading predictions') + ] + ) assert ( 'instance_id' in predictions.columns ), 'Input file must contain instance_id column.' diff --git a/evaluation/benchmarks/swe_bench/resource/mapping.py b/evaluation/benchmarks/swe_bench/resource/mapping.py index 755591238116..ed2f433c262b 100644 --- a/evaluation/benchmarks/swe_bench/resource/mapping.py +++ b/evaluation/benchmarks/swe_bench/resource/mapping.py @@ -7,8 +7,6 @@ import json import os -from typing import Dict - from openhands.core.logger import openhands_logger as logger CUR_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -17,10 +15,10 @@ ) # dataset to resource mapping -_global_resource_mapping: Dict[str, Dict[str, float]] = {} +_global_resource_mapping: dict[str, dict[str, float]] = {} -def get_resource_mapping(dataset_name: str) -> Dict[str, float]: +def get_resource_mapping(dataset_name: str) -> dict[str, float]: if dataset_name not in _global_resource_mapping: file_path = os.path.join(CUR_DIR, f'{dataset_name}.json') if not os.path.exists(file_path):