From bdc451393769b810a9e84df5dc8dab4e7fe10554 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 15 Nov 2024 09:04:56 -0600 Subject: [PATCH] fix(swebench): handle error in eval_infer and run_infer (#5017) --- evaluation/swe_bench/eval_infer.py | 42 +++++++++++++++++------------- evaluation/swe_bench/run_infer.py | 7 ++++- 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/swe_bench/eval_infer.py index d6c4e6ba938d..d40f984fca9c 100644 --- a/evaluation/swe_bench/eval_infer.py +++ b/evaluation/swe_bench/eval_infer.py @@ -263,23 +263,29 @@ def process_instance( test_output_path = os.path.join(log_dir, 'test_output.txt') with open(test_output_path, 'w') as f: f.write(test_output) - - _report = get_eval_report( - test_spec=test_spec, - prediction={ - 'model_patch': model_patch, - 'instance_id': instance_id, - }, - log_path=test_output_path, - include_tests_status=True, - ) - report = _report[instance_id] - logger.info( - f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}" - ) - instance['test_result']['report']['resolved'] = report[ - 'resolved' - ] + try: + _report = get_eval_report( + test_spec=test_spec, + prediction={ + 'model_patch': model_patch, + 'instance_id': instance_id, + }, + log_path=test_output_path, + include_tests_status=True, + ) + report = _report[instance_id] + logger.info( + f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}" + ) + instance['test_result']['report']['resolved'] = report[ + 'resolved' + ] + except Exception as e: + logger.error( + f'[{instance_id}] Error when getting eval report: {e}' + ) + instance['test_result']['report']['resolved'] = False + instance['test_result']['report']['error_eval'] = True else: logger.info(f'[{instance_id}] Error when starting eval:\n{obs.content}') instance['test_result']['report']['error_eval'] = True @@ -355,7 +361,7 @@ def process_instance( if 'model_patch' not in predictions.columns: predictions['model_patch'] = predictions['test_result'].apply( - lambda x: x['git_patch'] + lambda x: x.get('git_patch', '') ) assert {'instance_id', 'model_patch'}.issubset( set(predictions.columns) diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index a5d3db08ef46..386c0dd19238 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -534,5 +534,10 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame: instances[col] = instances[col].apply(lambda x: str(x)) run_evaluation( - instances, metadata, output_file, args.eval_num_workers, process_instance + instances, + metadata, + output_file, + args.eval_num_workers, + process_instance, + timeout_seconds=120 * 60, # 2 hour PER instance should be more than enough )