Skip to content

Commit

Permalink
fix(swebench): handle error in eval_infer and run_infer (#5017)
Browse files Browse the repository at this point in the history
  • Loading branch information
xingyaoww authored Nov 15, 2024
1 parent ffc4d32 commit bdc4513
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 19 deletions.
42 changes: 24 additions & 18 deletions evaluation/swe_bench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,23 +263,29 @@ def process_instance(
test_output_path = os.path.join(log_dir, 'test_output.txt')
with open(test_output_path, 'w') as f:
f.write(test_output)

_report = get_eval_report(
test_spec=test_spec,
prediction={
'model_patch': model_patch,
'instance_id': instance_id,
},
log_path=test_output_path,
include_tests_status=True,
)
report = _report[instance_id]
logger.info(
f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
)
instance['test_result']['report']['resolved'] = report[
'resolved'
]
try:
_report = get_eval_report(
test_spec=test_spec,
prediction={
'model_patch': model_patch,
'instance_id': instance_id,
},
log_path=test_output_path,
include_tests_status=True,
)
report = _report[instance_id]
logger.info(
f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
)
instance['test_result']['report']['resolved'] = report[
'resolved'
]
except Exception as e:
logger.error(
f'[{instance_id}] Error when getting eval report: {e}'
)
instance['test_result']['report']['resolved'] = False
instance['test_result']['report']['error_eval'] = True
else:
logger.info(f'[{instance_id}] Error when starting eval:\n{obs.content}')
instance['test_result']['report']['error_eval'] = True
Expand Down Expand Up @@ -355,7 +361,7 @@ def process_instance(

if 'model_patch' not in predictions.columns:
predictions['model_patch'] = predictions['test_result'].apply(
lambda x: x['git_patch']
lambda x: x.get('git_patch', '')
)
assert {'instance_id', 'model_patch'}.issubset(
set(predictions.columns)
Expand Down
7 changes: 6 additions & 1 deletion evaluation/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,5 +534,10 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
instances[col] = instances[col].apply(lambda x: str(x))

run_evaluation(
instances, metadata, output_file, args.eval_num_workers, process_instance
instances,
metadata,
output_file,
args.eval_num_workers,
process_instance,
timeout_seconds=120 * 60, # 2 hour PER instance should be more than enough
)

0 comments on commit bdc4513

Please sign in to comment.