diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py index b376a70aea28..f08ee73eee8a 100755 --- a/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py +++ b/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py @@ -6,6 +6,8 @@ from collections import Counter import pandas as pd +import random +import numpy as np from openhands.events.serialization import event_from_dict from openhands.events.utils import get_pairs_from_events @@ -18,6 +20,18 @@ ] +def get_bootstrap_accuracy_error_bars(values: float | int | bool, num_samples: int = 1000, p_value=0.05) -> tuple[float, float]: + sorted_vals = np.sort( + [ + np.mean(random.sample(values, len(values) // 2)) + for _ in range(num_samples) + ] + ) + bottom_idx = int(num_samples * p_value / 2) + top_idx = int(num_samples * (1.0 - p_value / 2)) + return (sorted_vals[bottom_idx], sorted_vals[top_idx]) + + def process_file(file_path): with open(file_path, 'r') as file: lines = file.readlines() @@ -26,6 +40,7 @@ def process_file(file_path): num_error_lines = 0 num_agent_stuck_in_loop = 0 num_resolved = 0 + resolved_arr = [] num_empty_patch = 0 num_unfinished_runs = 0 error_counter = Counter() @@ -74,6 +89,9 @@ def process_file(file_path): resolved = report.get('resolved', False) if resolved: num_resolved += 1 + resolved_arr.append(1) + else: + resolved_arr.append(0) # Error error = _d.get('error', None) @@ -100,6 +118,7 @@ def process_file(file_path): 'resolved': { 'count': num_resolved, 'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0, + 'ci': tuple(x * 100 for x in get_bootstrap_accuracy_error_bars(resolved_arr)), }, 'empty_patches': { 'count': num_empty_patch, @@ -174,6 +193,7 @@ def aggregate_directory(input_path) -> pd.DataFrame: ) df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage']) + df['resolve_rate_ci'] = df['resolved'].apply(lambda x: x['ci']) df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage']) df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage']) df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns']) @@ -242,7 +262,7 @@ def aggregate_directory(input_path) -> pd.DataFrame: # Print detailed results for single file print(f'\nResults for {args.input_path}:') print( - f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}%)" + f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}% [{result['resolved']['ci'][0]:.2f}%, {result['resolved']['ci'][1]:.2f}%])" ) print( f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"