diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/swe_bench/eval_infer.py index d6c4e6ba938d..ea650999c2be 100644 --- a/evaluation/swe_bench/eval_infer.py +++ b/evaluation/swe_bench/eval_infer.py @@ -263,23 +263,29 @@ def process_instance( test_output_path = os.path.join(log_dir, 'test_output.txt') with open(test_output_path, 'w') as f: f.write(test_output) - - _report = get_eval_report( - test_spec=test_spec, - prediction={ - 'model_patch': model_patch, - 'instance_id': instance_id, - }, - log_path=test_output_path, - include_tests_status=True, - ) - report = _report[instance_id] - logger.info( - f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}" - ) - instance['test_result']['report']['resolved'] = report[ - 'resolved' - ] + try: + _report = get_eval_report( + test_spec=test_spec, + prediction={ + 'model_patch': model_patch, + 'instance_id': instance_id, + }, + log_path=test_output_path, + include_tests_status=True, + ) + report = _report[instance_id] + logger.info( + f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}" + ) + instance['test_result']['report']['resolved'] = report[ + 'resolved' + ] + except Exception as e: + logger.error( + f'[{instance_id}] Error when getting eval report: {e}' + ) + instance['test_result']['report']['resolved'] = False + instance['test_result']['report']['error_eval'] = True else: logger.info(f'[{instance_id}] Error when starting eval:\n{obs.content}') instance['test_result']['report']['error_eval'] = True @@ -355,7 +361,7 @@ def process_instance( if 'model_patch' not in predictions.columns: predictions['model_patch'] = predictions['test_result'].apply( - lambda x: x['git_patch'] + lambda x: x.get('git_patch', '') ) assert {'instance_id', 'model_patch'}.issubset( set(predictions.columns) @@ -401,7 +407,11 @@ def process_instance( fields = ['resolved', 'failed_apply_patch', 'error_eval', 'empty_generation'] def count_report_field(row, field): - return row['test_result']['report'][field] + return ( + row['test_result']['report'][field] + if 'report' in row['test_result'] + else False + ) report = {} for field in fields: diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index a5d3db08ef46..5ae044edbc3c 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -32,7 +32,7 @@ ) from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller -from openhands.events.action import CmdRunAction, MessageAction +from openhands.events.action import CmdRunAction, IPythonRunCellAction, MessageAction from openhands.events.observation import CmdOutputObservation, ErrorObservation from openhands.events.serialization.event import event_to_dict from openhands.runtime.base import Runtime @@ -145,7 +145,7 @@ def get_config( platform='linux/amd64', api_key=os.environ.get('ALLHANDS_API_KEY', None), remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), - keep_remote_runtime_alive=False, + keep_runtime_alive=False, remote_runtime_init_timeout=3600, ), # do not mount workspace @@ -303,6 +303,7 @@ def initialize_runtime( def complete_runtime( runtime: Runtime, instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name + n_retries: int = 5, ) -> dict[str, Any]: """Complete the runtime for the agent. @@ -321,55 +322,84 @@ def complete_runtime( logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert_and_raise( - isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, - f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}', - ) - action = CmdRunAction(command='git config --global core.pager ""') - action.timeout = 600 - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert_and_raise( - isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, - f'Failed to git config --global core.pager "": {str(obs)}', - ) + if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0: + action = CmdRunAction(command='git config --global core.pager ""') + action.timeout = 600 + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert_and_raise( + isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, + f'Failed to git config --global core.pager "": {str(obs)}', + ) - action = CmdRunAction(command='git add -A') - action.timeout = 600 - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert_and_raise( - isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, - f'Failed to git add -A: {str(obs)}', - ) + action = CmdRunAction(command='git add -A') + action.timeout = 600 + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert_and_raise( + isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, + f'Failed to git add -A: {str(obs)}', + ) - n_retries = 0 - git_patch = None - while n_retries < 5: - action = CmdRunAction( - command=f'git diff --no-color --cached {instance["base_commit"]}', - keep_prompt=False, + n_retries = 0 + git_patch = None + while n_retries < 5: + action = CmdRunAction( + command=f'git diff --no-color --cached {instance["base_commit"]}', + keep_prompt=False, + ) + action.timeout = 600 + 100 * n_retries + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + n_retries += 1 + if isinstance(obs, CmdOutputObservation): + if obs.exit_code == 0: + git_patch = obs.content.strip() + break + else: + logger.info('Failed to get git diff, retrying...') + sleep_if_should_continue(10) + elif isinstance(obs, ErrorObservation): + logger.error(f'Error occurred: {obs.content}. Retrying...') + sleep_if_should_continue(10) + else: + assert_and_raise(False, f'Unexpected observation type: {str(obs)}') + else: + logger.warning( + f'Failed to cd to /workspace/{workspace_dir_name}... Trying to use IPython to get git diff' ) - action.timeout = 600 + 100 * n_retries + # Git configuration and diff using IPython + cell_code = f""" + import subprocess + + def run_git_cmd(cmd): + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, cwd='/workspace/{workspace_dir_name}') + return result.stdout, result.returncode + + # Configure git + run_git_cmd('git config --global core.pager ""') + run_git_cmd('git add -A') + + # Get the diff + stdout, exit_code = run_git_cmd('git diff --no-color --cached {instance["base_commit"]}') + git_patch = stdout.strip() + """ + action = IPythonRunCellAction(code=cell_code) + action.timeout = 600 logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - n_retries += 1 - if isinstance(obs, CmdOutputObservation): - if obs.exit_code == 0: - git_patch = obs.content.strip() - break - else: - logger.info('Failed to get git diff, retrying...') - sleep_if_should_continue(10) - elif isinstance(obs, ErrorObservation): - logger.error(f'Error occurred: {obs.content}. Retrying...') - sleep_if_should_continue(10) - else: - assert_and_raise(False, f'Unexpected observation type: {str(obs)}') + + # Get the git_patch from IPython's namespace + cell_code = 'print(git_patch)' + action = IPythonRunCellAction(code=cell_code) + action.timeout = 600 + obs = runtime.run_action(action) + git_patch = obs.content.strip() assert_and_raise(git_patch is not None, 'Failed to get git diff (None)') @@ -534,5 +564,10 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame: instances[col] = instances[col].apply(lambda x: str(x)) run_evaluation( - instances, metadata, output_file, args.eval_num_workers, process_instance + instances, + metadata, + output_file, + args.eval_num_workers, + process_instance, + timeout_seconds=120 * 60, # 2 hour PER instance should be more than enough ) diff --git a/evaluation/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/swe_bench/scripts/eval/summarize_outputs.py index 5d5dbbf2a3bd..d76882cb2415 100755 --- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py +++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py @@ -1,8 +1,14 @@ #!/usr/bin/env python3 import argparse +import glob import json +import os +import random from collections import Counter +import numpy as np +import pandas as pd + from openhands.events.serialization import event_from_dict from openhands.events.utils import get_pairs_from_events @@ -10,25 +16,31 @@ 'Agent encountered an error while processing the last action', 'APIError', 'Action execution failed', + 'litellm.Timeout: APITimeoutError', ] -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('output_file', type=str, help='The file to summarize') - args = parser.parse_args() - with open(args.output_file, 'r') as file: +def get_bootstrap_accuracy_std( + values: float | int | bool, num_samples: int = 1000 +) -> float: + # Borrow from https://github.com/openai/evals/blob/cdb8ce9547e68b8e5e4520b6a162294c06865c0f/evals/metrics.py#L21 + return np.std( + [np.mean(random.sample(values, len(values) // 2)) for _ in range(num_samples)] + ) + + +def process_file(file_path): + with open(file_path, 'r') as file: lines = file.readlines() num_lines = len(lines) num_error_lines = 0 num_agent_stuck_in_loop = 0 - num_resolved = 0 + resolved_arr = [] num_empty_patch = 0 - + num_unfinished_runs = 0 error_counter = Counter() - main_agent_cost = [] editor_cost = [] num_turns = [] @@ -36,6 +48,11 @@ for line in lines: _d = json.loads(line) + if 'metrics' not in _d or _d['metrics'] is None: + # this is a failed run + num_unfinished_runs += 1 + continue + # Cost costs = _d['metrics'].get('costs', []) _cur_main_agent_cost = 0 @@ -69,6 +86,9 @@ resolved = report.get('resolved', False) if resolved: num_resolved += 1 + resolved_arr.append(1) + else: + resolved_arr.append(0) # Error error = _d.get('error', None) @@ -89,30 +109,182 @@ num_error_lines += 1 break - # print the error counter (with percentage) - print( - f'Number of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)' - ) - print( - f'Number of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)' - ) - print( - f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)' + return { + 'file_path': file_path, + 'total_instances': num_lines, + 'resolved': { + 'count': num_resolved, + 'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0, + 'std': get_bootstrap_accuracy_std(resolved_arr) * 100, + }, + 'empty_patches': { + 'count': num_empty_patch, + 'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0, + }, + 'unfinished_runs': { + 'count': num_unfinished_runs, + 'percentage': (num_unfinished_runs / num_lines * 100) + if num_lines > 0 + else 0, + }, + 'errors': { + 'total': num_error_lines, + 'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0, + 'stuck_in_loop': { + 'count': num_agent_stuck_in_loop, + 'percentage': (num_agent_stuck_in_loop / num_lines * 100) + if num_lines > 0 + else 0, + }, + 'breakdown': { + str(error): { + 'count': count, + 'percentage': (count / num_lines * 100) if num_lines > 0 else 0, + } + for error, count in error_counter.items() + }, + }, + 'statistics': { + 'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0, + 'costs': { + 'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0, + 'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0, + 'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines + if num_lines > 0 + else 0, + }, + }, + } + + +def aggregate_directory(input_path) -> pd.DataFrame: + # Process all output.jsonl files in subdirectories + pattern = os.path.join(input_path, '**/output.jsonl') + files = glob.glob(pattern, recursive=True) + print(f'Processing {len(files)} files from directory {input_path}') + + # Process each file silently and collect results + results = [] + for file_path in files: + try: + result = process_file(file_path) + results.append(result) + except Exception as e: + print(f'Error processing {file_path}: {str(e)}') + import traceback + + traceback.print_exc() + continue + + # Convert results to pandas DataFrame and sort by resolve rate + df = pd.DataFrame(results) + + # Extract directory name from file path + df['directory'] = df['file_path'].apply( + lambda x: os.path.basename(os.path.dirname(x)) ) - print( - f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)' + + df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage']) + df['resolve_rate_std'] = df['resolved'].apply(lambda x: x['std']) + df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage']) + df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage']) + df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns']) + df['error_rate'] = df['errors'].apply(lambda x: x['percentage']) + df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total']) + + df = df.sort_values('resolve_rate', ascending=False) + + return df + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + 'input_path', type=str, help='The file or directory to summarize' ) - assert len(num_turns) == num_lines - assert len(main_agent_cost) == num_lines - assert len(editor_cost) == num_lines - print('## Statistics') - print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}') - print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD') - print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD') - print( - f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD' + parser.add_argument( + '--output', + type=str, + help='Output JSONL file for results', + default='summary_results.jsonl', ) + args = parser.parse_args() + + if os.path.isdir(args.input_path): + df = aggregate_directory(args.input_path) + # Create the summary string + columns = [ + 'directory', + 'resolve_rate', + 'empty_patch_rate', + 'unfinished_rate', + 'error_rate', + 'avg_turns', + 'avg_cost', + 'total_instances', + ] + summary_str = df[columns].to_string( + float_format=lambda x: '{:.2f}'.format(x), + formatters={ + 'directory': lambda x: x[:90] + }, # Truncate directory names to 20 chars + index=False, + ) + + # Print to console + print('\nResults summary (sorted by resolve rate):') + print(summary_str) + + # Save to text file + txt_output = args.output.rsplit('.', 1)[0] + '.txt' + with open(txt_output, 'w') as f: + f.write('Results summary (sorted by resolve rate):\n') + f.write(summary_str) + + # Save + df.to_json(args.output, lines=True, orient='records') + df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False) + else: + # Process single file with detailed output + results = [] + try: + result = process_file(args.input_path) + results.append(result) + + # Print detailed results for single file + print(f'\nResults for {args.input_path}:') + print( + f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}% ± {result['resolved']['std']:.2f}%)" + ) + print( + f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)" + ) + print( + f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)" + ) + print( + f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)" + ) + print( + f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)" + ) + print('## Statistics') + print( + f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}" + ) + print( + f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD" + ) + print( + f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD" + ) + print( + f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD" + ) + + print('## Detailed error breakdown:') + for error, data in result['errors']['breakdown'].items(): + print(f"{error}: {data['count']} ({data['percentage']:.2f}%)") - print('## Detailed error breakdown:') - for error, count in error_counter.items(): - print(f'{error}: {count} ({count / num_lines * 100:.2f}%)') + except Exception as e: + print(f'Error processing {args.input_path}: {str(e)}') diff --git a/evaluation/swe_bench/scripts/eval/update_output_with_eval.py b/evaluation/swe_bench/scripts/eval/update_output_with_eval.py index 662e640ca752..ae196ccf57ac 100644 --- a/evaluation/swe_bench/scripts/eval/update_output_with_eval.py +++ b/evaluation/swe_bench/scripts/eval/update_output_with_eval.py @@ -108,7 +108,10 @@ def apply_report(row): ), 'There are duplicate instance ids in the eval report which is not allowed' for _, row in df_eval.iterrows(): - instance_id_to_status[row['instance_id']] = row['test_result']['report'] + if 'report' in row['test_result']: + instance_id_to_status[row['instance_id']] = row['test_result']['report'] + else: + instance_id_to_status[row['instance_id']] = {} df['report'] = df.apply(apply_report, axis=1) _n_instances = len(df) diff --git a/evaluation/swe_bench/scripts/run_infer.sh b/evaluation/swe_bench/scripts/run_infer.sh index 520003635a4e..e8c1df9f3c24 100755 --- a/evaluation/swe_bench/scripts/run_infer.sh +++ b/evaluation/swe_bench/scripts/run_infer.sh @@ -108,7 +108,13 @@ if [ -z "$N_RUNS" ]; then echo "N_RUNS not specified, use default $N_RUNS" fi +# SKIP_RUNS is a comma-separated list of run numbers to skip for i in $(seq 1 $N_RUNS); do + # skip if i in SKIP_RUNS + if [[ "$SKIP_RUNS" == *"$i"* ]]; then + continue + fi + current_eval_note="$EVAL_NOTE-run_$i" echo "EVAL_NOTE: $current_eval_note" run_eval $current_eval_note diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index 847eb16bb32e..517ecc523581 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -3,9 +3,11 @@ import multiprocessing as mp import os import pathlib +import signal import subprocess import time import traceback +from contextlib import contextmanager from typing import Any, Awaitable, Callable, TextIO import pandas as pd @@ -92,6 +94,27 @@ class EvalException(Exception): pass +class EvalTimeoutException(Exception): + pass + + +@contextmanager +def timeout(seconds: int): + def timeout_handler(signum, frame): + raise EvalTimeoutException(f'Function timed out after {seconds} seconds') + + # Set up the signal handler + original_handler = signal.signal(signal.SIGALRM, timeout_handler) + signal.alarm(seconds) + + try: + yield + finally: + # Restore the original handler and disable the alarm + signal.alarm(0) + signal.signal(signal.SIGALRM, original_handler) + + def codeact_user_response( state: State, encapsulate_solution: bool = False, @@ -280,15 +303,33 @@ def _process_instance_wrapper( metadata: EvalMetadata, use_mp: bool, max_retries: int = 5, + timeout_seconds: int | None = None, ) -> EvalOutput: - """Wrap the process_instance_func to handle retries and errors. - - Retry an instance up to max_retries times if it fails (e.g., due to transient network/runtime issues). - """ + """Wrap the process_instance_func to handle retries and errors.""" for attempt in range(max_retries + 1): try: - result = process_instance_func(instance, metadata, use_mp) + if timeout_seconds is not None: + with timeout(timeout_seconds): + result = process_instance_func(instance, metadata, use_mp) + else: + result = process_instance_func(instance, metadata, use_mp) return result + except EvalTimeoutException as e: + error = f'Timeout after {timeout_seconds} seconds' + stacktrace = traceback.format_exc() + msg = ( + '-' * 10 + + '\n' + + f'Timeout ({timeout_seconds} seconds) in instance [{instance.instance_id}], Stopped evaluation for this instance.' + + '\n' + + '-' * 10 + ) + logger.exception(e) + return EvalOutput( + instance_id=instance.instance_id, + test_result={}, + error=error, + ) except Exception as e: error = str(e) stacktrace = traceback.format_exc() @@ -337,6 +378,7 @@ def run_evaluation( [pd.Series, EvalMetadata, bool], Awaitable[EvalOutput] ], max_retries: int = 5, # number of retries for each instance + timeout_seconds: int | None = None, ): use_multiprocessing = num_workers > 1 @@ -357,7 +399,14 @@ def run_evaluation( if use_multiprocessing: with mp.Pool(num_workers) as pool: args_iter = ( - (process_instance_func, instance, metadata, True, max_retries) + ( + process_instance_func, + instance, + metadata, + True, + max_retries, + timeout_seconds, + ) for _, instance in dataset.iterrows() ) results = pool.imap_unordered(_process_instance_wrapper_mp, args_iter) diff --git a/openhands/core/config/llm_config.py b/openhands/core/config/llm_config.py index 477b47ccdbe1..dfcb3b62c5b9 100644 --- a/openhands/core/config/llm_config.py +++ b/openhands/core/config/llm_config.py @@ -77,6 +77,7 @@ class LLMConfig: log_completions: bool = False log_completions_folder: str = os.path.join(LOG_DIR, 'completions') draft_editor: Optional['LLMConfig'] = None + nonfncall_mode_add_in_context_learning_example: bool = True def defaults_to_dict(self) -> dict: """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional.""" diff --git a/openhands/llm/fn_call_converter.py b/openhands/llm/fn_call_converter.py index da595ec4f364..057401b12c2c 100644 --- a/openhands/llm/fn_call_converter.py +++ b/openhands/llm/fn_call_converter.py @@ -307,6 +307,7 @@ def convert_tools_to_description(tools: list[dict]) -> str: def convert_fncall_messages_to_non_fncall_messages( messages: list[dict], tools: list[ChatCompletionToolParam], + add_in_context_learning_example: bool = True, ) -> list[dict]: """Convert function calling messages to non-function calling messages.""" messages = copy.deepcopy(messages) @@ -319,7 +320,8 @@ def convert_fncall_messages_to_non_fncall_messages( converted_messages = [] first_user_message_encountered = False for message in messages: - role, content = message['role'], message['content'] + role = message['role'] + content = message.get('content', '') if content is None: content = '' @@ -341,7 +343,7 @@ def convert_fncall_messages_to_non_fncall_messages( # 2. USER MESSAGES (no change) elif role == 'user': # Add in-context learning example for the first user message - if not first_user_message_encountered: + if not first_user_message_encountered and add_in_context_learning_example: first_user_message_encountered = True # Check tools if not ( @@ -430,7 +432,7 @@ def convert_fncall_messages_to_non_fncall_messages( tool_content = convert_tool_call_to_string(message['tool_calls'][0]) except FunctionCallConversionError as e: raise FunctionCallConversionError( - f'Failed to convert tool call to string. Raw messages: {json.dumps(messages, indent=2)}' + f'Failed to convert tool call to string.\nCurrent tool call: {message["tool_calls"][0]}.\nRaw messages: {json.dumps(messages, indent=2)}' ) from e if isinstance(content, str): content += '\n\n' + tool_content @@ -751,6 +753,7 @@ def convert_non_fncall_messages_to_fncall_messages( def convert_from_multiple_tool_calls_to_single_tool_call_messages( messages: list[dict], + ignore_final_tool_result: bool = False, ) -> list[dict]: """Break one message with multiple tool calls into multiple messages.""" converted_messages = [] @@ -787,7 +790,7 @@ def convert_from_multiple_tool_calls_to_single_tool_call_messages( ), f'Found pending tool calls but not expect to handle it with role {role}: {pending_tool_calls=}, {message=}' converted_messages.append(message) - if len(pending_tool_calls) > 0: + if not ignore_final_tool_result and len(pending_tool_calls) > 0: raise FunctionCallConversionError( f'Found pending tool calls but no tool result: {pending_tool_calls=}' ) diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py index 0590945995c1..42b3d34a64a6 100644 --- a/openhands/llm/llm.py +++ b/openhands/llm/llm.py @@ -121,7 +121,9 @@ def __init__( top_p=self.config.top_p, drop_params=self.config.drop_params, ) - + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + self.init_model_info() if self.vision_is_active(): logger.debug('LLM: model has vision enabled') if self.is_caching_prompt_active(): @@ -143,16 +145,6 @@ def __init__( drop_params=self.config.drop_params, ) - with warnings.catch_warnings(): - warnings.simplefilter('ignore') - self.init_model_info() - if self.vision_is_active(): - logger.debug('LLM: model has vision enabled') - if self.is_caching_prompt_active(): - logger.debug('LLM: caching prompt enabled') - if self.is_function_calling_active(): - logger.debug('LLM: model supports function calling') - self._completion_unwrapped = self._completion @self.retry_decorator( @@ -194,7 +186,9 @@ def wrapper(*args, **kwargs): 'tools' in kwargs ), "'tools' must be in kwargs when mock_function_calling is True" messages = convert_fncall_messages_to_non_fncall_messages( - messages, kwargs['tools'] + messages, + kwargs['tools'], + add_in_context_learning_example=self.config.nonfncall_mode_add_in_context_learning_example, ) kwargs['messages'] = messages kwargs['stop'] = STOP_WORDS diff --git a/openhands/runtime/impl/remote/remote_runtime.py b/openhands/runtime/impl/remote/remote_runtime.py index 4191a047b1c2..db685eb98962 100644 --- a/openhands/runtime/impl/remote/remote_runtime.py +++ b/openhands/runtime/impl/remote/remote_runtime.py @@ -327,13 +327,13 @@ def _wait_until_alive_impl(self): assert 'runtime_id' in runtime_data assert runtime_data['runtime_id'] == self.runtime_id assert 'pod_status' in runtime_data - pod_status = runtime_data['pod_status'] + pod_status = runtime_data['pod_status'].lower() self.log('debug', f'Pod status: {pod_status}') # FIXME: We should fix it at the backend of /start endpoint, make sure # the pod is created before returning the response. # Retry a period of time to give the cluster time to start the pod - if pod_status == 'Ready': + if pod_status == 'ready': try: with self._send_request( 'GET', @@ -349,14 +349,14 @@ def _wait_until_alive_impl(self): ) return elif ( - pod_status == 'Not Found' - or pod_status == 'Pending' - or pod_status == 'Running' + pod_status == 'not found' + or pod_status == 'pending' + or pod_status == 'running' ): # nb: Running is not yet Ready raise RuntimeNotReadyError( f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}' ) - elif pod_status in ('Failed', 'Unknown'): + elif pod_status in ('failed', 'unknown', 'crashloopbackoff'): # clean up the runtime self.close() raise RuntimeError( diff --git a/poetry.lock b/poetry.lock index 80d6b20882c8..b95c8ec8c7af 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aenum" @@ -8571,9 +8571,9 @@ inference = ["anthropic", "flash_attn", "jedi", "openai", "peft", "protobuf", "s [package.source] type = "git" -url = "https://github.com/All-Hands-AI/SWE-bench.git" +url = "https://github.com/SWE-Gym/SWE-Bench-Fork.git" reference = "HEAD" -resolved_reference = "c807c112edc3dcb4fdf5ddac63b34706912d5cdb" +resolved_reference = "e440817248cf29bee71ca9ab3c3a762177ab8f7c" [[package]] name = "sympy" @@ -10211,4 +10211,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "8718ffe2ed836fca6c646c37bdad2c9c8e63ebd7ec881f420148fef5095d19e4" +content-hash = "6c58d155c879a1469ba187666f010e13ec50e3a5c85c9461f3316878d0019074" diff --git a/pyproject.toml b/pyproject.toml index 68bc044a79fc..fc2aa1304c3e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -132,7 +132,7 @@ streamlit = "*" whatthepatch = "*" retry = "*" evaluate = "*" -swebench = { git = "https://github.com/All-Hands-AI/SWE-bench.git" } +swebench = { git = "https://github.com/SWE-Gym/SWE-Bench-Fork.git" } func_timeout = "*" sympy = "*" gdown = "*" diff --git a/scripts/eval-swebench-lite.sh b/scripts/eval-swebench-lite.sh new file mode 100755 index 000000000000..3f6d88d910d7 --- /dev/null +++ b/scripts/eval-swebench-lite.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +OUTPUT_FILE=$1 +NUM_WORKERS=${2:-64} +DATASET="princeton-nlp/SWE-bench_Lite" +SPLIT="test" + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" + +./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT diff --git a/scripts/eval-swebench-lite100.sh b/scripts/eval-swebench-lite100.sh new file mode 100755 index 000000000000..b2842f7f7b68 --- /dev/null +++ b/scripts/eval-swebench-lite100.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +OUTPUT_FILE=$1 +NUM_WORKERS=$2 +DATASET="swe-train/SWE-bench_lite" +SPLIT="test100" + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" + +./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT diff --git a/scripts/eval-swebench-verified.sh b/scripts/eval-swebench-verified.sh new file mode 100755 index 000000000000..f6909f6515c5 --- /dev/null +++ b/scripts/eval-swebench-verified.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +OUTPUT_FILE=$1 +NUM_WORKERS=${2:-64} +DATASET="princeton-nlp/SWE-bench_Verified" +SPLIT="test" + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" + +./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT diff --git a/scripts/eval-swetrain-full-rollout.sh b/scripts/eval-swetrain-full-rollout.sh new file mode 100755 index 000000000000..9441fb2f53aa --- /dev/null +++ b/scripts/eval-swetrain-full-rollout.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +OUTPUT_FILE=$1 +NUM_WORKERS=$2 +DATASET="swe-train/swe-train-dev-v1" +SPLIT="train.v2.success" + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" + +./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT diff --git a/scripts/eval-swetrain-lite-rollout.sh b/scripts/eval-swetrain-lite-rollout.sh new file mode 100755 index 000000000000..1f2558839062 --- /dev/null +++ b/scripts/eval-swetrain-lite-rollout.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +OUTPUT_FILE=$1 +NUM_WORKERS=$2 +DATASET="swe-train/swe-train-dev-v1-lite" +SPLIT="train.v2.success" + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" + +./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT diff --git a/scripts/rollout-eval-lite.sh b/scripts/rollout-eval-lite.sh new file mode 100755 index 000000000000..2d111b991003 --- /dev/null +++ b/scripts/rollout-eval-lite.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +export EXP_NAME="t0" # use this to differentiate between different runs +MODEL=$1 +NUM_WORKERS=${2:-64} +N_RUNS=${3:-1} +DATASET="princeton-nlp/SWE-bench_Lite" +SPLIT="test" + +echo "MODEL: $MODEL" +echo "NUM_WORKERS: $NUM_WORKERS" +echo "N_RUNS: $N_RUNS" + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" +export EXP_NAME=$EXP_NAME + +EVAL_LIMIT=300 +MAX_ITER=100 + +./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT $N_RUNS diff --git a/scripts/rollout-eval-verified.sh b/scripts/rollout-eval-verified.sh new file mode 100755 index 000000000000..ecffc1ac108a --- /dev/null +++ b/scripts/rollout-eval-verified.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +export EXP_NAME="t0" # use this to differentiate between different runs +MODEL=$1 +NUM_WORKERS=${2:-64} +N_RUNS=${3:-1} +DATASET="princeton-nlp/SWE-bench_Verified" +SPLIT="test" + +echo "MODEL: $MODEL" +echo "NUM_WORKERS: $NUM_WORKERS" +echo "N_RUNS: $N_RUNS" + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" +export EXP_NAME=$EXP_NAME + +EVAL_LIMIT=500 +MAX_ITER=100 + +./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT $N_RUNS diff --git a/scripts/rollout-eval.sh b/scripts/rollout-eval.sh new file mode 100755 index 000000000000..ac46effca15e --- /dev/null +++ b/scripts/rollout-eval.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +export EXP_NAME="t0" # use this to differentiate between different runs +MODEL=$1 +NUM_WORKERS=${2:-64} +DATASET="swe-train/SWE-bench_lite" +SPLIT="test100" +N_RUNS=1 + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" +export EXP_NAME=$EXP_NAME + +EVAL_LIMIT=300 +MAX_ITER=100 + +./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT diff --git a/scripts/rollout-swe-train-full.sh b/scripts/rollout-swe-train-full.sh new file mode 100755 index 000000000000..fcd3d0272a0b --- /dev/null +++ b/scripts/rollout-swe-train-full.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +MODEL=$1 +EXP_NAME=$2 # "train-t0" +N_RUNS=${3:-20} +export EXP_NAME=$EXP_NAME +echo "MODEL: $MODEL" +echo "EXP_NAME: $EXP_NAME" +DATASET="swe-train/swe-train-dev-v1" +SPLIT="train.v2.success" + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" + +EVAL_LIMIT=3000 +MAX_ITER=50 +NUM_WORKERS=64 + +./evaluation/swe_bench/scripts/run_infer.sh \ + $MODEL HEAD CodeActAgent \ + $EVAL_LIMIT $MAX_ITER $NUM_WORKERS \ + $DATASET $SPLIT $N_RUNS diff --git a/scripts/rollout-swe-train-lite.sh b/scripts/rollout-swe-train-lite.sh new file mode 100755 index 000000000000..77173ade2b30 --- /dev/null +++ b/scripts/rollout-swe-train-lite.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +MODEL=$1 +EXP_NAME=$2 # "train-t0" +export EXP_NAME=$EXP_NAME +echo "MODEL: $MODEL" +echo "EXP_NAME: $EXP_NAME" +DATASET="swe-train/swe-train-dev-v1-lite" +SPLIT="train.v2.success" +N_RUNS=1 + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" + +EVAL_LIMIT=2000 +MAX_ITER=50 +NUM_WORKERS=64 + +./evaluation/swe_bench/scripts/run_infer.sh \ + $MODEL HEAD CodeActAgent \ + $EVAL_LIMIT $MAX_ITER $NUM_WORKERS \ + $DATASET $SPLIT $N_RUNS