From 9ee7ee496c1775e0d20b90c47c52aba18bb37d22 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Wed, 30 Oct 2024 18:01:04 +0000 Subject: [PATCH 01/40] checkout script --- scripts/rollout-eval.sh | 26 ++++++++++++++++++++++++++ scripts/rollout-swe-train.sh | 25 +++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100755 scripts/rollout-eval.sh create mode 100755 scripts/rollout-swe-train.sh diff --git a/scripts/rollout-eval.sh b/scripts/rollout-eval.sh new file mode 100755 index 0000000000000..99be840eed501 --- /dev/null +++ b/scripts/rollout-eval.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +export EXP_NAME="t0" # use this to differentiate between different runs +MODEL="llm.deepseek-coder-eval-t0" # change this based on your config.toml +DATASET="princeton-nlp/SWE-bench_Lite" +SPLIT="test" +N_RUNS=1 + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" + + +EVAL_LIMIT=300 +MAX_ITER=30 +NUM_WORKERS=64 + +./evaluation/swe_bench/scripts/run_infer.sh \ + $MODEL HEAD CodeActAgent \ + $EVAL_LIMIT $MAX_ITER $NUM_WORKERS \ + $DATASET $SPLIT $N_RUNS diff --git a/scripts/rollout-swe-train.sh b/scripts/rollout-swe-train.sh new file mode 100755 index 0000000000000..7ad03b2fca9a9 --- /dev/null +++ b/scripts/rollout-swe-train.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +export EXP_NAME="train-t0" # use this to differentiate between different runs +MODEL="llm.deepseek-coder-jiayi" # change this based on your config.toml +DATASET="swe-train/swe-train-dev-v1" +SPLIT="train.v1.success" +N_RUNS=1 + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" + +EVAL_LIMIT=2000 +MAX_ITER=30 +NUM_WORKERS=64 + +./evaluation/swe_bench/scripts/run_infer.sh \ + $MODEL HEAD CodeActAgent \ + $EVAL_LIMIT $MAX_ITER $NUM_WORKERS \ + $DATASET $SPLIT $N_RUNS From d1b94f02ae6b7bec2c95ce2a8efa21909a61eca2 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Wed, 30 Oct 2024 18:04:32 +0000 Subject: [PATCH 02/40] update script --- scripts/{rollout-swe-train.sh => rollout-swe-train-lite.sh} | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) rename scripts/{rollout-swe-train.sh => rollout-swe-train-lite.sh} (81%) diff --git a/scripts/rollout-swe-train.sh b/scripts/rollout-swe-train-lite.sh similarity index 81% rename from scripts/rollout-swe-train.sh rename to scripts/rollout-swe-train-lite.sh index 7ad03b2fca9a9..c4b27d46233e8 100755 --- a/scripts/rollout-swe-train.sh +++ b/scripts/rollout-swe-train-lite.sh @@ -1,9 +1,9 @@ #!/bin/bash +MODEL=$1 export EXP_NAME="train-t0" # use this to differentiate between different runs -MODEL="llm.deepseek-coder-jiayi" # change this based on your config.toml -DATASET="swe-train/swe-train-dev-v1" -SPLIT="train.v1.success" +DATASET="swe-train/swe-train-dev-v1-lite" +SPLIT="train.v2.success" N_RUNS=1 if [ -z "$ALLHANDS_API_KEY" ]; then From 16be2b5bd3bede67dd96ddcb3584fa9642e823e8 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Wed, 30 Oct 2024 18:55:36 +0000 Subject: [PATCH 03/40] make sure docker image is lower case --- evaluation/swe_bench/run_infer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index 8b8b45a463e95..39163e9594315 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -101,7 +101,7 @@ def get_instance_docker_image(instance_id: str) -> str: image_name = image_name.replace( '__', '_s_' ) # to comply with docker image naming convention - return DOCKER_IMAGE_PREFIX.rstrip('/') + '/' + image_name + return (DOCKER_IMAGE_PREFIX.rstrip('/') + '/' + image_name).lower() def get_config( From 5160328ee142978e45390ae67ed7509a01e91d39 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Wed, 30 Oct 2024 19:52:39 +0000 Subject: [PATCH 04/40] update swebench source --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 6481fe5bafa5b..04f14d4f05768 100644 --- a/poetry.lock +++ b/poetry.lock @@ -8469,13 +8469,13 @@ tqdm = "*" unidiff = "*" [package.extras] -inference = ["anthropic", "flash-attn", "jedi", "openai", "peft", "protobuf", "sentencepiece", "tenacity", "tiktoken", "torch", "transformers", "triton"] +inference = ["anthropic", "flash_attn", "jedi", "openai", "peft", "protobuf", "sentencepiece", "tenacity", "tiktoken", "torch", "transformers", "triton"] [package.source] type = "git" -url = "https://github.com/All-Hands-AI/SWE-bench.git" +url = "https://github.com/DevRL-Agent/SWE-Bench.git" reference = "HEAD" -resolved_reference = "c807c112edc3dcb4fdf5ddac63b34706912d5cdb" +resolved_reference = "c1f15036a6ee37f0915af8ab2da2d32114739570" [[package]] name = "sympy" @@ -10109,4 +10109,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "2b268ef696ace0d8170276407dbdeb414134477839ebe4b7ecf29b1a1fe2cef3" +content-hash = "793d4add995bc0bc48b02fff607a1d7bbb342d809be6d8f368c9adf44ae0e5f3" diff --git a/pyproject.toml b/pyproject.toml index a758e4dcc57d9..952a1345fc5ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -126,7 +126,7 @@ streamlit = "*" whatthepatch = "*" retry = "*" evaluate = "*" -swebench = { git = "https://github.com/All-Hands-AI/SWE-bench.git" } +swebench = { git = "https://github.com/DevRL-Agent/SWE-Bench.git" } func_timeout = "*" sympy = "*" gdown = "*" From 360caa478e5a8b569cd52ea68e60902ce602ff86 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Wed, 30 Oct 2024 19:53:50 +0000 Subject: [PATCH 05/40] add script to eval rollout --- scripts/eval-lite-rollout.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100755 scripts/eval-lite-rollout.sh diff --git a/scripts/eval-lite-rollout.sh b/scripts/eval-lite-rollout.sh new file mode 100755 index 0000000000000..1f25588390621 --- /dev/null +++ b/scripts/eval-lite-rollout.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +OUTPUT_FILE=$1 +NUM_WORKERS=$2 +DATASET="swe-train/swe-train-dev-v1-lite" +SPLIT="train.v2.success" + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" + +./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT From d83ddcab4fef3580c450f2a83ec367bb3b53ff50 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Wed, 30 Oct 2024 20:39:01 +0000 Subject: [PATCH 06/40] .lower() to fix swebench instance --- evaluation/swe_bench/eval_infer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/swe_bench/eval_infer.py index a214f4781ed9c..c4566c1bfd3df 100644 --- a/evaluation/swe_bench/eval_infer.py +++ b/evaluation/swe_bench/eval_infer.py @@ -239,7 +239,7 @@ def process_instance( # Create a directory structure that matches the expected format # NOTE: this is a hack to make the eval report format consistent # with the original SWE-Bench eval script - log_dir = os.path.join(temp_dir, 'logs', instance_id) + log_dir = os.path.join(temp_dir, 'logs', instance_id.lower()) os.makedirs(log_dir, exist_ok=True) test_output_path = os.path.join(log_dir, 'test_output.txt') with open(test_output_path, 'w') as f: From d33b8ea1604d1daed5fb87249f9db33f09d0a789 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 31 Oct 2024 01:08:46 +0000 Subject: [PATCH 07/40] add exp name to arg as well --- scripts/rollout-swe-train-lite.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/rollout-swe-train-lite.sh b/scripts/rollout-swe-train-lite.sh index c4b27d46233e8..517fd2b8f9a47 100755 --- a/scripts/rollout-swe-train-lite.sh +++ b/scripts/rollout-swe-train-lite.sh @@ -1,7 +1,9 @@ #!/bin/bash MODEL=$1 -export EXP_NAME="train-t0" # use this to differentiate between different runs +EXP_NAME=$2 # "train-t0" +echo "MODEL: $MODEL" +echo "EXP_NAME: $EXP_NAME" DATASET="swe-train/swe-train-dev-v1-lite" SPLIT="train.v2.success" N_RUNS=1 From 0d8b4116b57fb23e1b6101ff4697de8c42dda08d Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 31 Oct 2024 01:30:29 +0000 Subject: [PATCH 08/40] fix exp name export --- scripts/rollout-swe-train-lite.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/rollout-swe-train-lite.sh b/scripts/rollout-swe-train-lite.sh index 517fd2b8f9a47..dca8e95ac9783 100755 --- a/scripts/rollout-swe-train-lite.sh +++ b/scripts/rollout-swe-train-lite.sh @@ -2,6 +2,7 @@ MODEL=$1 EXP_NAME=$2 # "train-t0" +export EXP_NAME=$EXP_NAME echo "MODEL: $MODEL" echo "EXP_NAME: $EXP_NAME" DATASET="swe-train/swe-train-dev-v1-lite" From 31109ea6b13444ff43a48b5ea4e45456d4476c6a Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 31 Oct 2024 01:31:59 +0000 Subject: [PATCH 09/40] increase processes --- scripts/rollout-swe-train-lite.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/rollout-swe-train-lite.sh b/scripts/rollout-swe-train-lite.sh index dca8e95ac9783..f47d1b5aad1a6 100755 --- a/scripts/rollout-swe-train-lite.sh +++ b/scripts/rollout-swe-train-lite.sh @@ -20,7 +20,7 @@ export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/sw EVAL_LIMIT=2000 MAX_ITER=30 -NUM_WORKERS=64 +NUM_WORKERS=128 ./evaluation/swe_bench/scripts/run_infer.sh \ $MODEL HEAD CodeActAgent \ From 5377356075464535ff54f1dd8d84faa0884af0ed Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sun, 3 Nov 2024 21:07:29 +0000 Subject: [PATCH 10/40] update eval script --- scripts/rollout-eval.sh | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/scripts/rollout-eval.sh b/scripts/rollout-eval.sh index 99be840eed501..e96f73b3ef59f 100755 --- a/scripts/rollout-eval.sh +++ b/scripts/rollout-eval.sh @@ -1,9 +1,9 @@ #!/bin/bash export EXP_NAME="t0" # use this to differentiate between different runs -MODEL="llm.deepseek-coder-eval-t0" # change this based on your config.toml -DATASET="princeton-nlp/SWE-bench_Lite" -SPLIT="test" +MODEL=$1 +DATASET="swe-train/SWE-bench_lite" +SPLIT="test100" N_RUNS=1 if [ -z "$ALLHANDS_API_KEY" ]; then @@ -20,7 +20,4 @@ EVAL_LIMIT=300 MAX_ITER=30 NUM_WORKERS=64 -./evaluation/swe_bench/scripts/run_infer.sh \ - $MODEL HEAD CodeActAgent \ - $EVAL_LIMIT $MAX_ITER $NUM_WORKERS \ - $DATASET $SPLIT $N_RUNS +./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT From 873df45dc04393ce7dbe634c70081cff4f86a43d Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sun, 3 Nov 2024 21:53:58 +0000 Subject: [PATCH 11/40] update script --- scripts/rollout-eval.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/rollout-eval.sh b/scripts/rollout-eval.sh index e96f73b3ef59f..cbb7017c06bda 100755 --- a/scripts/rollout-eval.sh +++ b/scripts/rollout-eval.sh @@ -14,7 +14,7 @@ fi export RUNTIME=remote export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" - +export EXP_NAME=$EXP_NAME EVAL_LIMIT=300 MAX_ITER=30 From c2962c66a72139a074f987a696f6149a38defc75 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sun, 3 Nov 2024 22:52:17 +0000 Subject: [PATCH 12/40] add llm converter from #4711 --- openhands/core/exceptions.py | 16 + openhands/llm/fn_call_converter.py | 481 ++++++++++++++++ tests/unit/test_llm_fncall_converter.py | 729 ++++++++++++++++++++++++ 3 files changed, 1226 insertions(+) create mode 100644 openhands/llm/fn_call_converter.py create mode 100644 tests/unit/test_llm_fncall_converter.py diff --git a/openhands/core/exceptions.py b/openhands/core/exceptions.py index c33297a0d2450..af545960e6d79 100644 --- a/openhands/core/exceptions.py +++ b/openhands/core/exceptions.py @@ -94,3 +94,19 @@ class CloudFlareBlockageError(Exception): """Exception raised when a request is blocked by CloudFlare.""" pass + + +class FunctionCallConversionError(Exception): + """Exception raised when FunctionCallingConverter failed to convert a non-function call message to a function call message. + This typically happens when there's a malformed message (e.g., missing tags). But not due to LLM output. + """ + + pass + + +class FunctionCallValidationError(Exception): + """Exception raised when FunctionCallingConverter failed to validate a function call message. + This typically happens when the LLM outputs unrecognized function call / parameter names / values. + """ + + pass diff --git a/openhands/llm/fn_call_converter.py b/openhands/llm/fn_call_converter.py new file mode 100644 index 0000000000000..17e897a253596 --- /dev/null +++ b/openhands/llm/fn_call_converter.py @@ -0,0 +1,481 @@ +"""Convert function calling messages to non-function calling messages and vice versa. + +This will inject prompts so that models that doesn't support function calling +can still be used with function calling agents. + +We follow format from: https://docs.litellm.ai/docs/completion/function_call +""" + +import copy +import json +import re +from typing import Iterable + +from litellm import ChatCompletionToolParam + +from openhands.core.exceptions import ( + FunctionCallConversionError, + FunctionCallValidationError, +) + +# Inspired by: https://docs.together.ai/docs/llama-3-function-calling#function-calling-w-llama-31-70b +SYSTEM_PROMPT_SUFFIX_TEMPLATE = """ +You have access to the following functions: + +{description} + +If you choose to call a function ONLY reply in the following format with NO suffix: + + +value_1 + +This is the value for the second parameter +that can span +multiple lines + + + + +Reminder: +- Function calls MUST follow the specified format, start with +- Required parameters MUST be specified +- Only call one function at a time +- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after. +- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls +""" + +# Regex patterns for function call parsing +FN_REGEX_PATTERN = r']+)>\n(.*?)' +FN_PARAM_REGEX_PATTERN = r']+)>(.*?)' + +# Add new regex pattern for tool execution results +TOOL_RESULT_REGEX_PATTERN = r'EXECUTION RESULT of \[(.*?)\]:\n(.*)' + + +def convert_tool_call_to_string(tool_call: dict) -> str: + """Convert tool call to content in string format.""" + if 'function' not in tool_call: + raise FunctionCallConversionError("Tool call must contain 'function' key.") + if 'id' not in tool_call: + raise FunctionCallConversionError("Tool call must contain 'id' key.") + if 'type' not in tool_call: + raise FunctionCallConversionError("Tool call must contain 'type' key.") + if tool_call['type'] != 'function': + raise FunctionCallConversionError("Tool call type must be 'function'.") + + ret = f"\n" + try: + args = json.loads(tool_call['function']['arguments']) + except json.JSONDecodeError as e: + raise FunctionCallConversionError( + f"Failed to parse arguments as JSON. Arguments: {tool_call['function']['arguments']}" + ) from e + for param_name, param_value in args.items(): + is_multiline = isinstance(param_value, str) and '\n' in param_value + ret += f'' + if is_multiline: + ret += '\n' + ret += f'{param_value}' + if is_multiline: + ret += '\n' + ret += '\n' + ret += '' + return ret + + +def convert_tools_to_description(tools: list[dict]) -> str: + ret = '' + for i, tool in enumerate(tools): + assert tool['type'] == 'function' + fn = tool['function'] + if i > 0: + ret += '\n' + ret += f"---- BEGIN FUNCTION #{i+1}: {fn['name']} ----\n" + ret += f"Description: {fn['description']}\n" + if 'parameters' in fn: + ret += f"Parameters: {json.dumps(fn['parameters'], indent=2)}\n" + else: + ret += 'No parameters are required for this function.\n' + ret += f'---- END FUNCTION #{i+1} ----\n' + return ret + + +def convert_fncall_messages_to_non_fncall_messages( + messages: list[dict], + tools: list[ChatCompletionToolParam], +) -> list[dict]: + """Convert function calling messages to non-function calling messages.""" + messages = copy.deepcopy(messages) + + formatted_tools = convert_tools_to_description(tools) + system_prompt_suffix = SYSTEM_PROMPT_SUFFIX_TEMPLATE.format( + description=formatted_tools + ) + + converted_messages = [] + for message in messages: + role, content = message['role'], message['content'] + if content is None: + content = '' + + # 1. SYSTEM MESSAGES + # append system prompt suffix to content + if role == 'system': + if isinstance(content, str): + content += system_prompt_suffix + elif isinstance(content, list): + if content and content[-1]['type'] == 'text': + content[-1]['text'] += system_prompt_suffix + else: + content.append({'type': 'text', 'text': system_prompt_suffix}) + else: + raise FunctionCallConversionError( + f'Unexpected content type {type(content)}. Expected str or list. Content: {content}' + ) + converted_messages.append({'role': 'system', 'content': content}) + # 2. USER MESSAGES (no change) + elif role == 'user': + converted_messages.append(message) + + # 3. ASSISTANT MESSAGES + # - 3.1 no change if no function call + # - 3.2 change if function call + elif role == 'assistant': + if 'tool_calls' in message and message['tool_calls'] is not None: + if len(message['tool_calls']) != 1: + raise FunctionCallConversionError( + f'Expected exactly one tool call in the message. More than one tool call is not supported. But got {len(message["tool_calls"])} tool calls. Content: {content}' + ) + try: + tool_content = convert_tool_call_to_string(message['tool_calls'][0]) + except FunctionCallConversionError as e: + raise FunctionCallConversionError( + f'Failed to convert tool call to string. Raw messages: {json.dumps(messages, indent=2)}' + ) from e + if isinstance(content, str): + content += '\n\n' + tool_content + content = content.lstrip() + elif isinstance(content, list): + if content and content[-1]['type'] == 'text': + content[-1]['text'] += '\n\n' + tool_content + content[-1]['text'] = content[-1]['text'].lstrip() + else: + content.append({'type': 'text', 'text': tool_content}) + else: + raise FunctionCallConversionError( + f'Unexpected content type {type(content)}. Expected str or list. Content: {content}' + ) + converted_messages.append({'role': 'assistant', 'content': content}) + # 4. TOOL MESSAGES (tool outputs) + elif role == 'tool': + # Convert tool result as assistant message + prefix = f'EXECUTION RESULT of [{message["name"]}]:\n' + # and omit "tool_call_id" AND "name" + if isinstance(content, str): + content = prefix + content + elif isinstance(content, list): + if content and content[-1]['type'] == 'text': + content[-1]['text'] = prefix + content[-1]['text'] + else: + content = [{'type': 'text', 'text': prefix}] + content + else: + raise FunctionCallConversionError( + f'Unexpected content type {type(content)}. Expected str or list. Content: {content}' + ) + converted_messages.append({'role': 'user', 'content': content}) + else: + raise FunctionCallConversionError( + f'Unexpected role {role}. Expected system, user, assistant or tool.' + ) + return converted_messages + + +def _extract_and_validate_params( + matching_tool: dict, param_matches: Iterable[re.Match], fn_name: str +) -> dict: + params = {} + # Parse and validate parameters + required_params = set() + if 'parameters' in matching_tool and 'required' in matching_tool['parameters']: + required_params = set(matching_tool['parameters'].get('required', [])) + + allowed_params = set() + if 'parameters' in matching_tool and 'properties' in matching_tool['parameters']: + allowed_params = set(matching_tool['parameters']['properties'].keys()) + + param_name_to_type = {} + if 'parameters' in matching_tool and 'properties' in matching_tool['parameters']: + param_name_to_type = { + name: val.get('type', 'string') + for name, val in matching_tool['parameters']['properties'].items() + } + + # Collect parameters + found_params = set() + for param_match in param_matches: + param_name = param_match.group(1) + param_value = param_match.group(2).strip() + + # Validate parameter is allowed + if allowed_params and param_name not in allowed_params: + raise FunctionCallValidationError( + f"Parameter '{param_name}' is not allowed for function '{fn_name}'. " + f'Allowed parameters: {allowed_params}' + ) + + # Validate and convert parameter type + # supported: string, integer, array + if param_name in param_name_to_type: + if param_name_to_type[param_name] == 'integer': + try: + param_value = int(param_value) + except ValueError: + raise FunctionCallValidationError( + f"Parameter '{param_name}' is expected to be an integer." + ) + elif param_name_to_type[param_name] == 'array': + try: + param_value = json.loads(param_value) + except json.JSONDecodeError: + raise FunctionCallValidationError( + f"Parameter '{param_name}' is expected to be an array." + ) + else: + # string + pass + + # Enum check + if 'enum' in matching_tool['parameters']['properties'][param_name]: + if ( + param_value + not in matching_tool['parameters']['properties'][param_name]['enum'] + ): + raise FunctionCallValidationError( + f"Parameter '{param_name}' is expected to be one of {matching_tool['parameters']['properties'][param_name]['enum']}." + ) + + params[param_name] = param_value + found_params.add(param_name) + + # Check all required parameters are present + missing_params = required_params - found_params + if missing_params: + raise FunctionCallValidationError( + f"Missing required parameters for function '{fn_name}': {missing_params}" + ) + return params + + +def convert_non_fncall_messages_to_fncall_messages( + messages: list[dict], + tools: list[ChatCompletionToolParam], +) -> list[dict]: + """Convert non-function calling messages back to function calling messages.""" + messages = copy.deepcopy(messages) + formatted_tools = convert_tools_to_description(tools) + system_prompt_suffix = SYSTEM_PROMPT_SUFFIX_TEMPLATE.format( + description=formatted_tools + ) + + converted_messages = [] + tool_call_counter = 1 # Counter for tool calls + + for message in messages: + role, content = message['role'], message['content'] + + # For system messages, remove the added suffix + if role == 'system': + if isinstance(content, str): + # Remove the suffix if present + content = content.split(system_prompt_suffix)[0] + elif isinstance(content, list): + if content and content[-1]['type'] == 'text': + # Remove the suffix from the last text item + content[-1]['text'] = content[-1]['text'].split( + system_prompt_suffix + )[0] + converted_messages.append({'role': 'system', 'content': content}) + # Skip user messages (no conversion needed) + elif role == 'user': + # Check for tool execution result pattern + if isinstance(content, str): + tool_result_match = re.search( + TOOL_RESULT_REGEX_PATTERN, content, re.DOTALL + ) + elif isinstance(content, list): + tool_result_match = next( + ( + _match + for item in content + if item.get('type') == 'text' + and ( + _match := re.search( + TOOL_RESULT_REGEX_PATTERN, item['text'], re.DOTALL + ) + ) + ), + None, + ) + else: + raise FunctionCallConversionError( + f'Unexpected content type {type(content)}. Expected str or list. Content: {content}' + ) + + if tool_result_match: + if not ( + isinstance(content, str) + or ( + isinstance(content, list) + and len(content) == 1 + and content[0].get('type') == 'text' + ) + ): + raise FunctionCallConversionError( + f'Expected str or list with one text item when tool result is present in the message. Content: {content}' + ) + tool_name = tool_result_match.group(1) + tool_result = tool_result_match.group(2).strip() + + # Convert to tool message format + converted_messages.append( + { + 'role': 'tool', + 'name': tool_name, + 'content': [{'type': 'text', 'text': tool_result}] + if isinstance(content, list) + else tool_result, + 'tool_call_id': f'toolu_{tool_call_counter-1:02d}', # Use last generated ID + } + ) + else: + converted_messages.append(message) + + # Handle assistant messages + elif role == 'assistant': + if isinstance(content, str): + fn_match = re.search(FN_REGEX_PATTERN, content, re.DOTALL) + elif isinstance(content, list): + if content and content[-1]['type'] == 'text': + fn_match = re.search( + FN_REGEX_PATTERN, content[-1]['text'], re.DOTALL + ) + else: + fn_match = None + fn_match_exists = any( + item.get('type') == 'text' + and re.search(FN_REGEX_PATTERN, item['text'], re.DOTALL) + for item in content + ) + if fn_match_exists and not fn_match: + raise FunctionCallConversionError( + f'Expecting function call in the LAST index of content list. But got content={content}' + ) + else: + raise FunctionCallConversionError( + f'Unexpected content type {type(content)}. Expected str or list. Content: {content}' + ) + + if fn_match: + fn_name = fn_match.group(1) + fn_body = fn_match.group(2) + matching_tool = next( + ( + tool['function'] + for tool in tools + if tool['type'] == 'function' + and tool['function']['name'] == fn_name + ), + None, + ) + # Validate function exists in tools + if not matching_tool: + raise FunctionCallValidationError( + f"Function '{fn_name}' not found in available tools: {[tool['function']['name'] for tool in tools if tool['type'] == 'function']}" + ) + + # Parse parameters + param_matches = re.finditer(FN_PARAM_REGEX_PATTERN, fn_body, re.DOTALL) + params = _extract_and_validate_params( + matching_tool, param_matches, fn_name + ) + + # Create tool call with unique ID + tool_call_id = f'toolu_{tool_call_counter:02d}' + tool_call = { + 'index': 1, # always 1 because we only support **one tool call per message** + 'id': tool_call_id, + 'type': 'function', + 'function': {'name': fn_name, 'arguments': json.dumps(params)}, + } + tool_call_counter += 1 # Increment counter + + # Remove the function call part from content + if isinstance(content, list): + assert content and content[-1]['type'] == 'text' + content[-1]['text'] = ( + content[-1]['text'].split(' list[dict]: + """Break one message with multiple tool calls into multiple messages.""" + converted_messages = [] + + pending_tool_calls: dict[str, dict] = {} + for message in messages: + role, content = message['role'], message['content'] + if role == 'assistant': + if message.get('tool_calls') and len(message['tool_calls']) > 1: + # handle multiple tool calls by breaking them into multiple messages + for i, tool_call in enumerate(message['tool_calls']): + pending_tool_calls[tool_call['id']] = { + 'role': 'assistant', + 'content': content if i == 0 else '', + 'tool_calls': [tool_call], + } + else: + converted_messages.append(message) + elif role == 'tool': + if message['tool_call_id'] in pending_tool_calls: + # remove the tool call from the pending list + _tool_call_message = pending_tool_calls.pop(message['tool_call_id']) + converted_messages.append(_tool_call_message) + # add the tool result + converted_messages.append(message) + else: + assert ( + len(pending_tool_calls) == 0 + ), f'Found pending tool calls but not found in pending list: {pending_tool_calls=}' + converted_messages.append(message) + else: + assert ( + len(pending_tool_calls) == 0 + ), f'Found pending tool calls but not expect to handle it with role {role}: {pending_tool_calls=}, {message=}' + converted_messages.append(message) + + if len(pending_tool_calls) > 0: + raise FunctionCallConversionError( + f'Found pending tool calls but no tool result: {pending_tool_calls=}' + ) + return converted_messages diff --git a/tests/unit/test_llm_fncall_converter.py b/tests/unit/test_llm_fncall_converter.py new file mode 100644 index 0000000000000..9bb4d6b44aace --- /dev/null +++ b/tests/unit/test_llm_fncall_converter.py @@ -0,0 +1,729 @@ +"""Test for FunctionCallingConverter.""" + +import copy +import json + +import pytest +from litellm import ChatCompletionToolParam + +from openhands.llm.fn_call_converter import ( + FunctionCallConversionError, + convert_fncall_messages_to_non_fncall_messages, + convert_from_multiple_tool_calls_to_single_tool_call_messages, + convert_non_fncall_messages_to_fncall_messages, + convert_tool_call_to_string, + convert_tools_to_description, +) + +FNCALL_TOOLS: list[ChatCompletionToolParam] = [ + { + 'type': 'function', + 'function': { + 'name': 'execute_bash', + 'description': 'Execute a bash command in the terminal.\n* Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = `python3 app.py > server.log 2>&1 &`.\n* Interactive: If a bash command returns exit code `-1`, this means the process is not yet finished. The assistant must then send a second call to terminal with an empty `command` (which will retrieve any additional logs), or it can send additional text (set `command` to the text) to STDIN of the running process, or it can send command=`ctrl+c` to interrupt the process.\n* Timeout: If a command execution result says "Command timed out. Sending SIGINT to the process", the assistant should retry running the command in the background.\n', + 'parameters': { + 'type': 'object', + 'properties': { + 'command': { + 'type': 'string', + 'description': 'The bash command to execute. Can be empty to view additional logs when previous exit code is `-1`. Can be `ctrl+c` to interrupt the currently running process.', + } + }, + 'required': ['command'], + }, + }, + }, + { + 'type': 'function', + 'function': { + 'name': 'finish', + 'description': 'Finish the interaction when the task is complete OR if the assistant cannot proceed further with the task.', + }, + }, + { + 'type': 'function', + 'function': { + 'name': 'str_replace_editor', + 'description': 'Custom editing tool for viewing, creating and editing files\n* State is persistent across command calls and discussions with the user\n* If `path` is a file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep\n* The `create` command cannot be used if the specified `path` already exists as a file\n* If a `command` generates a long output, it will be truncated and marked with ``\n* The `undo_edit` command will revert the last edit made to the file at `path`\n\nNotes for using the `str_replace` command:\n* The `old_str` parameter should match EXACTLY one or more consecutive lines from the original file. Be mindful of whitespaces!\n* If the `old_str` parameter is not unique in the file, the replacement will not be performed. Make sure to include enough context in `old_str` to make it unique\n* The `new_str` parameter should contain the edited lines that should replace the `old_str`\n', + 'parameters': { + 'type': 'object', + 'properties': { + 'command': { + 'description': 'The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.', + 'enum': [ + 'view', + 'create', + 'str_replace', + 'insert', + 'undo_edit', + ], + 'type': 'string', + }, + 'path': { + 'description': 'Absolute path to file or directory, e.g. `/repo/file.py` or `/repo`.', + 'type': 'string', + }, + 'file_text': { + 'description': 'Required parameter of `create` command, with the content of the file to be created.', + 'type': 'string', + }, + 'old_str': { + 'description': 'Required parameter of `str_replace` command containing the string in `path` to replace.', + 'type': 'string', + }, + 'new_str': { + 'description': 'Optional parameter of `str_replace` command containing the new string (if not given, no string will be added). Required parameter of `insert` command containing the string to insert.', + 'type': 'string', + }, + 'insert_line': { + 'description': 'Required parameter of `insert` command. The `new_str` will be inserted AFTER the line `insert_line` of `path`.', + 'type': 'integer', + }, + 'view_range': { + 'description': 'Optional parameter of `view` command when `path` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting `[start_line, -1]` shows all lines from `start_line` to the end of the file.', + 'items': {'type': 'integer'}, + 'type': 'array', + }, + }, + 'required': ['command', 'path'], + }, + }, + }, +] + + +def test_convert_tools_to_description(): + formatted_tools = convert_tools_to_description(FNCALL_TOOLS) + assert ( + formatted_tools.strip() + == """---- BEGIN FUNCTION #1: execute_bash ---- +Description: Execute a bash command in the terminal. +* Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = `python3 app.py > server.log 2>&1 &`. +* Interactive: If a bash command returns exit code `-1`, this means the process is not yet finished. The assistant must then send a second call to terminal with an empty `command` (which will retrieve any additional logs), or it can send additional text (set `command` to the text) to STDIN of the running process, or it can send command=`ctrl+c` to interrupt the process. +* Timeout: If a command execution result says "Command timed out. Sending SIGINT to the process", the assistant should retry running the command in the background. + +Parameters: { + "type": "object", + "properties": { + "command": { + "type": "string", + "description": "The bash command to execute. Can be empty to view additional logs when previous exit code is `-1`. Can be `ctrl+c` to interrupt the currently running process." + } + }, + "required": [ + "command" + ] +} +---- END FUNCTION #1 ---- + +---- BEGIN FUNCTION #2: finish ---- +Description: Finish the interaction when the task is complete OR if the assistant cannot proceed further with the task. +No parameters are required for this function. +---- END FUNCTION #2 ---- + +---- BEGIN FUNCTION #3: str_replace_editor ---- +Description: Custom editing tool for viewing, creating and editing files +* State is persistent across command calls and discussions with the user +* If `path` is a file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep +* The `create` command cannot be used if the specified `path` already exists as a file +* If a `command` generates a long output, it will be truncated and marked with `` +* The `undo_edit` command will revert the last edit made to the file at `path` + +Notes for using the `str_replace` command: +* The `old_str` parameter should match EXACTLY one or more consecutive lines from the original file. Be mindful of whitespaces! +* If the `old_str` parameter is not unique in the file, the replacement will not be performed. Make sure to include enough context in `old_str` to make it unique +* The `new_str` parameter should contain the edited lines that should replace the `old_str` + +Parameters: { + "type": "object", + "properties": { + "command": { + "description": "The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.", + "enum": [ + "view", + "create", + "str_replace", + "insert", + "undo_edit" + ], + "type": "string" + }, + "path": { + "description": "Absolute path to file or directory, e.g. `/repo/file.py` or `/repo`.", + "type": "string" + }, + "file_text": { + "description": "Required parameter of `create` command, with the content of the file to be created.", + "type": "string" + }, + "old_str": { + "description": "Required parameter of `str_replace` command containing the string in `path` to replace.", + "type": "string" + }, + "new_str": { + "description": "Optional parameter of `str_replace` command containing the new string (if not given, no string will be added). Required parameter of `insert` command containing the string to insert.", + "type": "string" + }, + "insert_line": { + "description": "Required parameter of `insert` command. The `new_str` will be inserted AFTER the line `insert_line` of `path`.", + "type": "integer" + }, + "view_range": { + "description": "Optional parameter of `view` command when `path` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting `[start_line, -1]` shows all lines from `start_line` to the end of the file.", + "items": { + "type": "integer" + }, + "type": "array" + } + }, + "required": [ + "command", + "path" + ] +} +---- END FUNCTION #3 ----""".strip() + ) + + +FNCALL_MESSAGES = [ + { + 'content': [ + { + 'type': 'text', + 'text': "You are a helpful assistant that can interact with a computer to solve tasks.\n\n* If user provides a path, you should NOT assume it's relative to the current working directory. Instead, you should explore the file system to find the file before working on it.\n\n\n", + 'cache_control': {'type': 'ephemeral'}, + } + ], + 'role': 'system', + }, + { + 'content': [ + { + 'type': 'text', + 'text': "\n/workspace/astropy__astropy__5.1\n\nI've uploaded a python code repository in the directory astropy__astropy__5.1. LONG DESCRIPTION:\n\n", + } + ], + 'role': 'user', + }, + { + 'content': [ + { + 'type': 'text', + 'text': "I'll help you implement the necessary changes to meet the requirements. Let's follow the steps:\n\n1. First, let's explore the repository structure:", + } + ], + 'role': 'assistant', + 'tool_calls': [ + { + 'index': 1, + 'function': { + 'arguments': '{"command": "ls -la /workspace/astropy__astropy__5.1"}', + 'name': 'execute_bash', + }, + 'id': 'toolu_01', + 'type': 'function', + } + ], + }, + { + 'content': [ + { + 'type': 'text', + 'text': 'ls -la /workspace/astropy__astropy__5.1\r\nls: /workspace/astropy__astropy__5.1: Bad file descriptor\r\nlrwxrwxrwx 1 root root 8 Oct 28 21:58 /workspace/astropy__astropy__5.1 -> /testbed[Python Interpreter: /opt/miniconda3/envs/testbed/bin/python]\nroot@openhands-workspace:/workspace/astropy__astropy__5.1 # \n[Command finished with exit code 0]', + } + ], + 'role': 'tool', + 'tool_call_id': 'toolu_01', + 'name': 'execute_bash', + }, + { + 'content': [ + { + 'type': 'text', + 'text': "I see there's a symlink. Let's explore the actual directory:", + } + ], + 'role': 'assistant', + 'tool_calls': [ + { + 'index': 1, + 'function': { + 'arguments': '{"command": "ls -la /testbed"}', + 'name': 'execute_bash', + }, + 'id': 'toolu_02', + 'type': 'function', + } + ], + }, + { + 'content': [ + { + 'type': 'text', + 'text': 'SOME OBSERVATION', + } + ], + 'role': 'tool', + 'tool_call_id': 'toolu_02', + 'name': 'execute_bash', + }, + { + 'content': [ + { + 'type': 'text', + 'text': "Let's look at the source code file mentioned in the PR description:", + } + ], + 'role': 'assistant', + 'tool_calls': [ + { + 'index': 1, + 'function': { + 'arguments': '{"command": "view", "path": "/testbed/astropy/io/fits/card.py"}', + 'name': 'str_replace_editor', + }, + 'id': 'toolu_03', + 'type': 'function', + } + ], + }, + { + 'content': [ + { + 'type': 'text', + 'text': "Here's the result of running `cat -n` on /testbed/astropy/io/fits/card.py:\n 1\t# Licensed under a 3-clause BSD style license - see PYFITS.rst...VERY LONG TEXT", + } + ], + 'role': 'tool', + 'tool_call_id': 'toolu_03', + 'name': 'str_replace_editor', + }, +] + +NON_FNCALL_MESSAGES = [ + { + 'role': 'system', + 'content': [ + { + 'type': 'text', + 'text': 'You are a helpful assistant that can interact with a computer to solve tasks.\n\n* If user provides a path, you should NOT assume it\'s relative to the current working directory. Instead, you should explore the file system to find the file before working on it.\n\n\n\nYou have access to the following functions:\n\n---- BEGIN FUNCTION #1: execute_bash ----\nDescription: Execute a bash command in the terminal.\n* Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = `python3 app.py > server.log 2>&1 &`.\n* Interactive: If a bash command returns exit code `-1`, this means the process is not yet finished. The assistant must then send a second call to terminal with an empty `command` (which will retrieve any additional logs), or it can send additional text (set `command` to the text) to STDIN of the running process, or it can send command=`ctrl+c` to interrupt the process.\n* Timeout: If a command execution result says "Command timed out. Sending SIGINT to the process", the assistant should retry running the command in the background.\n\nParameters: {\n "type": "object",\n "properties": {\n "command": {\n "type": "string",\n "description": "The bash command to execute. Can be empty to view additional logs when previous exit code is `-1`. Can be `ctrl+c` to interrupt the currently running process."\n }\n },\n "required": [\n "command"\n ]\n}\n---- END FUNCTION #1 ----\n\n---- BEGIN FUNCTION #2: finish ----\nDescription: Finish the interaction when the task is complete OR if the assistant cannot proceed further with the task.\nNo parameters are required for this function.\n---- END FUNCTION #2 ----\n\n---- BEGIN FUNCTION #3: str_replace_editor ----\nDescription: Custom editing tool for viewing, creating and editing files\n* State is persistent across command calls and discussions with the user\n* If `path` is a file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep\n* The `create` command cannot be used if the specified `path` already exists as a file\n* If a `command` generates a long output, it will be truncated and marked with ``\n* The `undo_edit` command will revert the last edit made to the file at `path`\n\nNotes for using the `str_replace` command:\n* The `old_str` parameter should match EXACTLY one or more consecutive lines from the original file. Be mindful of whitespaces!\n* If the `old_str` parameter is not unique in the file, the replacement will not be performed. Make sure to include enough context in `old_str` to make it unique\n* The `new_str` parameter should contain the edited lines that should replace the `old_str`\n\nParameters: {\n "type": "object",\n "properties": {\n "command": {\n "description": "The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.",\n "enum": [\n "view",\n "create",\n "str_replace",\n "insert",\n "undo_edit"\n ],\n "type": "string"\n },\n "path": {\n "description": "Absolute path to file or directory, e.g. `/repo/file.py` or `/repo`.",\n "type": "string"\n },\n "file_text": {\n "description": "Required parameter of `create` command, with the content of the file to be created.",\n "type": "string"\n },\n "old_str": {\n "description": "Required parameter of `str_replace` command containing the string in `path` to replace.",\n "type": "string"\n },\n "new_str": {\n "description": "Optional parameter of `str_replace` command containing the new string (if not given, no string will be added). Required parameter of `insert` command containing the string to insert.",\n "type": "string"\n },\n "insert_line": {\n "description": "Required parameter of `insert` command. The `new_str` will be inserted AFTER the line `insert_line` of `path`.",\n "type": "integer"\n },\n "view_range": {\n "description": "Optional parameter of `view` command when `path` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting `[start_line, -1]` shows all lines from `start_line` to the end of the file.",\n "items": {\n "type": "integer"\n },\n "type": "array"\n }\n },\n "required": [\n "command",\n "path"\n ]\n}\n---- END FUNCTION #3 ----\n\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\nvalue_1\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format, start with \n- Required parameters MUST be specified\n- Only call one function at a time\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after.\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n', + 'cache_control': {'type': 'ephemeral'}, + } + ], + }, + { + 'content': [ + { + 'type': 'text', + 'text': "\n/workspace/astropy__astropy__5.1\n\nI've uploaded a python code repository in the directory astropy__astropy__5.1. LONG DESCRIPTION:\n\n", + } + ], + 'role': 'user', + }, + { + 'role': 'assistant', + 'content': [ + { + 'type': 'text', + 'text': "I'll help you implement the necessary changes to meet the requirements. Let's follow the steps:\n\n1. First, let's explore the repository structure:\n\n\nls -la /workspace/astropy__astropy__5.1\n", + } + ], + }, + { + 'role': 'user', + 'content': [ + { + 'type': 'text', + 'text': 'EXECUTION RESULT of [execute_bash]:\nls -la /workspace/astropy__astropy__5.1\r\nls: /workspace/astropy__astropy__5.1: Bad file descriptor\r\nlrwxrwxrwx 1 root root 8 Oct 28 21:58 /workspace/astropy__astropy__5.1 -> /testbed[Python Interpreter: /opt/miniconda3/envs/testbed/bin/python]\nroot@openhands-workspace:/workspace/astropy__astropy__5.1 # \n[Command finished with exit code 0]', + } + ], + }, + { + 'role': 'assistant', + 'content': [ + { + 'type': 'text', + 'text': "I see there's a symlink. Let's explore the actual directory:\n\n\nls -la /testbed\n", + } + ], + }, + { + 'role': 'user', + 'content': [ + { + 'type': 'text', + 'text': 'EXECUTION RESULT of [execute_bash]:\nSOME OBSERVATION', + } + ], + }, + { + 'role': 'assistant', + 'content': [ + { + 'type': 'text', + 'text': "Let's look at the source code file mentioned in the PR description:\n\n\nview\n/testbed/astropy/io/fits/card.py\n", + } + ], + }, + { + 'role': 'user', + 'content': [ + { + 'type': 'text', + 'text': "EXECUTION RESULT of [str_replace_editor]:\nHere's the result of running `cat -n` on /testbed/astropy/io/fits/card.py:\n 1\t# Licensed under a 3-clause BSD style license - see PYFITS.rst...VERY LONG TEXT", + } + ], + }, +] + +FNCALL_RESPONSE_MESSAGE = { + 'content': [ + { + 'type': 'text', + 'text': 'Let me search for the `_format_float` method mentioned in the PR description:', + } + ], + 'role': 'assistant', + 'tool_calls': [ + { + 'index': 1, + 'function': { + 'arguments': '{"command": "grep -n \\"_format_float\\" /testbed/astropy/io/fits/card.py"}', + 'name': 'execute_bash', + }, + 'id': 'toolu_04', + 'type': 'function', + } + ], +} + +NON_FNCALL_RESPONSE_MESSAGE = { + 'content': [ + { + 'type': 'text', + 'text': 'Let me search for the `_format_float` method mentioned in the PR description:\n\n\ngrep -n "_format_float" /testbed/astropy/io/fits/card.py\n', + } + ], + 'role': 'assistant', +} + + +@pytest.mark.parametrize( + 'tool_calls, expected', + [ + # Original test case + ( + FNCALL_RESPONSE_MESSAGE['tool_calls'], + """ +grep -n "_format_float" /testbed/astropy/io/fits/card.py +""", + ), + # Test case with multiple parameters + ( + [ + { + 'index': 1, + 'function': { + 'arguments': '{"command": "view", "path": "/test/file.py", "view_range": [1, 10]}', + 'name': 'str_replace_editor', + }, + 'id': 'test_id', + 'type': 'function', + } + ], + """ +view +/test/file.py +[1, 10] +""", + ), + ], +) +def test_convert_tool_call_to_string(tool_calls, expected): + assert len(tool_calls) == 1 + converted = convert_tool_call_to_string(tool_calls[0]) + print(converted) + assert converted == expected + + +def test_convert_fncall_messages_to_non_fncall_messages(): + converted_non_fncall = convert_fncall_messages_to_non_fncall_messages( + FNCALL_MESSAGES, FNCALL_TOOLS + ) + assert converted_non_fncall == NON_FNCALL_MESSAGES + + +def test_convert_non_fncall_messages_to_fncall_messages(): + converted = convert_non_fncall_messages_to_fncall_messages( + NON_FNCALL_MESSAGES, FNCALL_TOOLS + ) + print(json.dumps(converted, indent=2)) + assert converted == FNCALL_MESSAGES + + +def test_two_way_conversion_nonfn_to_fn_to_nonfn(): + non_fncall_copy = copy.deepcopy(NON_FNCALL_MESSAGES) + converted_fncall = convert_non_fncall_messages_to_fncall_messages( + NON_FNCALL_MESSAGES, FNCALL_TOOLS + ) + assert ( + non_fncall_copy == NON_FNCALL_MESSAGES + ) # make sure original messages are not modified + assert converted_fncall == FNCALL_MESSAGES + + fncall_copy = copy.deepcopy(FNCALL_MESSAGES) + converted_non_fncall = convert_fncall_messages_to_non_fncall_messages( + FNCALL_MESSAGES, FNCALL_TOOLS + ) + assert ( + fncall_copy == FNCALL_MESSAGES + ) # make sure original messages are not modified + assert converted_non_fncall == NON_FNCALL_MESSAGES + + +def test_two_way_conversion_fn_to_nonfn_to_fn(): + fncall_copy = copy.deepcopy(FNCALL_MESSAGES) + converted_non_fncall = convert_fncall_messages_to_non_fncall_messages( + FNCALL_MESSAGES, FNCALL_TOOLS + ) + assert ( + fncall_copy == FNCALL_MESSAGES + ) # make sure original messages are not modified + assert converted_non_fncall == NON_FNCALL_MESSAGES + + non_fncall_copy = copy.deepcopy(NON_FNCALL_MESSAGES) + converted_fncall = convert_non_fncall_messages_to_fncall_messages( + NON_FNCALL_MESSAGES, FNCALL_TOOLS + ) + assert ( + non_fncall_copy == NON_FNCALL_MESSAGES + ) # make sure original messages are not modified + assert converted_fncall == FNCALL_MESSAGES + + +def test_infer_fncall_on_noncall_model(): + messages_for_llm_inference = convert_fncall_messages_to_non_fncall_messages( + FNCALL_MESSAGES, FNCALL_TOOLS + ) + assert messages_for_llm_inference == NON_FNCALL_MESSAGES + # Mock LLM inference + response_message_from_llm_inference = NON_FNCALL_RESPONSE_MESSAGE + + # Convert back to fncall messages to hand back to the agent + # so agent is model-agnostic + all_nonfncall_messages = NON_FNCALL_MESSAGES + [response_message_from_llm_inference] + converted_fncall_messages = convert_non_fncall_messages_to_fncall_messages( + all_nonfncall_messages, FNCALL_TOOLS + ) + assert converted_fncall_messages == FNCALL_MESSAGES + [FNCALL_RESPONSE_MESSAGE] + assert converted_fncall_messages[-1] == FNCALL_RESPONSE_MESSAGE + + +def test_convert_from_multiple_tool_calls_to_single_tool_call_messages(): + # Test case with multiple tool calls in one message + input_messages = [ + { + 'role': 'assistant', + 'content': 'Let me help you with that.', + 'tool_calls': [ + { + 'id': 'call1', + 'type': 'function', + 'function': {'name': 'func1', 'arguments': '{}'}, + }, + { + 'id': 'call2', + 'type': 'function', + 'function': {'name': 'func2', 'arguments': '{}'}, + }, + ], + }, + { + 'role': 'tool', + 'tool_call_id': 'call1', + 'content': 'Result 1', + 'name': 'func1', + }, + { + 'role': 'tool', + 'tool_call_id': 'call2', + 'content': 'Result 2', + 'name': 'func2', + }, + { + 'role': 'assistant', + 'content': 'Test again', + 'tool_calls': [ + { + 'id': 'call3', + 'type': 'function', + 'function': {'name': 'func3', 'arguments': '{}'}, + }, + { + 'id': 'call4', + 'type': 'function', + 'function': {'name': 'func4', 'arguments': '{}'}, + }, + ], + }, + { + 'role': 'tool', + 'tool_call_id': 'call3', + 'content': 'Result 3', + 'name': 'func3', + }, + { + 'role': 'tool', + 'tool_call_id': 'call4', + 'content': 'Result 4', + 'name': 'func4', + }, + ] + + expected_output = [ + { + 'role': 'assistant', + 'content': 'Let me help you with that.', + 'tool_calls': [ + { + 'id': 'call1', + 'type': 'function', + 'function': {'name': 'func1', 'arguments': '{}'}, + } + ], + }, + { + 'role': 'tool', + 'tool_call_id': 'call1', + 'content': 'Result 1', + 'name': 'func1', + }, + { + 'role': 'assistant', + 'content': '', + 'tool_calls': [ + { + 'id': 'call2', + 'type': 'function', + 'function': {'name': 'func2', 'arguments': '{}'}, + } + ], + }, + { + 'role': 'tool', + 'tool_call_id': 'call2', + 'content': 'Result 2', + 'name': 'func2', + }, + { + 'role': 'assistant', + 'content': 'Test again', + 'tool_calls': [ + { + 'id': 'call3', + 'type': 'function', + 'function': {'name': 'func3', 'arguments': '{}'}, + } + ], + }, + { + 'role': 'tool', + 'tool_call_id': 'call3', + 'content': 'Result 3', + 'name': 'func3', + }, + { + 'role': 'assistant', + 'content': '', + 'tool_calls': [ + { + 'id': 'call4', + 'type': 'function', + 'function': {'name': 'func4', 'arguments': '{}'}, + } + ], + }, + { + 'role': 'tool', + 'tool_call_id': 'call4', + 'content': 'Result 4', + 'name': 'func4', + }, + ] + + result = convert_from_multiple_tool_calls_to_single_tool_call_messages( + input_messages + ) + assert result == expected_output + + +def test_convert_from_multiple_tool_calls_to_single_tool_call_messages_incomplete(): + # Test case with multiple tool calls in one message + input_messages = [ + { + 'role': 'assistant', + 'content': 'Let me help you with that.', + 'tool_calls': [ + { + 'id': 'call1', + 'type': 'function', + 'function': {'name': 'func1', 'arguments': '{}'}, + }, + { + 'id': 'call2', + 'type': 'function', + 'function': {'name': 'func2', 'arguments': '{}'}, + }, + ], + }, + { + 'role': 'tool', + 'tool_call_id': 'call1', + 'content': 'Result 1', + 'name': 'func1', + }, + ] + + with pytest.raises(FunctionCallConversionError): + convert_from_multiple_tool_calls_to_single_tool_call_messages(input_messages) + + +def test_convert_from_multiple_tool_calls_no_changes_needed(): + # Test case where no conversion is needed (single tool call) + input_messages = [ + { + 'role': 'assistant', + 'content': 'Let me help you with that.', + 'tool_calls': [ + { + 'id': 'call1', + 'type': 'function', + 'function': {'name': 'func1', 'arguments': '{}'}, + } + ], + }, + { + 'role': 'tool', + 'tool_call_id': 'call1', + 'content': 'Result 1', + 'name': 'func1', + }, + ] + + result = convert_from_multiple_tool_calls_to_single_tool_call_messages( + input_messages + ) + assert result == input_messages + + +def test_convert_from_multiple_tool_calls_no_tool_calls(): + # Test case with no tool calls + input_messages = [ + {'role': 'user', 'content': 'Hello'}, + {'role': 'assistant', 'content': 'Hi there!'}, + ] + + result = convert_from_multiple_tool_calls_to_single_tool_call_messages( + input_messages + ) + assert result == input_messages From 467fae900693830128719135130deff054191d02 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sun, 3 Nov 2024 22:54:34 +0000 Subject: [PATCH 13/40] add ignore final tool result flag --- openhands/llm/fn_call_converter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openhands/llm/fn_call_converter.py b/openhands/llm/fn_call_converter.py index 17e897a253596..ca0fcff9eedd3 100644 --- a/openhands/llm/fn_call_converter.py +++ b/openhands/llm/fn_call_converter.py @@ -438,6 +438,7 @@ def convert_non_fncall_messages_to_fncall_messages( def convert_from_multiple_tool_calls_to_single_tool_call_messages( messages: list[dict], + ignore_final_tool_result: bool = False, ) -> list[dict]: """Break one message with multiple tool calls into multiple messages.""" converted_messages = [] @@ -474,7 +475,7 @@ def convert_from_multiple_tool_calls_to_single_tool_call_messages( ), f'Found pending tool calls but not expect to handle it with role {role}: {pending_tool_calls=}, {message=}' converted_messages.append(message) - if len(pending_tool_calls) > 0: + if not ignore_final_tool_result and len(pending_tool_calls) > 0: raise FunctionCallConversionError( f'Found pending tool calls but no tool result: {pending_tool_calls=}' ) From f31adac07a64cfa0f9f11199ae027d00a5bc83aa Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Mon, 4 Nov 2024 00:13:48 +0000 Subject: [PATCH 14/40] support setting hard timeout per evaluation instance --- evaluation/swe_bench/run_infer.py | 7 +++- evaluation/utils/shared.py | 59 +++++++++++++++++++++++++++---- 2 files changed, 59 insertions(+), 7 deletions(-) diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index 39163e9594315..405d76cf4f00b 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -527,5 +527,10 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame: instances[col] = instances[col].apply(lambda x: str(x)) run_evaluation( - instances, metadata, output_file, args.eval_num_workers, process_instance + instances, + metadata, + output_file, + args.eval_num_workers, + process_instance, + timeout_seconds=60 * 60, # 1 hour PER instance should be more than enough ) diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index b8d2ad281ad6d..fde36ce65d5c2 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -3,9 +3,11 @@ import multiprocessing as mp import os import pathlib +import signal import subprocess import time import traceback +from contextlib import contextmanager from typing import Any, Awaitable, Callable, TextIO import pandas as pd @@ -89,6 +91,27 @@ class EvalException(Exception): pass +class EvalTimeoutException(Exception): + pass + + +@contextmanager +def timeout(seconds: int): + def timeout_handler(signum, frame): + raise EvalTimeoutException(f'Function timed out after {seconds} seconds') + + # Set up the signal handler + original_handler = signal.signal(signal.SIGALRM, timeout_handler) + signal.alarm(seconds) + + try: + yield + finally: + # Restore the original handler and disable the alarm + signal.alarm(0) + signal.signal(signal.SIGALRM, original_handler) + + def codeact_user_response( state: State, encapsulate_solution: bool = False, @@ -270,15 +293,31 @@ def _process_instance_wrapper( metadata: EvalMetadata, use_mp: bool, max_retries: int = 5, + timeout_seconds: int | None = None, ) -> EvalOutput: - """Wrap the process_instance_func to handle retries and errors. - - Retry an instance up to max_retries times if it fails (e.g., due to transient network/runtime issues). - """ + """Wrap the process_instance_func to handle retries and errors.""" for attempt in range(max_retries + 1): try: - result = process_instance_func(instance, metadata, use_mp) + if timeout_seconds is not None: + with timeout(timeout_seconds): + result = process_instance_func(instance, metadata, use_mp) + else: + result = process_instance_func(instance, metadata, use_mp) return result + except EvalTimeoutException as e: + error = f'Timeout after {timeout_seconds} seconds' + stacktrace = traceback.format_exc() + msg = ( + '-' * 10 + + '\n' + + f'Timeout ({timeout_seconds} seconds) in instance [{instance.instance_id}], Stopped evaluation for this instance.' + + '\n' + + '-' * 10 + ) + logger.exception(e) + raise RuntimeError( + f'Maximum timeout retries reached for instance {instance.instance_id}' + ) from e except Exception as e: error = str(e) stacktrace = traceback.format_exc() @@ -327,6 +366,7 @@ def run_evaluation( [pd.Series, EvalMetadata, bool], Awaitable[EvalOutput] ], max_retries: int = 5, # number of retries for each instance + timeout_seconds: int | None = None, ): use_multiprocessing = num_workers > 1 @@ -346,7 +386,14 @@ def run_evaluation( if use_multiprocessing: with mp.Pool(num_workers) as pool: args_iter = ( - (process_instance_func, instance, metadata, True, max_retries) + ( + process_instance_func, + instance, + metadata, + True, + max_retries, + timeout_seconds, + ) for _, instance in dataset.iterrows() ) results = pool.imap_unordered(_process_instance_wrapper_mp, args_iter) From e6634102be26c4687a4c697e9a4b4edd76d7adbc Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Mon, 4 Nov 2024 02:52:20 +0000 Subject: [PATCH 15/40] return stuff for timeout --- evaluation/utils/shared.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index fde36ce65d5c2..5c3d9864ca6c9 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -315,9 +315,11 @@ def _process_instance_wrapper( + '-' * 10 ) logger.exception(e) - raise RuntimeError( - f'Maximum timeout retries reached for instance {instance.instance_id}' - ) from e + return EvalOutput( + instance_id=instance.instance_id, + test_result={}, + error=error, + ) except Exception as e: error = str(e) stacktrace = traceback.format_exc() From 472f30ec909ee4c512abe044fddf2def7b197ad0 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Mon, 4 Nov 2024 03:50:53 +0000 Subject: [PATCH 16/40] update script --- scripts/eval-swebench-lite100.sh | 17 +++++++++++++++++ ...rollout.sh => eval-swetrain-lite-rollout.sh} | 0 2 files changed, 17 insertions(+) create mode 100755 scripts/eval-swebench-lite100.sh rename scripts/{eval-lite-rollout.sh => eval-swetrain-lite-rollout.sh} (100%) diff --git a/scripts/eval-swebench-lite100.sh b/scripts/eval-swebench-lite100.sh new file mode 100755 index 0000000000000..b2842f7f7b68a --- /dev/null +++ b/scripts/eval-swebench-lite100.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +OUTPUT_FILE=$1 +NUM_WORKERS=$2 +DATASET="swe-train/SWE-bench_lite" +SPLIT="test100" + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" + +./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT diff --git a/scripts/eval-lite-rollout.sh b/scripts/eval-swetrain-lite-rollout.sh similarity index 100% rename from scripts/eval-lite-rollout.sh rename to scripts/eval-swetrain-lite-rollout.sh From c1c46b671ebfa211c1cc9f35e6c0ce0d6ab4fcd7 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Mon, 4 Nov 2024 03:59:45 +0000 Subject: [PATCH 17/40] bump max turn to 50 --- scripts/rollout-eval.sh | 2 +- scripts/rollout-swe-train-full.sh | 28 ++++++++++++++++++++++++++++ scripts/rollout-swe-train-lite.sh | 4 ++-- 3 files changed, 31 insertions(+), 3 deletions(-) create mode 100755 scripts/rollout-swe-train-full.sh diff --git a/scripts/rollout-eval.sh b/scripts/rollout-eval.sh index cbb7017c06bda..f98006a347d0c 100755 --- a/scripts/rollout-eval.sh +++ b/scripts/rollout-eval.sh @@ -17,7 +17,7 @@ export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/sw export EXP_NAME=$EXP_NAME EVAL_LIMIT=300 -MAX_ITER=30 +MAX_ITER=50 NUM_WORKERS=64 ./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT diff --git a/scripts/rollout-swe-train-full.sh b/scripts/rollout-swe-train-full.sh new file mode 100755 index 0000000000000..5a0ec7038d925 --- /dev/null +++ b/scripts/rollout-swe-train-full.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +MODEL=$1 +EXP_NAME=$2 # "train-t0" +export EXP_NAME=$EXP_NAME +echo "MODEL: $MODEL" +echo "EXP_NAME: $EXP_NAME" +DATASET="swe-train/swe-train-dev-v1" +SPLIT="train.v2.success" +N_RUNS=1 + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" + +EVAL_LIMIT=2000 +MAX_ITER=50 +NUM_WORKERS=64 + +./evaluation/swe_bench/scripts/run_infer.sh \ + $MODEL HEAD CodeActAgent \ + $EVAL_LIMIT $MAX_ITER $NUM_WORKERS \ + $DATASET $SPLIT $N_RUNS diff --git a/scripts/rollout-swe-train-lite.sh b/scripts/rollout-swe-train-lite.sh index f47d1b5aad1a6..77173ade2b30d 100755 --- a/scripts/rollout-swe-train-lite.sh +++ b/scripts/rollout-swe-train-lite.sh @@ -19,8 +19,8 @@ export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" EVAL_LIMIT=2000 -MAX_ITER=30 -NUM_WORKERS=128 +MAX_ITER=50 +NUM_WORKERS=64 ./evaluation/swe_bench/scripts/run_infer.sh \ $MODEL HEAD CodeActAgent \ From a50cafe9d26556bc865e39d4628446014cafa986 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Mon, 4 Nov 2024 04:07:27 +0000 Subject: [PATCH 18/40] change limit --- scripts/rollout-swe-train-full.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/rollout-swe-train-full.sh b/scripts/rollout-swe-train-full.sh index 5a0ec7038d925..b07940c89269a 100755 --- a/scripts/rollout-swe-train-full.sh +++ b/scripts/rollout-swe-train-full.sh @@ -18,7 +18,7 @@ export RUNTIME=remote export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" -EVAL_LIMIT=2000 +EVAL_LIMIT=3000 MAX_ITER=50 NUM_WORKERS=64 From faae58004cadcf6b6146f5fbfd51ab3f6eed0b46 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Mon, 11 Nov 2024 23:50:20 +0000 Subject: [PATCH 19/40] get more-to-update fncall converter --- openhands/llm/fn_call_converter.py | 321 ++++++++++++++++++++++++++++- 1 file changed, 317 insertions(+), 4 deletions(-) diff --git a/openhands/llm/fn_call_converter.py b/openhands/llm/fn_call_converter.py index ca0fcff9eedd3..b63608df45379 100644 --- a/openhands/llm/fn_call_converter.py +++ b/openhands/llm/fn_call_converter.py @@ -44,6 +44,188 @@ - If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls """ +STOP_WORDS = [' + +pwd && ls + + + +USER: EXECUTION RESULT of [execute_bash]: +/workspace +openhands@runtime:~/workspace$ + +ASSISTANT: There is no `app.py` file in the current directory. Let me create a Python file `app.py`: + +create +/workspace/app.py + +from flask import Flask +app = Flask(__name__) + +@app.route('/') +def index(): + numbers = list(range(1, 11)) + return str(numbers) + +if __name__ == '__main__': + app.run(port=5000) + + + +USER: EXECUTION RESULT of [str_replace_editor]: +File created successfully at: /workspace/app.py + +ASSISTANT: I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you: + + +python3 app.py > server.log 2>&1 & + + + +USER: EXECUTION RESULT of [execute_bash]: +[1] 121 +[1]+ Exit 1 python3 app.py > server.log 2>&1 + +ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log: + + +cat server.log + + + +USER: EXECUTION RESULT of [execute_bash]: +Traceback (most recent call last): + File "/workspace/app.py", line 2, in + from flask import Flask +ModuleNotFoundError: No module named 'flask' + +ASSISTANT: Looks like the server crashed because the `flask` module is not installed. Let me install the `flask` module for you: + + +pip3 install flask + + + +USER: EXECUTION RESULT of [execute_bash]: +Defaulting to user installation because normal site-packages is not writeable +Collecting flask + Using cached flask-3.0.3-py3-none-any.whl (101 kB) +Collecting blinker>=1.6.2 + Using cached blinker-1.7.0-py3-none-any.whl (13 kB) +Collecting Werkzeug>=3.0.0 + Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB) +Collecting click>=8.1.3 + Using cached click-8.1.7-py3-none-any.whl (97 kB) +Collecting itsdangerous>=2.1.2 + Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB) +Requirement already satisfied: Jinja2>=3.1.2 in /home/openhands/.local/lib/python3.10/site-packages (from flask) (3.1.3) +Requirement already satisfied: MarkupSafe>=2.0 in /home/openhands/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5) +Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask +Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0 + +ASSISTANT: The `flask` module is installed successfully. Let me run the Python file again: + + +python3 app.py > server.log 2>&1 & + + + +USER: EXECUTION RESULT of [execute_bash]: +[1] 124 + +ASSISTANT: +Let me check the server log again: + + +cat server.log + + + +USER: EXECUTION RESULT of [execute_bash]: +* Serving Flask app 'app' + * Debug mode: off +WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead. + * Running on http://127.0.0.1:5000 +Press CTRL+C to quit + +ASSISTANT: +The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask! + +USER: Now kill the server, make it display the numbers in a table format. + +ASSISTANT: +Sure! Let me stop the server first: + + +kill 124 + + + +USER: EXECUTION RESULT of [execute_bash]: +[1]+ Terminated python3 app.py > server.log 2>&1 + +ASSISTANT: +Now let me display the numbers in a table format: + +str_replace +/workspace/app.py +return str(numbers) +return '' + ''.join([f'' for i in numbers]) + '
{i}
' + + +USER: EXECUTION RESULT of [str_replace_editor]: +The file /workspace/app.py has been edited. Here's the result of running `cat -n` on a snippet of /workspace/app.py: + 3 + 4 @app.route('/') + 5 def index(): + 6 numbers = list(range(1, 11)) + 7 return '' + ''.join([f'' for i in numbers]) + '
{i}
' + 8 + 9 if __name__ == '__main__': + 10 app.run(port=5000) +Review the changes and make sure they are as expected. Edit the file again if necessary. + +ASSISTANT: +Running the updated file: + + + +python3 app.py > server.log 2>&1 & + + + +USER: EXECUTION RESULT of [execute_bash]: +[1] 126 + +ASSISTANT: +The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests! + + + +--------------------- END OF EXAMPLE --------------------- + +Do NOT assume the environment is the same as in the example above. + +--------------------- NEW TASK DESCRIPTION --------------------- +""".lstrip() + +IN_CONTEXT_LEARNING_EXAMPLE_SUFFIX = """ +--------------------- END OF NEW TASK DESCRIPTION --------------------- + +PLEASE follow the format strictly! PLEASE EMIT ONE AND ONLY ONE FUNCTION CALL PER MESSAGE. +""".lstrip() + # Regex patterns for function call parsing FN_REGEX_PATTERN = r']+)>\n(.*?)' FN_PARAM_REGEX_PATTERN = r']+)>(.*?)' @@ -92,10 +274,32 @@ def convert_tools_to_description(tools: list[dict]) -> str: ret += '\n' ret += f"---- BEGIN FUNCTION #{i+1}: {fn['name']} ----\n" ret += f"Description: {fn['description']}\n" + if 'parameters' in fn: - ret += f"Parameters: {json.dumps(fn['parameters'], indent=2)}\n" + ret += 'Parameters:\n' + properties = fn['parameters'].get('properties', {}) + required_params = set(fn['parameters'].get('required', [])) + + for j, (param_name, param_info) in enumerate(properties.items()): + # Indicate required/optional in parentheses with type + is_required = param_name in required_params + param_status = 'required' if is_required else 'optional' + param_type = param_info.get('type', 'string') + + # Get parameter description + desc = param_info.get('description', 'No description provided') + + # Handle enum values if present + if 'enum' in param_info: + enum_values = ', '.join(f'`{v}`' for v in param_info['enum']) + desc += f'\nAllowed values: [{enum_values}]' + + ret += ( + f' ({j+1}) {param_name} ({param_type}, {param_status}): {desc}\n' + ) else: ret += 'No parameters are required for this function.\n' + ret += f'---- END FUNCTION #{i+1} ----\n' return ret @@ -113,6 +317,7 @@ def convert_fncall_messages_to_non_fncall_messages( ) converted_messages = [] + first_user_message_encountered = False for message in messages: role, content = message['role'], message['content'] if content is None: @@ -135,7 +340,82 @@ def convert_fncall_messages_to_non_fncall_messages( converted_messages.append({'role': 'system', 'content': content}) # 2. USER MESSAGES (no change) elif role == 'user': - converted_messages.append(message) + # Add in-context learning example for the first user message + if not first_user_message_encountered: + first_user_message_encountered = True + # Check tools + if not ( + tools + and len(tools) > 0 + and any( + ( + tool['type'] == 'function' + and tool['function']['name'] == 'execute_bash' + and 'command' + in tool['function']['parameters']['properties'] + ) + for tool in tools + ) + and any( + ( + tool['type'] == 'function' + and tool['function']['name'] == 'str_replace_editor' + and 'path' in tool['function']['parameters']['properties'] + and 'file_text' + in tool['function']['parameters']['properties'] + and 'old_str' + in tool['function']['parameters']['properties'] + and 'new_str' + in tool['function']['parameters']['properties'] + ) + for tool in tools + ) + ): + raise FunctionCallConversionError( + 'The currently provided tool set are NOT compatible with the in-context learning example for FnCall to Non-FnCall conversion. ' + 'Please update your tool set OR the in-context learning example in openhands/llm/fn_call_converter.py' + ) + + # add in-context learning example + if isinstance(content, str): + content = ( + IN_CONTEXT_LEARNING_EXAMPLE_PREFIX + + content + + IN_CONTEXT_LEARNING_EXAMPLE_SUFFIX + ) + elif isinstance(content, list): + if content and content[0]['type'] == 'text': + content[0]['text'] = ( + IN_CONTEXT_LEARNING_EXAMPLE_PREFIX + + content[0]['text'] + + IN_CONTEXT_LEARNING_EXAMPLE_SUFFIX + ) + else: + content = ( + [ + { + 'type': 'text', + 'text': IN_CONTEXT_LEARNING_EXAMPLE_PREFIX, + } + ] + + content + + [ + { + 'type': 'text', + 'text': IN_CONTEXT_LEARNING_EXAMPLE_SUFFIX, + } + ] + ) + else: + raise FunctionCallConversionError( + f'Unexpected content type {type(content)}. Expected str or list. Content: {content}' + ) + converted_messages.append( + { + 'role': 'user', + 'content': content, + } + ) # 3. ASSISTANT MESSAGES # - 3.1 no change if no function call @@ -266,6 +546,16 @@ def _extract_and_validate_params( return params +def _fix_stopword(content: str) -> str: + """Fix the issue when some LLM would NOT return the stopword.""" + if '' + else: + content = content + '\n' + return content + + def convert_non_fncall_messages_to_fncall_messages( messages: list[dict], tools: list[ChatCompletionToolParam], @@ -280,9 +570,10 @@ def convert_non_fncall_messages_to_fncall_messages( converted_messages = [] tool_call_counter = 1 # Counter for tool calls + first_user_message_encountered = False for message in messages: role, content = message['role'], message['content'] - + content = content or '' # handle cases where content is None # For system messages, remove the added suffix if role == 'system': if isinstance(content, str): @@ -297,6 +588,26 @@ def convert_non_fncall_messages_to_fncall_messages( converted_messages.append({'role': 'system', 'content': content}) # Skip user messages (no conversion needed) elif role == 'user': + # Check & replace in-context learning example + if not first_user_message_encountered: + first_user_message_encountered = True + if isinstance(content, str): + content = content.replace(IN_CONTEXT_LEARNING_EXAMPLE_PREFIX, '') + content = content.replace(IN_CONTEXT_LEARNING_EXAMPLE_SUFFIX, '') + elif isinstance(content, list): + for item in content: + if item['type'] == 'text': + item['text'] = item['text'].replace( + IN_CONTEXT_LEARNING_EXAMPLE_PREFIX, '' + ) + item['text'] = item['text'].replace( + IN_CONTEXT_LEARNING_EXAMPLE_SUFFIX, '' + ) + else: + raise FunctionCallConversionError( + f'Unexpected content type {type(content)}. Expected str or list. Content: {content}' + ) + # Check for tool execution result pattern if isinstance(content, str): tool_result_match = re.search( @@ -348,14 +659,16 @@ def convert_non_fncall_messages_to_fncall_messages( } ) else: - converted_messages.append(message) + converted_messages.append({'role': 'user', 'content': content}) # Handle assistant messages elif role == 'assistant': if isinstance(content, str): + content = _fix_stopword(content) fn_match = re.search(FN_REGEX_PATTERN, content, re.DOTALL) elif isinstance(content, list): if content and content[-1]['type'] == 'text': + content[-1]['text'] = _fix_stopword(content[-1]['text']) fn_match = re.search( FN_REGEX_PATTERN, content[-1]['text'], re.DOTALL ) From 226588a5d36d18a6c48d6550df95a8e873875178 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 12 Nov 2024 00:00:22 +0000 Subject: [PATCH 20/40] optionally add icl --- openhands/llm/fn_call_converter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openhands/llm/fn_call_converter.py b/openhands/llm/fn_call_converter.py index b63608df45379..5ddbcb3050646 100644 --- a/openhands/llm/fn_call_converter.py +++ b/openhands/llm/fn_call_converter.py @@ -307,6 +307,7 @@ def convert_tools_to_description(tools: list[dict]) -> str: def convert_fncall_messages_to_non_fncall_messages( messages: list[dict], tools: list[ChatCompletionToolParam], + add_in_context_learning_example: bool = True, ) -> list[dict]: """Convert function calling messages to non-function calling messages.""" messages = copy.deepcopy(messages) @@ -341,7 +342,7 @@ def convert_fncall_messages_to_non_fncall_messages( # 2. USER MESSAGES (no change) elif role == 'user': # Add in-context learning example for the first user message - if not first_user_message_encountered: + if not first_user_message_encountered and add_in_context_learning_example: first_user_message_encountered = True # Check tools if not ( From cb6241593780e6eb83c58cc55fd30babafdab4f4 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 15 Nov 2024 05:08:58 +0000 Subject: [PATCH 21/40] fix(swebench): handle error in eval_infer annd --- evaluation/swe_bench/eval_infer.py | 42 +++++++++++++++++------------- evaluation/swe_bench/run_infer.py | 2 +- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/swe_bench/eval_infer.py index c4566c1bfd3df..5af302c6e6b31 100644 --- a/evaluation/swe_bench/eval_infer.py +++ b/evaluation/swe_bench/eval_infer.py @@ -244,23 +244,29 @@ def process_instance( test_output_path = os.path.join(log_dir, 'test_output.txt') with open(test_output_path, 'w') as f: f.write(test_output) - - _report = get_eval_report( - test_spec=test_spec, - prediction={ - 'model_patch': model_patch, - 'instance_id': instance_id, - }, - log_path=test_output_path, - include_tests_status=True, - ) - report = _report[instance_id] - logger.info( - f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}" - ) - instance['test_result']['report']['resolved'] = report[ - 'resolved' - ] + try: + _report = get_eval_report( + test_spec=test_spec, + prediction={ + 'model_patch': model_patch, + 'instance_id': instance_id, + }, + log_path=test_output_path, + include_tests_status=True, + ) + report = _report[instance_id] + logger.info( + f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}" + ) + instance['test_result']['report']['resolved'] = report[ + 'resolved' + ] + except Exception as e: + logger.error( + f'[{instance_id}] Error when getting eval report: {e}' + ) + instance['test_result']['report']['resolved'] = False + instance['test_result']['report']['error_eval'] = True else: logger.info(f'[{instance_id}] Error when starting eval:\n{obs.content}') instance['test_result']['report']['error_eval'] = True @@ -335,7 +341,7 @@ def process_instance( if 'model_patch' not in predictions.columns: predictions['model_patch'] = predictions['test_result'].apply( - lambda x: x['git_patch'] + lambda x: x.get('git_patch', '') ) assert {'instance_id', 'model_patch'}.issubset( set(predictions.columns) diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index 405d76cf4f00b..86334bed56bb0 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -532,5 +532,5 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame: output_file, args.eval_num_workers, process_instance, - timeout_seconds=60 * 60, # 1 hour PER instance should be more than enough + timeout_seconds=120 * 60, # 2 hour PER instance should be more than enough ) From e0a50e6b5e9b15eb3f368564314302fac70c5e7c Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 15 Nov 2024 05:17:49 +0000 Subject: [PATCH 22/40] fix arg --- evaluation/swe_bench/run_infer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index 386c0dd192383..9cb9dd77f4983 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -145,7 +145,7 @@ def get_config( platform='linux/amd64', api_key=os.environ.get('ALLHANDS_API_KEY', None), remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), - keep_remote_runtime_alive=False, + keep_runtime_alive=False, remote_runtime_init_timeout=3600, ), # do not mount workspace From d2d8bf5023256754c172f0f11d24dc73bb2dda03 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 15 Nov 2024 15:15:44 +0000 Subject: [PATCH 23/40] fix for empty content --- openhands/llm/fn_call_converter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openhands/llm/fn_call_converter.py b/openhands/llm/fn_call_converter.py index 5ddbcb3050646..2f8d9cbe5cc80 100644 --- a/openhands/llm/fn_call_converter.py +++ b/openhands/llm/fn_call_converter.py @@ -320,7 +320,8 @@ def convert_fncall_messages_to_non_fncall_messages( converted_messages = [] first_user_message_encountered = False for message in messages: - role, content = message['role'], message['content'] + role = message['role'] + content = message.get('content', '') if content is None: content = '' From c3d9ec2b248ee79fbca7e18f487f871722876cb5 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 15 Nov 2024 15:17:22 +0000 Subject: [PATCH 24/40] remove extra --- openhands/llm/llm.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py index 0590945995c1e..7712457a73ad0 100644 --- a/openhands/llm/llm.py +++ b/openhands/llm/llm.py @@ -121,7 +121,9 @@ def __init__( top_p=self.config.top_p, drop_params=self.config.drop_params, ) - + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + self.init_model_info() if self.vision_is_active(): logger.debug('LLM: model has vision enabled') if self.is_caching_prompt_active(): @@ -143,16 +145,6 @@ def __init__( drop_params=self.config.drop_params, ) - with warnings.catch_warnings(): - warnings.simplefilter('ignore') - self.init_model_info() - if self.vision_is_active(): - logger.debug('LLM: model has vision enabled') - if self.is_caching_prompt_active(): - logger.debug('LLM: caching prompt enabled') - if self.is_function_calling_active(): - logger.debug('LLM: model supports function calling') - self._completion_unwrapped = self._completion @self.retry_decorator( From a25d05de9c09173e3c72f9eb47f3ad6559f9b5ef Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 21 Nov 2024 20:03:31 +0000 Subject: [PATCH 25/40] update summarizer script to accept folder --- .../scripts/eval/summarize_outputs.py | 211 +++++++++++++++--- 1 file changed, 179 insertions(+), 32 deletions(-) diff --git a/evaluation/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/swe_bench/scripts/eval/summarize_outputs.py index 5d5dbbf2a3bd5..739f26d10eb3a 100755 --- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py +++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py @@ -1,8 +1,12 @@ #!/usr/bin/env python3 import argparse +import glob import json +import os from collections import Counter +import pandas as pd + from openhands.events.serialization import event_from_dict from openhands.events.utils import get_pairs_from_events @@ -12,23 +16,18 @@ 'Action execution failed', ] -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('output_file', type=str, help='The file to summarize') - args = parser.parse_args() - with open(args.output_file, 'r') as file: +def process_file(file_path): + with open(file_path, 'r') as file: lines = file.readlines() num_lines = len(lines) num_error_lines = 0 num_agent_stuck_in_loop = 0 - num_resolved = 0 num_empty_patch = 0 - + num_unfinished_runs = 0 error_counter = Counter() - main_agent_cost = [] editor_cost = [] num_turns = [] @@ -36,6 +35,11 @@ for line in lines: _d = json.loads(line) + if 'metrics' not in _d or _d['metrics'] is None: + # this is a failed run + num_unfinished_runs += 1 + continue + # Cost costs = _d['metrics'].get('costs', []) _cur_main_agent_cost = 0 @@ -89,30 +93,173 @@ num_error_lines += 1 break - # print the error counter (with percentage) - print( - f'Number of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)' - ) - print( - f'Number of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)' - ) - print( - f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)' - ) - print( - f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)' + return { + 'file_path': file_path, + 'total_instances': num_lines, + 'resolved': { + 'count': num_resolved, + 'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0, + }, + 'empty_patches': { + 'count': num_empty_patch, + 'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0, + }, + 'unfinished_runs': { + 'count': num_unfinished_runs, + 'percentage': (num_unfinished_runs / num_lines * 100) + if num_lines > 0 + else 0, + }, + 'errors': { + 'total': num_error_lines, + 'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0, + 'stuck_in_loop': { + 'count': num_agent_stuck_in_loop, + 'percentage': (num_agent_stuck_in_loop / num_lines * 100) + if num_lines > 0 + else 0, + }, + 'breakdown': { + str(error): { + 'count': count, + 'percentage': (count / num_lines * 100) if num_lines > 0 else 0, + } + for error, count in error_counter.items() + }, + }, + 'statistics': { + 'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0, + 'costs': { + 'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0, + 'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0, + 'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines + if num_lines > 0 + else 0, + }, + }, + } + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + 'input_path', type=str, help='The file or directory to summarize' ) - assert len(num_turns) == num_lines - assert len(main_agent_cost) == num_lines - assert len(editor_cost) == num_lines - print('## Statistics') - print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}') - print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD') - print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD') - print( - f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD' + parser.add_argument( + '--output', + type=str, + help='Output JSONL file for results', + default='summary_results.jsonl', ) + args = parser.parse_args() + + results = [] + + if os.path.isdir(args.input_path): + # Process all output.jsonl files in subdirectories + pattern = os.path.join(args.input_path, '**/output.jsonl') + files = glob.glob(pattern, recursive=True) + print(f'Processing {len(files)} files from directory {args.input_path}') + + # Process each file silently and collect results + for file_path in files: + try: + result = process_file(file_path) + results.append(result) + except Exception as e: + print(f'Error processing {file_path}: {str(e)}') + import traceback + + traceback.print_exc() + continue + + print(f'Results written to {args.output}') + + # Convert results to pandas DataFrame and sort by resolve rate + df = pd.DataFrame(results) + + # Extract directory name from file path + df['directory'] = df['file_path'].apply( + lambda x: os.path.basename(os.path.dirname(x)) + ) + + df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage']) + df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage']) + df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage']) + df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns']) + df['error_rate'] = df['errors'].apply(lambda x: x['percentage']) + df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total']) + + df = df.sort_values('resolve_rate', ascending=False) + + print('\nResults summary (sorted by resolve rate):') + columns = [ + 'directory', + 'resolve_rate', + 'empty_patch_rate', + 'unfinished_rate', + 'error_rate', + 'avg_turns', + 'avg_cost', + 'total_instances', + ] + + # Set display options for pandas + pd.set_option('display.width', 0) # Auto-detect terminal width + pd.set_option('display.max_rows', None) # Show all rows + + # Use formatters and max_colwidth in to_string + print( + df[columns].to_string( + float_format=lambda x: '{:.2f}'.format(x), + formatters={ + 'directory': lambda x: x[:90] + }, # Truncate directory names to 20 chars + index=False, + ) + ) + + df.to_json(args.output, lines=True, orient='records') + else: + # Process single file with detailed output + try: + result = process_file(args.input_path) + results.append(result) + + # Print detailed results for single file + print(f'\nResults for {args.input_path}:') + print( + f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}%)" + ) + print( + f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)" + ) + print( + f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)" + ) + print( + f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)" + ) + print( + f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)" + ) + print('## Statistics') + print( + f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}" + ) + print( + f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD" + ) + print( + f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD" + ) + print( + f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD" + ) + + print('## Detailed error breakdown:') + for error, data in result['errors']['breakdown'].items(): + print(f"{error}: {data['count']} ({data['percentage']:.2f}%)") - print('## Detailed error breakdown:') - for error, count in error_counter.items(): - print(f'{error}: {count} ({count / num_lines * 100:.2f}%)') + except Exception as e: + print(f'Error processing {args.input_path}: {str(e)}') From 640c7acf6d3186dab1c8451891287ae1f7da3a7f Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 21 Nov 2024 20:04:57 +0000 Subject: [PATCH 26/40] print summary string to text --- .../scripts/eval/summarize_outputs.py | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/evaluation/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/swe_bench/scripts/eval/summarize_outputs.py index 739f26d10eb3a..21ff18ebdbb7e 100755 --- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py +++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py @@ -192,7 +192,7 @@ def process_file(file_path): df = df.sort_values('resolve_rate', ascending=False) - print('\nResults summary (sorted by resolve rate):') + # Create the summary string columns = [ 'directory', 'resolve_rate', @@ -203,22 +203,25 @@ def process_file(file_path): 'avg_cost', 'total_instances', ] - - # Set display options for pandas - pd.set_option('display.width', 0) # Auto-detect terminal width - pd.set_option('display.max_rows', None) # Show all rows - - # Use formatters and max_colwidth in to_string - print( - df[columns].to_string( - float_format=lambda x: '{:.2f}'.format(x), - formatters={ - 'directory': lambda x: x[:90] - }, # Truncate directory names to 20 chars - index=False, - ) + summary_str = df[columns].to_string( + float_format=lambda x: '{:.2f}'.format(x), + formatters={ + 'directory': lambda x: x[:90] + }, # Truncate directory names to 20 chars + index=False, ) + # Print to console + print('\nResults summary (sorted by resolve rate):') + print(summary_str) + + # Save to text file + txt_output = args.output.rsplit('.', 1)[0] + '.txt' + with open(txt_output, 'w') as f: + f.write('Results summary (sorted by resolve rate):\n') + f.write(summary_str) + + # Save df.to_json(args.output, lines=True, orient='records') else: # Process single file with detailed output From e4f3859d1d09e72c3cfdf06601912e1d4cff9cec Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 21 Nov 2024 20:24:26 +0000 Subject: [PATCH 27/40] update rollout eval script --- scripts/rollout-eval.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/rollout-eval.sh b/scripts/rollout-eval.sh index f98006a347d0c..23957ab0bd51e 100755 --- a/scripts/rollout-eval.sh +++ b/scripts/rollout-eval.sh @@ -17,7 +17,7 @@ export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/sw export EXP_NAME=$EXP_NAME EVAL_LIMIT=300 -MAX_ITER=50 +MAX_ITER=100 NUM_WORKERS=64 ./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT From aa3b39eef746afe10a7f54c1ad8693fb644a80af Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 22 Nov 2024 21:22:19 +0000 Subject: [PATCH 28/40] allow specifying n processes --- scripts/rollout-eval.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/rollout-eval.sh b/scripts/rollout-eval.sh index 23957ab0bd51e..ac46effca15ee 100755 --- a/scripts/rollout-eval.sh +++ b/scripts/rollout-eval.sh @@ -2,6 +2,7 @@ export EXP_NAME="t0" # use this to differentiate between different runs MODEL=$1 +NUM_WORKERS=${2:-64} DATASET="swe-train/SWE-bench_lite" SPLIT="test100" N_RUNS=1 @@ -18,6 +19,5 @@ export EXP_NAME=$EXP_NAME EVAL_LIMIT=300 MAX_ITER=100 -NUM_WORKERS=64 ./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT From 7c4ce5663dcbfa44c38b779f2dba61e245497b83 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Fri, 22 Nov 2024 21:22:29 +0000 Subject: [PATCH 29/40] include litellm timeout as error --- evaluation/swe_bench/scripts/eval/summarize_outputs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/evaluation/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/swe_bench/scripts/eval/summarize_outputs.py index 21ff18ebdbb7e..2252ffa345eb3 100755 --- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py +++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py @@ -14,6 +14,7 @@ 'Agent encountered an error while processing the last action', 'APIError', 'Action execution failed', + 'litellm.Timeout: APITimeoutError', ] @@ -223,6 +224,7 @@ def process_file(file_path): # Save df.to_json(args.output, lines=True, orient='records') + df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False) else: # Process single file with detailed output try: From 14b15d12dd22faeb0d4afe69f6335c931c1401a1 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sat, 23 Nov 2024 18:19:52 +0000 Subject: [PATCH 30/40] add script to rollout and eval lite --- scripts/eval-swebench-lite.sh | 17 +++++++++++++++++ scripts/rollout-eval-lite.sh | 23 +++++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100755 scripts/eval-swebench-lite.sh create mode 100755 scripts/rollout-eval-lite.sh diff --git a/scripts/eval-swebench-lite.sh b/scripts/eval-swebench-lite.sh new file mode 100755 index 0000000000000..5a0172dd91174 --- /dev/null +++ b/scripts/eval-swebench-lite.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +OUTPUT_FILE=$1 +NUM_WORKERS=$2 +DATASET="princeton-nlp/SWE-bench_Lite" +SPLIT="test" + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" + +./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT diff --git a/scripts/rollout-eval-lite.sh b/scripts/rollout-eval-lite.sh new file mode 100755 index 0000000000000..21113f3f870c1 --- /dev/null +++ b/scripts/rollout-eval-lite.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +export EXP_NAME="t0" # use this to differentiate between different runs +MODEL=$1 +NUM_WORKERS=${2:-64} +DATASET="princeton-nlp/SWE-bench_Lite" +SPLIT="test" +N_RUNS=1 + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" +export EXP_NAME=$EXP_NAME + +EVAL_LIMIT=300 +MAX_ITER=100 + +./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT From edfeb2f84584203d6bdddc9d0b2d8f51ac336134 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sat, 23 Nov 2024 18:25:48 +0000 Subject: [PATCH 31/40] put aggregate dir to a separate script --- .../scripts/eval/summarize_outputs.py | 77 ++++++++++--------- 1 file changed, 41 insertions(+), 36 deletions(-) diff --git a/evaluation/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/swe_bench/scripts/eval/summarize_outputs.py index 2252ffa345eb3..8ca3ab4c1c820 100755 --- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py +++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py @@ -141,6 +141,46 @@ def process_file(file_path): } +def aggregate_directory(input_path) -> pd.DataFrame: + # Process all output.jsonl files in subdirectories + pattern = os.path.join(input_path, '**/output.jsonl') + files = glob.glob(pattern, recursive=True) + print(f'Processing {len(files)} files from directory {input_path}') + + # Process each file silently and collect results + for file_path in files: + try: + result = process_file(file_path) + results.append(result) + except Exception as e: + print(f'Error processing {file_path}: {str(e)}') + import traceback + + traceback.print_exc() + continue + + print(f'Results written to {args.output}') + + # Convert results to pandas DataFrame and sort by resolve rate + df = pd.DataFrame(results) + + # Extract directory name from file path + df['directory'] = df['file_path'].apply( + lambda x: os.path.basename(os.path.dirname(x)) + ) + + df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage']) + df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage']) + df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage']) + df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns']) + df['error_rate'] = df['errors'].apply(lambda x: x['percentage']) + df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total']) + + df = df.sort_values('resolve_rate', ascending=False) + + return df + + if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( @@ -157,42 +197,7 @@ def process_file(file_path): results = [] if os.path.isdir(args.input_path): - # Process all output.jsonl files in subdirectories - pattern = os.path.join(args.input_path, '**/output.jsonl') - files = glob.glob(pattern, recursive=True) - print(f'Processing {len(files)} files from directory {args.input_path}') - - # Process each file silently and collect results - for file_path in files: - try: - result = process_file(file_path) - results.append(result) - except Exception as e: - print(f'Error processing {file_path}: {str(e)}') - import traceback - - traceback.print_exc() - continue - - print(f'Results written to {args.output}') - - # Convert results to pandas DataFrame and sort by resolve rate - df = pd.DataFrame(results) - - # Extract directory name from file path - df['directory'] = df['file_path'].apply( - lambda x: os.path.basename(os.path.dirname(x)) - ) - - df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage']) - df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage']) - df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage']) - df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns']) - df['error_rate'] = df['errors'].apply(lambda x: x['percentage']) - df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total']) - - df = df.sort_values('resolve_rate', ascending=False) - + df = aggregate_directory(args.input_path) # Create the summary string columns = [ 'directory', From 95f5fc0c97abd6bb4d580e965bc0aa8885674f78 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sat, 23 Nov 2024 18:28:07 +0000 Subject: [PATCH 32/40] fix bugs --- evaluation/swe_bench/scripts/eval/summarize_outputs.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/evaluation/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/swe_bench/scripts/eval/summarize_outputs.py index 8ca3ab4c1c820..0da6eb25b54c6 100755 --- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py +++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py @@ -148,6 +148,7 @@ def aggregate_directory(input_path) -> pd.DataFrame: print(f'Processing {len(files)} files from directory {input_path}') # Process each file silently and collect results + results = [] for file_path in files: try: result = process_file(file_path) @@ -159,8 +160,6 @@ def aggregate_directory(input_path) -> pd.DataFrame: traceback.print_exc() continue - print(f'Results written to {args.output}') - # Convert results to pandas DataFrame and sort by resolve rate df = pd.DataFrame(results) @@ -194,8 +193,6 @@ def aggregate_directory(input_path) -> pd.DataFrame: ) args = parser.parse_args() - results = [] - if os.path.isdir(args.input_path): df = aggregate_directory(args.input_path) # Create the summary string @@ -232,6 +229,7 @@ def aggregate_directory(input_path) -> pd.DataFrame: df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False) else: # Process single file with detailed output + results = [] try: result = process_file(args.input_path) results.append(result) From b0281d6135e590a85806b3d52e54e21b52341980 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Sun, 24 Nov 2024 22:07:51 +0000 Subject: [PATCH 33/40] add script to eval full rollout --- scripts/eval-swetrain-full-rollout.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100755 scripts/eval-swetrain-full-rollout.sh diff --git a/scripts/eval-swetrain-full-rollout.sh b/scripts/eval-swetrain-full-rollout.sh new file mode 100755 index 0000000000000..9441fb2f53aaa --- /dev/null +++ b/scripts/eval-swetrain-full-rollout.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +OUTPUT_FILE=$1 +NUM_WORKERS=$2 +DATASET="swe-train/swe-train-dev-v1" +SPLIT="train.v2.success" + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" + +./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT From 46425b65b8c7e6bc8311c23829b1232eb6ab51a7 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Mon, 25 Nov 2024 19:37:01 +0000 Subject: [PATCH 34/40] fix remote runtime --- openhands/llm/fn_call_converter.py | 2 +- openhands/runtime/impl/remote/remote_runtime.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/openhands/llm/fn_call_converter.py b/openhands/llm/fn_call_converter.py index 2f8d9cbe5cc80..057401b12c2ca 100644 --- a/openhands/llm/fn_call_converter.py +++ b/openhands/llm/fn_call_converter.py @@ -432,7 +432,7 @@ def convert_fncall_messages_to_non_fncall_messages( tool_content = convert_tool_call_to_string(message['tool_calls'][0]) except FunctionCallConversionError as e: raise FunctionCallConversionError( - f'Failed to convert tool call to string. Raw messages: {json.dumps(messages, indent=2)}' + f'Failed to convert tool call to string.\nCurrent tool call: {message["tool_calls"][0]}.\nRaw messages: {json.dumps(messages, indent=2)}' ) from e if isinstance(content, str): content += '\n\n' + tool_content diff --git a/openhands/runtime/impl/remote/remote_runtime.py b/openhands/runtime/impl/remote/remote_runtime.py index 4191a047b1c22..db685eb989620 100644 --- a/openhands/runtime/impl/remote/remote_runtime.py +++ b/openhands/runtime/impl/remote/remote_runtime.py @@ -327,13 +327,13 @@ def _wait_until_alive_impl(self): assert 'runtime_id' in runtime_data assert runtime_data['runtime_id'] == self.runtime_id assert 'pod_status' in runtime_data - pod_status = runtime_data['pod_status'] + pod_status = runtime_data['pod_status'].lower() self.log('debug', f'Pod status: {pod_status}') # FIXME: We should fix it at the backend of /start endpoint, make sure # the pod is created before returning the response. # Retry a period of time to give the cluster time to start the pod - if pod_status == 'Ready': + if pod_status == 'ready': try: with self._send_request( 'GET', @@ -349,14 +349,14 @@ def _wait_until_alive_impl(self): ) return elif ( - pod_status == 'Not Found' - or pod_status == 'Pending' - or pod_status == 'Running' + pod_status == 'not found' + or pod_status == 'pending' + or pod_status == 'running' ): # nb: Running is not yet Ready raise RuntimeNotReadyError( f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}' ) - elif pod_status in ('Failed', 'Unknown'): + elif pod_status in ('failed', 'unknown', 'crashloopbackoff'): # clean up the runtime self.close() raise RuntimeError( From 5bd315106b4c5660b9ec435d511de32e781ea2c8 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Mon, 25 Nov 2024 20:06:36 +0000 Subject: [PATCH 35/40] add a bunch of scripts --- scripts/eval-swebench-lite.sh | 2 +- scripts/eval-swebench-verified.sh | 17 +++++++++++++++++ scripts/rollout-eval-verified.sh | 23 +++++++++++++++++++++++ scripts/rollout-swe-train-full.sh | 2 +- 4 files changed, 42 insertions(+), 2 deletions(-) create mode 100755 scripts/eval-swebench-verified.sh create mode 100755 scripts/rollout-eval-verified.sh diff --git a/scripts/eval-swebench-lite.sh b/scripts/eval-swebench-lite.sh index 5a0172dd91174..3f6d88d910d7a 100755 --- a/scripts/eval-swebench-lite.sh +++ b/scripts/eval-swebench-lite.sh @@ -1,7 +1,7 @@ #!/bin/bash OUTPUT_FILE=$1 -NUM_WORKERS=$2 +NUM_WORKERS=${2:-64} DATASET="princeton-nlp/SWE-bench_Lite" SPLIT="test" diff --git a/scripts/eval-swebench-verified.sh b/scripts/eval-swebench-verified.sh new file mode 100755 index 0000000000000..f6909f6515c5d --- /dev/null +++ b/scripts/eval-swebench-verified.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +OUTPUT_FILE=$1 +NUM_WORKERS=${2:-64} +DATASET="princeton-nlp/SWE-bench_Verified" +SPLIT="test" + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" + +./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT diff --git a/scripts/rollout-eval-verified.sh b/scripts/rollout-eval-verified.sh new file mode 100755 index 0000000000000..b46acc93d74cd --- /dev/null +++ b/scripts/rollout-eval-verified.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +export EXP_NAME="t0" # use this to differentiate between different runs +MODEL=$1 +NUM_WORKERS=${2:-64} +DATASET="princeton-nlp/SWE-bench_Verified" +SPLIT="test" +N_RUNS=1 + +if [ -z "$ALLHANDS_API_KEY" ]; then + echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." + exit 1 +fi + +export RUNTIME=remote +export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" +export EXP_NAME=$EXP_NAME + +EVAL_LIMIT=500 +MAX_ITER=100 + +./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT diff --git a/scripts/rollout-swe-train-full.sh b/scripts/rollout-swe-train-full.sh index b07940c89269a..fcd3d0272a0b9 100755 --- a/scripts/rollout-swe-train-full.sh +++ b/scripts/rollout-swe-train-full.sh @@ -2,12 +2,12 @@ MODEL=$1 EXP_NAME=$2 # "train-t0" +N_RUNS=${3:-20} export EXP_NAME=$EXP_NAME echo "MODEL: $MODEL" echo "EXP_NAME: $EXP_NAME" DATASET="swe-train/swe-train-dev-v1" SPLIT="train.v2.success" -N_RUNS=1 if [ -z "$ALLHANDS_API_KEY" ]; then echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." From 160e99594e51440d354ed76ed7f30d35670b857e Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Tue, 26 Nov 2024 18:35:10 +0000 Subject: [PATCH 36/40] support multiple rounds for lite & verifed --- scripts/rollout-eval-lite.sh | 8 ++++++-- scripts/rollout-eval-verified.sh | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/scripts/rollout-eval-lite.sh b/scripts/rollout-eval-lite.sh index 21113f3f870c1..2d111b9910036 100755 --- a/scripts/rollout-eval-lite.sh +++ b/scripts/rollout-eval-lite.sh @@ -3,9 +3,13 @@ export EXP_NAME="t0" # use this to differentiate between different runs MODEL=$1 NUM_WORKERS=${2:-64} +N_RUNS=${3:-1} DATASET="princeton-nlp/SWE-bench_Lite" SPLIT="test" -N_RUNS=1 + +echo "MODEL: $MODEL" +echo "NUM_WORKERS: $NUM_WORKERS" +echo "N_RUNS: $N_RUNS" if [ -z "$ALLHANDS_API_KEY" ]; then echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." @@ -20,4 +24,4 @@ export EXP_NAME=$EXP_NAME EVAL_LIMIT=300 MAX_ITER=100 -./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT +./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT $N_RUNS diff --git a/scripts/rollout-eval-verified.sh b/scripts/rollout-eval-verified.sh index b46acc93d74cd..ecffc1ac108af 100755 --- a/scripts/rollout-eval-verified.sh +++ b/scripts/rollout-eval-verified.sh @@ -3,9 +3,13 @@ export EXP_NAME="t0" # use this to differentiate between different runs MODEL=$1 NUM_WORKERS=${2:-64} +N_RUNS=${3:-1} DATASET="princeton-nlp/SWE-bench_Verified" SPLIT="test" -N_RUNS=1 + +echo "MODEL: $MODEL" +echo "NUM_WORKERS: $NUM_WORKERS" +echo "N_RUNS: $N_RUNS" if [ -z "$ALLHANDS_API_KEY" ]; then echo "ALLHANDS_API_KEY is not set. Please set it and run the script again." @@ -20,4 +24,4 @@ export EXP_NAME=$EXP_NAME EVAL_LIMIT=500 MAX_ITER=100 -./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT +./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT $N_RUNS From 57071a0c501ea8105c8b59eadd7a1f1e424a8b8d Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Wed, 18 Dec 2024 04:23:22 +0000 Subject: [PATCH 37/40] fix run_infer for occasional cd error --- evaluation/swe_bench/run_infer.py | 116 +++++++++++++++++++----------- 1 file changed, 73 insertions(+), 43 deletions(-) diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index 9cb9dd77f4983..5ae044edbc3c4 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -32,7 +32,7 @@ ) from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller -from openhands.events.action import CmdRunAction, MessageAction +from openhands.events.action import CmdRunAction, IPythonRunCellAction, MessageAction from openhands.events.observation import CmdOutputObservation, ErrorObservation from openhands.events.serialization.event import event_to_dict from openhands.runtime.base import Runtime @@ -303,6 +303,7 @@ def initialize_runtime( def complete_runtime( runtime: Runtime, instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name + n_retries: int = 5, ) -> dict[str, Any]: """Complete the runtime for the agent. @@ -321,55 +322,84 @@ def complete_runtime( logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert_and_raise( - isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, - f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}', - ) - action = CmdRunAction(command='git config --global core.pager ""') - action.timeout = 600 - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert_and_raise( - isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, - f'Failed to git config --global core.pager "": {str(obs)}', - ) + if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0: + action = CmdRunAction(command='git config --global core.pager ""') + action.timeout = 600 + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert_and_raise( + isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, + f'Failed to git config --global core.pager "": {str(obs)}', + ) - action = CmdRunAction(command='git add -A') - action.timeout = 600 - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert_and_raise( - isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, - f'Failed to git add -A: {str(obs)}', - ) + action = CmdRunAction(command='git add -A') + action.timeout = 600 + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert_and_raise( + isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, + f'Failed to git add -A: {str(obs)}', + ) - n_retries = 0 - git_patch = None - while n_retries < 5: - action = CmdRunAction( - command=f'git diff --no-color --cached {instance["base_commit"]}', - keep_prompt=False, + n_retries = 0 + git_patch = None + while n_retries < 5: + action = CmdRunAction( + command=f'git diff --no-color --cached {instance["base_commit"]}', + keep_prompt=False, + ) + action.timeout = 600 + 100 * n_retries + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + n_retries += 1 + if isinstance(obs, CmdOutputObservation): + if obs.exit_code == 0: + git_patch = obs.content.strip() + break + else: + logger.info('Failed to get git diff, retrying...') + sleep_if_should_continue(10) + elif isinstance(obs, ErrorObservation): + logger.error(f'Error occurred: {obs.content}. Retrying...') + sleep_if_should_continue(10) + else: + assert_and_raise(False, f'Unexpected observation type: {str(obs)}') + else: + logger.warning( + f'Failed to cd to /workspace/{workspace_dir_name}... Trying to use IPython to get git diff' ) - action.timeout = 600 + 100 * n_retries + # Git configuration and diff using IPython + cell_code = f""" + import subprocess + + def run_git_cmd(cmd): + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, cwd='/workspace/{workspace_dir_name}') + return result.stdout, result.returncode + + # Configure git + run_git_cmd('git config --global core.pager ""') + run_git_cmd('git add -A') + + # Get the diff + stdout, exit_code = run_git_cmd('git diff --no-color --cached {instance["base_commit"]}') + git_patch = stdout.strip() + """ + action = IPythonRunCellAction(code=cell_code) + action.timeout = 600 logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - n_retries += 1 - if isinstance(obs, CmdOutputObservation): - if obs.exit_code == 0: - git_patch = obs.content.strip() - break - else: - logger.info('Failed to get git diff, retrying...') - sleep_if_should_continue(10) - elif isinstance(obs, ErrorObservation): - logger.error(f'Error occurred: {obs.content}. Retrying...') - sleep_if_should_continue(10) - else: - assert_and_raise(False, f'Unexpected observation type: {str(obs)}') + + # Get the git_patch from IPython's namespace + cell_code = 'print(git_patch)' + action = IPythonRunCellAction(code=cell_code) + action.timeout = 600 + obs = runtime.run_action(action) + git_patch = obs.content.strip() assert_and_raise(git_patch is not None, 'Failed to get git diff (None)') From 1a25763a048f42c525a70a0db03ed73ba11f6e32 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Thu, 19 Dec 2024 18:40:21 +0000 Subject: [PATCH 38/40] add standard error to swebench summarize outputs --- .../scripts/eval/summarize_outputs.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/evaluation/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/swe_bench/scripts/eval/summarize_outputs.py index 0da6eb25b54c6..5546ce2252966 100755 --- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py +++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py @@ -6,6 +6,8 @@ from collections import Counter import pandas as pd +import random +import numpy as np from openhands.events.serialization import event_from_dict from openhands.events.utils import get_pairs_from_events @@ -18,6 +20,16 @@ ] +def get_bootstrap_accuracy_std(values: float | int | bool, num_samples: int = 1000) -> float: + # Borrow from https://github.com/openai/evals/blob/cdb8ce9547e68b8e5e4520b6a162294c06865c0f/evals/metrics.py#L21 + return np.std( + [ + np.mean(random.sample(values, len(values) // 2)) + for _ in range(num_samples) + ] + ) + + def process_file(file_path): with open(file_path, 'r') as file: lines = file.readlines() @@ -26,6 +38,7 @@ def process_file(file_path): num_error_lines = 0 num_agent_stuck_in_loop = 0 num_resolved = 0 + resolved_arr = [] num_empty_patch = 0 num_unfinished_runs = 0 error_counter = Counter() @@ -74,6 +87,9 @@ def process_file(file_path): resolved = report.get('resolved', False) if resolved: num_resolved += 1 + resolved_arr.append(1) + else: + resolved_arr.append(0) # Error error = _d.get('error', None) @@ -100,6 +116,7 @@ def process_file(file_path): 'resolved': { 'count': num_resolved, 'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0, + 'std': get_bootstrap_accuracy_std(resolved_arr) * 100, }, 'empty_patches': { 'count': num_empty_patch, @@ -169,6 +186,7 @@ def aggregate_directory(input_path) -> pd.DataFrame: ) df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage']) + df['resolve_rate_std'] = df['resolved'].apply(lambda x: x['std']) df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage']) df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage']) df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns']) @@ -237,7 +255,7 @@ def aggregate_directory(input_path) -> pd.DataFrame: # Print detailed results for single file print(f'\nResults for {args.input_path}:') print( - f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}%)" + f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}% ± {result['resolved']['std']:.2f}%)" ) print( f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)" From 74b32f5c1bec6f30bbb3aa3b78982b7ad25258f3 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Mon, 23 Dec 2024 04:00:36 +0000 Subject: [PATCH 39/40] add misc --- evaluation/swe_bench/eval_infer.py | 6 +++++- .../swe_bench/scripts/eval/summarize_outputs.py | 13 ++++++------- .../scripts/eval/update_output_with_eval.py | 5 ++++- evaluation/swe_bench/scripts/run_infer.sh | 6 ++++++ openhands/core/config/llm_config.py | 1 + openhands/llm/llm.py | 4 +++- 6 files changed, 25 insertions(+), 10 deletions(-) diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/swe_bench/eval_infer.py index d40f984fca9c8..ea650999c2be6 100644 --- a/evaluation/swe_bench/eval_infer.py +++ b/evaluation/swe_bench/eval_infer.py @@ -407,7 +407,11 @@ def process_instance( fields = ['resolved', 'failed_apply_patch', 'error_eval', 'empty_generation'] def count_report_field(row, field): - return row['test_result']['report'][field] + return ( + row['test_result']['report'][field] + if 'report' in row['test_result'] + else False + ) report = {} for field in fields: diff --git a/evaluation/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/swe_bench/scripts/eval/summarize_outputs.py index 5546ce2252966..d76882cb2415e 100755 --- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py +++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py @@ -3,11 +3,11 @@ import glob import json import os +import random from collections import Counter -import pandas as pd -import random import numpy as np +import pandas as pd from openhands.events.serialization import event_from_dict from openhands.events.utils import get_pairs_from_events @@ -20,13 +20,12 @@ ] -def get_bootstrap_accuracy_std(values: float | int | bool, num_samples: int = 1000) -> float: +def get_bootstrap_accuracy_std( + values: float | int | bool, num_samples: int = 1000 +) -> float: # Borrow from https://github.com/openai/evals/blob/cdb8ce9547e68b8e5e4520b6a162294c06865c0f/evals/metrics.py#L21 return np.std( - [ - np.mean(random.sample(values, len(values) // 2)) - for _ in range(num_samples) - ] + [np.mean(random.sample(values, len(values) // 2)) for _ in range(num_samples)] ) diff --git a/evaluation/swe_bench/scripts/eval/update_output_with_eval.py b/evaluation/swe_bench/scripts/eval/update_output_with_eval.py index 662e640ca752f..ae196ccf57ac9 100644 --- a/evaluation/swe_bench/scripts/eval/update_output_with_eval.py +++ b/evaluation/swe_bench/scripts/eval/update_output_with_eval.py @@ -108,7 +108,10 @@ def apply_report(row): ), 'There are duplicate instance ids in the eval report which is not allowed' for _, row in df_eval.iterrows(): - instance_id_to_status[row['instance_id']] = row['test_result']['report'] + if 'report' in row['test_result']: + instance_id_to_status[row['instance_id']] = row['test_result']['report'] + else: + instance_id_to_status[row['instance_id']] = {} df['report'] = df.apply(apply_report, axis=1) _n_instances = len(df) diff --git a/evaluation/swe_bench/scripts/run_infer.sh b/evaluation/swe_bench/scripts/run_infer.sh index 520003635a4e9..e8c1df9f3c24c 100755 --- a/evaluation/swe_bench/scripts/run_infer.sh +++ b/evaluation/swe_bench/scripts/run_infer.sh @@ -108,7 +108,13 @@ if [ -z "$N_RUNS" ]; then echo "N_RUNS not specified, use default $N_RUNS" fi +# SKIP_RUNS is a comma-separated list of run numbers to skip for i in $(seq 1 $N_RUNS); do + # skip if i in SKIP_RUNS + if [[ "$SKIP_RUNS" == *"$i"* ]]; then + continue + fi + current_eval_note="$EVAL_NOTE-run_$i" echo "EVAL_NOTE: $current_eval_note" run_eval $current_eval_note diff --git a/openhands/core/config/llm_config.py b/openhands/core/config/llm_config.py index 477b47ccdbe12..dfcb3b62c5b9f 100644 --- a/openhands/core/config/llm_config.py +++ b/openhands/core/config/llm_config.py @@ -77,6 +77,7 @@ class LLMConfig: log_completions: bool = False log_completions_folder: str = os.path.join(LOG_DIR, 'completions') draft_editor: Optional['LLMConfig'] = None + nonfncall_mode_add_in_context_learning_example: bool = True def defaults_to_dict(self) -> dict: """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional.""" diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py index 7712457a73ad0..42b3d34a64a69 100644 --- a/openhands/llm/llm.py +++ b/openhands/llm/llm.py @@ -186,7 +186,9 @@ def wrapper(*args, **kwargs): 'tools' in kwargs ), "'tools' must be in kwargs when mock_function_calling is True" messages = convert_fncall_messages_to_non_fncall_messages( - messages, kwargs['tools'] + messages, + kwargs['tools'], + add_in_context_learning_example=self.config.nonfncall_mode_add_in_context_learning_example, ) kwargs['messages'] = messages kwargs['stop'] = STOP_WORDS From 6d0d061f92f2410f872d62cea1e47d8f365537ec Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Mon, 23 Dec 2024 04:11:16 +0000 Subject: [PATCH 40/40] update pyproject for new fork --- poetry.lock | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7a13d28ec2196..b95c8ec8c7afb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -8571,9 +8571,9 @@ inference = ["anthropic", "flash_attn", "jedi", "openai", "peft", "protobuf", "s [package.source] type = "git" -url = "https://github.com/DevRL-Agent/SWE-Bench.git" +url = "https://github.com/SWE-Gym/SWE-Bench-Fork.git" reference = "HEAD" -resolved_reference = "c1f15036a6ee37f0915af8ab2da2d32114739570" +resolved_reference = "e440817248cf29bee71ca9ab3c3a762177ab8f7c" [[package]] name = "sympy" @@ -10211,4 +10211,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "42d1414d8f6db6a4040c07b3ba8da850ed4fdbfd8e5cae0d55598216b309fbe2" +content-hash = "6c58d155c879a1469ba187666f010e13ec50e3a5c85c9461f3316878d0019074" diff --git a/pyproject.toml b/pyproject.toml index 0ec86829ea945..fc2aa1304c3ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -132,7 +132,7 @@ streamlit = "*" whatthepatch = "*" retry = "*" evaluate = "*" -swebench = { git = "https://github.com/DevRL-Agent/SWE-Bench.git" } +swebench = { git = "https://github.com/SWE-Gym/SWE-Bench-Fork.git" } func_timeout = "*" sympy = "*" gdown = "*"