From 9ee7ee496c1775e0d20b90c47c52aba18bb37d22 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Wed, 30 Oct 2024 18:01:04 +0000
Subject: [PATCH 01/40] checkout script

---
 scripts/rollout-eval.sh      | 26 ++++++++++++++++++++++++++
 scripts/rollout-swe-train.sh | 25 +++++++++++++++++++++++++
 2 files changed, 51 insertions(+)
 create mode 100755 scripts/rollout-eval.sh
 create mode 100755 scripts/rollout-swe-train.sh

diff --git a/scripts/rollout-eval.sh b/scripts/rollout-eval.sh
new file mode 100755
index 0000000000000..99be840eed501
--- /dev/null
+++ b/scripts/rollout-eval.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+export EXP_NAME="t0"  # use this to differentiate between different runs
+MODEL="llm.deepseek-coder-eval-t0"  # change this based on your config.toml
+DATASET="princeton-nlp/SWE-bench_Lite"
+SPLIT="test"
+N_RUNS=1
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+
+
+EVAL_LIMIT=300
+MAX_ITER=30
+NUM_WORKERS=64
+
+./evaluation/swe_bench/scripts/run_infer.sh \
+    $MODEL HEAD CodeActAgent \
+    $EVAL_LIMIT $MAX_ITER $NUM_WORKERS \
+    $DATASET $SPLIT $N_RUNS
diff --git a/scripts/rollout-swe-train.sh b/scripts/rollout-swe-train.sh
new file mode 100755
index 0000000000000..7ad03b2fca9a9
--- /dev/null
+++ b/scripts/rollout-swe-train.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+export EXP_NAME="train-t0"   # use this to differentiate between different runs
+MODEL="llm.deepseek-coder-jiayi"  # change this based on your config.toml
+DATASET="swe-train/swe-train-dev-v1"
+SPLIT="train.v1.success"
+N_RUNS=1
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+
+EVAL_LIMIT=2000
+MAX_ITER=30
+NUM_WORKERS=64
+
+./evaluation/swe_bench/scripts/run_infer.sh \
+    $MODEL HEAD CodeActAgent \
+    $EVAL_LIMIT $MAX_ITER $NUM_WORKERS \
+    $DATASET $SPLIT $N_RUNS

From d1b94f02ae6b7bec2c95ce2a8efa21909a61eca2 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Wed, 30 Oct 2024 18:04:32 +0000
Subject: [PATCH 02/40] update script

---
 scripts/{rollout-swe-train.sh => rollout-swe-train-lite.sh} | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
 rename scripts/{rollout-swe-train.sh => rollout-swe-train-lite.sh} (81%)

diff --git a/scripts/rollout-swe-train.sh b/scripts/rollout-swe-train-lite.sh
similarity index 81%
rename from scripts/rollout-swe-train.sh
rename to scripts/rollout-swe-train-lite.sh
index 7ad03b2fca9a9..c4b27d46233e8 100755
--- a/scripts/rollout-swe-train.sh
+++ b/scripts/rollout-swe-train-lite.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
 
+MODEL=$1
 export EXP_NAME="train-t0"   # use this to differentiate between different runs
-MODEL="llm.deepseek-coder-jiayi"  # change this based on your config.toml
-DATASET="swe-train/swe-train-dev-v1"
-SPLIT="train.v1.success"
+DATASET="swe-train/swe-train-dev-v1-lite"
+SPLIT="train.v2.success"
 N_RUNS=1
 
 if [ -z "$ALLHANDS_API_KEY" ]; then

From 16be2b5bd3bede67dd96ddcb3584fa9642e823e8 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Wed, 30 Oct 2024 18:55:36 +0000
Subject: [PATCH 03/40] make sure docker image is lower case

---
 evaluation/swe_bench/run_infer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index 8b8b45a463e95..39163e9594315 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -101,7 +101,7 @@ def get_instance_docker_image(instance_id: str) -> str:
     image_name = image_name.replace(
         '__', '_s_'
     )  # to comply with docker image naming convention
-    return DOCKER_IMAGE_PREFIX.rstrip('/') + '/' + image_name
+    return (DOCKER_IMAGE_PREFIX.rstrip('/') + '/' + image_name).lower()
 
 
 def get_config(

From 5160328ee142978e45390ae67ed7509a01e91d39 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Wed, 30 Oct 2024 19:52:39 +0000
Subject: [PATCH 04/40] update swebench source

---
 poetry.lock    | 8 ++++----
 pyproject.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 6481fe5bafa5b..04f14d4f05768 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -8469,13 +8469,13 @@ tqdm = "*"
 unidiff = "*"
 
 [package.extras]
-inference = ["anthropic", "flash-attn", "jedi", "openai", "peft", "protobuf", "sentencepiece", "tenacity", "tiktoken", "torch", "transformers", "triton"]
+inference = ["anthropic", "flash_attn", "jedi", "openai", "peft", "protobuf", "sentencepiece", "tenacity", "tiktoken", "torch", "transformers", "triton"]
 
 [package.source]
 type = "git"
-url = "https://github.com/All-Hands-AI/SWE-bench.git"
+url = "https://github.com/DevRL-Agent/SWE-Bench.git"
 reference = "HEAD"
-resolved_reference = "c807c112edc3dcb4fdf5ddac63b34706912d5cdb"
+resolved_reference = "c1f15036a6ee37f0915af8ab2da2d32114739570"
 
 [[package]]
 name = "sympy"
@@ -10109,4 +10109,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "2b268ef696ace0d8170276407dbdeb414134477839ebe4b7ecf29b1a1fe2cef3"
+content-hash = "793d4add995bc0bc48b02fff607a1d7bbb342d809be6d8f368c9adf44ae0e5f3"
diff --git a/pyproject.toml b/pyproject.toml
index a758e4dcc57d9..952a1345fc5ef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -126,7 +126,7 @@ streamlit = "*"
 whatthepatch = "*"
 retry = "*"
 evaluate = "*"
-swebench = { git = "https://github.com/All-Hands-AI/SWE-bench.git" }
+swebench = { git = "https://github.com/DevRL-Agent/SWE-Bench.git" }
 func_timeout = "*"
 sympy = "*"
 gdown = "*"

From 360caa478e5a8b569cd52ea68e60902ce602ff86 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Wed, 30 Oct 2024 19:53:50 +0000
Subject: [PATCH 05/40] add script to eval rollout

---
 scripts/eval-lite-rollout.sh | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100755 scripts/eval-lite-rollout.sh

diff --git a/scripts/eval-lite-rollout.sh b/scripts/eval-lite-rollout.sh
new file mode 100755
index 0000000000000..1f25588390621
--- /dev/null
+++ b/scripts/eval-lite-rollout.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+OUTPUT_FILE=$1
+NUM_WORKERS=$2
+DATASET="swe-train/swe-train-dev-v1-lite"
+SPLIT="train.v2.success"
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+
+./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT

From d83ddcab4fef3580c450f2a83ec367bb3b53ff50 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Wed, 30 Oct 2024 20:39:01 +0000
Subject: [PATCH 06/40] .lower() to fix swebench instance

---
 evaluation/swe_bench/eval_infer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/swe_bench/eval_infer.py
index a214f4781ed9c..c4566c1bfd3df 100644
--- a/evaluation/swe_bench/eval_infer.py
+++ b/evaluation/swe_bench/eval_infer.py
@@ -239,7 +239,7 @@ def process_instance(
                         # Create a directory structure that matches the expected format
                         # NOTE: this is a hack to make the eval report format consistent
                         # with the original SWE-Bench eval script
-                        log_dir = os.path.join(temp_dir, 'logs', instance_id)
+                        log_dir = os.path.join(temp_dir, 'logs', instance_id.lower())
                         os.makedirs(log_dir, exist_ok=True)
                         test_output_path = os.path.join(log_dir, 'test_output.txt')
                         with open(test_output_path, 'w') as f:

From d33b8ea1604d1daed5fb87249f9db33f09d0a789 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Thu, 31 Oct 2024 01:08:46 +0000
Subject: [PATCH 07/40] add exp name to arg as well

---
 scripts/rollout-swe-train-lite.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/rollout-swe-train-lite.sh b/scripts/rollout-swe-train-lite.sh
index c4b27d46233e8..517fd2b8f9a47 100755
--- a/scripts/rollout-swe-train-lite.sh
+++ b/scripts/rollout-swe-train-lite.sh
@@ -1,7 +1,9 @@
 #!/bin/bash
 
 MODEL=$1
-export EXP_NAME="train-t0"   # use this to differentiate between different runs
+EXP_NAME=$2 # "train-t0"
+echo "MODEL: $MODEL"
+echo "EXP_NAME: $EXP_NAME"
 DATASET="swe-train/swe-train-dev-v1-lite"
 SPLIT="train.v2.success"
 N_RUNS=1

From 0d8b4116b57fb23e1b6101ff4697de8c42dda08d Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Thu, 31 Oct 2024 01:30:29 +0000
Subject: [PATCH 08/40] fix exp name export

---
 scripts/rollout-swe-train-lite.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/rollout-swe-train-lite.sh b/scripts/rollout-swe-train-lite.sh
index 517fd2b8f9a47..dca8e95ac9783 100755
--- a/scripts/rollout-swe-train-lite.sh
+++ b/scripts/rollout-swe-train-lite.sh
@@ -2,6 +2,7 @@
 
 MODEL=$1
 EXP_NAME=$2 # "train-t0"
+export EXP_NAME=$EXP_NAME
 echo "MODEL: $MODEL"
 echo "EXP_NAME: $EXP_NAME"
 DATASET="swe-train/swe-train-dev-v1-lite"

From 31109ea6b13444ff43a48b5ea4e45456d4476c6a Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Thu, 31 Oct 2024 01:31:59 +0000
Subject: [PATCH 09/40] increase processes

---
 scripts/rollout-swe-train-lite.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/rollout-swe-train-lite.sh b/scripts/rollout-swe-train-lite.sh
index dca8e95ac9783..f47d1b5aad1a6 100755
--- a/scripts/rollout-swe-train-lite.sh
+++ b/scripts/rollout-swe-train-lite.sh
@@ -20,7 +20,7 @@ export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/sw
 
 EVAL_LIMIT=2000
 MAX_ITER=30
-NUM_WORKERS=64
+NUM_WORKERS=128
 
 ./evaluation/swe_bench/scripts/run_infer.sh \
     $MODEL HEAD CodeActAgent \

From 5377356075464535ff54f1dd8d84faa0884af0ed Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Sun, 3 Nov 2024 21:07:29 +0000
Subject: [PATCH 10/40] update eval script

---
 scripts/rollout-eval.sh | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/scripts/rollout-eval.sh b/scripts/rollout-eval.sh
index 99be840eed501..e96f73b3ef59f 100755
--- a/scripts/rollout-eval.sh
+++ b/scripts/rollout-eval.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
 
 export EXP_NAME="t0"  # use this to differentiate between different runs
-MODEL="llm.deepseek-coder-eval-t0"  # change this based on your config.toml
-DATASET="princeton-nlp/SWE-bench_Lite"
-SPLIT="test"
+MODEL=$1
+DATASET="swe-train/SWE-bench_lite"
+SPLIT="test100"
 N_RUNS=1
 
 if [ -z "$ALLHANDS_API_KEY" ]; then
@@ -20,7 +20,4 @@ EVAL_LIMIT=300
 MAX_ITER=30
 NUM_WORKERS=64
 
-./evaluation/swe_bench/scripts/run_infer.sh \
-    $MODEL HEAD CodeActAgent \
-    $EVAL_LIMIT $MAX_ITER $NUM_WORKERS \
-    $DATASET $SPLIT $N_RUNS
+./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT

From 873df45dc04393ce7dbe634c70081cff4f86a43d Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Sun, 3 Nov 2024 21:53:58 +0000
Subject: [PATCH 11/40] update script

---
 scripts/rollout-eval.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/rollout-eval.sh b/scripts/rollout-eval.sh
index e96f73b3ef59f..cbb7017c06bda 100755
--- a/scripts/rollout-eval.sh
+++ b/scripts/rollout-eval.sh
@@ -14,7 +14,7 @@ fi
 export RUNTIME=remote
 export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
 export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
-
+export EXP_NAME=$EXP_NAME
 
 EVAL_LIMIT=300
 MAX_ITER=30

From c2962c66a72139a074f987a696f6149a38defc75 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Sun, 3 Nov 2024 22:52:17 +0000
Subject: [PATCH 12/40] add llm converter from #4711

---
 openhands/core/exceptions.py            |  16 +
 openhands/llm/fn_call_converter.py      | 481 ++++++++++++++++
 tests/unit/test_llm_fncall_converter.py | 729 ++++++++++++++++++++++++
 3 files changed, 1226 insertions(+)
 create mode 100644 openhands/llm/fn_call_converter.py
 create mode 100644 tests/unit/test_llm_fncall_converter.py

diff --git a/openhands/core/exceptions.py b/openhands/core/exceptions.py
index c33297a0d2450..af545960e6d79 100644
--- a/openhands/core/exceptions.py
+++ b/openhands/core/exceptions.py
@@ -94,3 +94,19 @@ class CloudFlareBlockageError(Exception):
     """Exception raised when a request is blocked by CloudFlare."""
 
     pass
+
+
+class FunctionCallConversionError(Exception):
+    """Exception raised when FunctionCallingConverter failed to convert a non-function call message to a function call message.
+    This typically happens when there's a malformed message (e.g., missing <function=...> tags). But not due to LLM output.
+    """
+
+    pass
+
+
+class FunctionCallValidationError(Exception):
+    """Exception raised when FunctionCallingConverter failed to validate a function call message.
+    This typically happens when the LLM outputs unrecognized function call / parameter names / values.
+    """
+
+    pass
diff --git a/openhands/llm/fn_call_converter.py b/openhands/llm/fn_call_converter.py
new file mode 100644
index 0000000000000..17e897a253596
--- /dev/null
+++ b/openhands/llm/fn_call_converter.py
@@ -0,0 +1,481 @@
+"""Convert function calling messages to non-function calling messages and vice versa.
+
+This will inject prompts so that models that doesn't support function calling
+can still be used with function calling agents.
+
+We follow format from: https://docs.litellm.ai/docs/completion/function_call
+"""
+
+import copy
+import json
+import re
+from typing import Iterable
+
+from litellm import ChatCompletionToolParam
+
+from openhands.core.exceptions import (
+    FunctionCallConversionError,
+    FunctionCallValidationError,
+)
+
+# Inspired by: https://docs.together.ai/docs/llama-3-function-calling#function-calling-w-llama-31-70b
+SYSTEM_PROMPT_SUFFIX_TEMPLATE = """
+You have access to the following functions:
+
+{description}
+
+If you choose to call a function ONLY reply in the following format with NO suffix:
+
+<function=example_function_name>
+<parameter=example_parameter_1>value_1</parameter>
+<parameter=example_parameter_2>
+This is the value for the second parameter
+that can span
+multiple lines
+</parameter>
+</function>
+
+<IMPORTANT>
+Reminder:
+- Function calls MUST follow the specified format, start with <function= and end with </function>
+- Required parameters MUST be specified
+- Only call one function at a time
+- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after.
+- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls
+"""
+
+# Regex patterns for function call parsing
+FN_REGEX_PATTERN = r'<function=([^>]+)>\n(.*?)</function>'
+FN_PARAM_REGEX_PATTERN = r'<parameter=([^>]+)>(.*?)</parameter>'
+
+# Add new regex pattern for tool execution results
+TOOL_RESULT_REGEX_PATTERN = r'EXECUTION RESULT of \[(.*?)\]:\n(.*)'
+
+
+def convert_tool_call_to_string(tool_call: dict) -> str:
+    """Convert tool call to content in string format."""
+    if 'function' not in tool_call:
+        raise FunctionCallConversionError("Tool call must contain 'function' key.")
+    if 'id' not in tool_call:
+        raise FunctionCallConversionError("Tool call must contain 'id' key.")
+    if 'type' not in tool_call:
+        raise FunctionCallConversionError("Tool call must contain 'type' key.")
+    if tool_call['type'] != 'function':
+        raise FunctionCallConversionError("Tool call type must be 'function'.")
+
+    ret = f"<function={tool_call['function']['name']}>\n"
+    try:
+        args = json.loads(tool_call['function']['arguments'])
+    except json.JSONDecodeError as e:
+        raise FunctionCallConversionError(
+            f"Failed to parse arguments as JSON. Arguments: {tool_call['function']['arguments']}"
+        ) from e
+    for param_name, param_value in args.items():
+        is_multiline = isinstance(param_value, str) and '\n' in param_value
+        ret += f'<parameter={param_name}>'
+        if is_multiline:
+            ret += '\n'
+        ret += f'{param_value}'
+        if is_multiline:
+            ret += '\n'
+        ret += '</parameter>\n'
+    ret += '</function>'
+    return ret
+
+
+def convert_tools_to_description(tools: list[dict]) -> str:
+    ret = ''
+    for i, tool in enumerate(tools):
+        assert tool['type'] == 'function'
+        fn = tool['function']
+        if i > 0:
+            ret += '\n'
+        ret += f"---- BEGIN FUNCTION #{i+1}: {fn['name']} ----\n"
+        ret += f"Description: {fn['description']}\n"
+        if 'parameters' in fn:
+            ret += f"Parameters: {json.dumps(fn['parameters'], indent=2)}\n"
+        else:
+            ret += 'No parameters are required for this function.\n'
+        ret += f'---- END FUNCTION #{i+1} ----\n'
+    return ret
+
+
+def convert_fncall_messages_to_non_fncall_messages(
+    messages: list[dict],
+    tools: list[ChatCompletionToolParam],
+) -> list[dict]:
+    """Convert function calling messages to non-function calling messages."""
+    messages = copy.deepcopy(messages)
+
+    formatted_tools = convert_tools_to_description(tools)
+    system_prompt_suffix = SYSTEM_PROMPT_SUFFIX_TEMPLATE.format(
+        description=formatted_tools
+    )
+
+    converted_messages = []
+    for message in messages:
+        role, content = message['role'], message['content']
+        if content is None:
+            content = ''
+
+        # 1. SYSTEM MESSAGES
+        # append system prompt suffix to content
+        if role == 'system':
+            if isinstance(content, str):
+                content += system_prompt_suffix
+            elif isinstance(content, list):
+                if content and content[-1]['type'] == 'text':
+                    content[-1]['text'] += system_prompt_suffix
+                else:
+                    content.append({'type': 'text', 'text': system_prompt_suffix})
+            else:
+                raise FunctionCallConversionError(
+                    f'Unexpected content type {type(content)}. Expected str or list. Content: {content}'
+                )
+            converted_messages.append({'role': 'system', 'content': content})
+        # 2. USER MESSAGES (no change)
+        elif role == 'user':
+            converted_messages.append(message)
+
+        # 3. ASSISTANT MESSAGES
+        # - 3.1 no change if no function call
+        # - 3.2 change if function call
+        elif role == 'assistant':
+            if 'tool_calls' in message and message['tool_calls'] is not None:
+                if len(message['tool_calls']) != 1:
+                    raise FunctionCallConversionError(
+                        f'Expected exactly one tool call in the message. More than one tool call is not supported. But got {len(message["tool_calls"])} tool calls. Content: {content}'
+                    )
+                try:
+                    tool_content = convert_tool_call_to_string(message['tool_calls'][0])
+                except FunctionCallConversionError as e:
+                    raise FunctionCallConversionError(
+                        f'Failed to convert tool call to string. Raw messages: {json.dumps(messages, indent=2)}'
+                    ) from e
+                if isinstance(content, str):
+                    content += '\n\n' + tool_content
+                    content = content.lstrip()
+                elif isinstance(content, list):
+                    if content and content[-1]['type'] == 'text':
+                        content[-1]['text'] += '\n\n' + tool_content
+                        content[-1]['text'] = content[-1]['text'].lstrip()
+                    else:
+                        content.append({'type': 'text', 'text': tool_content})
+                else:
+                    raise FunctionCallConversionError(
+                        f'Unexpected content type {type(content)}. Expected str or list. Content: {content}'
+                    )
+            converted_messages.append({'role': 'assistant', 'content': content})
+        # 4. TOOL MESSAGES (tool outputs)
+        elif role == 'tool':
+            # Convert tool result as assistant message
+            prefix = f'EXECUTION RESULT of [{message["name"]}]:\n'
+            # and omit "tool_call_id" AND "name"
+            if isinstance(content, str):
+                content = prefix + content
+            elif isinstance(content, list):
+                if content and content[-1]['type'] == 'text':
+                    content[-1]['text'] = prefix + content[-1]['text']
+                else:
+                    content = [{'type': 'text', 'text': prefix}] + content
+            else:
+                raise FunctionCallConversionError(
+                    f'Unexpected content type {type(content)}. Expected str or list. Content: {content}'
+                )
+            converted_messages.append({'role': 'user', 'content': content})
+        else:
+            raise FunctionCallConversionError(
+                f'Unexpected role {role}. Expected system, user, assistant or tool.'
+            )
+    return converted_messages
+
+
+def _extract_and_validate_params(
+    matching_tool: dict, param_matches: Iterable[re.Match], fn_name: str
+) -> dict:
+    params = {}
+    # Parse and validate parameters
+    required_params = set()
+    if 'parameters' in matching_tool and 'required' in matching_tool['parameters']:
+        required_params = set(matching_tool['parameters'].get('required', []))
+
+    allowed_params = set()
+    if 'parameters' in matching_tool and 'properties' in matching_tool['parameters']:
+        allowed_params = set(matching_tool['parameters']['properties'].keys())
+
+    param_name_to_type = {}
+    if 'parameters' in matching_tool and 'properties' in matching_tool['parameters']:
+        param_name_to_type = {
+            name: val.get('type', 'string')
+            for name, val in matching_tool['parameters']['properties'].items()
+        }
+
+    # Collect parameters
+    found_params = set()
+    for param_match in param_matches:
+        param_name = param_match.group(1)
+        param_value = param_match.group(2).strip()
+
+        # Validate parameter is allowed
+        if allowed_params and param_name not in allowed_params:
+            raise FunctionCallValidationError(
+                f"Parameter '{param_name}' is not allowed for function '{fn_name}'. "
+                f'Allowed parameters: {allowed_params}'
+            )
+
+        # Validate and convert parameter type
+        # supported: string, integer, array
+        if param_name in param_name_to_type:
+            if param_name_to_type[param_name] == 'integer':
+                try:
+                    param_value = int(param_value)
+                except ValueError:
+                    raise FunctionCallValidationError(
+                        f"Parameter '{param_name}' is expected to be an integer."
+                    )
+            elif param_name_to_type[param_name] == 'array':
+                try:
+                    param_value = json.loads(param_value)
+                except json.JSONDecodeError:
+                    raise FunctionCallValidationError(
+                        f"Parameter '{param_name}' is expected to be an array."
+                    )
+            else:
+                # string
+                pass
+
+        # Enum check
+        if 'enum' in matching_tool['parameters']['properties'][param_name]:
+            if (
+                param_value
+                not in matching_tool['parameters']['properties'][param_name]['enum']
+            ):
+                raise FunctionCallValidationError(
+                    f"Parameter '{param_name}' is expected to be one of {matching_tool['parameters']['properties'][param_name]['enum']}."
+                )
+
+        params[param_name] = param_value
+        found_params.add(param_name)
+
+    # Check all required parameters are present
+    missing_params = required_params - found_params
+    if missing_params:
+        raise FunctionCallValidationError(
+            f"Missing required parameters for function '{fn_name}': {missing_params}"
+        )
+    return params
+
+
+def convert_non_fncall_messages_to_fncall_messages(
+    messages: list[dict],
+    tools: list[ChatCompletionToolParam],
+) -> list[dict]:
+    """Convert non-function calling messages back to function calling messages."""
+    messages = copy.deepcopy(messages)
+    formatted_tools = convert_tools_to_description(tools)
+    system_prompt_suffix = SYSTEM_PROMPT_SUFFIX_TEMPLATE.format(
+        description=formatted_tools
+    )
+
+    converted_messages = []
+    tool_call_counter = 1  # Counter for tool calls
+
+    for message in messages:
+        role, content = message['role'], message['content']
+
+        # For system messages, remove the added suffix
+        if role == 'system':
+            if isinstance(content, str):
+                # Remove the suffix if present
+                content = content.split(system_prompt_suffix)[0]
+            elif isinstance(content, list):
+                if content and content[-1]['type'] == 'text':
+                    # Remove the suffix from the last text item
+                    content[-1]['text'] = content[-1]['text'].split(
+                        system_prompt_suffix
+                    )[0]
+            converted_messages.append({'role': 'system', 'content': content})
+        # Skip user messages (no conversion needed)
+        elif role == 'user':
+            # Check for tool execution result pattern
+            if isinstance(content, str):
+                tool_result_match = re.search(
+                    TOOL_RESULT_REGEX_PATTERN, content, re.DOTALL
+                )
+            elif isinstance(content, list):
+                tool_result_match = next(
+                    (
+                        _match
+                        for item in content
+                        if item.get('type') == 'text'
+                        and (
+                            _match := re.search(
+                                TOOL_RESULT_REGEX_PATTERN, item['text'], re.DOTALL
+                            )
+                        )
+                    ),
+                    None,
+                )
+            else:
+                raise FunctionCallConversionError(
+                    f'Unexpected content type {type(content)}. Expected str or list. Content: {content}'
+                )
+
+            if tool_result_match:
+                if not (
+                    isinstance(content, str)
+                    or (
+                        isinstance(content, list)
+                        and len(content) == 1
+                        and content[0].get('type') == 'text'
+                    )
+                ):
+                    raise FunctionCallConversionError(
+                        f'Expected str or list with one text item when tool result is present in the message. Content: {content}'
+                    )
+                tool_name = tool_result_match.group(1)
+                tool_result = tool_result_match.group(2).strip()
+
+                # Convert to tool message format
+                converted_messages.append(
+                    {
+                        'role': 'tool',
+                        'name': tool_name,
+                        'content': [{'type': 'text', 'text': tool_result}]
+                        if isinstance(content, list)
+                        else tool_result,
+                        'tool_call_id': f'toolu_{tool_call_counter-1:02d}',  # Use last generated ID
+                    }
+                )
+            else:
+                converted_messages.append(message)
+
+        # Handle assistant messages
+        elif role == 'assistant':
+            if isinstance(content, str):
+                fn_match = re.search(FN_REGEX_PATTERN, content, re.DOTALL)
+            elif isinstance(content, list):
+                if content and content[-1]['type'] == 'text':
+                    fn_match = re.search(
+                        FN_REGEX_PATTERN, content[-1]['text'], re.DOTALL
+                    )
+                else:
+                    fn_match = None
+                fn_match_exists = any(
+                    item.get('type') == 'text'
+                    and re.search(FN_REGEX_PATTERN, item['text'], re.DOTALL)
+                    for item in content
+                )
+                if fn_match_exists and not fn_match:
+                    raise FunctionCallConversionError(
+                        f'Expecting function call in the LAST index of content list. But got content={content}'
+                    )
+            else:
+                raise FunctionCallConversionError(
+                    f'Unexpected content type {type(content)}. Expected str or list. Content: {content}'
+                )
+
+            if fn_match:
+                fn_name = fn_match.group(1)
+                fn_body = fn_match.group(2)
+                matching_tool = next(
+                    (
+                        tool['function']
+                        for tool in tools
+                        if tool['type'] == 'function'
+                        and tool['function']['name'] == fn_name
+                    ),
+                    None,
+                )
+                # Validate function exists in tools
+                if not matching_tool:
+                    raise FunctionCallValidationError(
+                        f"Function '{fn_name}' not found in available tools: {[tool['function']['name'] for tool in tools if tool['type'] == 'function']}"
+                    )
+
+                # Parse parameters
+                param_matches = re.finditer(FN_PARAM_REGEX_PATTERN, fn_body, re.DOTALL)
+                params = _extract_and_validate_params(
+                    matching_tool, param_matches, fn_name
+                )
+
+                # Create tool call with unique ID
+                tool_call_id = f'toolu_{tool_call_counter:02d}'
+                tool_call = {
+                    'index': 1,  # always 1 because we only support **one tool call per message**
+                    'id': tool_call_id,
+                    'type': 'function',
+                    'function': {'name': fn_name, 'arguments': json.dumps(params)},
+                }
+                tool_call_counter += 1  # Increment counter
+
+                # Remove the function call part from content
+                if isinstance(content, list):
+                    assert content and content[-1]['type'] == 'text'
+                    content[-1]['text'] = (
+                        content[-1]['text'].split('<function=')[0].strip()
+                    )
+                elif isinstance(content, str):
+                    content = content.split('<function=')[0].strip()
+                else:
+                    raise FunctionCallConversionError(
+                        f'Unexpected content type {type(content)}. Expected str or list. Content: {content}'
+                    )
+
+                converted_messages.append(
+                    {'role': 'assistant', 'content': content, 'tool_calls': [tool_call]}
+                )
+            else:
+                # No function call, keep message as is
+                converted_messages.append(message)
+
+        else:
+            raise FunctionCallConversionError(
+                f'Unexpected role {role}. Expected system, user, or assistant in non-function calling messages.'
+            )
+    return converted_messages
+
+
+def convert_from_multiple_tool_calls_to_single_tool_call_messages(
+    messages: list[dict],
+) -> list[dict]:
+    """Break one message with multiple tool calls into multiple messages."""
+    converted_messages = []
+
+    pending_tool_calls: dict[str, dict] = {}
+    for message in messages:
+        role, content = message['role'], message['content']
+        if role == 'assistant':
+            if message.get('tool_calls') and len(message['tool_calls']) > 1:
+                # handle multiple tool calls by breaking them into multiple messages
+                for i, tool_call in enumerate(message['tool_calls']):
+                    pending_tool_calls[tool_call['id']] = {
+                        'role': 'assistant',
+                        'content': content if i == 0 else '',
+                        'tool_calls': [tool_call],
+                    }
+            else:
+                converted_messages.append(message)
+        elif role == 'tool':
+            if message['tool_call_id'] in pending_tool_calls:
+                # remove the tool call from the pending list
+                _tool_call_message = pending_tool_calls.pop(message['tool_call_id'])
+                converted_messages.append(_tool_call_message)
+                # add the tool result
+                converted_messages.append(message)
+            else:
+                assert (
+                    len(pending_tool_calls) == 0
+                ), f'Found pending tool calls but not found in pending list: {pending_tool_calls=}'
+                converted_messages.append(message)
+        else:
+            assert (
+                len(pending_tool_calls) == 0
+            ), f'Found pending tool calls but not expect to handle it with role {role}: {pending_tool_calls=}, {message=}'
+            converted_messages.append(message)
+
+    if len(pending_tool_calls) > 0:
+        raise FunctionCallConversionError(
+            f'Found pending tool calls but no tool result: {pending_tool_calls=}'
+        )
+    return converted_messages
diff --git a/tests/unit/test_llm_fncall_converter.py b/tests/unit/test_llm_fncall_converter.py
new file mode 100644
index 0000000000000..9bb4d6b44aace
--- /dev/null
+++ b/tests/unit/test_llm_fncall_converter.py
@@ -0,0 +1,729 @@
+"""Test for FunctionCallingConverter."""
+
+import copy
+import json
+
+import pytest
+from litellm import ChatCompletionToolParam
+
+from openhands.llm.fn_call_converter import (
+    FunctionCallConversionError,
+    convert_fncall_messages_to_non_fncall_messages,
+    convert_from_multiple_tool_calls_to_single_tool_call_messages,
+    convert_non_fncall_messages_to_fncall_messages,
+    convert_tool_call_to_string,
+    convert_tools_to_description,
+)
+
+FNCALL_TOOLS: list[ChatCompletionToolParam] = [
+    {
+        'type': 'function',
+        'function': {
+            'name': 'execute_bash',
+            'description': 'Execute a bash command in the terminal.\n* Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = `python3 app.py > server.log 2>&1 &`.\n* Interactive: If a bash command returns exit code `-1`, this means the process is not yet finished. The assistant must then send a second call to terminal with an empty `command` (which will retrieve any additional logs), or it can send additional text (set `command` to the text) to STDIN of the running process, or it can send command=`ctrl+c` to interrupt the process.\n* Timeout: If a command execution result says "Command timed out. Sending SIGINT to the process", the assistant should retry running the command in the background.\n',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'command': {
+                        'type': 'string',
+                        'description': 'The bash command to execute. Can be empty to view additional logs when previous exit code is `-1`. Can be `ctrl+c` to interrupt the currently running process.',
+                    }
+                },
+                'required': ['command'],
+            },
+        },
+    },
+    {
+        'type': 'function',
+        'function': {
+            'name': 'finish',
+            'description': 'Finish the interaction when the task is complete OR if the assistant cannot proceed further with the task.',
+        },
+    },
+    {
+        'type': 'function',
+        'function': {
+            'name': 'str_replace_editor',
+            'description': 'Custom editing tool for viewing, creating and editing files\n* State is persistent across command calls and discussions with the user\n* If `path` is a file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep\n* The `create` command cannot be used if the specified `path` already exists as a file\n* If a `command` generates a long output, it will be truncated and marked with `<response clipped>`\n* The `undo_edit` command will revert the last edit made to the file at `path`\n\nNotes for using the `str_replace` command:\n* The `old_str` parameter should match EXACTLY one or more consecutive lines from the original file. Be mindful of whitespaces!\n* If the `old_str` parameter is not unique in the file, the replacement will not be performed. Make sure to include enough context in `old_str` to make it unique\n* The `new_str` parameter should contain the edited lines that should replace the `old_str`\n',
+            'parameters': {
+                'type': 'object',
+                'properties': {
+                    'command': {
+                        'description': 'The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.',
+                        'enum': [
+                            'view',
+                            'create',
+                            'str_replace',
+                            'insert',
+                            'undo_edit',
+                        ],
+                        'type': 'string',
+                    },
+                    'path': {
+                        'description': 'Absolute path to file or directory, e.g. `/repo/file.py` or `/repo`.',
+                        'type': 'string',
+                    },
+                    'file_text': {
+                        'description': 'Required parameter of `create` command, with the content of the file to be created.',
+                        'type': 'string',
+                    },
+                    'old_str': {
+                        'description': 'Required parameter of `str_replace` command containing the string in `path` to replace.',
+                        'type': 'string',
+                    },
+                    'new_str': {
+                        'description': 'Optional parameter of `str_replace` command containing the new string (if not given, no string will be added). Required parameter of `insert` command containing the string to insert.',
+                        'type': 'string',
+                    },
+                    'insert_line': {
+                        'description': 'Required parameter of `insert` command. The `new_str` will be inserted AFTER the line `insert_line` of `path`.',
+                        'type': 'integer',
+                    },
+                    'view_range': {
+                        'description': 'Optional parameter of `view` command when `path` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting `[start_line, -1]` shows all lines from `start_line` to the end of the file.',
+                        'items': {'type': 'integer'},
+                        'type': 'array',
+                    },
+                },
+                'required': ['command', 'path'],
+            },
+        },
+    },
+]
+
+
+def test_convert_tools_to_description():
+    formatted_tools = convert_tools_to_description(FNCALL_TOOLS)
+    assert (
+        formatted_tools.strip()
+        == """---- BEGIN FUNCTION #1: execute_bash ----
+Description: Execute a bash command in the terminal.
+* Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = `python3 app.py > server.log 2>&1 &`.
+* Interactive: If a bash command returns exit code `-1`, this means the process is not yet finished. The assistant must then send a second call to terminal with an empty `command` (which will retrieve any additional logs), or it can send additional text (set `command` to the text) to STDIN of the running process, or it can send command=`ctrl+c` to interrupt the process.
+* Timeout: If a command execution result says "Command timed out. Sending SIGINT to the process", the assistant should retry running the command in the background.
+
+Parameters: {
+  "type": "object",
+  "properties": {
+    "command": {
+      "type": "string",
+      "description": "The bash command to execute. Can be empty to view additional logs when previous exit code is `-1`. Can be `ctrl+c` to interrupt the currently running process."
+    }
+  },
+  "required": [
+    "command"
+  ]
+}
+---- END FUNCTION #1 ----
+
+---- BEGIN FUNCTION #2: finish ----
+Description: Finish the interaction when the task is complete OR if the assistant cannot proceed further with the task.
+No parameters are required for this function.
+---- END FUNCTION #2 ----
+
+---- BEGIN FUNCTION #3: str_replace_editor ----
+Description: Custom editing tool for viewing, creating and editing files
+* State is persistent across command calls and discussions with the user
+* If `path` is a file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep
+* The `create` command cannot be used if the specified `path` already exists as a file
+* If a `command` generates a long output, it will be truncated and marked with `<response clipped>`
+* The `undo_edit` command will revert the last edit made to the file at `path`
+
+Notes for using the `str_replace` command:
+* The `old_str` parameter should match EXACTLY one or more consecutive lines from the original file. Be mindful of whitespaces!
+* If the `old_str` parameter is not unique in the file, the replacement will not be performed. Make sure to include enough context in `old_str` to make it unique
+* The `new_str` parameter should contain the edited lines that should replace the `old_str`
+
+Parameters: {
+  "type": "object",
+  "properties": {
+    "command": {
+      "description": "The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.",
+      "enum": [
+        "view",
+        "create",
+        "str_replace",
+        "insert",
+        "undo_edit"
+      ],
+      "type": "string"
+    },
+    "path": {
+      "description": "Absolute path to file or directory, e.g. `/repo/file.py` or `/repo`.",
+      "type": "string"
+    },
+    "file_text": {
+      "description": "Required parameter of `create` command, with the content of the file to be created.",
+      "type": "string"
+    },
+    "old_str": {
+      "description": "Required parameter of `str_replace` command containing the string in `path` to replace.",
+      "type": "string"
+    },
+    "new_str": {
+      "description": "Optional parameter of `str_replace` command containing the new string (if not given, no string will be added). Required parameter of `insert` command containing the string to insert.",
+      "type": "string"
+    },
+    "insert_line": {
+      "description": "Required parameter of `insert` command. The `new_str` will be inserted AFTER the line `insert_line` of `path`.",
+      "type": "integer"
+    },
+    "view_range": {
+      "description": "Optional parameter of `view` command when `path` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting `[start_line, -1]` shows all lines from `start_line` to the end of the file.",
+      "items": {
+        "type": "integer"
+      },
+      "type": "array"
+    }
+  },
+  "required": [
+    "command",
+    "path"
+  ]
+}
+---- END FUNCTION #3 ----""".strip()
+    )
+
+
+FNCALL_MESSAGES = [
+    {
+        'content': [
+            {
+                'type': 'text',
+                'text': "You are a helpful assistant that can interact with a computer to solve tasks.\n<IMPORTANT>\n* If user provides a path, you should NOT assume it's relative to the current working directory. Instead, you should explore the file system to find the file before working on it.\n</IMPORTANT>\n\n",
+                'cache_control': {'type': 'ephemeral'},
+            }
+        ],
+        'role': 'system',
+    },
+    {
+        'content': [
+            {
+                'type': 'text',
+                'text': "<uploaded_files>\n/workspace/astropy__astropy__5.1\n</uploaded_files>\nI've uploaded a python code repository in the directory astropy__astropy__5.1. LONG DESCRIPTION:\n\n",
+            }
+        ],
+        'role': 'user',
+    },
+    {
+        'content': [
+            {
+                'type': 'text',
+                'text': "I'll help you implement the necessary changes to meet the requirements. Let's follow the steps:\n\n1. First, let's explore the repository structure:",
+            }
+        ],
+        'role': 'assistant',
+        'tool_calls': [
+            {
+                'index': 1,
+                'function': {
+                    'arguments': '{"command": "ls -la /workspace/astropy__astropy__5.1"}',
+                    'name': 'execute_bash',
+                },
+                'id': 'toolu_01',
+                'type': 'function',
+            }
+        ],
+    },
+    {
+        'content': [
+            {
+                'type': 'text',
+                'text': 'ls -la /workspace/astropy__astropy__5.1\r\nls: /workspace/astropy__astropy__5.1: Bad file descriptor\r\nlrwxrwxrwx 1 root root 8 Oct 28 21:58 /workspace/astropy__astropy__5.1 -> /testbed[Python Interpreter: /opt/miniconda3/envs/testbed/bin/python]\nroot@openhands-workspace:/workspace/astropy__astropy__5.1 # \n[Command finished with exit code 0]',
+            }
+        ],
+        'role': 'tool',
+        'tool_call_id': 'toolu_01',
+        'name': 'execute_bash',
+    },
+    {
+        'content': [
+            {
+                'type': 'text',
+                'text': "I see there's a symlink. Let's explore the actual directory:",
+            }
+        ],
+        'role': 'assistant',
+        'tool_calls': [
+            {
+                'index': 1,
+                'function': {
+                    'arguments': '{"command": "ls -la /testbed"}',
+                    'name': 'execute_bash',
+                },
+                'id': 'toolu_02',
+                'type': 'function',
+            }
+        ],
+    },
+    {
+        'content': [
+            {
+                'type': 'text',
+                'text': 'SOME OBSERVATION',
+            }
+        ],
+        'role': 'tool',
+        'tool_call_id': 'toolu_02',
+        'name': 'execute_bash',
+    },
+    {
+        'content': [
+            {
+                'type': 'text',
+                'text': "Let's look at the source code file mentioned in the PR description:",
+            }
+        ],
+        'role': 'assistant',
+        'tool_calls': [
+            {
+                'index': 1,
+                'function': {
+                    'arguments': '{"command": "view", "path": "/testbed/astropy/io/fits/card.py"}',
+                    'name': 'str_replace_editor',
+                },
+                'id': 'toolu_03',
+                'type': 'function',
+            }
+        ],
+    },
+    {
+        'content': [
+            {
+                'type': 'text',
+                'text': "Here's the result of running `cat -n` on /testbed/astropy/io/fits/card.py:\n     1\t# Licensed under a 3-clause BSD style license - see PYFITS.rst...VERY LONG TEXT",
+            }
+        ],
+        'role': 'tool',
+        'tool_call_id': 'toolu_03',
+        'name': 'str_replace_editor',
+    },
+]
+
+NON_FNCALL_MESSAGES = [
+    {
+        'role': 'system',
+        'content': [
+            {
+                'type': 'text',
+                'text': 'You are a helpful assistant that can interact with a computer to solve tasks.\n<IMPORTANT>\n* If user provides a path, you should NOT assume it\'s relative to the current working directory. Instead, you should explore the file system to find the file before working on it.\n</IMPORTANT>\n\n\nYou have access to the following functions:\n\n---- BEGIN FUNCTION #1: execute_bash ----\nDescription: Execute a bash command in the terminal.\n* Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = `python3 app.py > server.log 2>&1 &`.\n* Interactive: If a bash command returns exit code `-1`, this means the process is not yet finished. The assistant must then send a second call to terminal with an empty `command` (which will retrieve any additional logs), or it can send additional text (set `command` to the text) to STDIN of the running process, or it can send command=`ctrl+c` to interrupt the process.\n* Timeout: If a command execution result says "Command timed out. Sending SIGINT to the process", the assistant should retry running the command in the background.\n\nParameters: {\n  "type": "object",\n  "properties": {\n    "command": {\n      "type": "string",\n      "description": "The bash command to execute. Can be empty to view additional logs when previous exit code is `-1`. Can be `ctrl+c` to interrupt the currently running process."\n    }\n  },\n  "required": [\n    "command"\n  ]\n}\n---- END FUNCTION #1 ----\n\n---- BEGIN FUNCTION #2: finish ----\nDescription: Finish the interaction when the task is complete OR if the assistant cannot proceed further with the task.\nNo parameters are required for this function.\n---- END FUNCTION #2 ----\n\n---- BEGIN FUNCTION #3: str_replace_editor ----\nDescription: Custom editing tool for viewing, creating and editing files\n* State is persistent across command calls and discussions with the user\n* If `path` is a file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep\n* The `create` command cannot be used if the specified `path` already exists as a file\n* If a `command` generates a long output, it will be truncated and marked with `<response clipped>`\n* The `undo_edit` command will revert the last edit made to the file at `path`\n\nNotes for using the `str_replace` command:\n* The `old_str` parameter should match EXACTLY one or more consecutive lines from the original file. Be mindful of whitespaces!\n* If the `old_str` parameter is not unique in the file, the replacement will not be performed. Make sure to include enough context in `old_str` to make it unique\n* The `new_str` parameter should contain the edited lines that should replace the `old_str`\n\nParameters: {\n  "type": "object",\n  "properties": {\n    "command": {\n      "description": "The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.",\n      "enum": [\n        "view",\n        "create",\n        "str_replace",\n        "insert",\n        "undo_edit"\n      ],\n      "type": "string"\n    },\n    "path": {\n      "description": "Absolute path to file or directory, e.g. `/repo/file.py` or `/repo`.",\n      "type": "string"\n    },\n    "file_text": {\n      "description": "Required parameter of `create` command, with the content of the file to be created.",\n      "type": "string"\n    },\n    "old_str": {\n      "description": "Required parameter of `str_replace` command containing the string in `path` to replace.",\n      "type": "string"\n    },\n    "new_str": {\n      "description": "Optional parameter of `str_replace` command containing the new string (if not given, no string will be added). Required parameter of `insert` command containing the string to insert.",\n      "type": "string"\n    },\n    "insert_line": {\n      "description": "Required parameter of `insert` command. The `new_str` will be inserted AFTER the line `insert_line` of `path`.",\n      "type": "integer"\n    },\n    "view_range": {\n      "description": "Optional parameter of `view` command when `path` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting `[start_line, -1]` shows all lines from `start_line` to the end of the file.",\n      "items": {\n        "type": "integer"\n      },\n      "type": "array"\n    }\n  },\n  "required": [\n    "command",\n    "path"\n  ]\n}\n---- END FUNCTION #3 ----\n\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<function=example_function_name>\n<parameter=example_parameter_1>value_1</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format, start with <function= and end with </function>\n- Required parameters MUST be specified\n- Only call one function at a time\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after.\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n',
+                'cache_control': {'type': 'ephemeral'},
+            }
+        ],
+    },
+    {
+        'content': [
+            {
+                'type': 'text',
+                'text': "<uploaded_files>\n/workspace/astropy__astropy__5.1\n</uploaded_files>\nI've uploaded a python code repository in the directory astropy__astropy__5.1. LONG DESCRIPTION:\n\n",
+            }
+        ],
+        'role': 'user',
+    },
+    {
+        'role': 'assistant',
+        'content': [
+            {
+                'type': 'text',
+                'text': "I'll help you implement the necessary changes to meet the requirements. Let's follow the steps:\n\n1. First, let's explore the repository structure:\n\n<function=execute_bash>\n<parameter=command>ls -la /workspace/astropy__astropy__5.1</parameter>\n</function>",
+            }
+        ],
+    },
+    {
+        'role': 'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': 'EXECUTION RESULT of [execute_bash]:\nls -la /workspace/astropy__astropy__5.1\r\nls: /workspace/astropy__astropy__5.1: Bad file descriptor\r\nlrwxrwxrwx 1 root root 8 Oct 28 21:58 /workspace/astropy__astropy__5.1 -> /testbed[Python Interpreter: /opt/miniconda3/envs/testbed/bin/python]\nroot@openhands-workspace:/workspace/astropy__astropy__5.1 # \n[Command finished with exit code 0]',
+            }
+        ],
+    },
+    {
+        'role': 'assistant',
+        'content': [
+            {
+                'type': 'text',
+                'text': "I see there's a symlink. Let's explore the actual directory:\n\n<function=execute_bash>\n<parameter=command>ls -la /testbed</parameter>\n</function>",
+            }
+        ],
+    },
+    {
+        'role': 'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': 'EXECUTION RESULT of [execute_bash]:\nSOME OBSERVATION',
+            }
+        ],
+    },
+    {
+        'role': 'assistant',
+        'content': [
+            {
+                'type': 'text',
+                'text': "Let's look at the source code file mentioned in the PR description:\n\n<function=str_replace_editor>\n<parameter=command>view</parameter>\n<parameter=path>/testbed/astropy/io/fits/card.py</parameter>\n</function>",
+            }
+        ],
+    },
+    {
+        'role': 'user',
+        'content': [
+            {
+                'type': 'text',
+                'text': "EXECUTION RESULT of [str_replace_editor]:\nHere's the result of running `cat -n` on /testbed/astropy/io/fits/card.py:\n     1\t# Licensed under a 3-clause BSD style license - see PYFITS.rst...VERY LONG TEXT",
+            }
+        ],
+    },
+]
+
+FNCALL_RESPONSE_MESSAGE = {
+    'content': [
+        {
+            'type': 'text',
+            'text': 'Let me search for the `_format_float` method mentioned in the PR description:',
+        }
+    ],
+    'role': 'assistant',
+    'tool_calls': [
+        {
+            'index': 1,
+            'function': {
+                'arguments': '{"command": "grep -n \\"_format_float\\" /testbed/astropy/io/fits/card.py"}',
+                'name': 'execute_bash',
+            },
+            'id': 'toolu_04',
+            'type': 'function',
+        }
+    ],
+}
+
+NON_FNCALL_RESPONSE_MESSAGE = {
+    'content': [
+        {
+            'type': 'text',
+            'text': 'Let me search for the `_format_float` method mentioned in the PR description:\n\n<function=execute_bash>\n<parameter=command>grep -n "_format_float" /testbed/astropy/io/fits/card.py</parameter>\n</function>',
+        }
+    ],
+    'role': 'assistant',
+}
+
+
+@pytest.mark.parametrize(
+    'tool_calls, expected',
+    [
+        # Original test case
+        (
+            FNCALL_RESPONSE_MESSAGE['tool_calls'],
+            """<function=execute_bash>
+<parameter=command>grep -n "_format_float" /testbed/astropy/io/fits/card.py</parameter>
+</function>""",
+        ),
+        # Test case with multiple parameters
+        (
+            [
+                {
+                    'index': 1,
+                    'function': {
+                        'arguments': '{"command": "view", "path": "/test/file.py", "view_range": [1, 10]}',
+                        'name': 'str_replace_editor',
+                    },
+                    'id': 'test_id',
+                    'type': 'function',
+                }
+            ],
+            """<function=str_replace_editor>
+<parameter=command>view</parameter>
+<parameter=path>/test/file.py</parameter>
+<parameter=view_range>[1, 10]</parameter>
+</function>""",
+        ),
+    ],
+)
+def test_convert_tool_call_to_string(tool_calls, expected):
+    assert len(tool_calls) == 1
+    converted = convert_tool_call_to_string(tool_calls[0])
+    print(converted)
+    assert converted == expected
+
+
+def test_convert_fncall_messages_to_non_fncall_messages():
+    converted_non_fncall = convert_fncall_messages_to_non_fncall_messages(
+        FNCALL_MESSAGES, FNCALL_TOOLS
+    )
+    assert converted_non_fncall == NON_FNCALL_MESSAGES
+
+
+def test_convert_non_fncall_messages_to_fncall_messages():
+    converted = convert_non_fncall_messages_to_fncall_messages(
+        NON_FNCALL_MESSAGES, FNCALL_TOOLS
+    )
+    print(json.dumps(converted, indent=2))
+    assert converted == FNCALL_MESSAGES
+
+
+def test_two_way_conversion_nonfn_to_fn_to_nonfn():
+    non_fncall_copy = copy.deepcopy(NON_FNCALL_MESSAGES)
+    converted_fncall = convert_non_fncall_messages_to_fncall_messages(
+        NON_FNCALL_MESSAGES, FNCALL_TOOLS
+    )
+    assert (
+        non_fncall_copy == NON_FNCALL_MESSAGES
+    )  # make sure original messages are not modified
+    assert converted_fncall == FNCALL_MESSAGES
+
+    fncall_copy = copy.deepcopy(FNCALL_MESSAGES)
+    converted_non_fncall = convert_fncall_messages_to_non_fncall_messages(
+        FNCALL_MESSAGES, FNCALL_TOOLS
+    )
+    assert (
+        fncall_copy == FNCALL_MESSAGES
+    )  # make sure original messages are not modified
+    assert converted_non_fncall == NON_FNCALL_MESSAGES
+
+
+def test_two_way_conversion_fn_to_nonfn_to_fn():
+    fncall_copy = copy.deepcopy(FNCALL_MESSAGES)
+    converted_non_fncall = convert_fncall_messages_to_non_fncall_messages(
+        FNCALL_MESSAGES, FNCALL_TOOLS
+    )
+    assert (
+        fncall_copy == FNCALL_MESSAGES
+    )  # make sure original messages are not modified
+    assert converted_non_fncall == NON_FNCALL_MESSAGES
+
+    non_fncall_copy = copy.deepcopy(NON_FNCALL_MESSAGES)
+    converted_fncall = convert_non_fncall_messages_to_fncall_messages(
+        NON_FNCALL_MESSAGES, FNCALL_TOOLS
+    )
+    assert (
+        non_fncall_copy == NON_FNCALL_MESSAGES
+    )  # make sure original messages are not modified
+    assert converted_fncall == FNCALL_MESSAGES
+
+
+def test_infer_fncall_on_noncall_model():
+    messages_for_llm_inference = convert_fncall_messages_to_non_fncall_messages(
+        FNCALL_MESSAGES, FNCALL_TOOLS
+    )
+    assert messages_for_llm_inference == NON_FNCALL_MESSAGES
+    # Mock LLM inference
+    response_message_from_llm_inference = NON_FNCALL_RESPONSE_MESSAGE
+
+    # Convert back to fncall messages to hand back to the agent
+    # so agent is model-agnostic
+    all_nonfncall_messages = NON_FNCALL_MESSAGES + [response_message_from_llm_inference]
+    converted_fncall_messages = convert_non_fncall_messages_to_fncall_messages(
+        all_nonfncall_messages, FNCALL_TOOLS
+    )
+    assert converted_fncall_messages == FNCALL_MESSAGES + [FNCALL_RESPONSE_MESSAGE]
+    assert converted_fncall_messages[-1] == FNCALL_RESPONSE_MESSAGE
+
+
+def test_convert_from_multiple_tool_calls_to_single_tool_call_messages():
+    # Test case with multiple tool calls in one message
+    input_messages = [
+        {
+            'role': 'assistant',
+            'content': 'Let me help you with that.',
+            'tool_calls': [
+                {
+                    'id': 'call1',
+                    'type': 'function',
+                    'function': {'name': 'func1', 'arguments': '{}'},
+                },
+                {
+                    'id': 'call2',
+                    'type': 'function',
+                    'function': {'name': 'func2', 'arguments': '{}'},
+                },
+            ],
+        },
+        {
+            'role': 'tool',
+            'tool_call_id': 'call1',
+            'content': 'Result 1',
+            'name': 'func1',
+        },
+        {
+            'role': 'tool',
+            'tool_call_id': 'call2',
+            'content': 'Result 2',
+            'name': 'func2',
+        },
+        {
+            'role': 'assistant',
+            'content': 'Test again',
+            'tool_calls': [
+                {
+                    'id': 'call3',
+                    'type': 'function',
+                    'function': {'name': 'func3', 'arguments': '{}'},
+                },
+                {
+                    'id': 'call4',
+                    'type': 'function',
+                    'function': {'name': 'func4', 'arguments': '{}'},
+                },
+            ],
+        },
+        {
+            'role': 'tool',
+            'tool_call_id': 'call3',
+            'content': 'Result 3',
+            'name': 'func3',
+        },
+        {
+            'role': 'tool',
+            'tool_call_id': 'call4',
+            'content': 'Result 4',
+            'name': 'func4',
+        },
+    ]
+
+    expected_output = [
+        {
+            'role': 'assistant',
+            'content': 'Let me help you with that.',
+            'tool_calls': [
+                {
+                    'id': 'call1',
+                    'type': 'function',
+                    'function': {'name': 'func1', 'arguments': '{}'},
+                }
+            ],
+        },
+        {
+            'role': 'tool',
+            'tool_call_id': 'call1',
+            'content': 'Result 1',
+            'name': 'func1',
+        },
+        {
+            'role': 'assistant',
+            'content': '',
+            'tool_calls': [
+                {
+                    'id': 'call2',
+                    'type': 'function',
+                    'function': {'name': 'func2', 'arguments': '{}'},
+                }
+            ],
+        },
+        {
+            'role': 'tool',
+            'tool_call_id': 'call2',
+            'content': 'Result 2',
+            'name': 'func2',
+        },
+        {
+            'role': 'assistant',
+            'content': 'Test again',
+            'tool_calls': [
+                {
+                    'id': 'call3',
+                    'type': 'function',
+                    'function': {'name': 'func3', 'arguments': '{}'},
+                }
+            ],
+        },
+        {
+            'role': 'tool',
+            'tool_call_id': 'call3',
+            'content': 'Result 3',
+            'name': 'func3',
+        },
+        {
+            'role': 'assistant',
+            'content': '',
+            'tool_calls': [
+                {
+                    'id': 'call4',
+                    'type': 'function',
+                    'function': {'name': 'func4', 'arguments': '{}'},
+                }
+            ],
+        },
+        {
+            'role': 'tool',
+            'tool_call_id': 'call4',
+            'content': 'Result 4',
+            'name': 'func4',
+        },
+    ]
+
+    result = convert_from_multiple_tool_calls_to_single_tool_call_messages(
+        input_messages
+    )
+    assert result == expected_output
+
+
+def test_convert_from_multiple_tool_calls_to_single_tool_call_messages_incomplete():
+    # Test case with multiple tool calls in one message
+    input_messages = [
+        {
+            'role': 'assistant',
+            'content': 'Let me help you with that.',
+            'tool_calls': [
+                {
+                    'id': 'call1',
+                    'type': 'function',
+                    'function': {'name': 'func1', 'arguments': '{}'},
+                },
+                {
+                    'id': 'call2',
+                    'type': 'function',
+                    'function': {'name': 'func2', 'arguments': '{}'},
+                },
+            ],
+        },
+        {
+            'role': 'tool',
+            'tool_call_id': 'call1',
+            'content': 'Result 1',
+            'name': 'func1',
+        },
+    ]
+
+    with pytest.raises(FunctionCallConversionError):
+        convert_from_multiple_tool_calls_to_single_tool_call_messages(input_messages)
+
+
+def test_convert_from_multiple_tool_calls_no_changes_needed():
+    # Test case where no conversion is needed (single tool call)
+    input_messages = [
+        {
+            'role': 'assistant',
+            'content': 'Let me help you with that.',
+            'tool_calls': [
+                {
+                    'id': 'call1',
+                    'type': 'function',
+                    'function': {'name': 'func1', 'arguments': '{}'},
+                }
+            ],
+        },
+        {
+            'role': 'tool',
+            'tool_call_id': 'call1',
+            'content': 'Result 1',
+            'name': 'func1',
+        },
+    ]
+
+    result = convert_from_multiple_tool_calls_to_single_tool_call_messages(
+        input_messages
+    )
+    assert result == input_messages
+
+
+def test_convert_from_multiple_tool_calls_no_tool_calls():
+    # Test case with no tool calls
+    input_messages = [
+        {'role': 'user', 'content': 'Hello'},
+        {'role': 'assistant', 'content': 'Hi there!'},
+    ]
+
+    result = convert_from_multiple_tool_calls_to_single_tool_call_messages(
+        input_messages
+    )
+    assert result == input_messages

From 467fae900693830128719135130deff054191d02 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Sun, 3 Nov 2024 22:54:34 +0000
Subject: [PATCH 13/40] add ignore final tool result flag

---
 openhands/llm/fn_call_converter.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openhands/llm/fn_call_converter.py b/openhands/llm/fn_call_converter.py
index 17e897a253596..ca0fcff9eedd3 100644
--- a/openhands/llm/fn_call_converter.py
+++ b/openhands/llm/fn_call_converter.py
@@ -438,6 +438,7 @@ def convert_non_fncall_messages_to_fncall_messages(
 
 def convert_from_multiple_tool_calls_to_single_tool_call_messages(
     messages: list[dict],
+    ignore_final_tool_result: bool = False,
 ) -> list[dict]:
     """Break one message with multiple tool calls into multiple messages."""
     converted_messages = []
@@ -474,7 +475,7 @@ def convert_from_multiple_tool_calls_to_single_tool_call_messages(
             ), f'Found pending tool calls but not expect to handle it with role {role}: {pending_tool_calls=}, {message=}'
             converted_messages.append(message)
 
-    if len(pending_tool_calls) > 0:
+    if not ignore_final_tool_result and len(pending_tool_calls) > 0:
         raise FunctionCallConversionError(
             f'Found pending tool calls but no tool result: {pending_tool_calls=}'
         )

From f31adac07a64cfa0f9f11199ae027d00a5bc83aa Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Mon, 4 Nov 2024 00:13:48 +0000
Subject: [PATCH 14/40] support setting hard timeout per evaluation instance

---
 evaluation/swe_bench/run_infer.py |  7 +++-
 evaluation/utils/shared.py        | 59 +++++++++++++++++++++++++++----
 2 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index 39163e9594315..405d76cf4f00b 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -527,5 +527,10 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
             instances[col] = instances[col].apply(lambda x: str(x))
 
     run_evaluation(
-        instances, metadata, output_file, args.eval_num_workers, process_instance
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+        timeout_seconds=60 * 60,  # 1 hour PER instance should be more than enough
     )
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index b8d2ad281ad6d..fde36ce65d5c2 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -3,9 +3,11 @@
 import multiprocessing as mp
 import os
 import pathlib
+import signal
 import subprocess
 import time
 import traceback
+from contextlib import contextmanager
 from typing import Any, Awaitable, Callable, TextIO
 
 import pandas as pd
@@ -89,6 +91,27 @@ class EvalException(Exception):
     pass
 
 
+class EvalTimeoutException(Exception):
+    pass
+
+
+@contextmanager
+def timeout(seconds: int):
+    def timeout_handler(signum, frame):
+        raise EvalTimeoutException(f'Function timed out after {seconds} seconds')
+
+    # Set up the signal handler
+    original_handler = signal.signal(signal.SIGALRM, timeout_handler)
+    signal.alarm(seconds)
+
+    try:
+        yield
+    finally:
+        # Restore the original handler and disable the alarm
+        signal.alarm(0)
+        signal.signal(signal.SIGALRM, original_handler)
+
+
 def codeact_user_response(
     state: State,
     encapsulate_solution: bool = False,
@@ -270,15 +293,31 @@ def _process_instance_wrapper(
     metadata: EvalMetadata,
     use_mp: bool,
     max_retries: int = 5,
+    timeout_seconds: int | None = None,
 ) -> EvalOutput:
-    """Wrap the process_instance_func to handle retries and errors.
-
-    Retry an instance up to max_retries times if it fails (e.g., due to transient network/runtime issues).
-    """
+    """Wrap the process_instance_func to handle retries and errors."""
     for attempt in range(max_retries + 1):
         try:
-            result = process_instance_func(instance, metadata, use_mp)
+            if timeout_seconds is not None:
+                with timeout(timeout_seconds):
+                    result = process_instance_func(instance, metadata, use_mp)
+            else:
+                result = process_instance_func(instance, metadata, use_mp)
             return result
+        except EvalTimeoutException as e:
+            error = f'Timeout after {timeout_seconds} seconds'
+            stacktrace = traceback.format_exc()
+            msg = (
+                '-' * 10
+                + '\n'
+                + f'Timeout ({timeout_seconds} seconds) in instance [{instance.instance_id}], Stopped evaluation for this instance.'
+                + '\n'
+                + '-' * 10
+            )
+            logger.exception(e)
+            raise RuntimeError(
+                f'Maximum timeout retries reached for instance {instance.instance_id}'
+            ) from e
         except Exception as e:
             error = str(e)
             stacktrace = traceback.format_exc()
@@ -327,6 +366,7 @@ def run_evaluation(
         [pd.Series, EvalMetadata, bool], Awaitable[EvalOutput]
     ],
     max_retries: int = 5,  # number of retries for each instance
+    timeout_seconds: int | None = None,
 ):
     use_multiprocessing = num_workers > 1
 
@@ -346,7 +386,14 @@ def run_evaluation(
         if use_multiprocessing:
             with mp.Pool(num_workers) as pool:
                 args_iter = (
-                    (process_instance_func, instance, metadata, True, max_retries)
+                    (
+                        process_instance_func,
+                        instance,
+                        metadata,
+                        True,
+                        max_retries,
+                        timeout_seconds,
+                    )
                     for _, instance in dataset.iterrows()
                 )
                 results = pool.imap_unordered(_process_instance_wrapper_mp, args_iter)

From e6634102be26c4687a4c697e9a4b4edd76d7adbc Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Mon, 4 Nov 2024 02:52:20 +0000
Subject: [PATCH 15/40] return stuff for timeout

---
 evaluation/utils/shared.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index fde36ce65d5c2..5c3d9864ca6c9 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -315,9 +315,11 @@ def _process_instance_wrapper(
                 + '-' * 10
             )
             logger.exception(e)
-            raise RuntimeError(
-                f'Maximum timeout retries reached for instance {instance.instance_id}'
-            ) from e
+            return EvalOutput(
+                instance_id=instance.instance_id,
+                test_result={},
+                error=error,
+            )
         except Exception as e:
             error = str(e)
             stacktrace = traceback.format_exc()

From 472f30ec909ee4c512abe044fddf2def7b197ad0 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Mon, 4 Nov 2024 03:50:53 +0000
Subject: [PATCH 16/40] update script

---
 scripts/eval-swebench-lite100.sh                | 17 +++++++++++++++++
 ...rollout.sh => eval-swetrain-lite-rollout.sh} |  0
 2 files changed, 17 insertions(+)
 create mode 100755 scripts/eval-swebench-lite100.sh
 rename scripts/{eval-lite-rollout.sh => eval-swetrain-lite-rollout.sh} (100%)

diff --git a/scripts/eval-swebench-lite100.sh b/scripts/eval-swebench-lite100.sh
new file mode 100755
index 0000000000000..b2842f7f7b68a
--- /dev/null
+++ b/scripts/eval-swebench-lite100.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+OUTPUT_FILE=$1
+NUM_WORKERS=$2
+DATASET="swe-train/SWE-bench_lite"
+SPLIT="test100"
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+
+./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT
diff --git a/scripts/eval-lite-rollout.sh b/scripts/eval-swetrain-lite-rollout.sh
similarity index 100%
rename from scripts/eval-lite-rollout.sh
rename to scripts/eval-swetrain-lite-rollout.sh

From c1c46b671ebfa211c1cc9f35e6c0ce0d6ab4fcd7 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Mon, 4 Nov 2024 03:59:45 +0000
Subject: [PATCH 17/40] bump max turn to 50

---
 scripts/rollout-eval.sh           |  2 +-
 scripts/rollout-swe-train-full.sh | 28 ++++++++++++++++++++++++++++
 scripts/rollout-swe-train-lite.sh |  4 ++--
 3 files changed, 31 insertions(+), 3 deletions(-)
 create mode 100755 scripts/rollout-swe-train-full.sh

diff --git a/scripts/rollout-eval.sh b/scripts/rollout-eval.sh
index cbb7017c06bda..f98006a347d0c 100755
--- a/scripts/rollout-eval.sh
+++ b/scripts/rollout-eval.sh
@@ -17,7 +17,7 @@ export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/sw
 export EXP_NAME=$EXP_NAME
 
 EVAL_LIMIT=300
-MAX_ITER=30
+MAX_ITER=50
 NUM_WORKERS=64
 
 ./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT
diff --git a/scripts/rollout-swe-train-full.sh b/scripts/rollout-swe-train-full.sh
new file mode 100755
index 0000000000000..5a0ec7038d925
--- /dev/null
+++ b/scripts/rollout-swe-train-full.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+MODEL=$1
+EXP_NAME=$2 # "train-t0"
+export EXP_NAME=$EXP_NAME
+echo "MODEL: $MODEL"
+echo "EXP_NAME: $EXP_NAME"
+DATASET="swe-train/swe-train-dev-v1"
+SPLIT="train.v2.success"
+N_RUNS=1
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+
+EVAL_LIMIT=2000
+MAX_ITER=50
+NUM_WORKERS=64
+
+./evaluation/swe_bench/scripts/run_infer.sh \
+    $MODEL HEAD CodeActAgent \
+    $EVAL_LIMIT $MAX_ITER $NUM_WORKERS \
+    $DATASET $SPLIT $N_RUNS
diff --git a/scripts/rollout-swe-train-lite.sh b/scripts/rollout-swe-train-lite.sh
index f47d1b5aad1a6..77173ade2b30d 100755
--- a/scripts/rollout-swe-train-lite.sh
+++ b/scripts/rollout-swe-train-lite.sh
@@ -19,8 +19,8 @@ export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
 export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
 
 EVAL_LIMIT=2000
-MAX_ITER=30
-NUM_WORKERS=128
+MAX_ITER=50
+NUM_WORKERS=64
 
 ./evaluation/swe_bench/scripts/run_infer.sh \
     $MODEL HEAD CodeActAgent \

From a50cafe9d26556bc865e39d4628446014cafa986 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Mon, 4 Nov 2024 04:07:27 +0000
Subject: [PATCH 18/40] change limit

---
 scripts/rollout-swe-train-full.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/rollout-swe-train-full.sh b/scripts/rollout-swe-train-full.sh
index 5a0ec7038d925..b07940c89269a 100755
--- a/scripts/rollout-swe-train-full.sh
+++ b/scripts/rollout-swe-train-full.sh
@@ -18,7 +18,7 @@ export RUNTIME=remote
 export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
 export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
 
-EVAL_LIMIT=2000
+EVAL_LIMIT=3000
 MAX_ITER=50
 NUM_WORKERS=64
 

From faae58004cadcf6b6146f5fbfd51ab3f6eed0b46 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Mon, 11 Nov 2024 23:50:20 +0000
Subject: [PATCH 19/40] get more-to-update fncall converter

---
 openhands/llm/fn_call_converter.py | 321 ++++++++++++++++++++++++++++-
 1 file changed, 317 insertions(+), 4 deletions(-)

diff --git a/openhands/llm/fn_call_converter.py b/openhands/llm/fn_call_converter.py
index ca0fcff9eedd3..b63608df45379 100644
--- a/openhands/llm/fn_call_converter.py
+++ b/openhands/llm/fn_call_converter.py
@@ -44,6 +44,188 @@
 - If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls
 """
 
+STOP_WORDS = ['</function']
+
+# NOTE: we need to make sure this example is always in-sync with the tool interface designed in openhands/agenthub/codeact_agent/function_calling.py
+IN_CONTEXT_LEARNING_EXAMPLE_PREFIX = """
+Here's a running example of how to perform a task with the provided tools.
+
+--------------------- START OF EXAMPLE ---------------------
+
+USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.
+
+ASSISTANT: Sure! Let me first check the current directory:
+<function=execute_bash>
+<parameter=command>
+pwd && ls
+</parameter>
+</function>
+
+USER: EXECUTION RESULT of [execute_bash]:
+/workspace
+openhands@runtime:~/workspace$
+
+ASSISTANT: There is no `app.py` file in the current directory. Let me create a Python file `app.py`:
+<function=str_replace_editor>
+<parameter=command>create</parameter>
+<parameter=path>/workspace/app.py</parameter>
+<parameter=file_text>
+from flask import Flask
+app = Flask(__name__)
+
+@app.route('/')
+def index():
+    numbers = list(range(1, 11))
+    return str(numbers)
+
+if __name__ == '__main__':
+    app.run(port=5000)
+</parameter>
+</function>
+
+USER: EXECUTION RESULT of [str_replace_editor]:
+File created successfully at: /workspace/app.py
+
+ASSISTANT: I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
+<function=execute_bash>
+<parameter=command>
+python3 app.py > server.log 2>&1 &
+</parameter>
+</function>
+
+USER: EXECUTION RESULT of [execute_bash]:
+[1] 121
+[1]+  Exit 1                  python3 app.py > server.log 2>&1
+
+ASSISTANT: Looks like the server is running with PID 121 then crashed. Let me check the server log:
+<function=execute_bash>
+<parameter=command>
+cat server.log
+</parameter>
+</function>
+
+USER: EXECUTION RESULT of [execute_bash]:
+Traceback (most recent call last):
+  File "/workspace/app.py", line 2, in <module>
+    from flask import Flask
+ModuleNotFoundError: No module named 'flask'
+
+ASSISTANT: Looks like the server crashed because the `flask` module is not installed. Let me install the `flask` module for you:
+<function=execute_bash>
+<parameter=command>
+pip3 install flask
+</parameter>
+</function>
+
+USER: EXECUTION RESULT of [execute_bash]:
+Defaulting to user installation because normal site-packages is not writeable
+Collecting flask
+  Using cached flask-3.0.3-py3-none-any.whl (101 kB)
+Collecting blinker>=1.6.2
+  Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
+Collecting Werkzeug>=3.0.0
+  Using cached werkzeug-3.0.2-py3-none-any.whl (226 kB)
+Collecting click>=8.1.3
+  Using cached click-8.1.7-py3-none-any.whl (97 kB)
+Collecting itsdangerous>=2.1.2
+  Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
+Requirement already satisfied: Jinja2>=3.1.2 in /home/openhands/.local/lib/python3.10/site-packages (from flask) (3.1.3)
+Requirement already satisfied: MarkupSafe>=2.0 in /home/openhands/.local/lib/python3.10/site-packages (from Jinja2>=3.1.2->flask) (2.1.5)
+Installing collected packages: Werkzeug, itsdangerous, click, blinker, flask
+Successfully installed Werkzeug-3.0.2 blinker-1.7.0 click-8.1.7 flask-3.0.3 itsdangerous-2.2.0
+
+ASSISTANT: The `flask` module is installed successfully. Let me run the Python file again:
+<function=execute_bash>
+<parameter=command>
+python3 app.py > server.log 2>&1 &
+</parameter>
+</function>
+
+USER: EXECUTION RESULT of [execute_bash]:
+[1] 124
+
+ASSISTANT:
+Let me check the server log again:
+<function=execute_bash>
+<parameter=command>
+cat server.log
+</parameter>
+</function>
+
+USER: EXECUTION RESULT of [execute_bash]:
+* Serving Flask app 'app'
+ * Debug mode: off
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on http://127.0.0.1:5000
+Press CTRL+C to quit
+
+ASSISTANT:
+The server is running on port 5000 with PID 124. You can access the list of numbers by visiting http://127.0.0.1:5000. If you have any further questions, feel free to ask!
+
+USER: Now kill the server, make it display the numbers in a table format.
+
+ASSISTANT:
+Sure! Let me stop the server first:
+<function=execute_bash>
+<parameter=command>
+kill 124
+</parameter>
+</function>
+
+USER: EXECUTION RESULT of [execute_bash]:
+[1]+  Terminated              python3 app.py > server.log 2>&1
+
+ASSISTANT:
+Now let me display the numbers in a table format:
+<function=str_replace_editor>
+<parameter=command>str_replace</parameter>
+<parameter=path>/workspace/app.py</parameter>
+<parameter=old_str>return str(numbers)</parameter>
+<parameter=new_str>return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'</parameter>
+</function>
+
+USER: EXECUTION RESULT of [str_replace_editor]:
+The file /workspace/app.py has been edited. Here's the result of running `cat -n` on a snippet of /workspace/app.py:
+     3
+     4  @app.route('/')
+     5  def index():
+     6      numbers = list(range(1, 11))
+     7      return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+     8
+     9  if __name__ == '__main__':
+    10      app.run(port=5000)
+Review the changes and make sure they are as expected. Edit the file again if necessary.
+
+ASSISTANT:
+Running the updated file:
+<function=execute_bash>
+<parameter=command>
+<parameter=command>
+python3 app.py > server.log 2>&1 &
+</parameter>
+</function>
+
+USER: EXECUTION RESULT of [execute_bash]:
+[1] 126
+
+ASSISTANT:
+The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000. Let me know if you have any further requests!
+<function=finish>
+</function>
+
+--------------------- END OF EXAMPLE ---------------------
+
+Do NOT assume the environment is the same as in the example above.
+
+--------------------- NEW TASK DESCRIPTION ---------------------
+""".lstrip()
+
+IN_CONTEXT_LEARNING_EXAMPLE_SUFFIX = """
+--------------------- END OF NEW TASK DESCRIPTION ---------------------
+
+PLEASE follow the format strictly! PLEASE EMIT ONE AND ONLY ONE FUNCTION CALL PER MESSAGE.
+""".lstrip()
+
 # Regex patterns for function call parsing
 FN_REGEX_PATTERN = r'<function=([^>]+)>\n(.*?)</function>'
 FN_PARAM_REGEX_PATTERN = r'<parameter=([^>]+)>(.*?)</parameter>'
@@ -92,10 +274,32 @@ def convert_tools_to_description(tools: list[dict]) -> str:
             ret += '\n'
         ret += f"---- BEGIN FUNCTION #{i+1}: {fn['name']} ----\n"
         ret += f"Description: {fn['description']}\n"
+
         if 'parameters' in fn:
-            ret += f"Parameters: {json.dumps(fn['parameters'], indent=2)}\n"
+            ret += 'Parameters:\n'
+            properties = fn['parameters'].get('properties', {})
+            required_params = set(fn['parameters'].get('required', []))
+
+            for j, (param_name, param_info) in enumerate(properties.items()):
+                # Indicate required/optional in parentheses with type
+                is_required = param_name in required_params
+                param_status = 'required' if is_required else 'optional'
+                param_type = param_info.get('type', 'string')
+
+                # Get parameter description
+                desc = param_info.get('description', 'No description provided')
+
+                # Handle enum values if present
+                if 'enum' in param_info:
+                    enum_values = ', '.join(f'`{v}`' for v in param_info['enum'])
+                    desc += f'\nAllowed values: [{enum_values}]'
+
+                ret += (
+                    f'  ({j+1}) {param_name} ({param_type}, {param_status}): {desc}\n'
+                )
         else:
             ret += 'No parameters are required for this function.\n'
+
         ret += f'---- END FUNCTION #{i+1} ----\n'
     return ret
 
@@ -113,6 +317,7 @@ def convert_fncall_messages_to_non_fncall_messages(
     )
 
     converted_messages = []
+    first_user_message_encountered = False
     for message in messages:
         role, content = message['role'], message['content']
         if content is None:
@@ -135,7 +340,82 @@ def convert_fncall_messages_to_non_fncall_messages(
             converted_messages.append({'role': 'system', 'content': content})
         # 2. USER MESSAGES (no change)
         elif role == 'user':
-            converted_messages.append(message)
+            # Add in-context learning example for the first user message
+            if not first_user_message_encountered:
+                first_user_message_encountered = True
+                # Check tools
+                if not (
+                    tools
+                    and len(tools) > 0
+                    and any(
+                        (
+                            tool['type'] == 'function'
+                            and tool['function']['name'] == 'execute_bash'
+                            and 'command'
+                            in tool['function']['parameters']['properties']
+                        )
+                        for tool in tools
+                    )
+                    and any(
+                        (
+                            tool['type'] == 'function'
+                            and tool['function']['name'] == 'str_replace_editor'
+                            and 'path' in tool['function']['parameters']['properties']
+                            and 'file_text'
+                            in tool['function']['parameters']['properties']
+                            and 'old_str'
+                            in tool['function']['parameters']['properties']
+                            and 'new_str'
+                            in tool['function']['parameters']['properties']
+                        )
+                        for tool in tools
+                    )
+                ):
+                    raise FunctionCallConversionError(
+                        'The currently provided tool set are NOT compatible with the in-context learning example for FnCall to Non-FnCall conversion. '
+                        'Please update your tool set OR the in-context learning example in openhands/llm/fn_call_converter.py'
+                    )
+
+                # add in-context learning example
+                if isinstance(content, str):
+                    content = (
+                        IN_CONTEXT_LEARNING_EXAMPLE_PREFIX
+                        + content
+                        + IN_CONTEXT_LEARNING_EXAMPLE_SUFFIX
+                    )
+                elif isinstance(content, list):
+                    if content and content[0]['type'] == 'text':
+                        content[0]['text'] = (
+                            IN_CONTEXT_LEARNING_EXAMPLE_PREFIX
+                            + content[0]['text']
+                            + IN_CONTEXT_LEARNING_EXAMPLE_SUFFIX
+                        )
+                    else:
+                        content = (
+                            [
+                                {
+                                    'type': 'text',
+                                    'text': IN_CONTEXT_LEARNING_EXAMPLE_PREFIX,
+                                }
+                            ]
+                            + content
+                            + [
+                                {
+                                    'type': 'text',
+                                    'text': IN_CONTEXT_LEARNING_EXAMPLE_SUFFIX,
+                                }
+                            ]
+                        )
+                else:
+                    raise FunctionCallConversionError(
+                        f'Unexpected content type {type(content)}. Expected str or list. Content: {content}'
+                    )
+            converted_messages.append(
+                {
+                    'role': 'user',
+                    'content': content,
+                }
+            )
 
         # 3. ASSISTANT MESSAGES
         # - 3.1 no change if no function call
@@ -266,6 +546,16 @@ def _extract_and_validate_params(
     return params
 
 
+def _fix_stopword(content: str) -> str:
+    """Fix the issue when some LLM would NOT return the stopword."""
+    if '<function=' in content and content.count('<function=') == 1:
+        if content.endswith('</'):
+            content = content.rstrip() + 'function>'
+        else:
+            content = content + '\n</function>'
+    return content
+
+
 def convert_non_fncall_messages_to_fncall_messages(
     messages: list[dict],
     tools: list[ChatCompletionToolParam],
@@ -280,9 +570,10 @@ def convert_non_fncall_messages_to_fncall_messages(
     converted_messages = []
     tool_call_counter = 1  # Counter for tool calls
 
+    first_user_message_encountered = False
     for message in messages:
         role, content = message['role'], message['content']
-
+        content = content or ''  # handle cases where content is None
         # For system messages, remove the added suffix
         if role == 'system':
             if isinstance(content, str):
@@ -297,6 +588,26 @@ def convert_non_fncall_messages_to_fncall_messages(
             converted_messages.append({'role': 'system', 'content': content})
         # Skip user messages (no conversion needed)
         elif role == 'user':
+            # Check & replace in-context learning example
+            if not first_user_message_encountered:
+                first_user_message_encountered = True
+                if isinstance(content, str):
+                    content = content.replace(IN_CONTEXT_LEARNING_EXAMPLE_PREFIX, '')
+                    content = content.replace(IN_CONTEXT_LEARNING_EXAMPLE_SUFFIX, '')
+                elif isinstance(content, list):
+                    for item in content:
+                        if item['type'] == 'text':
+                            item['text'] = item['text'].replace(
+                                IN_CONTEXT_LEARNING_EXAMPLE_PREFIX, ''
+                            )
+                            item['text'] = item['text'].replace(
+                                IN_CONTEXT_LEARNING_EXAMPLE_SUFFIX, ''
+                            )
+                else:
+                    raise FunctionCallConversionError(
+                        f'Unexpected content type {type(content)}. Expected str or list. Content: {content}'
+                    )
+
             # Check for tool execution result pattern
             if isinstance(content, str):
                 tool_result_match = re.search(
@@ -348,14 +659,16 @@ def convert_non_fncall_messages_to_fncall_messages(
                     }
                 )
             else:
-                converted_messages.append(message)
+                converted_messages.append({'role': 'user', 'content': content})
 
         # Handle assistant messages
         elif role == 'assistant':
             if isinstance(content, str):
+                content = _fix_stopword(content)
                 fn_match = re.search(FN_REGEX_PATTERN, content, re.DOTALL)
             elif isinstance(content, list):
                 if content and content[-1]['type'] == 'text':
+                    content[-1]['text'] = _fix_stopword(content[-1]['text'])
                     fn_match = re.search(
                         FN_REGEX_PATTERN, content[-1]['text'], re.DOTALL
                     )

From 226588a5d36d18a6c48d6550df95a8e873875178 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Tue, 12 Nov 2024 00:00:22 +0000
Subject: [PATCH 20/40] optionally add icl

---
 openhands/llm/fn_call_converter.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openhands/llm/fn_call_converter.py b/openhands/llm/fn_call_converter.py
index b63608df45379..5ddbcb3050646 100644
--- a/openhands/llm/fn_call_converter.py
+++ b/openhands/llm/fn_call_converter.py
@@ -307,6 +307,7 @@ def convert_tools_to_description(tools: list[dict]) -> str:
 def convert_fncall_messages_to_non_fncall_messages(
     messages: list[dict],
     tools: list[ChatCompletionToolParam],
+    add_in_context_learning_example: bool = True,
 ) -> list[dict]:
     """Convert function calling messages to non-function calling messages."""
     messages = copy.deepcopy(messages)
@@ -341,7 +342,7 @@ def convert_fncall_messages_to_non_fncall_messages(
         # 2. USER MESSAGES (no change)
         elif role == 'user':
             # Add in-context learning example for the first user message
-            if not first_user_message_encountered:
+            if not first_user_message_encountered and add_in_context_learning_example:
                 first_user_message_encountered = True
                 # Check tools
                 if not (

From cb6241593780e6eb83c58cc55fd30babafdab4f4 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Fri, 15 Nov 2024 05:08:58 +0000
Subject: [PATCH 21/40] fix(swebench): handle error in eval_infer annd

---
 evaluation/swe_bench/eval_infer.py | 42 +++++++++++++++++-------------
 evaluation/swe_bench/run_infer.py  |  2 +-
 2 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/swe_bench/eval_infer.py
index c4566c1bfd3df..5af302c6e6b31 100644
--- a/evaluation/swe_bench/eval_infer.py
+++ b/evaluation/swe_bench/eval_infer.py
@@ -244,23 +244,29 @@ def process_instance(
                         test_output_path = os.path.join(log_dir, 'test_output.txt')
                         with open(test_output_path, 'w') as f:
                             f.write(test_output)
-
-                        _report = get_eval_report(
-                            test_spec=test_spec,
-                            prediction={
-                                'model_patch': model_patch,
-                                'instance_id': instance_id,
-                            },
-                            log_path=test_output_path,
-                            include_tests_status=True,
-                        )
-                        report = _report[instance_id]
-                        logger.info(
-                            f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
-                        )
-                        instance['test_result']['report']['resolved'] = report[
-                            'resolved'
-                        ]
+                        try:
+                            _report = get_eval_report(
+                                test_spec=test_spec,
+                                prediction={
+                                    'model_patch': model_patch,
+                                    'instance_id': instance_id,
+                                },
+                                log_path=test_output_path,
+                                include_tests_status=True,
+                            )
+                            report = _report[instance_id]
+                            logger.info(
+                                f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
+                            )
+                            instance['test_result']['report']['resolved'] = report[
+                                'resolved'
+                            ]
+                        except Exception as e:
+                            logger.error(
+                                f'[{instance_id}] Error when getting eval report: {e}'
+                            )
+                            instance['test_result']['report']['resolved'] = False
+                            instance['test_result']['report']['error_eval'] = True
             else:
                 logger.info(f'[{instance_id}] Error when starting eval:\n{obs.content}')
                 instance['test_result']['report']['error_eval'] = True
@@ -335,7 +341,7 @@ def process_instance(
 
     if 'model_patch' not in predictions.columns:
         predictions['model_patch'] = predictions['test_result'].apply(
-            lambda x: x['git_patch']
+            lambda x: x.get('git_patch', '')
         )
     assert {'instance_id', 'model_patch'}.issubset(
         set(predictions.columns)
diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index 405d76cf4f00b..86334bed56bb0 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -532,5 +532,5 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
         output_file,
         args.eval_num_workers,
         process_instance,
-        timeout_seconds=60 * 60,  # 1 hour PER instance should be more than enough
+        timeout_seconds=120 * 60,  # 2 hour PER instance should be more than enough
     )

From e0a50e6b5e9b15eb3f368564314302fac70c5e7c Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Fri, 15 Nov 2024 05:17:49 +0000
Subject: [PATCH 22/40] fix arg

---
 evaluation/swe_bench/run_infer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index 386c0dd192383..9cb9dd77f4983 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -145,7 +145,7 @@ def get_config(
             platform='linux/amd64',
             api_key=os.environ.get('ALLHANDS_API_KEY', None),
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
-            keep_remote_runtime_alive=False,
+            keep_runtime_alive=False,
             remote_runtime_init_timeout=3600,
         ),
         # do not mount workspace

From d2d8bf5023256754c172f0f11d24dc73bb2dda03 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Fri, 15 Nov 2024 15:15:44 +0000
Subject: [PATCH 23/40] fix for empty content

---
 openhands/llm/fn_call_converter.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openhands/llm/fn_call_converter.py b/openhands/llm/fn_call_converter.py
index 5ddbcb3050646..2f8d9cbe5cc80 100644
--- a/openhands/llm/fn_call_converter.py
+++ b/openhands/llm/fn_call_converter.py
@@ -320,7 +320,8 @@ def convert_fncall_messages_to_non_fncall_messages(
     converted_messages = []
     first_user_message_encountered = False
     for message in messages:
-        role, content = message['role'], message['content']
+        role = message['role']
+        content = message.get('content', '')
         if content is None:
             content = ''
 

From c3d9ec2b248ee79fbca7e18f487f871722876cb5 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Fri, 15 Nov 2024 15:17:22 +0000
Subject: [PATCH 24/40] remove extra

---
 openhands/llm/llm.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index 0590945995c1e..7712457a73ad0 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -121,7 +121,9 @@ def __init__(
             top_p=self.config.top_p,
             drop_params=self.config.drop_params,
         )
-
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore')
+            self.init_model_info()
         if self.vision_is_active():
             logger.debug('LLM: model has vision enabled')
         if self.is_caching_prompt_active():
@@ -143,16 +145,6 @@ def __init__(
             drop_params=self.config.drop_params,
         )
 
-        with warnings.catch_warnings():
-            warnings.simplefilter('ignore')
-            self.init_model_info()
-        if self.vision_is_active():
-            logger.debug('LLM: model has vision enabled')
-        if self.is_caching_prompt_active():
-            logger.debug('LLM: caching prompt enabled')
-        if self.is_function_calling_active():
-            logger.debug('LLM: model supports function calling')
-
         self._completion_unwrapped = self._completion
 
         @self.retry_decorator(

From a25d05de9c09173e3c72f9eb47f3ad6559f9b5ef Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Thu, 21 Nov 2024 20:03:31 +0000
Subject: [PATCH 25/40] update summarizer script to accept folder

---
 .../scripts/eval/summarize_outputs.py         | 211 +++++++++++++++---
 1 file changed, 179 insertions(+), 32 deletions(-)

diff --git a/evaluation/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
index 5d5dbbf2a3bd5..739f26d10eb3a 100755
--- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py
+++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
@@ -1,8 +1,12 @@
 #!/usr/bin/env python3
 import argparse
+import glob
 import json
+import os
 from collections import Counter
 
+import pandas as pd
+
 from openhands.events.serialization import event_from_dict
 from openhands.events.utils import get_pairs_from_events
 
@@ -12,23 +16,18 @@
     'Action execution failed',
 ]
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('output_file', type=str, help='The file to summarize')
-    args = parser.parse_args()
 
-    with open(args.output_file, 'r') as file:
+def process_file(file_path):
+    with open(file_path, 'r') as file:
         lines = file.readlines()
 
     num_lines = len(lines)
     num_error_lines = 0
     num_agent_stuck_in_loop = 0
-
     num_resolved = 0
     num_empty_patch = 0
-
+    num_unfinished_runs = 0
     error_counter = Counter()
-
     main_agent_cost = []
     editor_cost = []
     num_turns = []
@@ -36,6 +35,11 @@
     for line in lines:
         _d = json.loads(line)
 
+        if 'metrics' not in _d or _d['metrics'] is None:
+            # this is a failed run
+            num_unfinished_runs += 1
+            continue
+
         # Cost
         costs = _d['metrics'].get('costs', [])
         _cur_main_agent_cost = 0
@@ -89,30 +93,173 @@
                 num_error_lines += 1
                 break
 
-    # print the error counter (with percentage)
-    print(
-        f'Number of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
-    )
-    print(
-        f'Number of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
-    )
-    print(
-        f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
-    )
-    print(
-        f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
+    return {
+        'file_path': file_path,
+        'total_instances': num_lines,
+        'resolved': {
+            'count': num_resolved,
+            'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
+        },
+        'empty_patches': {
+            'count': num_empty_patch,
+            'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0,
+        },
+        'unfinished_runs': {
+            'count': num_unfinished_runs,
+            'percentage': (num_unfinished_runs / num_lines * 100)
+            if num_lines > 0
+            else 0,
+        },
+        'errors': {
+            'total': num_error_lines,
+            'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0,
+            'stuck_in_loop': {
+                'count': num_agent_stuck_in_loop,
+                'percentage': (num_agent_stuck_in_loop / num_lines * 100)
+                if num_lines > 0
+                else 0,
+            },
+            'breakdown': {
+                str(error): {
+                    'count': count,
+                    'percentage': (count / num_lines * 100) if num_lines > 0 else 0,
+                }
+                for error, count in error_counter.items()
+            },
+        },
+        'statistics': {
+            'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0,
+            'costs': {
+                'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0,
+                'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0,
+                'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines
+                if num_lines > 0
+                else 0,
+            },
+        },
+    }
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'input_path', type=str, help='The file or directory to summarize'
     )
-    assert len(num_turns) == num_lines
-    assert len(main_agent_cost) == num_lines
-    assert len(editor_cost) == num_lines
-    print('## Statistics')
-    print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}')
-    print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD')
-    print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD')
-    print(
-        f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD'
+    parser.add_argument(
+        '--output',
+        type=str,
+        help='Output JSONL file for results',
+        default='summary_results.jsonl',
     )
+    args = parser.parse_args()
+
+    results = []
+
+    if os.path.isdir(args.input_path):
+        # Process all output.jsonl files in subdirectories
+        pattern = os.path.join(args.input_path, '**/output.jsonl')
+        files = glob.glob(pattern, recursive=True)
+        print(f'Processing {len(files)} files from directory {args.input_path}')
+
+        # Process each file silently and collect results
+        for file_path in files:
+            try:
+                result = process_file(file_path)
+                results.append(result)
+            except Exception as e:
+                print(f'Error processing {file_path}: {str(e)}')
+                import traceback
+
+                traceback.print_exc()
+                continue
+
+        print(f'Results written to {args.output}')
+
+        # Convert results to pandas DataFrame and sort by resolve rate
+        df = pd.DataFrame(results)
+
+        # Extract directory name from file path
+        df['directory'] = df['file_path'].apply(
+            lambda x: os.path.basename(os.path.dirname(x))
+        )
+
+        df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
+        df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
+        df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
+        df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
+        df['error_rate'] = df['errors'].apply(lambda x: x['percentage'])
+        df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total'])
+
+        df = df.sort_values('resolve_rate', ascending=False)
+
+        print('\nResults summary (sorted by resolve rate):')
+        columns = [
+            'directory',
+            'resolve_rate',
+            'empty_patch_rate',
+            'unfinished_rate',
+            'error_rate',
+            'avg_turns',
+            'avg_cost',
+            'total_instances',
+        ]
+
+        # Set display options for pandas
+        pd.set_option('display.width', 0)  # Auto-detect terminal width
+        pd.set_option('display.max_rows', None)  # Show all rows
+
+        # Use formatters and max_colwidth in to_string
+        print(
+            df[columns].to_string(
+                float_format=lambda x: '{:.2f}'.format(x),
+                formatters={
+                    'directory': lambda x: x[:90]
+                },  # Truncate directory names to 20 chars
+                index=False,
+            )
+        )
+
+        df.to_json(args.output, lines=True, orient='records')
+    else:
+        # Process single file with detailed output
+        try:
+            result = process_file(args.input_path)
+            results.append(result)
+
+            # Print detailed results for single file
+            print(f'\nResults for {args.input_path}:')
+            print(
+                f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}%)"
+            )
+            print(
+                f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
+            )
+            print(
+                f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)"
+            )
+            print(
+                f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)"
+            )
+            print(
+                f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
+            )
+            print('## Statistics')
+            print(
+                f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
+            )
+            print(
+                f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD"
+            )
+            print(
+                f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD"
+            )
+            print(
+                f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD"
+            )
+
+            print('## Detailed error breakdown:')
+            for error, data in result['errors']['breakdown'].items():
+                print(f"{error}: {data['count']} ({data['percentage']:.2f}%)")
 
-    print('## Detailed error breakdown:')
-    for error, count in error_counter.items():
-        print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')
+        except Exception as e:
+            print(f'Error processing {args.input_path}: {str(e)}')

From 640c7acf6d3186dab1c8451891287ae1f7da3a7f Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Thu, 21 Nov 2024 20:04:57 +0000
Subject: [PATCH 26/40] print summary string to text

---
 .../scripts/eval/summarize_outputs.py         | 33 ++++++++++---------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/evaluation/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
index 739f26d10eb3a..21ff18ebdbb7e 100755
--- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py
+++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
@@ -192,7 +192,7 @@ def process_file(file_path):
 
         df = df.sort_values('resolve_rate', ascending=False)
 
-        print('\nResults summary (sorted by resolve rate):')
+        # Create the summary string
         columns = [
             'directory',
             'resolve_rate',
@@ -203,22 +203,25 @@ def process_file(file_path):
             'avg_cost',
             'total_instances',
         ]
-
-        # Set display options for pandas
-        pd.set_option('display.width', 0)  # Auto-detect terminal width
-        pd.set_option('display.max_rows', None)  # Show all rows
-
-        # Use formatters and max_colwidth in to_string
-        print(
-            df[columns].to_string(
-                float_format=lambda x: '{:.2f}'.format(x),
-                formatters={
-                    'directory': lambda x: x[:90]
-                },  # Truncate directory names to 20 chars
-                index=False,
-            )
+        summary_str = df[columns].to_string(
+            float_format=lambda x: '{:.2f}'.format(x),
+            formatters={
+                'directory': lambda x: x[:90]
+            },  # Truncate directory names to 20 chars
+            index=False,
         )
 
+        # Print to console
+        print('\nResults summary (sorted by resolve rate):')
+        print(summary_str)
+
+        # Save to text file
+        txt_output = args.output.rsplit('.', 1)[0] + '.txt'
+        with open(txt_output, 'w') as f:
+            f.write('Results summary (sorted by resolve rate):\n')
+            f.write(summary_str)
+
+        # Save
         df.to_json(args.output, lines=True, orient='records')
     else:
         # Process single file with detailed output

From e4f3859d1d09e72c3cfdf06601912e1d4cff9cec Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Thu, 21 Nov 2024 20:24:26 +0000
Subject: [PATCH 27/40] update rollout eval script

---
 scripts/rollout-eval.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/rollout-eval.sh b/scripts/rollout-eval.sh
index f98006a347d0c..23957ab0bd51e 100755
--- a/scripts/rollout-eval.sh
+++ b/scripts/rollout-eval.sh
@@ -17,7 +17,7 @@ export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/sw
 export EXP_NAME=$EXP_NAME
 
 EVAL_LIMIT=300
-MAX_ITER=50
+MAX_ITER=100
 NUM_WORKERS=64
 
 ./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT

From aa3b39eef746afe10a7f54c1ad8693fb644a80af Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Fri, 22 Nov 2024 21:22:19 +0000
Subject: [PATCH 28/40] allow specifying n processes

---
 scripts/rollout-eval.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/rollout-eval.sh b/scripts/rollout-eval.sh
index 23957ab0bd51e..ac46effca15ee 100755
--- a/scripts/rollout-eval.sh
+++ b/scripts/rollout-eval.sh
@@ -2,6 +2,7 @@
 
 export EXP_NAME="t0"  # use this to differentiate between different runs
 MODEL=$1
+NUM_WORKERS=${2:-64}
 DATASET="swe-train/SWE-bench_lite"
 SPLIT="test100"
 N_RUNS=1
@@ -18,6 +19,5 @@ export EXP_NAME=$EXP_NAME
 
 EVAL_LIMIT=300
 MAX_ITER=100
-NUM_WORKERS=64
 
 ./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT

From 7c4ce5663dcbfa44c38b779f2dba61e245497b83 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Fri, 22 Nov 2024 21:22:29 +0000
Subject: [PATCH 29/40] include litellm timeout as error

---
 evaluation/swe_bench/scripts/eval/summarize_outputs.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/evaluation/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
index 21ff18ebdbb7e..2252ffa345eb3 100755
--- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py
+++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
@@ -14,6 +14,7 @@
     'Agent encountered an error while processing the last action',
     'APIError',
     'Action execution failed',
+    'litellm.Timeout: APITimeoutError',
 ]
 
 
@@ -223,6 +224,7 @@ def process_file(file_path):
 
         # Save
         df.to_json(args.output, lines=True, orient='records')
+        df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False)
     else:
         # Process single file with detailed output
         try:

From 14b15d12dd22faeb0d4afe69f6335c931c1401a1 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Sat, 23 Nov 2024 18:19:52 +0000
Subject: [PATCH 30/40] add script to rollout and eval lite

---
 scripts/eval-swebench-lite.sh | 17 +++++++++++++++++
 scripts/rollout-eval-lite.sh  | 23 +++++++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100755 scripts/eval-swebench-lite.sh
 create mode 100755 scripts/rollout-eval-lite.sh

diff --git a/scripts/eval-swebench-lite.sh b/scripts/eval-swebench-lite.sh
new file mode 100755
index 0000000000000..5a0172dd91174
--- /dev/null
+++ b/scripts/eval-swebench-lite.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+OUTPUT_FILE=$1
+NUM_WORKERS=$2
+DATASET="princeton-nlp/SWE-bench_Lite"
+SPLIT="test"
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+
+./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT
diff --git a/scripts/rollout-eval-lite.sh b/scripts/rollout-eval-lite.sh
new file mode 100755
index 0000000000000..21113f3f870c1
--- /dev/null
+++ b/scripts/rollout-eval-lite.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+export EXP_NAME="t0"  # use this to differentiate between different runs
+MODEL=$1
+NUM_WORKERS=${2:-64}
+DATASET="princeton-nlp/SWE-bench_Lite"
+SPLIT="test"
+N_RUNS=1
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+export EXP_NAME=$EXP_NAME
+
+EVAL_LIMIT=300
+MAX_ITER=100
+
+./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT

From edfeb2f84584203d6bdddc9d0b2d8f51ac336134 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Sat, 23 Nov 2024 18:25:48 +0000
Subject: [PATCH 31/40] put aggregate dir to a separate script

---
 .../scripts/eval/summarize_outputs.py         | 77 ++++++++++---------
 1 file changed, 41 insertions(+), 36 deletions(-)

diff --git a/evaluation/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
index 2252ffa345eb3..8ca3ab4c1c820 100755
--- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py
+++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
@@ -141,6 +141,46 @@ def process_file(file_path):
     }
 
 
+def aggregate_directory(input_path) -> pd.DataFrame:
+    # Process all output.jsonl files in subdirectories
+    pattern = os.path.join(input_path, '**/output.jsonl')
+    files = glob.glob(pattern, recursive=True)
+    print(f'Processing {len(files)} files from directory {input_path}')
+
+    # Process each file silently and collect results
+    for file_path in files:
+        try:
+            result = process_file(file_path)
+            results.append(result)
+        except Exception as e:
+            print(f'Error processing {file_path}: {str(e)}')
+            import traceback
+
+            traceback.print_exc()
+            continue
+
+    print(f'Results written to {args.output}')
+
+    # Convert results to pandas DataFrame and sort by resolve rate
+    df = pd.DataFrame(results)
+
+    # Extract directory name from file path
+    df['directory'] = df['file_path'].apply(
+        lambda x: os.path.basename(os.path.dirname(x))
+    )
+
+    df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
+    df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
+    df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
+    df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
+    df['error_rate'] = df['errors'].apply(lambda x: x['percentage'])
+    df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total'])
+
+    df = df.sort_values('resolve_rate', ascending=False)
+
+    return df
+
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -157,42 +197,7 @@ def process_file(file_path):
     results = []
 
     if os.path.isdir(args.input_path):
-        # Process all output.jsonl files in subdirectories
-        pattern = os.path.join(args.input_path, '**/output.jsonl')
-        files = glob.glob(pattern, recursive=True)
-        print(f'Processing {len(files)} files from directory {args.input_path}')
-
-        # Process each file silently and collect results
-        for file_path in files:
-            try:
-                result = process_file(file_path)
-                results.append(result)
-            except Exception as e:
-                print(f'Error processing {file_path}: {str(e)}')
-                import traceback
-
-                traceback.print_exc()
-                continue
-
-        print(f'Results written to {args.output}')
-
-        # Convert results to pandas DataFrame and sort by resolve rate
-        df = pd.DataFrame(results)
-
-        # Extract directory name from file path
-        df['directory'] = df['file_path'].apply(
-            lambda x: os.path.basename(os.path.dirname(x))
-        )
-
-        df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
-        df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
-        df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
-        df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
-        df['error_rate'] = df['errors'].apply(lambda x: x['percentage'])
-        df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total'])
-
-        df = df.sort_values('resolve_rate', ascending=False)
-
+        df = aggregate_directory(args.input_path)
         # Create the summary string
         columns = [
             'directory',

From 95f5fc0c97abd6bb4d580e965bc0aa8885674f78 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Sat, 23 Nov 2024 18:28:07 +0000
Subject: [PATCH 32/40] fix bugs

---
 evaluation/swe_bench/scripts/eval/summarize_outputs.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/evaluation/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
index 8ca3ab4c1c820..0da6eb25b54c6 100755
--- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py
+++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
@@ -148,6 +148,7 @@ def aggregate_directory(input_path) -> pd.DataFrame:
     print(f'Processing {len(files)} files from directory {input_path}')
 
     # Process each file silently and collect results
+    results = []
     for file_path in files:
         try:
             result = process_file(file_path)
@@ -159,8 +160,6 @@ def aggregate_directory(input_path) -> pd.DataFrame:
             traceback.print_exc()
             continue
 
-    print(f'Results written to {args.output}')
-
     # Convert results to pandas DataFrame and sort by resolve rate
     df = pd.DataFrame(results)
 
@@ -194,8 +193,6 @@ def aggregate_directory(input_path) -> pd.DataFrame:
     )
     args = parser.parse_args()
 
-    results = []
-
     if os.path.isdir(args.input_path):
         df = aggregate_directory(args.input_path)
         # Create the summary string
@@ -232,6 +229,7 @@ def aggregate_directory(input_path) -> pd.DataFrame:
         df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False)
     else:
         # Process single file with detailed output
+        results = []
         try:
             result = process_file(args.input_path)
             results.append(result)

From b0281d6135e590a85806b3d52e54e21b52341980 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Sun, 24 Nov 2024 22:07:51 +0000
Subject: [PATCH 33/40] add script to eval full rollout

---
 scripts/eval-swetrain-full-rollout.sh | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100755 scripts/eval-swetrain-full-rollout.sh

diff --git a/scripts/eval-swetrain-full-rollout.sh b/scripts/eval-swetrain-full-rollout.sh
new file mode 100755
index 0000000000000..9441fb2f53aaa
--- /dev/null
+++ b/scripts/eval-swetrain-full-rollout.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+OUTPUT_FILE=$1
+NUM_WORKERS=$2
+DATASET="swe-train/swe-train-dev-v1"
+SPLIT="train.v2.success"
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+
+./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT

From 46425b65b8c7e6bc8311c23829b1232eb6ab51a7 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Mon, 25 Nov 2024 19:37:01 +0000
Subject: [PATCH 34/40] fix remote runtime

---
 openhands/llm/fn_call_converter.py              |  2 +-
 openhands/runtime/impl/remote/remote_runtime.py | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/openhands/llm/fn_call_converter.py b/openhands/llm/fn_call_converter.py
index 2f8d9cbe5cc80..057401b12c2ca 100644
--- a/openhands/llm/fn_call_converter.py
+++ b/openhands/llm/fn_call_converter.py
@@ -432,7 +432,7 @@ def convert_fncall_messages_to_non_fncall_messages(
                     tool_content = convert_tool_call_to_string(message['tool_calls'][0])
                 except FunctionCallConversionError as e:
                     raise FunctionCallConversionError(
-                        f'Failed to convert tool call to string. Raw messages: {json.dumps(messages, indent=2)}'
+                        f'Failed to convert tool call to string.\nCurrent tool call: {message["tool_calls"][0]}.\nRaw messages: {json.dumps(messages, indent=2)}'
                     ) from e
                 if isinstance(content, str):
                     content += '\n\n' + tool_content
diff --git a/openhands/runtime/impl/remote/remote_runtime.py b/openhands/runtime/impl/remote/remote_runtime.py
index 4191a047b1c22..db685eb989620 100644
--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@@ -327,13 +327,13 @@ def _wait_until_alive_impl(self):
         assert 'runtime_id' in runtime_data
         assert runtime_data['runtime_id'] == self.runtime_id
         assert 'pod_status' in runtime_data
-        pod_status = runtime_data['pod_status']
+        pod_status = runtime_data['pod_status'].lower()
         self.log('debug', f'Pod status: {pod_status}')
 
         # FIXME: We should fix it at the backend of /start endpoint, make sure
         # the pod is created before returning the response.
         # Retry a period of time to give the cluster time to start the pod
-        if pod_status == 'Ready':
+        if pod_status == 'ready':
             try:
                 with self._send_request(
                     'GET',
@@ -349,14 +349,14 @@ def _wait_until_alive_impl(self):
                 )
             return
         elif (
-            pod_status == 'Not Found'
-            or pod_status == 'Pending'
-            or pod_status == 'Running'
+            pod_status == 'not found'
+            or pod_status == 'pending'
+            or pod_status == 'running'
         ):  # nb: Running is not yet Ready
             raise RuntimeNotReadyError(
                 f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}'
             )
-        elif pod_status in ('Failed', 'Unknown'):
+        elif pod_status in ('failed', 'unknown', 'crashloopbackoff'):
             # clean up the runtime
             self.close()
             raise RuntimeError(

From 5bd315106b4c5660b9ec435d511de32e781ea2c8 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Mon, 25 Nov 2024 20:06:36 +0000
Subject: [PATCH 35/40] add a bunch of scripts

---
 scripts/eval-swebench-lite.sh     |  2 +-
 scripts/eval-swebench-verified.sh | 17 +++++++++++++++++
 scripts/rollout-eval-verified.sh  | 23 +++++++++++++++++++++++
 scripts/rollout-swe-train-full.sh |  2 +-
 4 files changed, 42 insertions(+), 2 deletions(-)
 create mode 100755 scripts/eval-swebench-verified.sh
 create mode 100755 scripts/rollout-eval-verified.sh

diff --git a/scripts/eval-swebench-lite.sh b/scripts/eval-swebench-lite.sh
index 5a0172dd91174..3f6d88d910d7a 100755
--- a/scripts/eval-swebench-lite.sh
+++ b/scripts/eval-swebench-lite.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 OUTPUT_FILE=$1
-NUM_WORKERS=$2
+NUM_WORKERS=${2:-64}
 DATASET="princeton-nlp/SWE-bench_Lite"
 SPLIT="test"
 
diff --git a/scripts/eval-swebench-verified.sh b/scripts/eval-swebench-verified.sh
new file mode 100755
index 0000000000000..f6909f6515c5d
--- /dev/null
+++ b/scripts/eval-swebench-verified.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+OUTPUT_FILE=$1
+NUM_WORKERS=${2:-64}
+DATASET="princeton-nlp/SWE-bench_Verified"
+SPLIT="test"
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+
+./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT
diff --git a/scripts/rollout-eval-verified.sh b/scripts/rollout-eval-verified.sh
new file mode 100755
index 0000000000000..b46acc93d74cd
--- /dev/null
+++ b/scripts/rollout-eval-verified.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+export EXP_NAME="t0"  # use this to differentiate between different runs
+MODEL=$1
+NUM_WORKERS=${2:-64}
+DATASET="princeton-nlp/SWE-bench_Verified"
+SPLIT="test"
+N_RUNS=1
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+export EXP_NAME=$EXP_NAME
+
+EVAL_LIMIT=500
+MAX_ITER=100
+
+./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT
diff --git a/scripts/rollout-swe-train-full.sh b/scripts/rollout-swe-train-full.sh
index b07940c89269a..fcd3d0272a0b9 100755
--- a/scripts/rollout-swe-train-full.sh
+++ b/scripts/rollout-swe-train-full.sh
@@ -2,12 +2,12 @@
 
 MODEL=$1
 EXP_NAME=$2 # "train-t0"
+N_RUNS=${3:-20}
 export EXP_NAME=$EXP_NAME
 echo "MODEL: $MODEL"
 echo "EXP_NAME: $EXP_NAME"
 DATASET="swe-train/swe-train-dev-v1"
 SPLIT="train.v2.success"
-N_RUNS=1
 
 if [ -z "$ALLHANDS_API_KEY" ]; then
     echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."

From 160e99594e51440d354ed76ed7f30d35670b857e Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Tue, 26 Nov 2024 18:35:10 +0000
Subject: [PATCH 36/40] support multiple rounds for lite & verifed

---
 scripts/rollout-eval-lite.sh     | 8 ++++++--
 scripts/rollout-eval-verified.sh | 8 ++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/scripts/rollout-eval-lite.sh b/scripts/rollout-eval-lite.sh
index 21113f3f870c1..2d111b9910036 100755
--- a/scripts/rollout-eval-lite.sh
+++ b/scripts/rollout-eval-lite.sh
@@ -3,9 +3,13 @@
 export EXP_NAME="t0"  # use this to differentiate between different runs
 MODEL=$1
 NUM_WORKERS=${2:-64}
+N_RUNS=${3:-1}
 DATASET="princeton-nlp/SWE-bench_Lite"
 SPLIT="test"
-N_RUNS=1
+
+echo "MODEL: $MODEL"
+echo "NUM_WORKERS: $NUM_WORKERS"
+echo "N_RUNS: $N_RUNS"
 
 if [ -z "$ALLHANDS_API_KEY" ]; then
     echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
@@ -20,4 +24,4 @@ export EXP_NAME=$EXP_NAME
 EVAL_LIMIT=300
 MAX_ITER=100
 
-./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT
+./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT $N_RUNS
diff --git a/scripts/rollout-eval-verified.sh b/scripts/rollout-eval-verified.sh
index b46acc93d74cd..ecffc1ac108af 100755
--- a/scripts/rollout-eval-verified.sh
+++ b/scripts/rollout-eval-verified.sh
@@ -3,9 +3,13 @@
 export EXP_NAME="t0"  # use this to differentiate between different runs
 MODEL=$1
 NUM_WORKERS=${2:-64}
+N_RUNS=${3:-1}
 DATASET="princeton-nlp/SWE-bench_Verified"
 SPLIT="test"
-N_RUNS=1
+
+echo "MODEL: $MODEL"
+echo "NUM_WORKERS: $NUM_WORKERS"
+echo "N_RUNS: $N_RUNS"
 
 if [ -z "$ALLHANDS_API_KEY" ]; then
     echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
@@ -20,4 +24,4 @@ export EXP_NAME=$EXP_NAME
 EVAL_LIMIT=500
 MAX_ITER=100
 
-./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT
+./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT $N_RUNS

From 57071a0c501ea8105c8b59eadd7a1f1e424a8b8d Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Wed, 18 Dec 2024 04:23:22 +0000
Subject: [PATCH 37/40] fix run_infer for occasional cd error

---
 evaluation/swe_bench/run_infer.py | 116 +++++++++++++++++++-----------
 1 file changed, 73 insertions(+), 43 deletions(-)

diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index 9cb9dd77f4983..5ae044edbc3c4 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -32,7 +32,7 @@
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction, MessageAction
+from openhands.events.action import CmdRunAction, IPythonRunCellAction, MessageAction
 from openhands.events.observation import CmdOutputObservation, ErrorObservation
 from openhands.events.serialization.event import event_to_dict
 from openhands.runtime.base import Runtime
@@ -303,6 +303,7 @@ def initialize_runtime(
 def complete_runtime(
     runtime: Runtime,
     instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
+    n_retries: int = 5,
 ) -> dict[str, Any]:
     """Complete the runtime for the agent.
 
@@ -321,55 +322,84 @@ def complete_runtime(
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
-        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
-    )
 
-    action = CmdRunAction(command='git config --global core.pager ""')
-    action.timeout = 600
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
-        f'Failed to git config --global core.pager "": {str(obs)}',
-    )
+    if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0:
+        action = CmdRunAction(command='git config --global core.pager ""')
+        action.timeout = 600
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert_and_raise(
+            isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+            f'Failed to git config --global core.pager "": {str(obs)}',
+        )
 
-    action = CmdRunAction(command='git add -A')
-    action.timeout = 600
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
-        f'Failed to git add -A: {str(obs)}',
-    )
+        action = CmdRunAction(command='git add -A')
+        action.timeout = 600
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert_and_raise(
+            isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+            f'Failed to git add -A: {str(obs)}',
+        )
 
-    n_retries = 0
-    git_patch = None
-    while n_retries < 5:
-        action = CmdRunAction(
-            command=f'git diff --no-color --cached {instance["base_commit"]}',
-            keep_prompt=False,
+        n_retries = 0
+        git_patch = None
+        while n_retries < 5:
+            action = CmdRunAction(
+                command=f'git diff --no-color --cached {instance["base_commit"]}',
+                keep_prompt=False,
+            )
+            action.timeout = 600 + 100 * n_retries
+            logger.info(action, extra={'msg_type': 'ACTION'})
+            obs = runtime.run_action(action)
+            logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+            n_retries += 1
+            if isinstance(obs, CmdOutputObservation):
+                if obs.exit_code == 0:
+                    git_patch = obs.content.strip()
+                    break
+                else:
+                    logger.info('Failed to get git diff, retrying...')
+                    sleep_if_should_continue(10)
+            elif isinstance(obs, ErrorObservation):
+                logger.error(f'Error occurred: {obs.content}. Retrying...')
+                sleep_if_should_continue(10)
+            else:
+                assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
+    else:
+        logger.warning(
+            f'Failed to cd to /workspace/{workspace_dir_name}... Trying to use IPython to get git diff'
         )
-        action.timeout = 600 + 100 * n_retries
+        # Git configuration and diff using IPython
+        cell_code = f"""
+    import subprocess
+
+    def run_git_cmd(cmd):
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, cwd='/workspace/{workspace_dir_name}')
+        return result.stdout, result.returncode
+
+    # Configure git
+    run_git_cmd('git config --global core.pager ""')
+    run_git_cmd('git add -A')
+
+    # Get the diff
+    stdout, exit_code = run_git_cmd('git diff --no-color --cached {instance["base_commit"]}')
+    git_patch = stdout.strip()
+    """
+        action = IPythonRunCellAction(code=cell_code)
+        action.timeout = 600
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-        n_retries += 1
-        if isinstance(obs, CmdOutputObservation):
-            if obs.exit_code == 0:
-                git_patch = obs.content.strip()
-                break
-            else:
-                logger.info('Failed to get git diff, retrying...')
-                sleep_if_should_continue(10)
-        elif isinstance(obs, ErrorObservation):
-            logger.error(f'Error occurred: {obs.content}. Retrying...')
-            sleep_if_should_continue(10)
-        else:
-            assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
+
+        # Get the git_patch from IPython's namespace
+        cell_code = 'print(git_patch)'
+        action = IPythonRunCellAction(code=cell_code)
+        action.timeout = 600
+        obs = runtime.run_action(action)
+        git_patch = obs.content.strip()
 
     assert_and_raise(git_patch is not None, 'Failed to get git diff (None)')
 

From 1a25763a048f42c525a70a0db03ed73ba11f6e32 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Thu, 19 Dec 2024 18:40:21 +0000
Subject: [PATCH 38/40] add standard error to swebench summarize outputs

---
 .../scripts/eval/summarize_outputs.py         | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/evaluation/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
index 0da6eb25b54c6..5546ce2252966 100755
--- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py
+++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
@@ -6,6 +6,8 @@
 from collections import Counter
 
 import pandas as pd
+import random
+import numpy as np
 
 from openhands.events.serialization import event_from_dict
 from openhands.events.utils import get_pairs_from_events
@@ -18,6 +20,16 @@
 ]
 
 
+def get_bootstrap_accuracy_std(values: float | int | bool, num_samples: int = 1000) -> float:
+    # Borrow from https://github.com/openai/evals/blob/cdb8ce9547e68b8e5e4520b6a162294c06865c0f/evals/metrics.py#L21
+    return np.std(
+        [
+            np.mean(random.sample(values, len(values) // 2))
+            for _ in range(num_samples)
+        ]
+    )
+
+
 def process_file(file_path):
     with open(file_path, 'r') as file:
         lines = file.readlines()
@@ -26,6 +38,7 @@ def process_file(file_path):
     num_error_lines = 0
     num_agent_stuck_in_loop = 0
     num_resolved = 0
+    resolved_arr = []
     num_empty_patch = 0
     num_unfinished_runs = 0
     error_counter = Counter()
@@ -74,6 +87,9 @@ def process_file(file_path):
         resolved = report.get('resolved', False)
         if resolved:
             num_resolved += 1
+            resolved_arr.append(1)
+        else:
+            resolved_arr.append(0)
 
         # Error
         error = _d.get('error', None)
@@ -100,6 +116,7 @@ def process_file(file_path):
         'resolved': {
             'count': num_resolved,
             'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
+            'std': get_bootstrap_accuracy_std(resolved_arr) * 100,
         },
         'empty_patches': {
             'count': num_empty_patch,
@@ -169,6 +186,7 @@ def aggregate_directory(input_path) -> pd.DataFrame:
     )
 
     df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
+    df['resolve_rate_std'] = df['resolved'].apply(lambda x: x['std'])
     df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
     df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
     df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
@@ -237,7 +255,7 @@ def aggregate_directory(input_path) -> pd.DataFrame:
             # Print detailed results for single file
             print(f'\nResults for {args.input_path}:')
             print(
-                f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}%)"
+                f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}% ± {result['resolved']['std']:.2f}%)"
             )
             print(
                 f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"

From 74b32f5c1bec6f30bbb3aa3b78982b7ad25258f3 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Mon, 23 Dec 2024 04:00:36 +0000
Subject: [PATCH 39/40] add misc

---
 evaluation/swe_bench/eval_infer.py                  |  6 +++++-
 .../swe_bench/scripts/eval/summarize_outputs.py     | 13 ++++++-------
 .../scripts/eval/update_output_with_eval.py         |  5 ++++-
 evaluation/swe_bench/scripts/run_infer.sh           |  6 ++++++
 openhands/core/config/llm_config.py                 |  1 +
 openhands/llm/llm.py                                |  4 +++-
 6 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/swe_bench/eval_infer.py
index d40f984fca9c8..ea650999c2be6 100644
--- a/evaluation/swe_bench/eval_infer.py
+++ b/evaluation/swe_bench/eval_infer.py
@@ -407,7 +407,11 @@ def process_instance(
     fields = ['resolved', 'failed_apply_patch', 'error_eval', 'empty_generation']
 
     def count_report_field(row, field):
-        return row['test_result']['report'][field]
+        return (
+            row['test_result']['report'][field]
+            if 'report' in row['test_result']
+            else False
+        )
 
     report = {}
     for field in fields:
diff --git a/evaluation/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
index 5546ce2252966..d76882cb2415e 100755
--- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py
+++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
@@ -3,11 +3,11 @@
 import glob
 import json
 import os
+import random
 from collections import Counter
 
-import pandas as pd
-import random
 import numpy as np
+import pandas as pd
 
 from openhands.events.serialization import event_from_dict
 from openhands.events.utils import get_pairs_from_events
@@ -20,13 +20,12 @@
 ]
 
 
-def get_bootstrap_accuracy_std(values: float | int | bool, num_samples: int = 1000) -> float:
+def get_bootstrap_accuracy_std(
+    values: float | int | bool, num_samples: int = 1000
+) -> float:
     # Borrow from https://github.com/openai/evals/blob/cdb8ce9547e68b8e5e4520b6a162294c06865c0f/evals/metrics.py#L21
     return np.std(
-        [
-            np.mean(random.sample(values, len(values) // 2))
-            for _ in range(num_samples)
-        ]
+        [np.mean(random.sample(values, len(values) // 2)) for _ in range(num_samples)]
     )
 
 
diff --git a/evaluation/swe_bench/scripts/eval/update_output_with_eval.py b/evaluation/swe_bench/scripts/eval/update_output_with_eval.py
index 662e640ca752f..ae196ccf57ac9 100644
--- a/evaluation/swe_bench/scripts/eval/update_output_with_eval.py
+++ b/evaluation/swe_bench/scripts/eval/update_output_with_eval.py
@@ -108,7 +108,10 @@ def apply_report(row):
     ), 'There are duplicate instance ids in the eval report which is not allowed'
 
     for _, row in df_eval.iterrows():
-        instance_id_to_status[row['instance_id']] = row['test_result']['report']
+        if 'report' in row['test_result']:
+            instance_id_to_status[row['instance_id']] = row['test_result']['report']
+        else:
+            instance_id_to_status[row['instance_id']] = {}
     df['report'] = df.apply(apply_report, axis=1)
 
     _n_instances = len(df)
diff --git a/evaluation/swe_bench/scripts/run_infer.sh b/evaluation/swe_bench/scripts/run_infer.sh
index 520003635a4e9..e8c1df9f3c24c 100755
--- a/evaluation/swe_bench/scripts/run_infer.sh
+++ b/evaluation/swe_bench/scripts/run_infer.sh
@@ -108,7 +108,13 @@ if [ -z "$N_RUNS" ]; then
   echo "N_RUNS not specified, use default $N_RUNS"
 fi
 
+# SKIP_RUNS is a comma-separated list of run numbers to skip
 for i in $(seq 1 $N_RUNS); do
+  # skip if i in SKIP_RUNS
+  if [[ "$SKIP_RUNS" == *"$i"* ]]; then
+    continue
+  fi
+
   current_eval_note="$EVAL_NOTE-run_$i"
   echo "EVAL_NOTE: $current_eval_note"
   run_eval $current_eval_note
diff --git a/openhands/core/config/llm_config.py b/openhands/core/config/llm_config.py
index 477b47ccdbe12..dfcb3b62c5b9f 100644
--- a/openhands/core/config/llm_config.py
+++ b/openhands/core/config/llm_config.py
@@ -77,6 +77,7 @@ class LLMConfig:
     log_completions: bool = False
     log_completions_folder: str = os.path.join(LOG_DIR, 'completions')
     draft_editor: Optional['LLMConfig'] = None
+    nonfncall_mode_add_in_context_learning_example: bool = True
 
     def defaults_to_dict(self) -> dict:
         """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index 7712457a73ad0..42b3d34a64a69 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -186,7 +186,9 @@ def wrapper(*args, **kwargs):
                     'tools' in kwargs
                 ), "'tools' must be in kwargs when mock_function_calling is True"
                 messages = convert_fncall_messages_to_non_fncall_messages(
-                    messages, kwargs['tools']
+                    messages,
+                    kwargs['tools'],
+                    add_in_context_learning_example=self.config.nonfncall_mode_add_in_context_learning_example,
                 )
                 kwargs['messages'] = messages
                 kwargs['stop'] = STOP_WORDS

From 6d0d061f92f2410f872d62cea1e47d8f365537ec Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Mon, 23 Dec 2024 04:11:16 +0000
Subject: [PATCH 40/40] update pyproject for new fork

---
 poetry.lock    | 6 +++---
 pyproject.toml | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 7a13d28ec2196..b95c8ec8c7afb 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -8571,9 +8571,9 @@ inference = ["anthropic", "flash_attn", "jedi", "openai", "peft", "protobuf", "s
 
 [package.source]
 type = "git"
-url = "https://github.com/DevRL-Agent/SWE-Bench.git"
+url = "https://github.com/SWE-Gym/SWE-Bench-Fork.git"
 reference = "HEAD"
-resolved_reference = "c1f15036a6ee37f0915af8ab2da2d32114739570"
+resolved_reference = "e440817248cf29bee71ca9ab3c3a762177ab8f7c"
 
 [[package]]
 name = "sympy"
@@ -10211,4 +10211,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "42d1414d8f6db6a4040c07b3ba8da850ed4fdbfd8e5cae0d55598216b309fbe2"
+content-hash = "6c58d155c879a1469ba187666f010e13ec50e3a5c85c9461f3316878d0019074"
diff --git a/pyproject.toml b/pyproject.toml
index 0ec86829ea945..fc2aa1304c3ef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -132,7 +132,7 @@ streamlit = "*"
 whatthepatch = "*"
 retry = "*"
 evaluate = "*"
-swebench = { git = "https://github.com/DevRL-Agent/SWE-Bench.git" }
+swebench = { git = "https://github.com/SWE-Gym/SWE-Bench-Fork.git" }
 func_timeout = "*"
 sympy = "*"
 gdown = "*"