diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/swe_bench/eval_infer.py
index d6c4e6ba938d..ea650999c2be 100644
--- a/evaluation/swe_bench/eval_infer.py
+++ b/evaluation/swe_bench/eval_infer.py
@@ -263,23 +263,29 @@ def process_instance(
                         test_output_path = os.path.join(log_dir, 'test_output.txt')
                         with open(test_output_path, 'w') as f:
                             f.write(test_output)
-
-                        _report = get_eval_report(
-                            test_spec=test_spec,
-                            prediction={
-                                'model_patch': model_patch,
-                                'instance_id': instance_id,
-                            },
-                            log_path=test_output_path,
-                            include_tests_status=True,
-                        )
-                        report = _report[instance_id]
-                        logger.info(
-                            f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
-                        )
-                        instance['test_result']['report']['resolved'] = report[
-                            'resolved'
-                        ]
+                        try:
+                            _report = get_eval_report(
+                                test_spec=test_spec,
+                                prediction={
+                                    'model_patch': model_patch,
+                                    'instance_id': instance_id,
+                                },
+                                log_path=test_output_path,
+                                include_tests_status=True,
+                            )
+                            report = _report[instance_id]
+                            logger.info(
+                                f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
+                            )
+                            instance['test_result']['report']['resolved'] = report[
+                                'resolved'
+                            ]
+                        except Exception as e:
+                            logger.error(
+                                f'[{instance_id}] Error when getting eval report: {e}'
+                            )
+                            instance['test_result']['report']['resolved'] = False
+                            instance['test_result']['report']['error_eval'] = True
             else:
                 logger.info(f'[{instance_id}] Error when starting eval:\n{obs.content}')
                 instance['test_result']['report']['error_eval'] = True
@@ -355,7 +361,7 @@ def process_instance(
 
     if 'model_patch' not in predictions.columns:
         predictions['model_patch'] = predictions['test_result'].apply(
-            lambda x: x['git_patch']
+            lambda x: x.get('git_patch', '')
         )
     assert {'instance_id', 'model_patch'}.issubset(
         set(predictions.columns)
@@ -401,7 +407,11 @@ def process_instance(
     fields = ['resolved', 'failed_apply_patch', 'error_eval', 'empty_generation']
 
     def count_report_field(row, field):
-        return row['test_result']['report'][field]
+        return (
+            row['test_result']['report'][field]
+            if 'report' in row['test_result']
+            else False
+        )
 
     report = {}
     for field in fields:
diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index a5d3db08ef46..5ae044edbc3c 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -32,7 +32,7 @@
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction, MessageAction
+from openhands.events.action import CmdRunAction, IPythonRunCellAction, MessageAction
 from openhands.events.observation import CmdOutputObservation, ErrorObservation
 from openhands.events.serialization.event import event_to_dict
 from openhands.runtime.base import Runtime
@@ -145,7 +145,7 @@ def get_config(
             platform='linux/amd64',
             api_key=os.environ.get('ALLHANDS_API_KEY', None),
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
-            keep_remote_runtime_alive=False,
+            keep_runtime_alive=False,
             remote_runtime_init_timeout=3600,
         ),
         # do not mount workspace
@@ -303,6 +303,7 @@ def initialize_runtime(
 def complete_runtime(
     runtime: Runtime,
     instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
+    n_retries: int = 5,
 ) -> dict[str, Any]:
     """Complete the runtime for the agent.
 
@@ -321,55 +322,84 @@ def complete_runtime(
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
-        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
-    )
 
-    action = CmdRunAction(command='git config --global core.pager ""')
-    action.timeout = 600
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
-        f'Failed to git config --global core.pager "": {str(obs)}',
-    )
+    if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0:
+        action = CmdRunAction(command='git config --global core.pager ""')
+        action.timeout = 600
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert_and_raise(
+            isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+            f'Failed to git config --global core.pager "": {str(obs)}',
+        )
 
-    action = CmdRunAction(command='git add -A')
-    action.timeout = 600
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
-        f'Failed to git add -A: {str(obs)}',
-    )
+        action = CmdRunAction(command='git add -A')
+        action.timeout = 600
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert_and_raise(
+            isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+            f'Failed to git add -A: {str(obs)}',
+        )
 
-    n_retries = 0
-    git_patch = None
-    while n_retries < 5:
-        action = CmdRunAction(
-            command=f'git diff --no-color --cached {instance["base_commit"]}',
-            keep_prompt=False,
+        n_retries = 0
+        git_patch = None
+        while n_retries < 5:
+            action = CmdRunAction(
+                command=f'git diff --no-color --cached {instance["base_commit"]}',
+                keep_prompt=False,
+            )
+            action.timeout = 600 + 100 * n_retries
+            logger.info(action, extra={'msg_type': 'ACTION'})
+            obs = runtime.run_action(action)
+            logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+            n_retries += 1
+            if isinstance(obs, CmdOutputObservation):
+                if obs.exit_code == 0:
+                    git_patch = obs.content.strip()
+                    break
+                else:
+                    logger.info('Failed to get git diff, retrying...')
+                    sleep_if_should_continue(10)
+            elif isinstance(obs, ErrorObservation):
+                logger.error(f'Error occurred: {obs.content}. Retrying...')
+                sleep_if_should_continue(10)
+            else:
+                assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
+    else:
+        logger.warning(
+            f'Failed to cd to /workspace/{workspace_dir_name}... Trying to use IPython to get git diff'
         )
-        action.timeout = 600 + 100 * n_retries
+        # Git configuration and diff using IPython
+        cell_code = f"""
+    import subprocess
+
+    def run_git_cmd(cmd):
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, cwd='/workspace/{workspace_dir_name}')
+        return result.stdout, result.returncode
+
+    # Configure git
+    run_git_cmd('git config --global core.pager ""')
+    run_git_cmd('git add -A')
+
+    # Get the diff
+    stdout, exit_code = run_git_cmd('git diff --no-color --cached {instance["base_commit"]}')
+    git_patch = stdout.strip()
+    """
+        action = IPythonRunCellAction(code=cell_code)
+        action.timeout = 600
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-        n_retries += 1
-        if isinstance(obs, CmdOutputObservation):
-            if obs.exit_code == 0:
-                git_patch = obs.content.strip()
-                break
-            else:
-                logger.info('Failed to get git diff, retrying...')
-                sleep_if_should_continue(10)
-        elif isinstance(obs, ErrorObservation):
-            logger.error(f'Error occurred: {obs.content}. Retrying...')
-            sleep_if_should_continue(10)
-        else:
-            assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
+
+        # Get the git_patch from IPython's namespace
+        cell_code = 'print(git_patch)'
+        action = IPythonRunCellAction(code=cell_code)
+        action.timeout = 600
+        obs = runtime.run_action(action)
+        git_patch = obs.content.strip()
 
     assert_and_raise(git_patch is not None, 'Failed to get git diff (None)')
 
@@ -534,5 +564,10 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
             instances[col] = instances[col].apply(lambda x: str(x))
 
     run_evaluation(
-        instances, metadata, output_file, args.eval_num_workers, process_instance
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+        timeout_seconds=120 * 60,  # 2 hour PER instance should be more than enough
     )
diff --git a/evaluation/swe_bench/scripts/eval/summarize_outputs.py b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
index 5d5dbbf2a3bd..d76882cb2415 100755
--- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py
+++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
@@ -1,8 +1,14 @@
 #!/usr/bin/env python3
 import argparse
+import glob
 import json
+import os
+import random
 from collections import Counter
 
+import numpy as np
+import pandas as pd
+
 from openhands.events.serialization import event_from_dict
 from openhands.events.utils import get_pairs_from_events
 
@@ -10,25 +16,31 @@
     'Agent encountered an error while processing the last action',
     'APIError',
     'Action execution failed',
+    'litellm.Timeout: APITimeoutError',
 ]
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('output_file', type=str, help='The file to summarize')
-    args = parser.parse_args()
 
-    with open(args.output_file, 'r') as file:
+def get_bootstrap_accuracy_std(
+    values: float | int | bool, num_samples: int = 1000
+) -> float:
+    # Borrow from https://github.com/openai/evals/blob/cdb8ce9547e68b8e5e4520b6a162294c06865c0f/evals/metrics.py#L21
+    return np.std(
+        [np.mean(random.sample(values, len(values) // 2)) for _ in range(num_samples)]
+    )
+
+
+def process_file(file_path):
+    with open(file_path, 'r') as file:
         lines = file.readlines()
 
     num_lines = len(lines)
     num_error_lines = 0
     num_agent_stuck_in_loop = 0
-
     num_resolved = 0
+    resolved_arr = []
     num_empty_patch = 0
-
+    num_unfinished_runs = 0
     error_counter = Counter()
-
     main_agent_cost = []
     editor_cost = []
     num_turns = []
@@ -36,6 +48,11 @@
     for line in lines:
         _d = json.loads(line)
 
+        if 'metrics' not in _d or _d['metrics'] is None:
+            # this is a failed run
+            num_unfinished_runs += 1
+            continue
+
         # Cost
         costs = _d['metrics'].get('costs', [])
         _cur_main_agent_cost = 0
@@ -69,6 +86,9 @@
         resolved = report.get('resolved', False)
         if resolved:
             num_resolved += 1
+            resolved_arr.append(1)
+        else:
+            resolved_arr.append(0)
 
         # Error
         error = _d.get('error', None)
@@ -89,30 +109,182 @@
                 num_error_lines += 1
                 break
 
-    # print the error counter (with percentage)
-    print(
-        f'Number of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
-    )
-    print(
-        f'Number of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
-    )
-    print(
-        f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
+    return {
+        'file_path': file_path,
+        'total_instances': num_lines,
+        'resolved': {
+            'count': num_resolved,
+            'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
+            'std': get_bootstrap_accuracy_std(resolved_arr) * 100,
+        },
+        'empty_patches': {
+            'count': num_empty_patch,
+            'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0,
+        },
+        'unfinished_runs': {
+            'count': num_unfinished_runs,
+            'percentage': (num_unfinished_runs / num_lines * 100)
+            if num_lines > 0
+            else 0,
+        },
+        'errors': {
+            'total': num_error_lines,
+            'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0,
+            'stuck_in_loop': {
+                'count': num_agent_stuck_in_loop,
+                'percentage': (num_agent_stuck_in_loop / num_lines * 100)
+                if num_lines > 0
+                else 0,
+            },
+            'breakdown': {
+                str(error): {
+                    'count': count,
+                    'percentage': (count / num_lines * 100) if num_lines > 0 else 0,
+                }
+                for error, count in error_counter.items()
+            },
+        },
+        'statistics': {
+            'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0,
+            'costs': {
+                'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0,
+                'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0,
+                'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines
+                if num_lines > 0
+                else 0,
+            },
+        },
+    }
+
+
+def aggregate_directory(input_path) -> pd.DataFrame:
+    # Process all output.jsonl files in subdirectories
+    pattern = os.path.join(input_path, '**/output.jsonl')
+    files = glob.glob(pattern, recursive=True)
+    print(f'Processing {len(files)} files from directory {input_path}')
+
+    # Process each file silently and collect results
+    results = []
+    for file_path in files:
+        try:
+            result = process_file(file_path)
+            results.append(result)
+        except Exception as e:
+            print(f'Error processing {file_path}: {str(e)}')
+            import traceback
+
+            traceback.print_exc()
+            continue
+
+    # Convert results to pandas DataFrame and sort by resolve rate
+    df = pd.DataFrame(results)
+
+    # Extract directory name from file path
+    df['directory'] = df['file_path'].apply(
+        lambda x: os.path.basename(os.path.dirname(x))
     )
-    print(
-        f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
+
+    df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
+    df['resolve_rate_std'] = df['resolved'].apply(lambda x: x['std'])
+    df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
+    df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
+    df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
+    df['error_rate'] = df['errors'].apply(lambda x: x['percentage'])
+    df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total'])
+
+    df = df.sort_values('resolve_rate', ascending=False)
+
+    return df
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'input_path', type=str, help='The file or directory to summarize'
     )
-    assert len(num_turns) == num_lines
-    assert len(main_agent_cost) == num_lines
-    assert len(editor_cost) == num_lines
-    print('## Statistics')
-    print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}')
-    print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD')
-    print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD')
-    print(
-        f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD'
+    parser.add_argument(
+        '--output',
+        type=str,
+        help='Output JSONL file for results',
+        default='summary_results.jsonl',
     )
+    args = parser.parse_args()
+
+    if os.path.isdir(args.input_path):
+        df = aggregate_directory(args.input_path)
+        # Create the summary string
+        columns = [
+            'directory',
+            'resolve_rate',
+            'empty_patch_rate',
+            'unfinished_rate',
+            'error_rate',
+            'avg_turns',
+            'avg_cost',
+            'total_instances',
+        ]
+        summary_str = df[columns].to_string(
+            float_format=lambda x: '{:.2f}'.format(x),
+            formatters={
+                'directory': lambda x: x[:90]
+            },  # Truncate directory names to 20 chars
+            index=False,
+        )
+
+        # Print to console
+        print('\nResults summary (sorted by resolve rate):')
+        print(summary_str)
+
+        # Save to text file
+        txt_output = args.output.rsplit('.', 1)[0] + '.txt'
+        with open(txt_output, 'w') as f:
+            f.write('Results summary (sorted by resolve rate):\n')
+            f.write(summary_str)
+
+        # Save
+        df.to_json(args.output, lines=True, orient='records')
+        df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False)
+    else:
+        # Process single file with detailed output
+        results = []
+        try:
+            result = process_file(args.input_path)
+            results.append(result)
+
+            # Print detailed results for single file
+            print(f'\nResults for {args.input_path}:')
+            print(
+                f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}% ± {result['resolved']['std']:.2f}%)"
+            )
+            print(
+                f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
+            )
+            print(
+                f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)"
+            )
+            print(
+                f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)"
+            )
+            print(
+                f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
+            )
+            print('## Statistics')
+            print(
+                f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
+            )
+            print(
+                f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD"
+            )
+            print(
+                f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD"
+            )
+            print(
+                f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD"
+            )
+
+            print('## Detailed error breakdown:')
+            for error, data in result['errors']['breakdown'].items():
+                print(f"{error}: {data['count']} ({data['percentage']:.2f}%)")
 
-    print('## Detailed error breakdown:')
-    for error, count in error_counter.items():
-        print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')
+        except Exception as e:
+            print(f'Error processing {args.input_path}: {str(e)}')
diff --git a/evaluation/swe_bench/scripts/eval/update_output_with_eval.py b/evaluation/swe_bench/scripts/eval/update_output_with_eval.py
index 662e640ca752..ae196ccf57ac 100644
--- a/evaluation/swe_bench/scripts/eval/update_output_with_eval.py
+++ b/evaluation/swe_bench/scripts/eval/update_output_with_eval.py
@@ -108,7 +108,10 @@ def apply_report(row):
     ), 'There are duplicate instance ids in the eval report which is not allowed'
 
     for _, row in df_eval.iterrows():
-        instance_id_to_status[row['instance_id']] = row['test_result']['report']
+        if 'report' in row['test_result']:
+            instance_id_to_status[row['instance_id']] = row['test_result']['report']
+        else:
+            instance_id_to_status[row['instance_id']] = {}
     df['report'] = df.apply(apply_report, axis=1)
 
     _n_instances = len(df)
diff --git a/evaluation/swe_bench/scripts/run_infer.sh b/evaluation/swe_bench/scripts/run_infer.sh
index 520003635a4e..e8c1df9f3c24 100755
--- a/evaluation/swe_bench/scripts/run_infer.sh
+++ b/evaluation/swe_bench/scripts/run_infer.sh
@@ -108,7 +108,13 @@ if [ -z "$N_RUNS" ]; then
   echo "N_RUNS not specified, use default $N_RUNS"
 fi
 
+# SKIP_RUNS is a comma-separated list of run numbers to skip
 for i in $(seq 1 $N_RUNS); do
+  # skip if i in SKIP_RUNS
+  if [[ "$SKIP_RUNS" == *"$i"* ]]; then
+    continue
+  fi
+
   current_eval_note="$EVAL_NOTE-run_$i"
   echo "EVAL_NOTE: $current_eval_note"
   run_eval $current_eval_note
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index 847eb16bb32e..517ecc523581 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -3,9 +3,11 @@
 import multiprocessing as mp
 import os
 import pathlib
+import signal
 import subprocess
 import time
 import traceback
+from contextlib import contextmanager
 from typing import Any, Awaitable, Callable, TextIO
 
 import pandas as pd
@@ -92,6 +94,27 @@ class EvalException(Exception):
     pass
 
 
+class EvalTimeoutException(Exception):
+    pass
+
+
+@contextmanager
+def timeout(seconds: int):
+    def timeout_handler(signum, frame):
+        raise EvalTimeoutException(f'Function timed out after {seconds} seconds')
+
+    # Set up the signal handler
+    original_handler = signal.signal(signal.SIGALRM, timeout_handler)
+    signal.alarm(seconds)
+
+    try:
+        yield
+    finally:
+        # Restore the original handler and disable the alarm
+        signal.alarm(0)
+        signal.signal(signal.SIGALRM, original_handler)
+
+
 def codeact_user_response(
     state: State,
     encapsulate_solution: bool = False,
@@ -280,15 +303,33 @@ def _process_instance_wrapper(
     metadata: EvalMetadata,
     use_mp: bool,
     max_retries: int = 5,
+    timeout_seconds: int | None = None,
 ) -> EvalOutput:
-    """Wrap the process_instance_func to handle retries and errors.
-
-    Retry an instance up to max_retries times if it fails (e.g., due to transient network/runtime issues).
-    """
+    """Wrap the process_instance_func to handle retries and errors."""
     for attempt in range(max_retries + 1):
         try:
-            result = process_instance_func(instance, metadata, use_mp)
+            if timeout_seconds is not None:
+                with timeout(timeout_seconds):
+                    result = process_instance_func(instance, metadata, use_mp)
+            else:
+                result = process_instance_func(instance, metadata, use_mp)
             return result
+        except EvalTimeoutException as e:
+            error = f'Timeout after {timeout_seconds} seconds'
+            stacktrace = traceback.format_exc()
+            msg = (
+                '-' * 10
+                + '\n'
+                + f'Timeout ({timeout_seconds} seconds) in instance [{instance.instance_id}], Stopped evaluation for this instance.'
+                + '\n'
+                + '-' * 10
+            )
+            logger.exception(e)
+            return EvalOutput(
+                instance_id=instance.instance_id,
+                test_result={},
+                error=error,
+            )
         except Exception as e:
             error = str(e)
             stacktrace = traceback.format_exc()
@@ -337,6 +378,7 @@ def run_evaluation(
         [pd.Series, EvalMetadata, bool], Awaitable[EvalOutput]
     ],
     max_retries: int = 5,  # number of retries for each instance
+    timeout_seconds: int | None = None,
 ):
     use_multiprocessing = num_workers > 1
 
@@ -357,7 +399,14 @@ def run_evaluation(
         if use_multiprocessing:
             with mp.Pool(num_workers) as pool:
                 args_iter = (
-                    (process_instance_func, instance, metadata, True, max_retries)
+                    (
+                        process_instance_func,
+                        instance,
+                        metadata,
+                        True,
+                        max_retries,
+                        timeout_seconds,
+                    )
                     for _, instance in dataset.iterrows()
                 )
                 results = pool.imap_unordered(_process_instance_wrapper_mp, args_iter)
diff --git a/openhands/core/config/llm_config.py b/openhands/core/config/llm_config.py
index 477b47ccdbe1..dfcb3b62c5b9 100644
--- a/openhands/core/config/llm_config.py
+++ b/openhands/core/config/llm_config.py
@@ -77,6 +77,7 @@ class LLMConfig:
     log_completions: bool = False
     log_completions_folder: str = os.path.join(LOG_DIR, 'completions')
     draft_editor: Optional['LLMConfig'] = None
+    nonfncall_mode_add_in_context_learning_example: bool = True
 
     def defaults_to_dict(self) -> dict:
         """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
diff --git a/openhands/llm/fn_call_converter.py b/openhands/llm/fn_call_converter.py
index da595ec4f364..057401b12c2c 100644
--- a/openhands/llm/fn_call_converter.py
+++ b/openhands/llm/fn_call_converter.py
@@ -307,6 +307,7 @@ def convert_tools_to_description(tools: list[dict]) -> str:
 def convert_fncall_messages_to_non_fncall_messages(
     messages: list[dict],
     tools: list[ChatCompletionToolParam],
+    add_in_context_learning_example: bool = True,
 ) -> list[dict]:
     """Convert function calling messages to non-function calling messages."""
     messages = copy.deepcopy(messages)
@@ -319,7 +320,8 @@ def convert_fncall_messages_to_non_fncall_messages(
     converted_messages = []
     first_user_message_encountered = False
     for message in messages:
-        role, content = message['role'], message['content']
+        role = message['role']
+        content = message.get('content', '')
         if content is None:
             content = ''
 
@@ -341,7 +343,7 @@ def convert_fncall_messages_to_non_fncall_messages(
         # 2. USER MESSAGES (no change)
         elif role == 'user':
             # Add in-context learning example for the first user message
-            if not first_user_message_encountered:
+            if not first_user_message_encountered and add_in_context_learning_example:
                 first_user_message_encountered = True
                 # Check tools
                 if not (
@@ -430,7 +432,7 @@ def convert_fncall_messages_to_non_fncall_messages(
                     tool_content = convert_tool_call_to_string(message['tool_calls'][0])
                 except FunctionCallConversionError as e:
                     raise FunctionCallConversionError(
-                        f'Failed to convert tool call to string. Raw messages: {json.dumps(messages, indent=2)}'
+                        f'Failed to convert tool call to string.\nCurrent tool call: {message["tool_calls"][0]}.\nRaw messages: {json.dumps(messages, indent=2)}'
                     ) from e
                 if isinstance(content, str):
                     content += '\n\n' + tool_content
@@ -751,6 +753,7 @@ def convert_non_fncall_messages_to_fncall_messages(
 
 def convert_from_multiple_tool_calls_to_single_tool_call_messages(
     messages: list[dict],
+    ignore_final_tool_result: bool = False,
 ) -> list[dict]:
     """Break one message with multiple tool calls into multiple messages."""
     converted_messages = []
@@ -787,7 +790,7 @@ def convert_from_multiple_tool_calls_to_single_tool_call_messages(
             ), f'Found pending tool calls but not expect to handle it with role {role}: {pending_tool_calls=}, {message=}'
             converted_messages.append(message)
 
-    if len(pending_tool_calls) > 0:
+    if not ignore_final_tool_result and len(pending_tool_calls) > 0:
         raise FunctionCallConversionError(
             f'Found pending tool calls but no tool result: {pending_tool_calls=}'
         )
diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index 0590945995c1..42b3d34a64a6 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -121,7 +121,9 @@ def __init__(
             top_p=self.config.top_p,
             drop_params=self.config.drop_params,
         )
-
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore')
+            self.init_model_info()
         if self.vision_is_active():
             logger.debug('LLM: model has vision enabled')
         if self.is_caching_prompt_active():
@@ -143,16 +145,6 @@ def __init__(
             drop_params=self.config.drop_params,
         )
 
-        with warnings.catch_warnings():
-            warnings.simplefilter('ignore')
-            self.init_model_info()
-        if self.vision_is_active():
-            logger.debug('LLM: model has vision enabled')
-        if self.is_caching_prompt_active():
-            logger.debug('LLM: caching prompt enabled')
-        if self.is_function_calling_active():
-            logger.debug('LLM: model supports function calling')
-
         self._completion_unwrapped = self._completion
 
         @self.retry_decorator(
@@ -194,7 +186,9 @@ def wrapper(*args, **kwargs):
                     'tools' in kwargs
                 ), "'tools' must be in kwargs when mock_function_calling is True"
                 messages = convert_fncall_messages_to_non_fncall_messages(
-                    messages, kwargs['tools']
+                    messages,
+                    kwargs['tools'],
+                    add_in_context_learning_example=self.config.nonfncall_mode_add_in_context_learning_example,
                 )
                 kwargs['messages'] = messages
                 kwargs['stop'] = STOP_WORDS
diff --git a/openhands/runtime/impl/remote/remote_runtime.py b/openhands/runtime/impl/remote/remote_runtime.py
index 4191a047b1c2..db685eb98962 100644
--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@@ -327,13 +327,13 @@ def _wait_until_alive_impl(self):
         assert 'runtime_id' in runtime_data
         assert runtime_data['runtime_id'] == self.runtime_id
         assert 'pod_status' in runtime_data
-        pod_status = runtime_data['pod_status']
+        pod_status = runtime_data['pod_status'].lower()
         self.log('debug', f'Pod status: {pod_status}')
 
         # FIXME: We should fix it at the backend of /start endpoint, make sure
         # the pod is created before returning the response.
         # Retry a period of time to give the cluster time to start the pod
-        if pod_status == 'Ready':
+        if pod_status == 'ready':
             try:
                 with self._send_request(
                     'GET',
@@ -349,14 +349,14 @@ def _wait_until_alive_impl(self):
                 )
             return
         elif (
-            pod_status == 'Not Found'
-            or pod_status == 'Pending'
-            or pod_status == 'Running'
+            pod_status == 'not found'
+            or pod_status == 'pending'
+            or pod_status == 'running'
         ):  # nb: Running is not yet Ready
             raise RuntimeNotReadyError(
                 f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}'
             )
-        elif pod_status in ('Failed', 'Unknown'):
+        elif pod_status in ('failed', 'unknown', 'crashloopbackoff'):
             # clean up the runtime
             self.close()
             raise RuntimeError(
diff --git a/poetry.lock b/poetry.lock
index 80d6b20882c8..b95c8ec8c7af 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
 [[package]]
 name = "aenum"
@@ -8571,9 +8571,9 @@ inference = ["anthropic", "flash_attn", "jedi", "openai", "peft", "protobuf", "s
 
 [package.source]
 type = "git"
-url = "https://github.com/All-Hands-AI/SWE-bench.git"
+url = "https://github.com/SWE-Gym/SWE-Bench-Fork.git"
 reference = "HEAD"
-resolved_reference = "c807c112edc3dcb4fdf5ddac63b34706912d5cdb"
+resolved_reference = "e440817248cf29bee71ca9ab3c3a762177ab8f7c"
 
 [[package]]
 name = "sympy"
@@ -10211,4 +10211,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "8718ffe2ed836fca6c646c37bdad2c9c8e63ebd7ec881f420148fef5095d19e4"
+content-hash = "6c58d155c879a1469ba187666f010e13ec50e3a5c85c9461f3316878d0019074"
diff --git a/pyproject.toml b/pyproject.toml
index 68bc044a79fc..fc2aa1304c3e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -132,7 +132,7 @@ streamlit = "*"
 whatthepatch = "*"
 retry = "*"
 evaluate = "*"
-swebench = { git = "https://github.com/All-Hands-AI/SWE-bench.git" }
+swebench = { git = "https://github.com/SWE-Gym/SWE-Bench-Fork.git" }
 func_timeout = "*"
 sympy = "*"
 gdown = "*"
diff --git a/scripts/eval-swebench-lite.sh b/scripts/eval-swebench-lite.sh
new file mode 100755
index 000000000000..3f6d88d910d7
--- /dev/null
+++ b/scripts/eval-swebench-lite.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+OUTPUT_FILE=$1
+NUM_WORKERS=${2:-64}
+DATASET="princeton-nlp/SWE-bench_Lite"
+SPLIT="test"
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+
+./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT
diff --git a/scripts/eval-swebench-lite100.sh b/scripts/eval-swebench-lite100.sh
new file mode 100755
index 000000000000..b2842f7f7b68
--- /dev/null
+++ b/scripts/eval-swebench-lite100.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+OUTPUT_FILE=$1
+NUM_WORKERS=$2
+DATASET="swe-train/SWE-bench_lite"
+SPLIT="test100"
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+
+./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT
diff --git a/scripts/eval-swebench-verified.sh b/scripts/eval-swebench-verified.sh
new file mode 100755
index 000000000000..f6909f6515c5
--- /dev/null
+++ b/scripts/eval-swebench-verified.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+OUTPUT_FILE=$1
+NUM_WORKERS=${2:-64}
+DATASET="princeton-nlp/SWE-bench_Verified"
+SPLIT="test"
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+
+./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT
diff --git a/scripts/eval-swetrain-full-rollout.sh b/scripts/eval-swetrain-full-rollout.sh
new file mode 100755
index 000000000000..9441fb2f53aa
--- /dev/null
+++ b/scripts/eval-swetrain-full-rollout.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+OUTPUT_FILE=$1
+NUM_WORKERS=$2
+DATASET="swe-train/swe-train-dev-v1"
+SPLIT="train.v2.success"
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+
+./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT
diff --git a/scripts/eval-swetrain-lite-rollout.sh b/scripts/eval-swetrain-lite-rollout.sh
new file mode 100755
index 000000000000..1f2558839062
--- /dev/null
+++ b/scripts/eval-swetrain-lite-rollout.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+OUTPUT_FILE=$1
+NUM_WORKERS=$2
+DATASET="swe-train/swe-train-dev-v1-lite"
+SPLIT="train.v2.success"
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+
+./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FILE $NUM_WORKERS $DATASET $SPLIT
diff --git a/scripts/rollout-eval-lite.sh b/scripts/rollout-eval-lite.sh
new file mode 100755
index 000000000000..2d111b991003
--- /dev/null
+++ b/scripts/rollout-eval-lite.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+export EXP_NAME="t0"  # use this to differentiate between different runs
+MODEL=$1
+NUM_WORKERS=${2:-64}
+N_RUNS=${3:-1}
+DATASET="princeton-nlp/SWE-bench_Lite"
+SPLIT="test"
+
+echo "MODEL: $MODEL"
+echo "NUM_WORKERS: $NUM_WORKERS"
+echo "N_RUNS: $N_RUNS"
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+export EXP_NAME=$EXP_NAME
+
+EVAL_LIMIT=300
+MAX_ITER=100
+
+./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT $N_RUNS
diff --git a/scripts/rollout-eval-verified.sh b/scripts/rollout-eval-verified.sh
new file mode 100755
index 000000000000..ecffc1ac108a
--- /dev/null
+++ b/scripts/rollout-eval-verified.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+export EXP_NAME="t0"  # use this to differentiate between different runs
+MODEL=$1
+NUM_WORKERS=${2:-64}
+N_RUNS=${3:-1}
+DATASET="princeton-nlp/SWE-bench_Verified"
+SPLIT="test"
+
+echo "MODEL: $MODEL"
+echo "NUM_WORKERS: $NUM_WORKERS"
+echo "N_RUNS: $N_RUNS"
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+export EXP_NAME=$EXP_NAME
+
+EVAL_LIMIT=500
+MAX_ITER=100
+
+./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT $N_RUNS
diff --git a/scripts/rollout-eval.sh b/scripts/rollout-eval.sh
new file mode 100755
index 000000000000..ac46effca15e
--- /dev/null
+++ b/scripts/rollout-eval.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+export EXP_NAME="t0"  # use this to differentiate between different runs
+MODEL=$1
+NUM_WORKERS=${2:-64}
+DATASET="swe-train/SWE-bench_lite"
+SPLIT="test100"
+N_RUNS=1
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+export EXP_NAME=$EXP_NAME
+
+EVAL_LIMIT=300
+MAX_ITER=100
+
+./evaluation/swe_bench/scripts/run_infer.sh $MODEL HEAD CodeActAgent $EVAL_LIMIT $MAX_ITER $NUM_WORKERS $DATASET $SPLIT
diff --git a/scripts/rollout-swe-train-full.sh b/scripts/rollout-swe-train-full.sh
new file mode 100755
index 000000000000..fcd3d0272a0b
--- /dev/null
+++ b/scripts/rollout-swe-train-full.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+MODEL=$1
+EXP_NAME=$2 # "train-t0"
+N_RUNS=${3:-20}
+export EXP_NAME=$EXP_NAME
+echo "MODEL: $MODEL"
+echo "EXP_NAME: $EXP_NAME"
+DATASET="swe-train/swe-train-dev-v1"
+SPLIT="train.v2.success"
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+
+EVAL_LIMIT=3000
+MAX_ITER=50
+NUM_WORKERS=64
+
+./evaluation/swe_bench/scripts/run_infer.sh \
+    $MODEL HEAD CodeActAgent \
+    $EVAL_LIMIT $MAX_ITER $NUM_WORKERS \
+    $DATASET $SPLIT $N_RUNS
diff --git a/scripts/rollout-swe-train-lite.sh b/scripts/rollout-swe-train-lite.sh
new file mode 100755
index 000000000000..77173ade2b30
--- /dev/null
+++ b/scripts/rollout-swe-train-lite.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+MODEL=$1
+EXP_NAME=$2 # "train-t0"
+export EXP_NAME=$EXP_NAME
+echo "MODEL: $MODEL"
+echo "EXP_NAME: $EXP_NAME"
+DATASET="swe-train/swe-train-dev-v1-lite"
+SPLIT="train.v2.success"
+N_RUNS=1
+
+if [ -z "$ALLHANDS_API_KEY" ]; then
+    echo "ALLHANDS_API_KEY is not set. Please set it and run the script again."
+    exit 1
+fi
+
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
+
+EVAL_LIMIT=2000
+MAX_ITER=50
+NUM_WORKERS=64
+
+./evaluation/swe_bench/scripts/run_infer.sh \
+    $MODEL HEAD CodeActAgent \
+    $EVAL_LIMIT $MAX_ITER $NUM_WORKERS \
+    $DATASET $SPLIT $N_RUNS