From 33b79a7f3d664ca90fa4f396c89dd8dbc2da090b Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Thu, 16 Jan 2025 12:42:48 -0500
Subject: [PATCH 1/3] feat(eval): misc SWE-Bench improvement - use different
 resources for different instances

---
 evaluation/benchmarks/swe_bench/eval_infer.py | 105 +++++-----
 .../benchmarks/swe_bench/resource/mapping.py  |  40 ++++
 ...rinceton-nlp__SWE-bench_Verified-test.json |   1 +
 evaluation/benchmarks/swe_bench/run_infer.py  |  25 ++-
 .../scripts/eval/combine_final_completions.py |  69 +++++++
 .../eval/convert_oh_output_to_swe_json.py     |   3 +-
 .../scripts/eval/update_output_with_eval.py   | 191 +++++++++++-------
 .../benchmarks/swe_bench/scripts/run_infer.sh |  11 +
 8 files changed, 317 insertions(+), 128 deletions(-)
 create mode 100644 evaluation/benchmarks/swe_bench/resource/mapping.py
 create mode 100644 evaluation/benchmarks/swe_bench/resource/princeton-nlp__SWE-bench_Verified-test.json
 create mode 100644 evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py

diff --git a/evaluation/benchmarks/swe_bench/eval_infer.py b/evaluation/benchmarks/swe_bench/eval_infer.py
index 7beacf344408..3c222edd6b9e 100644
--- a/evaluation/benchmarks/swe_bench/eval_infer.py
+++ b/evaluation/benchmarks/swe_bench/eval_infer.py
@@ -1,3 +1,4 @@
+import json
 import os
 import tempfile
 import time
@@ -11,7 +12,11 @@
 )
 from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec
 from swebench.harness.utils import load_swebench_dataset
+from tqdm import tqdm
 
+from evaluation.benchmarks.swe_bench.resource.mapping import (
+    get_instance_resource_factor,
+)
 from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image
 from evaluation.utils.shared import (
     EvalMetadata,
@@ -81,10 +86,14 @@ def get_config(instance: pd.Series) -> AppConfig:
             base_container_image=base_container_image,
             use_host_network=False,
             # large enough timeout, since some testcases take very long to run
-            timeout=1800,
+            timeout=600,
             api_key=os.environ.get('ALLHANDS_API_KEY', None),
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
             remote_runtime_init_timeout=3600,
+            remote_runtime_resource_factor=get_instance_resource_factor(
+                dataset_name=metadata.dataset,
+                instance_id=instance['instance_id'],
+            ),
         ),
         # do not mount workspace
         workspace_base=None,
@@ -151,52 +160,52 @@ def process_instance(
     if runtime_failure_count > 0:
         config.sandbox.remote_runtime_resource_factor = min(
             config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
-            4,  # hardcode maximum resource factor to 4
+            8,
         )
         logger.warning(
-            f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
+            f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
         )
 
-    runtime = create_runtime(config)
-    call_async_from_sync(runtime.connect)
-    # Get patch and save it to /tmp/patch.diff
-    with tempfile.TemporaryDirectory() as temp_dir:
-        # Patch file
-        patch_file_path = os.path.join(temp_dir, 'patch.diff')
-        with open(patch_file_path, 'w') as f:
-            f.write(model_patch)
-        runtime.copy_to(patch_file_path, '/tmp')
-        # Eval script
-        eval_script_path = os.path.join(temp_dir, 'eval.sh')
-        with open(eval_script_path, 'w') as f:
-            f.write(test_spec.eval_script)
-        runtime.copy_to(eval_script_path, '/tmp')
-
-    # Set +x
-    action = CmdRunAction(command='chmod +x /tmp/eval.sh')
-    action.timeout = 600
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert obs.exit_code == 0
-
-    # Apply patch
-    exec_command = (
-        'cd /testbed && '
-        "(git apply -v /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
-        "(echo 'Failed to apply patch with git apply, trying with patch command...' && "
-        "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
-        "echo 'APPLY_PATCH_FAIL')))"
-    )
-    action = CmdRunAction(command=exec_command)
-    action.timeout = 600
-    obs = runtime.run_action(action)
-    assert isinstance(obs, CmdOutputObservation)
-    apply_patch_output = obs.content
-    assert isinstance(apply_patch_output, str)
-    instance['test_result']['apply_patch_output'] = apply_patch_output
-
     try:
+        runtime = create_runtime(config)
+        call_async_from_sync(runtime.connect)
+        # Get patch and save it to /tmp/patch.diff
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Patch file
+            patch_file_path = os.path.join(temp_dir, 'patch.diff')
+            with open(patch_file_path, 'w') as f:
+                f.write(model_patch)
+            runtime.copy_to(patch_file_path, '/tmp')
+            # Eval script
+            eval_script_path = os.path.join(temp_dir, 'eval.sh')
+            with open(eval_script_path, 'w') as f:
+                f.write(test_spec.eval_script)
+            runtime.copy_to(eval_script_path, '/tmp')
+
+        # Set +x
+        action = CmdRunAction(command='chmod +x /tmp/eval.sh')
+        action.timeout = 600
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+
+        # Apply patch
+        exec_command = (
+            'cd /testbed && '
+            "(git apply -v /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
+            "(echo 'Failed to apply patch with git apply, trying with patch command...' && "
+            "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
+            "echo 'APPLY_PATCH_FAIL')))"
+        )
+        action = CmdRunAction(command=exec_command)
+        action.timeout = 600
+        obs = runtime.run_action(action)
+        assert isinstance(obs, CmdOutputObservation)
+        apply_patch_output = obs.content
+        assert isinstance(apply_patch_output, str)
+        instance['test_result']['apply_patch_output'] = apply_patch_output
+
         if 'APPLY_PATCH_FAIL' in apply_patch_output:
             logger.info(f'[{instance_id}] {APPLY_PATCH_FAIL}:\n{apply_patch_output}')
             instance['test_result']['report']['failed_apply_patch'] = True
@@ -212,7 +221,7 @@ def process_instance(
             # Run eval script in background and save output to log file
             log_file = '/tmp/eval_output.log'
             action = CmdRunAction(command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!')
-            action.timeout = 60  # Short timeout just to get the process ID
+            action.timeout = 300  # Short timeout just to get the process ID
             obs = runtime.run_action(action)
 
             if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0:
@@ -235,7 +244,7 @@ def process_instance(
                     check_action = CmdRunAction(
                         command=f'ps -p {pid} > /dev/null; echo $?'
                     )
-                    check_action.timeout = 60
+                    check_action.timeout = 300
                     check_obs = runtime.run_action(check_action)
                     if (
                         isinstance(check_obs, CmdOutputObservation)
@@ -352,7 +361,13 @@ def process_instance(
 
     # Load predictions
     assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.'
-    predictions = pd.read_json(args.input_file, lines=True)
+    required_fields = ['instance_id', 'model_patch', 'test_result']
+    predictions = pd.DataFrame.from_records(
+        [
+            {k: v for k, v in json.loads(line).items() if k in required_fields}
+            for line in tqdm(open(args.input_file), desc='Loading predictions')
+        ]
+    )
     assert (
         'instance_id' in predictions.columns
     ), 'Input file must contain instance_id column.'
diff --git a/evaluation/benchmarks/swe_bench/resource/mapping.py b/evaluation/benchmarks/swe_bench/resource/mapping.py
new file mode 100644
index 000000000000..755591238116
--- /dev/null
+++ b/evaluation/benchmarks/swe_bench/resource/mapping.py
@@ -0,0 +1,40 @@
+"""Mapping instance_id to resource_factor.
+
+Different instances may have different resource requirements.
+e.g., some instances may require more memory/CPU to run inference.
+This file tracks the resource requirements of different instances.
+"""
+
+import json
+import os
+from typing import Dict
+
+from openhands.core.logger import openhands_logger as logger
+
+CUR_DIR = os.path.dirname(os.path.abspath(__file__))
+DEFAULT_RUNTIME_RESOURCE_FACTOR = int(
+    os.environ.get('DEFAULT_RUNTIME_RESOURCE_FACTOR', 1)
+)
+
+# dataset to resource mapping
+_global_resource_mapping: Dict[str, Dict[str, float]] = {}
+
+
+def get_resource_mapping(dataset_name: str) -> Dict[str, float]:
+    if dataset_name not in _global_resource_mapping:
+        file_path = os.path.join(CUR_DIR, f'{dataset_name}.json')
+        if not os.path.exists(file_path):
+            logger.warning(f'Resource mapping for {dataset_name} not found.')
+            return None
+
+        with open(file_path, 'r') as f:
+            _global_resource_mapping[dataset_name] = json.load(f)
+        logger.info(f'Loaded resource mapping for {dataset_name}')
+    return _global_resource_mapping[dataset_name]
+
+
+def get_instance_resource_factor(dataset_name: str, instance_id: str) -> int:
+    resource_mapping = get_resource_mapping(dataset_name)
+    if resource_mapping is None:
+        return DEFAULT_RUNTIME_RESOURCE_FACTOR
+    return int(resource_mapping.get(instance_id, DEFAULT_RUNTIME_RESOURCE_FACTOR))
diff --git a/evaluation/benchmarks/swe_bench/resource/princeton-nlp__SWE-bench_Verified-test.json b/evaluation/benchmarks/swe_bench/resource/princeton-nlp__SWE-bench_Verified-test.json
new file mode 100644
index 000000000000..161ab736da08
--- /dev/null
+++ b/evaluation/benchmarks/swe_bench/resource/princeton-nlp__SWE-bench_Verified-test.json
@@ -0,0 +1 @@
+{"pydata__xarray-6721": 8, "pytest-dev__pytest-7236": 8, "matplotlib__matplotlib-24627": 4, "django__django-15561": 4, "django__django-15098": 4, "django__django-14771": 4, "sympy__sympy-21612": 4, "sympy__sympy-15345": 4, "psf__requests-5414": 4, "astropy__astropy-14508": 2, "django__django-11451": 2, "django__django-11477": 2, "django__django-10880": 2, "django__django-11163": 2, "django__django-11815": 2, "astropy__astropy-14369": 2, "django__django-10097": 2, "django__django-10554": 2, "django__django-12304": 2, "django__django-12325": 2, "django__django-11551": 2, "django__django-11734": 2, "django__django-13109": 2, "django__django-13089": 2, "django__django-13343": 2, "django__django-13363": 2, "django__django-13809": 2, "django__django-13810": 2, "django__django-13786": 2, "django__django-13807": 2, "django__django-14493": 2, "django__django-11820": 2, "django__django-11951": 2, "django__django-11964": 2, "astropy__astropy-14309": 2, "astropy__astropy-14365": 2, "astropy__astropy-12907": 2, "astropy__astropy-14182": 2, "django__django-15161": 2, "django__django-15128": 2, "django__django-14999": 2, "django__django-14915": 2, "django__django-14752": 2, "django__django-14765": 2, "django__django-14089": 2, "django__django-15252": 2, "django__django-15380": 2, "django__django-15382": 2, "django__django-15499": 2, "django__django-15467": 2, "django__django-15280": 2, "django__django-15315": 2, "django__django-15277": 2, "django__django-15268": 2, "django__django-15629": 2, "django__django-15695": 2, "django__django-15732": 2, "django__django-15863": 2, "django__django-16082": 2, "django__django-16145": 2, "django__django-16256": 2, "django__django-16429": 2, "django__django-16454": 2, "django__django-16493": 2, "matplotlib__matplotlib-13989": 2, "matplotlib__matplotlib-20488": 2, "django__django-15503": 2, "django__django-15525": 2, "django__django-15375": 2, "django__django-15278": 2, "matplotlib__matplotlib-21568": 2, "matplotlib__matplotlib-20859": 2, "matplotlib__matplotlib-20826": 2, "matplotlib__matplotlib-20676": 2, "matplotlib__matplotlib-23412": 2, "matplotlib__matplotlib-22719": 2, "matplotlib__matplotlib-23299": 2, "matplotlib__matplotlib-22865": 2, "matplotlib__matplotlib-24149": 2, "matplotlib__matplotlib-24177": 2, "matplotlib__matplotlib-24570": 2, "matplotlib__matplotlib-24637": 2, "matplotlib__matplotlib-24970": 2, "matplotlib__matplotlib-23476": 2, "matplotlib__matplotlib-24026": 2, "matplotlib__matplotlib-23314": 2, "matplotlib__matplotlib-25332": 2, "matplotlib__matplotlib-25311": 2, "matplotlib__matplotlib-25122": 2, "matplotlib__matplotlib-25479": 2, "matplotlib__matplotlib-26342": 2, "psf__requests-2317": 2, "matplotlib__matplotlib-25960": 2, "matplotlib__matplotlib-25775": 2, "pydata__xarray-4356": 2, "pydata__xarray-4075": 2, "pydata__xarray-6461": 2, "pydata__xarray-4687": 2, "pydata__xarray-6599": 2, "pylint-dev__pylint-4661": 2, "django__django-15554": 2, "django__django-15563": 2, "pytest-dev__pytest-5262": 2, "pytest-dev__pytest-10081": 2, "scikit-learn__scikit-learn-12973": 2, "scikit-learn__scikit-learn-13124": 2, "scikit-learn__scikit-learn-13779": 2, "scikit-learn__scikit-learn-14141": 2, "scikit-learn__scikit-learn-13439": 2, "scikit-learn__scikit-learn-13496": 2, "scikit-learn__scikit-learn-15100": 2, "scikit-learn__scikit-learn-25102": 2, "scikit-learn__scikit-learn-25232": 2, "scikit-learn__scikit-learn-25747": 2, "scikit-learn__scikit-learn-26323": 2, "scikit-learn__scikit-learn-9288": 2, "scikit-learn__scikit-learn-14496": 2, "scikit-learn__scikit-learn-14629": 2, "sphinx-doc__sphinx-8265": 2, "sphinx-doc__sphinx-8548": 2, "sphinx-doc__sphinx-8593": 2, "sphinx-doc__sphinx-8595": 2, "sphinx-doc__sphinx-8621": 2, "sphinx-doc__sphinx-8638": 2, "sphinx-doc__sphinx-9229": 2, "sphinx-doc__sphinx-9281": 2, "sphinx-doc__sphinx-9461": 2, "sphinx-doc__sphinx-9591": 2, "sphinx-doc__sphinx-9658": 2, "sphinx-doc__sphinx-9673": 2, "sympy__sympy-12096": 2, "sympy__sympy-12481": 2, "sphinx-doc__sphinx-10323": 2, "sphinx-doc__sphinx-7590": 2, "sympy__sympy-13877": 2, "sympy__sympy-12489": 2, "sympy__sympy-15809": 2, "sympy__sympy-14711": 2, "sympy__sympy-16597": 2, "sympy__sympy-16766": 2, "sympy__sympy-16792": 2, "sympy__sympy-15875": 2, "sympy__sympy-17655": 2, "sympy__sympy-18189": 2, "sympy__sympy-18763": 2, "sympy__sympy-19040": 2, "sympy__sympy-19495": 2, "sympy__sympy-19637": 2, "sympy__sympy-19783": 2, "sympy__sympy-17630": 2, "sympy__sympy-20428": 2, "sympy__sympy-20590": 2, "sympy__sympy-20801": 2, "sympy__sympy-21379": 2, "sympy__sympy-21847": 2, "sympy__sympy-22456": 2, "sympy__sympy-22714": 2, "sympy__sympy-22914": 2, "sympy__sympy-23262": 2, "sympy__sympy-23413": 2, "sympy__sympy-23534": 2, "sympy__sympy-24066": 2, "sympy__sympy-24213": 2, "sympy__sympy-24443": 2, "sympy__sympy-24562": 2, "sympy__sympy-24661": 2}
diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py
index bf065ada9734..4b07fde45128 100644
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -9,6 +9,9 @@
 from datasets import load_dataset
 
 import openhands.agenthub
+from evaluation.benchmarks.swe_bench.resource.mapping import (
+    get_instance_resource_factor,
+)
 from evaluation.utils.shared import (
     EvalException,
     EvalMetadata,
@@ -41,9 +44,10 @@
 from openhands.utils.shutdown_listener import sleep_if_should_continue
 
 USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
-USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
+USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'true').lower() == 'true'
 RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
 
+
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': codeact_user_response,
 }
@@ -135,6 +139,10 @@ def get_config(
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
             keep_runtime_alive=False,
             remote_runtime_init_timeout=3600,
+            remote_runtime_resource_factor=get_instance_resource_factor(
+                dataset_name=metadata.dataset,
+                instance_id=instance['instance_id'],
+            ),
         ),
         # do not mount workspace
         workspace_base=None,
@@ -239,7 +247,7 @@ def initialize_runtime(
         assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}')
 
         action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
-        action.timeout = 3600
+        action.timeout = 600
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -351,7 +359,7 @@ def complete_runtime(
         action = CmdRunAction(
             command=f'git diff --no-color --cached {instance["base_commit"]}'
         )
-        action.timeout = 600 + 100 * n_retries
+        action.timeout = max(300 + 100 * n_retries, 600)
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -399,7 +407,7 @@ def process_instance(
             8,
         )
         logger.warning(
-            f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
+            f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
         )
     runtime = create_runtime(config)
     call_async_from_sync(runtime.connect)
@@ -479,6 +487,10 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
                 subset = dataset[dataset[filter_column].isin(selected_ids)]
                 logger.info(f'Retained {subset.shape[0]} tasks after filtering')
                 return subset
+    skip_ids = os.environ.get('SKIP_IDS', '').split(',')
+    if len(skip_ids) > 0:
+        logger.info(f'Filtering {len(skip_ids)} tasks from "SKIP_IDS"...')
+        return dataset[~dataset[filter_column].isin(skip_ids)]
     return dataset
 
 
@@ -501,8 +513,10 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
     # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
     # so we don't need to manage file uploading to OpenHands's repo
     dataset = load_dataset(args.dataset, split=args.split)
-    logger.info(f'Loaded dataset {args.dataset} with split {args.split}')
     swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
+    logger.info(
+        f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
+    )
 
     llm_config = None
     if args.llm_config:
@@ -531,6 +545,7 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
     )
 
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+    print(f'### OUTPUT FILE: {output_file} ###')
     instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit)
 
     if len(instances) > 0 and not isinstance(
diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py b/evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py
new file mode 100644
index 000000000000..6fa5aeda83f7
--- /dev/null
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py
@@ -0,0 +1,69 @@
+import argparse
+import gzip
+import json
+import os
+from glob import glob
+
+from tqdm import tqdm
+
+tqdm.pandas()
+
+
+# Load trajectories for resolved instances
+def load_completions(output_dir: str, instance_id: str):
+    glob_path = os.path.join(output_dir, 'llm_completions', instance_id, '*.json')
+    files = sorted(glob(glob_path))  # this is ascending order
+    # pick the last file (last turn)
+    try:
+        file_path = files[-1]
+    except IndexError:
+        # print(f'No files found for instance {instance_id}: files={files}')
+        return None
+    with open(file_path, 'r') as f:
+        result = json.load(f)
+    # create messages
+    messages = result['messages']
+    messages.append(result['response']['choices'][0]['message'])
+    tools = result['kwargs']['tools']
+    return {
+        'messages': messages,
+        'tools': tools,
+    }
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('jsonl_path', type=str)
+args = parser.parse_args()
+
+output_dir = os.path.dirname(args.jsonl_path)
+output_path = os.path.join(output_dir, 'output.with_completions.jsonl.gz')
+
+# Check if output would be different from input
+needs_update = False
+with open(args.jsonl_path, 'r') as f_in:
+    for line in tqdm(f_in, desc='Checking for changes'):
+        data = json.loads(line)
+        new_completions = load_completions(output_dir, data['instance_id'])
+        current_completions = data.get('raw_completions')
+        if current_completions != new_completions:
+            needs_update = True
+            break
+
+if not needs_update:
+    print('No updates required. Skipping file update.')
+    exit(0)
+
+if os.path.exists(output_path):
+    print(f'Output file already exists at {output_path}, overwriting? (y/n)')
+    if input() != 'y':
+        print('Exiting...')
+        exit(0)
+
+# Process line by line
+with open(args.jsonl_path, 'r') as f_in, gzip.open(output_path, 'wt') as f_out:
+    for line in tqdm(f_in):
+        data = json.loads(line)
+        data['raw_completions'] = load_completions(output_dir, data['instance_id'])
+        f_out.write(json.dumps(data) + '\n')
+
+print(f'Saved compressed output to {output_path}')
diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py
index f333012f489a..69000106c6c2 100644
--- a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py
@@ -22,7 +22,8 @@ def convert_row_to_swebench_format(row):
     elif 'test_result' in row and 'git_patch' in row['test_result']:
         model_patch = row['test_result']['git_patch']
     else:
-        raise ValueError(f'Row {row} does not have a git_patch')
+        print(f'WARNING: Row {row} does not have a git_patch')
+        model_patch = ''
 
     return {
         'instance_id': row['instance_id'],
diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py b/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py
index d9c5c540f24b..f8527acd7a6c 100644
--- a/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py
@@ -3,7 +3,7 @@
 import os
 from collections import defaultdict
 
-import pandas as pd
+from tqdm import tqdm
 
 parser = argparse.ArgumentParser()
 parser.add_argument('input_file', type=str)
@@ -11,8 +11,7 @@
 
 dirname = os.path.dirname(args.input_file)
 
-df = pd.read_json(args.input_file, lines=True)
-
+# Initialize counters and data structures
 instance_id_to_status = defaultdict(
     lambda: {
         'empty_generation': False,
@@ -23,15 +22,7 @@
     }
 )
 
-
-# Apply the status to the dataframe
-def apply_report(row):
-    instance_id = row['instance_id']
-    if instance_id in instance_id_to_status:
-        return dict(instance_id_to_status[instance_id])
-    return row.get('report', {})
-
-
+# Process official report if it exists
 swebench_official_report_json = os.path.join(dirname, 'report.json')
 openhands_remote_report_jsonl = args.input_file.replace(
     '.jsonl', '.swebench_eval.jsonl'
@@ -90,113 +81,159 @@ def apply_report(row):
             f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
         )
 
-    df['report'] = df.apply(apply_report, axis=1)
-
     with open(output_md_filepath, 'w') as f:
         f.write(output_md)
 
 elif os.path.exists(openhands_remote_report_jsonl):
     output_md_filepath = args.input_file.replace('.jsonl', '.swebench_eval.md')
 
-    df_eval = pd.read_json(openhands_remote_report_jsonl, lines=True, orient='records')
-
-    assert len(df['instance_id'].unique()) == len(
-        df
-    ), 'There are duplicate instance ids in the original output which is not allowed'
-    assert len(df_eval['instance_id'].unique()) == len(
-        df_eval
-    ), 'There are duplicate instance ids in the eval report which is not allowed'
-
-    for _, row in df_eval.iterrows():
-        instance_id_to_status[row['instance_id']] = row['test_result']['report']
-    df['report'] = df.apply(apply_report, axis=1)
-
-    report_is_dict = df['report'].apply(lambda x: isinstance(x, dict))
-    if not report_is_dict.all():
-        print(df[~report_is_dict])
-        raise ValueError(f'Report is not a dict, but a {type(row["report"])}')
-
-    _n_instances = len(df)
-    _n_resolved = len(df[df['report'].apply(lambda x: x.get('resolved', False))])
-    _n_unresolved = _n_instances - _n_resolved
-    _n_empty_patch = len(
-        df[df['report'].apply(lambda x: x.get('empty_generation', False))]
-    )
-    _n_error = len(df[df['report'].apply(lambda x: x.get('error_eval', False))])
+    # First pass: Read eval report and count instances
+    instance_ids = set()
+    eval_instance_ids = set()
+
+    # Count instances in original file
+    n_instances = 0
+    with open(args.input_file, 'r') as f:
+        for line in tqdm(f, desc='Counting instances in original file'):
+            data = json.loads(line)
+            instance_ids.add(data['instance_id'])
+            n_instances += 1
+    print(f'Total instances in original file: {n_instances}')
+
+    # Process eval report
+    n_eval_instances = 0
+    with open(openhands_remote_report_jsonl, 'r') as f:
+        for line in tqdm(f, desc='Processing eval report'):
+            data = json.loads(line)
+            instance_id = data['instance_id']
+            eval_instance_ids.add(instance_id)
+            n_eval_instances += 1
+            instance_id_to_status[instance_id] = data['test_result']['report']
+    print(f'Total instances in eval report: {n_eval_instances}')
+
+    # Verify no duplicates
+    assert (
+        len(instance_ids) == n_instances
+    ), 'Duplicate instance ids found in original output'
+    assert (
+        len(eval_instance_ids) == n_eval_instances
+    ), 'Duplicate instance ids found in eval report'
+
+    # Initialize counters
+    stats = {'total': len(instance_ids), 'resolved': 0, 'empty_patch': 0, 'error': 0}
+
+    # Collect instance IDs by category
+    resolved_ids = []
+    unresolved_ids = []
+    error_ids = []
+    empty_patch_ids = []
+    timeout_ids = []
+
+    # Process original file and categorize instances
+    with open(args.input_file, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            instance_id = data['instance_id']
+            report = instance_id_to_status[instance_id]
+
+            if report.get('resolved', False):
+                stats['resolved'] += 1
+                resolved_ids.append(instance_id)
+            else:
+                unresolved_ids.append(instance_id)
+
+            if report.get('empty_generation', False):
+                stats['empty_patch'] += 1
+                empty_patch_ids.append(instance_id)
+            if report.get('error_eval', False):
+                stats['error'] += 1
+                error_ids.append(instance_id)
+            if report.get('test_timeout', False):
+                timeout_ids.append(instance_id)
+
+    # Generate markdown report
+    def _instance_id_to_log_path(instance_id):
+        path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log"
+        return os.path.relpath(path, start=dirname)
+
+    # ... rest of markdown generation code remains the same ...
     output_md = (
         '# SWE-bench Report\n'
         'This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n'
         '## Summary\n'
-        f'- submitted instances: {_n_instances}\n'
-        f'- empty patch instances: {_n_empty_patch}\n'
-        f'- resolved instances: {_n_resolved}\n'
-        f'- unresolved instances: {_n_unresolved}\n'
-        f'- error instances: {_n_error}\n'
+        f'- submitted instances: {stats["total"]}\n'
+        f'- empty patch instances: {stats["empty_patch"]}\n'
+        f'- resolved instances: {stats["resolved"]}\n'
+        f'- unresolved instances: {len(unresolved_ids)}\n'
+        f'- error instances: {stats["error"]}\n'
     )
 
-    def _instance_id_to_log_path(instance_id):
-        path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log"
-        # make it relative path
-        path = os.path.relpath(path, start=dirname)
-        return path
-
     output_md += '\n## Resolved Instances\n'
     # instance_id to status
-    for instance_id in sorted(
-        df[df['report'].apply(lambda x: x.get('resolved', False))][
-            'instance_id'
-        ].unique()
-    ):
+    for instance_id in resolved_ids:
         instance_id_to_status[instance_id]['resolved'] = True
         output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
 
     output_md += '\n## Unresolved Instances\n'
-    for instance_id in sorted(
-        df[~df['report'].apply(lambda x: x.get('resolved', False))][
-            'instance_id'
-        ].unique()
-    ):
+    for instance_id in unresolved_ids:
         output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
 
     output_md += '\n## Error Instances\n'
-    for instance_id in sorted(
-        df[df['report'].apply(lambda x: x.get('error_eval', False))][
-            'instance_id'
-        ].unique()
-    ):
+    for instance_id in error_ids:
         instance_id_to_status[instance_id]['error_eval'] = True
         output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
 
     output_md += '\n## Empty Patch Instances\n'
-    for instance_id in sorted(
-        df[df['report'].apply(lambda x: x.get('empty_generation', False))][
-            'instance_id'
-        ].unique()
-    ):
+    for instance_id in empty_patch_ids:
         instance_id_to_status[instance_id]['empty_generation'] = True
         output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
 
     output_md += '\n## Incomplete Instances\n'
-    for instance_id in sorted(
-        df[df['report'].apply(lambda x: x.get('test_timeout', False))][
-            'instance_id'
-        ].unique()
-    ):
+    for instance_id in timeout_ids:
         output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
+
     with open(output_md_filepath, 'w') as f:
         f.write(output_md)
+
 else:
     print(
         f'No report file found: Both {swebench_official_report_json} and {openhands_remote_report_jsonl} do not exist.'
     )
     exit()
 
+# Before backup and update, check if any changes would be made
+needs_update = False
+with open(args.input_file, 'r') as infile:
+    for line in tqdm(infile, desc='Checking for changes'):
+        data = json.loads(line)
+        instance_id = data['instance_id']
+        if instance_id in instance_id_to_status:
+            current_report = data.get('report', {})
+            new_report = instance_id_to_status[instance_id]
+            if current_report != new_report:
+                needs_update = True
+                break
+
+if not needs_update:
+    print('No updates detected. Skipping file update.')
+    exit()
+
+# Backup and update the original file row by row
 if os.path.exists(args.input_file + '.bak'):
     conf = input('Existing backup file found. Do you want to overwrite it? (y/n)')
     if conf != 'y':
         exit()
     os.remove(args.input_file + '.bak')
 
-# backup the original file
 os.rename(args.input_file, args.input_file + '.bak')
-df.to_json(args.input_file, orient='records', lines=True)
+
+# Process and write file row by row
+with open(args.input_file + '.bak', 'r') as infile, open(
+    args.input_file, 'w'
+) as outfile:
+    for line in tqdm(infile, desc='Updating output file'):
+        data = json.loads(line)
+        instance_id = data['instance_id']
+        if instance_id in instance_id_to_status:
+            data['report'] = instance_id_to_status[instance_id]
+        outfile.write(json.dumps(data) + '\n')
diff --git a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
index b1d375152dc4..d16b548ebacd 100755
--- a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
@@ -100,6 +100,10 @@ function run_eval() {
 
   # Run the command
   eval $COMMAND
+  # if exit code is not 0, exit the script
+  if [ $? -ne 0 ]; then
+    exit 1
+  fi
 }
 
 unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
@@ -108,7 +112,14 @@ if [ -z "$N_RUNS" ]; then
   echo "N_RUNS not specified, use default $N_RUNS"
 fi
 
+# Skip runs if the run number is in the SKIP_RUNS list
+# read from env variable SKIP_RUNS as a comma separated list of run numbers
+SKIP_RUNS=(${SKIP_RUNS//,/ })
 for i in $(seq 1 $N_RUNS); do
+  if [[ " ${SKIP_RUNS[@]} " =~ " $i " ]]; then
+    echo "Skipping run $i"
+    continue
+  fi
   current_eval_note="$EVAL_NOTE-run_$i"
   echo "EVAL_NOTE: $current_eval_note"
   run_eval $current_eval_note

From d84c8dc4c0d60caee2098c7db682bed8cdffe6cc Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Thu, 16 Jan 2025 12:48:12 -0500
Subject: [PATCH 2/3] remove extra

---
 evaluation/benchmarks/swe_bench/scripts/run_infer.sh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
index d16b548ebacd..73e8bd3a3e55 100755
--- a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh
@@ -100,10 +100,6 @@ function run_eval() {
 
   # Run the command
   eval $COMMAND
-  # if exit code is not 0, exit the script
-  if [ $? -ne 0 ]; then
-    exit 1
-  fi
 }
 
 unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push

From 9c912b62a178ae678438aa6ac778eea0f4368780 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 16 Jan 2025 18:22:56 +0000
Subject: [PATCH 3/3] Fix pr #6313: feat(eval): misc SWE-Bench improvement -
 use different resources for different instances

---
 evaluation/benchmarks/swe_bench/eval_infer.py       | 13 +++++++------
 evaluation/benchmarks/swe_bench/resource/mapping.py |  6 ++----
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/evaluation/benchmarks/swe_bench/eval_infer.py b/evaluation/benchmarks/swe_bench/eval_infer.py
index f8fd662c958b..52972a920e8e 100644
--- a/evaluation/benchmarks/swe_bench/eval_infer.py
+++ b/evaluation/benchmarks/swe_bench/eval_infer.py
@@ -362,12 +362,13 @@ def process_instance(
     # Load predictions
     assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.'
     required_fields = ['instance_id', 'model_patch', 'test_result']
-    predictions = pd.DataFrame.from_records(
-        [
-            {k: v for k, v in json.loads(line).items() if k in required_fields}
-            for line in tqdm(open(args.input_file), desc='Loading predictions')
-        ]
-    )
+    with open(args.input_file) as f:
+        predictions = pd.DataFrame.from_records(
+            [
+                {k: v for k, v in json.loads(line).items() if k in required_fields}
+                for line in tqdm(f, desc='Loading predictions')
+            ]
+        )
     assert (
         'instance_id' in predictions.columns
     ), 'Input file must contain instance_id column.'
diff --git a/evaluation/benchmarks/swe_bench/resource/mapping.py b/evaluation/benchmarks/swe_bench/resource/mapping.py
index 755591238116..ed2f433c262b 100644
--- a/evaluation/benchmarks/swe_bench/resource/mapping.py
+++ b/evaluation/benchmarks/swe_bench/resource/mapping.py
@@ -7,8 +7,6 @@
 
 import json
 import os
-from typing import Dict
-
 from openhands.core.logger import openhands_logger as logger
 
 CUR_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -17,10 +15,10 @@
 )
 
 # dataset to resource mapping
-_global_resource_mapping: Dict[str, Dict[str, float]] = {}
+_global_resource_mapping: dict[str, dict[str, float]] = {}
 
 
-def get_resource_mapping(dataset_name: str) -> Dict[str, float]:
+def get_resource_mapping(dataset_name: str) -> dict[str, float]:
     if dataset_name not in _global_resource_mapping:
         file_path = os.path.join(CUR_DIR, f'{dataset_name}.json')
         if not os.path.exists(file_path):