Skip to content

Commit

Permalink
Merge branch 'main' into kevin
Browse files Browse the repository at this point in the history
  • Loading branch information
SmartManoj committed Sep 5, 2024
2 parents 4394c41 + 688068a commit caafcde
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 23 deletions.
17 changes: 14 additions & 3 deletions evaluation/swe_bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,22 @@ then your command would be:
./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
```

**Evaluate on `RemoteRuntime` (alpha)** (contact Xingyao over slack if you want to try this out!)
### Run Inference on `RemoteRuntime`

This is in limited beta. Contact Xingyao over slack if you want to try this out!

```bash
# ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" \
./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test
# This example runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel
```

To clean-up all existing runtime you've already started, run:

```bash
SANDBOX_API_KEY="CONTACT-XINGYAO-TO-GET-A-TESTING-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300
ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
```
Multi-processing is still WIP.

### Specify a subset of tasks to run infer

Expand Down
12 changes: 2 additions & 10 deletions evaluation/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
SandboxConfig,
get_llm_config_arg,
get_parser,
load_from_env,
)
from openhands.core.logger import openhands_logger as logger
from openhands.core.main import create_runtime, run_controller
Expand Down Expand Up @@ -123,26 +122,19 @@ def get_config(
run_as_openhands=False,
max_budget_per_task=4,
max_iterations=metadata.max_iterations,
runtime=os.environ.get('RUNTIME', 'eventstream'),
sandbox=SandboxConfig(
base_container_image=base_container_image,
enable_auto_lint=True,
use_host_network=False,
# large enough timeout, since some testcases take very long to run
timeout=300,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
),
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
)
selected_env_vars = {'runtime', 'sandbox_api_key'}
selected_env_vars = {
k: v for k, v in os.environ.items() if k.lower() in selected_env_vars
}
if selected_env_vars:
logger.info(
f'Loading config keys from env vars: {list(selected_env_vars.keys())}'
)
load_from_env(config, selected_env_vars)
config.set_llm_config(metadata.llm_config)
return config

Expand Down
21 changes: 21 additions & 0 deletions evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash


# API base URL
BASE_URL="https://api.all-hands.dev/v0"

# Get the list of runtimes
runtimes=$(curl --silent --location --request GET "${BASE_URL}/runtime/list" \
--header "X-API-Key: ${ALLHANDS_API_KEY}" | jq -r '.runtimes | .[].runtime_id')

# Loop through each runtime and stop it
for runtime_id in $runtimes; do
echo "Stopping runtime: ${runtime_id}"
curl --silent --location --request POST "${BASE_URL}/runtime/stop" \
--header "X-API-Key: ${ALLHANDS_API_KEY}" \
--header "Content-Type: application/json" \
--data-raw "{\"runtime_id\": \"${runtime_id}\"}"
echo
done

echo "All runtimes have been stopped."
25 changes: 20 additions & 5 deletions openhands/runtime/utils/runtime_build.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
import hashlib
import os
import shutil
import subprocess
Expand Down Expand Up @@ -185,11 +186,25 @@ def get_runtime_image_repo_and_tag(base_image: str) -> tuple[str, str]:
if ':' not in base_image:
base_image = base_image + ':latest'
[repo, tag] = base_image.split(':')
# replace '/' with '_s_' to avoid '/' in the image name
# while make it a valid docker image name
repo = repo.replace('/', '_s_')
od_version = _get_package_version()
return get_runtime_image_repo(), f'od_v{od_version}_image_{repo}_tag_{tag}'
oh_version = _get_package_version()

# Hash the repo if it's too long
if len(repo) > 32:
repo_hash = hashlib.md5(repo[:-24].encode()).hexdigest()[:8]
repo = f'{repo_hash}_{repo[-24:]}' # Use 8 char hash + last 24 chars
else:
repo = repo.replace('/', '_s_')

new_tag = f'oh_v{oh_version}_image_{repo}_tag_{tag}'

# if it's still too long, hash the entire image name
if len(new_tag) > 128:
new_tag = f'oh_v{oh_version}_image_{hashlib.md5(new_tag.encode()).hexdigest()[:64]}'
logger.warning(
f'The new tag [{new_tag}] is still too long, so we use an hash of the entire image name: {new_tag}'
)

return get_runtime_image_repo(), new_tag


def build_runtime_image(
Expand Down
10 changes: 5 additions & 5 deletions tests/unit/test_runtime_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
prep_docker_build_folder,
)

OD_VERSION = f'od_v{_get_package_version()}'
OH_VERSION = f'oh_v{_get_package_version()}'


@pytest.fixture
Expand Down Expand Up @@ -176,22 +176,22 @@ def test_get_runtime_image_repo_and_tag_eventstream():
img_repo, img_tag = get_runtime_image_repo_and_tag(base_image)
assert (
img_repo == f'{get_runtime_image_repo()}'
and img_tag == f'{OD_VERSION}_image_debian_tag_11'
and img_tag == f'{OH_VERSION}_image_debian_tag_11'
)

base_image = 'nikolaik/python-nodejs:python3.11-nodejs22'
img_repo, img_tag = get_runtime_image_repo_and_tag(base_image)
assert (
img_repo == f'{get_runtime_image_repo()}'
and img_tag
== f'{OD_VERSION}_image_nikolaik_s_python-nodejs_tag_python3.11-nodejs22'
== f'{OH_VERSION}_image_nikolaik_s_python-nodejs_tag_python3.11-nodejs22'
)

base_image = 'ubuntu'
img_repo, img_tag = get_runtime_image_repo_and_tag(base_image)
assert (
img_repo == f'{get_runtime_image_repo()}'
and img_tag == f'{OD_VERSION}_image_ubuntu_tag_latest'
and img_tag == f'{OH_VERSION}_image_ubuntu_tag_latest'
)


Expand All @@ -215,7 +215,7 @@ def test_build_runtime_image_from_scratch(temp_dir):
path=ANY,
tags=[
f'{get_runtime_image_repo()}:{from_scratch_hash}',
f'{get_runtime_image_repo()}:{OD_VERSION}_image_debian_tag_11',
f'{get_runtime_image_repo()}:{OH_VERSION}_image_debian_tag_11',
],
)
assert image_name == f'{get_runtime_image_repo()}:{from_scratch_hash}'
Expand Down

0 comments on commit caafcde

Please sign in to comment.