Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: incorrect soft-timeout implementation & fix hard-timeout follow-up command #6280

Merged
merged 18 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions evaluation/benchmarks/commit0_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def initialize_runtime(
action = CmdRunAction(
command=f'git clone -b commit0_combined https://github.com/{instance["repo"]}.git'
)
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -181,7 +181,7 @@ def initialize_runtime(
)

action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -191,7 +191,7 @@ def initialize_runtime(
)

action = CmdRunAction(command='git checkout -b openhands')
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -201,7 +201,7 @@ def initialize_runtime(

# Install commit0
action = CmdRunAction(command='/root/.cargo/bin/uv pip install commit0')
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
# logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand Down Expand Up @@ -231,7 +231,7 @@ def complete_runtime(
workspace_dir_name = _get_commit0_workspace_dir_name(instance)

action = CmdRunAction(command='git add .')
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -241,7 +241,7 @@ def complete_runtime(
)

action = CmdRunAction(command='git commit -m "openhands edits"')
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -258,7 +258,7 @@ def complete_runtime(
action = CmdRunAction(
command=f"git diff {instance['base_commit']} HEAD -- . ':(exclude)spec.pdf.bz2'"
)
action.timeout = 600 + 100 * n_retries
action.set_hard_timeout(600 + 100 * n_retries)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
# logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -282,7 +282,7 @@ def complete_runtime(
action = CmdRunAction(
command=f"{instance['test']['test_cmd']} --json-report --json-report-file=report.json --continue-on-collection-errors {test_dir} > test_output.txt 2>&1"
)
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -292,7 +292,7 @@ def complete_runtime(
)
# Read test output
action = CmdRunAction(command='cat test_output.txt')
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
# logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -305,7 +305,7 @@ def complete_runtime(

# Save pytest exit code
action = CmdRunAction(command='echo $?')
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
# logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -318,7 +318,7 @@ def complete_runtime(

# Read the test report
action = CmdRunAction(command='cat report.json')
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
# logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -330,7 +330,7 @@ def complete_runtime(
repo_name = instance['repo'].split('/')[1]
repo_name = repo_name.replace('.', '-')
action = CmdRunAction(command=f'commit0 get-tests {repo_name}')
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
# logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand Down
10 changes: 5 additions & 5 deletions evaluation/benchmarks/swe_bench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def process_instance(

# Set +x
action = CmdRunAction(command='chmod +x /tmp/eval.sh')
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -189,7 +189,7 @@ def process_instance(
"echo 'APPLY_PATCH_FAIL')))"
)
action = CmdRunAction(command=exec_command)
action.timeout = 600
action.set_hard_timeout(600)
obs = runtime.run_action(action)
assert isinstance(obs, CmdOutputObservation)
apply_patch_output = obs.content
Expand All @@ -212,7 +212,7 @@ def process_instance(
# Run eval script in background and save output to log file
log_file = '/tmp/eval_output.log'
action = CmdRunAction(command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!')
action.timeout = 60 # Short timeout just to get the process ID
action.set_hard_timeout(60) # Short timeout just to get the process ID
obs = runtime.run_action(action)

if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0:
Expand All @@ -235,7 +235,7 @@ def process_instance(
check_action = CmdRunAction(
command=f'ps -p {pid} > /dev/null; echo $?'
)
check_action.timeout = 60
check_action.set_hard_timeout(60)
check_obs = runtime.run_action(check_action)
if (
isinstance(check_obs, CmdOutputObservation)
Expand All @@ -252,7 +252,7 @@ def process_instance(

# Read the log file
cat_action = CmdRunAction(command=f'cat {log_file}')
cat_action.timeout = 300
cat_action.set_hard_timeout(300)
cat_obs = runtime.run_action(cat_action)

# Grade answer
Expand Down
30 changes: 15 additions & 15 deletions evaluation/benchmarks/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def initialize_runtime(
action = CmdRunAction(
command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc"""
)
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -182,7 +182,7 @@ def initialize_runtime(
)

action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -194,7 +194,7 @@ def initialize_runtime(

# inject the instance info
action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances')
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand Down Expand Up @@ -223,14 +223,14 @@ def initialize_runtime(
'/swe_util/',
)
action = CmdRunAction(command='cat ~/.bashrc')
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}')

action = CmdRunAction(command='source ~/.bashrc')
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -239,7 +239,7 @@ def initialize_runtime(
assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}')

action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
action.timeout = 3600
action.set_hard_timeout(3600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -249,7 +249,7 @@ def initialize_runtime(
)
else:
action = CmdRunAction(command='source /swe_util/swe_entry.sh')
action.timeout = 1800
action.set_hard_timeout(1800)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -259,7 +259,7 @@ def initialize_runtime(
)

action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -269,7 +269,7 @@ def initialize_runtime(
)

action = CmdRunAction(command='git reset --hard')
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -278,14 +278,14 @@ def initialize_runtime(
action = CmdRunAction(
command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
)
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')

action = CmdRunAction(command='which python')
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand Down Expand Up @@ -316,7 +316,7 @@ def complete_runtime(
workspace_dir_name = _get_swebench_workspace_dir_name(instance)

action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -326,7 +326,7 @@ def complete_runtime(
)

action = CmdRunAction(command='git config --global core.pager ""')
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -336,7 +336,7 @@ def complete_runtime(
)

action = CmdRunAction(command='git add -A')
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand All @@ -351,7 +351,7 @@ def complete_runtime(
action = CmdRunAction(
command=f'git diff --no-color --cached {instance["base_commit"]}'
)
action.timeout = 600 + 100 * n_retries
action.set_hard_timeout(600 + 100 * n_retries)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/the_agent_company/browsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def pre_login(
instruction = action.to_instruction()

browser_action = BrowseInteractiveAction(browser_actions=instruction)
browser_action.timeout = 10000
browser_action.set_hard_timeout(10000)
logger.info(browser_action, extra={'msg_type': 'ACTION'})
obs: BrowserOutputObservation = runtime.run_action(browser_action)
logger.debug(obs, extra={'msg_type': 'OBSERVATION'})
Expand Down
4 changes: 2 additions & 2 deletions evaluation/benchmarks/the_agent_company/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def init_task_env(runtime: Runtime, hostname: str, env_llm_config: LLMConfig):
'bash /utils/init.sh'
)
action = CmdRunAction(command=command)
action.timeout = 900
action.set_hard_timeout(900)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand Down Expand Up @@ -172,7 +172,7 @@ def run_evaluator(
f'python_default /utils/eval.py --trajectory_path {trajectory_path} --result_path {result_path}'
)
action = CmdRunAction(command=command)
action.timeout = 600
action.set_hard_timeout(600)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand Down
17 changes: 16 additions & 1 deletion frontend/src/state/chat-slice.ts
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,22 @@
content = `${
causeMessage.content
}\n\nOutput:\n\`\`\`\n${content.trim() || "[Command finished execution with no output]"}\n\`\`\``;
causeMessage.content = content; // Observation content includes the action

// Only add metadata for 'run' observations
if (observationID === "run") {
const metadata = (observation.payload as CommandObservation).extras.metadata;

Check failure on line 170 in frontend/src/state/chat-slice.ts

View workflow job for this annotation

GitHub Actions / Lint frontend

Use object destructuring

Check failure on line 170 in frontend/src/state/chat-slice.ts

View workflow job for this annotation

GitHub Actions / Lint frontend

Insert `⏎············`
// if metadata.prefix exists, add it to the content
if (metadata.prefix) {
content = `Additional prefix the agent received:\n\`\`\`${metadata.prefix}\n\`\`\`\n\n${content}`;
}
// if metadata.suffix exists, add it to the content
if (metadata.suffix) {
content += `\n\nAdditional suffix the agent received:\n\`\`\`${metadata.suffix}\n\`\`\``;
}
causeMessage.content = content;
} else {
causeMessage.content = content;
}
} else if (observationID === "read" || observationID === "edit") {
const { content } = observation.payload;
causeMessage.content = `\`\`\`${observationID === "edit" ? "diff" : "python"}\n${content}\n\`\`\``; // Content is already truncated by the ACI
Expand Down
4 changes: 2 additions & 2 deletions openhands/core/config/sandbox_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class SandboxConfig:

remote_runtime_api_url: str = 'http://localhost:8000'
local_runtime_url: str = 'http://localhost'
keep_runtime_alive: bool = False
keep_runtime_alive: bool = True
rm_all_containers: bool = False
api_key: str | None = None
base_container_image: str = 'nikolaik/python-nodejs:python3.12-nodejs22' # default to nikolaik/python-nodejs:python3.12-nodejs22 for eventstream runtime
Expand All @@ -60,7 +60,7 @@ class SandboxConfig:
runtime_startup_env_vars: dict[str, str] = field(default_factory=dict)
browsergym_eval_env: str | None = None
platform: str | None = None
close_delay: int = 15
close_delay: int = 900
remote_runtime_resource_factor: int = 1
enable_gpu: bool = False
docker_runtime_kwargs: str | None = None
Expand Down
10 changes: 7 additions & 3 deletions openhands/events/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,12 @@ def timeout(self) -> int | None:
return self._timeout # type: ignore[attr-defined]
return None

@timeout.setter
def timeout(self, value: int | None) -> None:
def set_hard_timeout(self, value: int | None, blocking: bool = True) -> None:
"""Set the timeout for the event.

NOTE, this is a hard timeout, meaning that the event will be blocked
until the timeout is reached.
"""
self._timeout = value
if value is not None and value > 600:
from openhands.core.logger import openhands_logger as logger
Expand All @@ -78,7 +82,7 @@ def timeout(self, value: int | None) -> None:
# Check if .blocking is an attribute of the event
if hasattr(self, 'blocking'):
# .blocking needs to be set to True if .timeout is set
self.blocking = True
self.blocking = blocking

# optional metadata, LLM call cost of the edit
@property
Expand Down
3 changes: 2 additions & 1 deletion openhands/events/serialization/action.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ def action_from_dict(action: dict) -> Action:
try:
decoded_action = action_class(**args)
if 'timeout' in action:
decoded_action.timeout = action['timeout']
blocking = args.get('blocking', False)
decoded_action.set_hard_timeout(action['timeout'], blocking=blocking)

# Set timestamp if it was provided
if timestamp:
Expand Down
2 changes: 1 addition & 1 deletion openhands/resolver/resolve_issue.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ async def complete_runtime(
git_patch = None
while n_retries < 5:
action = CmdRunAction(command=f'git diff --no-color --cached {base_commit}')
action.timeout = 600 + 100 * n_retries
action.set_hard_timeout(600 + 100 * n_retries)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
Expand Down
Loading
Loading