diff --git a/.github/workflows/dummy-agent-test.yml b/.github/workflows/dummy-agent-test.yml index bac3fd8bb8da..9772dfda4a88 100644 --- a/.github/workflows/dummy-agent-test.yml +++ b/.github/workflows/dummy-agent-test.yml @@ -36,6 +36,8 @@ jobs: - name: Set up Docker Buildx id: buildx uses: docker/setup-buildx-action@v3 + - name: Install tmux + run: sudo apt-get update && sudo apt-get install -y tmux - name: Install poetry via pipx run: pipx install poetry - name: Set up Python diff --git a/.github/workflows/eval-runner.yml b/.github/workflows/eval-runner.yml index 68463a47ae06..6efc51acbfa7 100644 --- a/.github/workflows/eval-runner.yml +++ b/.github/workflows/eval-runner.yml @@ -29,6 +29,8 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Install tmux + run: sudo apt-get update && sudo apt-get install -y tmux - name: Install poetry via pipx run: pipx install poetry diff --git a/.github/workflows/py-unit-tests-mac.yml b/.github/workflows/py-unit-tests-mac.yml index 93897000c4a9..4d9e8bf5cbd4 100644 --- a/.github/workflows/py-unit-tests-mac.yml +++ b/.github/workflows/py-unit-tests-mac.yml @@ -31,6 +31,8 @@ jobs: key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }} restore-keys: | ${{ runner.os }}-poetry- + - name: Install tmux + run: brew install tmux - name: Install poetry via pipx run: pipx install poetry - name: Install Python dependencies using Poetry diff --git a/.github/workflows/py-unit-tests.yml b/.github/workflows/py-unit-tests.yml index ebf159b8c3f7..1e42ccc2d82c 100644 --- a/.github/workflows/py-unit-tests.yml +++ b/.github/workflows/py-unit-tests.yml @@ -30,6 +30,8 @@ jobs: - name: Set up Docker Buildx id: buildx uses: docker/setup-buildx-action@v3 + - name: Install tmux + run: sudo apt-get update && sudo apt-get install -y tmux - name: Install poetry via pipx run: pipx install poetry - name: Set up Python diff --git a/docs/static/img/backend_architecture.puml b/docs/static/img/backend_architecture.puml index 9f5564a52a8d..c96651ab469b 100644 --- a/docs/static/img/backend_architecture.puml +++ b/docs/static/img/backend_architecture.puml @@ -123,7 +123,6 @@ class openhands.state.State { updated_info: List[Tuple[Action, Observation]] } class openhands.observation.CmdOutputObservation { - command_id: int command: str exit_code: int observation: str diff --git a/evaluation/benchmarks/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py index cf1dc6bba97c..554ddc66488f 100644 --- a/evaluation/benchmarks/agent_bench/run_infer.py +++ b/evaluation/benchmarks/agent_bench/run_infer.py @@ -137,7 +137,6 @@ def complete_runtime( action = CmdRunAction( command=f'chmod +x ./{script_name} && ./{script_name}', - keep_prompt=False, ) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) @@ -164,8 +163,7 @@ def complete_runtime( logger.info(f'Running get ground truth cmd: {script_name}') action = CmdRunAction( - command=f'chmod +x ./{script_name} && ./{script_name}', - keep_prompt=False, + command=f'chmod +x ./{script_name} && ./{script_name}' ) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py index f0e86f30380e..51179724e239 100644 --- a/evaluation/benchmarks/aider_bench/run_infer.py +++ b/evaluation/benchmarks/aider_bench/run_infer.py @@ -145,10 +145,7 @@ def complete_runtime( ) logger.info(f'Running test file: {script_name}') - action = CmdRunAction( - command=f'python3 -m unittest {script_name}', - keep_prompt=False, - ) + action = CmdRunAction(command=f'python3 -m unittest {script_name}') logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) diff --git a/evaluation/benchmarks/biocoder/run_infer.py b/evaluation/benchmarks/biocoder/run_infer.py index ba8eb4d17b20..dfc0eaf1f912 100644 --- a/evaluation/benchmarks/biocoder/run_infer.py +++ b/evaluation/benchmarks/biocoder/run_infer.py @@ -199,7 +199,7 @@ def complete_runtime( if obs.exit_code == 0: test_result['metadata']['1_copy_change_success'] = True - action = CmdRunAction(command=f'cat {generated_path}', keep_prompt=False) + action = CmdRunAction(command=f'cat {generated_path}') logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) assert obs.exit_code == 0 @@ -223,9 +223,7 @@ def complete_runtime( logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 - action = CmdRunAction( - command='cat /testing_files/results_biocoder.json', keep_prompt=False - ) + action = CmdRunAction(command='cat /testing_files/results_biocoder.json') logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) if obs.exit_code == 0: diff --git a/evaluation/benchmarks/bird/README.md b/evaluation/benchmarks/bird/README.md index 41874fe99f59..b13df87f22a3 100644 --- a/evaluation/benchmarks/bird/README.md +++ b/evaluation/benchmarks/bird/README.md @@ -127,7 +127,6 @@ For each problem, OpenHands is given a set number of iterations to fix the faili "observation": "run", "content": "california_schools/california_schools.sqlite\r\n[(1.0,)]", "extras": { - "command_id": -1, "command": "python3 0.py", "exit_code": 0 } diff --git a/evaluation/benchmarks/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py index 45ddf582dc64..3254e4a2c791 100644 --- a/evaluation/benchmarks/bird/run_infer.py +++ b/evaluation/benchmarks/bird/run_infer.py @@ -268,10 +268,7 @@ def initialize_runtime( runtime.copy_to(db_file, '/workspace') # Check the database is copied - action = CmdRunAction( - command='cd /workspace && ls -l', - keep_prompt=False, - ) + action = CmdRunAction(command='cd /workspace && ls -l') obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 @@ -300,10 +297,7 @@ def complete_runtime( instance_id = instance.instance_id.replace('/', '__') path = os.path.join('/workspace', f'{instance_id}.py') - action = CmdRunAction( - command=f'cat {path}', - keep_prompt=False, - ) + action = CmdRunAction(command=f'cat {path}') obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) diff --git a/evaluation/benchmarks/humanevalfix/README.md b/evaluation/benchmarks/humanevalfix/README.md index 60dabef1f609..6d355e8093fd 100644 --- a/evaluation/benchmarks/humanevalfix/README.md +++ b/evaluation/benchmarks/humanevalfix/README.md @@ -71,7 +71,6 @@ For each problem, OpenHands is given a set number of iterations to fix the faili "observation": "run", "content": "[File: /workspace/Python__2.py (14 lines total)]\r\n1:def truncate_number(number: float) -> float:\r\n2: return number % 1.0 + 1.0\r\n3:\r\n4:\r\n5:\r\n6:\r\n7:\r\n8:\r\n9:def check(truncate_number):\r\n10: assert truncate_number(3.5) == 0.5\r\n11: assert abs(truncate_number(1.33) - 0.33) < 1e-6\r\n12: assert abs(truncate_number(123.456) - 0.456) < 1e-6\r\n13:\r\n14:check(truncate_number)", "extras": { - "command_id": -1, "command": "open Python__2.py", "exit_code": 0 } @@ -98,7 +97,6 @@ For each problem, OpenHands is given a set number of iterations to fix the faili "observation": "run", "content": "> > [File: /workspace/Python__2.py (14 lines total)]\r\n1:def truncate_number(number: float) -> float:\r\n2: return number % 1.0\r\n3:\r\n4:\r\n5:\r\n6:\r\n7:\r\n8:\r\n9:def check(truncate_number):\r\n10: assert truncate_number(3.5) == 0.5\r\n11: assert abs(truncate_number(1.33) - 0.33) < 1e-6\r\n12: assert abs(truncate_number(123.456) - 0.456) < 1e-6\r\n13:\r\n14:check(truncate_number)\r\nFile updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.", "extras": { - "command_id": -1, "command": "edit 2:2 < EvalOutput: """ Evaluate agent performance on a SWE-bench problem instance. @@ -146,6 +147,16 @@ def process_instance( metadata=metadata, ) + # Increase resource_factor with increasing attempt_id + if runtime_failure_count > 0: + config.sandbox.remote_runtime_resource_factor = min( + config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count), + 4, # hardcode maximum resource factor to 4 + ) + logger.warning( + f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}' + ) + runtime = create_runtime(config) call_async_from_sync(runtime.connect) # Get patch and save it to /tmp/patch.diff @@ -177,7 +188,7 @@ def process_instance( "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || " "echo 'APPLY_PATCH_FAIL')))" ) - action = CmdRunAction(command=exec_command, keep_prompt=False) + action = CmdRunAction(command=exec_command) action.timeout = 600 obs = runtime.run_action(action) assert isinstance(obs, CmdOutputObservation) @@ -200,9 +211,7 @@ def process_instance( # Run eval script in background and save output to log file log_file = '/tmp/eval_output.log' - action = CmdRunAction( - command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!', keep_prompt=False - ) + action = CmdRunAction(command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!') action.timeout = 60 # Short timeout just to get the process ID obs = runtime.run_action(action) @@ -224,7 +233,7 @@ def process_instance( instance['test_result']['report']['test_timeout'] = True break check_action = CmdRunAction( - command=f'ps -p {pid} > /dev/null; echo $?', keep_prompt=False + command=f'ps -p {pid} > /dev/null; echo $?' ) check_action.timeout = 60 check_obs = runtime.run_action(check_action) @@ -242,7 +251,7 @@ def process_instance( time.sleep(30) # Wait for 30 seconds before checking again # Read the log file - cat_action = CmdRunAction(command=f'cat {log_file}', keep_prompt=False) + cat_action = CmdRunAction(command=f'cat {log_file}') cat_action.timeout = 300 cat_obs = runtime.run_action(cat_action) diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index 61c045037bbb..4f0dff6508f4 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -282,6 +282,16 @@ def initialize_runtime( logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}') + action = CmdRunAction(command='which python') + action.timeout = 600 + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert_and_raise( + obs.exit_code == 0 and 'testbed' in obs.content, + f'Expected to find python interpreter from testbed, but got: {str(obs)}', + ) + logger.info('-' * 30) logger.info('END Runtime Initialization Fn') logger.info('-' * 30) @@ -337,8 +347,7 @@ def complete_runtime( git_patch = None while n_retries < 5: action = CmdRunAction( - command=f'git diff --no-color --cached {instance["base_commit"]}', - keep_prompt=False, + command=f'git diff --no-color --cached {instance["base_commit"]}' ) action.timeout = 600 + 100 * n_retries logger.info(action, extra={'msg_type': 'ACTION'}) @@ -385,7 +394,7 @@ def process_instance( if runtime_failure_count > 0: config.sandbox.remote_runtime_resource_factor = min( config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count), - 2, # hardcode maximum resource factor to 2 + 8, ) logger.warning( f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}' @@ -535,4 +544,5 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame: args.eval_num_workers, process_instance, timeout_seconds=120 * 60, # 2 hour PER instance should be more than enough + max_retries=5, ) diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py b/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py index cec92e153d84..0dc9552a25ea 100755 --- a/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py +++ b/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py @@ -104,9 +104,9 @@ def _get_resolved(report): # Determine if this repo has a significant diff is_significant = diff >= threshold repo_color = 'red' if is_significant else 'yellow' - print(colored(f'Difference: {diff} instances!', repo_color, attrs=['bold'])) print(f"\n{colored(repo, repo_color, attrs=['bold'])}:") + print(colored(f'Difference: {diff} instances!', repo_color, attrs=['bold'])) print(colored(f'X resolved but Y failed: ({len(x_instances)} instances)', 'green')) if x_instances: print(' ' + str(x_instances)) diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py index 8e9fc407d93b..97698feba3c7 100755 --- a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py +++ b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py @@ -20,6 +20,13 @@ print(f'Converting {args.oh_output_file} to markdown files in {output_md_folder}') oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True) + +swebench_eval_file = args.oh_output_file.replace('.jsonl', '.swebench_eval.jsonl') +if os.path.exists(swebench_eval_file): + eval_output_df = pd.read_json(swebench_eval_file, orient='records', lines=True) +else: + eval_output_df = None + # model name is the folder name of oh_output_file model_name = os.path.basename(os.path.dirname(args.oh_output_file)) @@ -50,7 +57,7 @@ def convert_history_to_str(history): return ret -def write_row_to_md_file(row): +def write_row_to_md_file(row, instance_id_to_test_result): if 'git_patch' in row: model_patch = row['git_patch'] elif 'test_result' in row and 'git_patch' in row['test_result']: @@ -58,8 +65,21 @@ def write_row_to_md_file(row): else: raise ValueError(f'Row {row} does not have a git_patch') - if 'report' in row: - resolved = row['report'].get('resolved', False) + test_output = None + if row['instance_id'] in instance_id_to_test_result: + report = instance_id_to_test_result[row['instance_id']].get('report', {}) + resolved = report.get('resolved', False) + test_output = instance_id_to_test_result[row['instance_id']].get( + 'test_output', None + ) + elif 'report' in row and row['report'] is not None: + if not isinstance(row['report'], dict): + resolved = None + print( + f'ERROR: Report is not a dict, but a {type(row["report"])}. Row: {row}' + ) + else: + resolved = row['report'].get('resolved', False) else: resolved = None @@ -84,5 +104,18 @@ def write_row_to_md_file(row): f.write('## Model Patch\n') f.write(f'{process_git_patch(model_patch)}\n') + f.write('## Test Output\n') + f.write(str(test_output)) + + +instance_id_to_test_result = {} +if eval_output_df is not None: + instance_id_to_test_result = ( + eval_output_df[['instance_id', 'test_result']] + .set_index('instance_id')['test_result'] + .to_dict() + ) -oh_format.progress_apply(write_row_to_md_file, axis=1) +oh_format.progress_apply( + write_row_to_md_file, axis=1, instance_id_to_test_result=instance_id_to_test_result +) diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py b/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py index 662e640ca752..d9c5c540f24b 100644 --- a/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py +++ b/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py @@ -111,6 +111,11 @@ def apply_report(row): instance_id_to_status[row['instance_id']] = row['test_result']['report'] df['report'] = df.apply(apply_report, axis=1) + report_is_dict = df['report'].apply(lambda x: isinstance(x, dict)) + if not report_is_dict.all(): + print(df[~report_is_dict]) + raise ValueError(f'Report is not a dict, but a {type(row["report"])}') + _n_instances = len(df) _n_resolved = len(df[df['report'].apply(lambda x: x.get('resolved', False))]) _n_unresolved = _n_instances - _n_resolved diff --git a/evaluation/integration_tests/tests/t01_fix_simple_typo.py b/evaluation/integration_tests/tests/t01_fix_simple_typo.py index 4cfa331df1b5..532d5d5b38fd 100644 --- a/evaluation/integration_tests/tests/t01_fix_simple_typo.py +++ b/evaluation/integration_tests/tests/t01_fix_simple_typo.py @@ -24,7 +24,7 @@ def initialize_runtime(cls, runtime: Runtime) -> None: @classmethod def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: # check if the file /workspace/bad.txt has been fixed - action = CmdRunAction(command='cat /workspace/bad.txt', keep_prompt=False) + action = CmdRunAction(command='cat /workspace/bad.txt') obs = runtime.run_action(action) if obs.exit_code != 0: return TestResult( diff --git a/evaluation/integration_tests/tests/t02_add_bash_hello.py b/evaluation/integration_tests/tests/t02_add_bash_hello.py index ac82e89bac05..88384a87f248 100644 --- a/evaluation/integration_tests/tests/t02_add_bash_hello.py +++ b/evaluation/integration_tests/tests/t02_add_bash_hello.py @@ -10,14 +10,14 @@ class Test(BaseIntegrationTest): @classmethod def initialize_runtime(cls, runtime: Runtime) -> None: - action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False) + action = CmdRunAction(command='mkdir -p /workspace') obs = runtime.run_action(action) assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}') @classmethod def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: # check if the file /workspace/hello.sh exists - action = CmdRunAction(command='cat /workspace/hello.sh', keep_prompt=False) + action = CmdRunAction(command='cat /workspace/hello.sh') obs = runtime.run_action(action) if obs.exit_code != 0: return TestResult( @@ -26,7 +26,7 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: ) # execute the script - action = CmdRunAction(command='bash /workspace/hello.sh', keep_prompt=False) + action = CmdRunAction(command='bash /workspace/hello.sh') obs = runtime.run_action(action) if obs.exit_code != 0: return TestResult( diff --git a/evaluation/integration_tests/tests/t03_jupyter_write_file.py b/evaluation/integration_tests/tests/t03_jupyter_write_file.py index e1ed6c27c4a6..2f88e1228b58 100644 --- a/evaluation/integration_tests/tests/t03_jupyter_write_file.py +++ b/evaluation/integration_tests/tests/t03_jupyter_write_file.py @@ -10,14 +10,14 @@ class Test(BaseIntegrationTest): @classmethod def initialize_runtime(cls, runtime: Runtime) -> None: - action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False) + action = CmdRunAction(command='mkdir -p /workspace') obs = runtime.run_action(action) assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}') @classmethod def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: # check if the file /workspace/hello.sh exists - action = CmdRunAction(command='cat /workspace/test.txt', keep_prompt=False) + action = CmdRunAction(command='cat /workspace/test.txt') obs = runtime.run_action(action) if obs.exit_code != 0: return TestResult( @@ -26,7 +26,7 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: ) # execute the script - action = CmdRunAction(command='cat /workspace/test.txt', keep_prompt=False) + action = CmdRunAction(command='cat /workspace/test.txt') obs = runtime.run_action(action) if obs.exit_code != 0: diff --git a/evaluation/integration_tests/tests/t04_git_staging.py b/evaluation/integration_tests/tests/t04_git_staging.py index aadb861203e7..5d9459ec4a40 100644 --- a/evaluation/integration_tests/tests/t04_git_staging.py +++ b/evaluation/integration_tests/tests/t04_git_staging.py @@ -10,31 +10,29 @@ class Test(BaseIntegrationTest): @classmethod def initialize_runtime(cls, runtime: Runtime) -> None: - action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False) + action = CmdRunAction(command='mkdir -p /workspace') obs = runtime.run_action(action) assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}') # git init - action = CmdRunAction(command='git init', keep_prompt=False) + action = CmdRunAction(command='git init') obs = runtime.run_action(action) assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}') # create README.md - action = CmdRunAction( - command='echo \'print("hello world")\' > hello.py', keep_prompt=False - ) + action = CmdRunAction(command='echo \'print("hello world")\' > hello.py') obs = runtime.run_action(action) assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}') # git add README.md - action = CmdRunAction(command='git add hello.py', keep_prompt=False) + action = CmdRunAction(command='git add hello.py') obs = runtime.run_action(action) assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}') @classmethod def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: # check if the file /workspace/hello.py exists - action = CmdRunAction(command='cat /workspace/hello.py', keep_prompt=False) + action = CmdRunAction(command='cat /workspace/hello.py') obs = runtime.run_action(action) if obs.exit_code != 0: return TestResult( @@ -43,7 +41,7 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult: ) # check if the staging area is empty - action = CmdRunAction(command='git status', keep_prompt=False) + action = CmdRunAction(command='git status') obs = runtime.run_action(action) if obs.exit_code != 0: return TestResult( diff --git a/evaluation/integration_tests/tests/t05_simple_browsing.py b/evaluation/integration_tests/tests/t05_simple_browsing.py index 96bb47875aec..8542e50d80a3 100644 --- a/evaluation/integration_tests/tests/t05_simple_browsing.py +++ b/evaluation/integration_tests/tests/t05_simple_browsing.py @@ -83,11 +83,11 @@ class Test(BaseIntegrationTest): @classmethod def initialize_runtime(cls, runtime: Runtime) -> None: - action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False) + action = CmdRunAction(command='mkdir -p /workspace') obs = runtime.run_action(action) assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}') - action = CmdRunAction(command='mkdir -p /tmp/server', keep_prompt=False) + action = CmdRunAction(command='mkdir -p /tmp/server') obs = runtime.run_action(action) assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}') @@ -101,8 +101,7 @@ def initialize_runtime(cls, runtime: Runtime) -> None: # create README.md action = CmdRunAction( - command='cd /tmp/server && nohup python3 -m http.server 8000 &', - keep_prompt=False, + command='cd /tmp/server && nohup python3 -m http.server 8000 &' ) obs = runtime.run_action(action) diff --git a/frontend/src/services/actions.ts b/frontend/src/services/actions.ts index e235218df5af..b19fc92383b7 100644 --- a/frontend/src/services/actions.ts +++ b/frontend/src/services/actions.ts @@ -16,6 +16,7 @@ import { StatusMessage, } from "#/types/message"; import { handleObservationMessage } from "./observations"; +import { appendInput } from "#/state/command-slice"; const messageActions = { [ActionType.BROWSE]: (message: ActionMessage) => { @@ -62,6 +63,10 @@ export function handleActionMessage(message: ActionMessage) { return; } + if (message.action === ActionType.RUN) { + store.dispatch(appendInput(message.args.command)); + } + if ("args" in message && "security_risk" in message.args) { store.dispatch(appendSecurityAnalyzerInput(message)); } diff --git a/frontend/src/services/observations.ts b/frontend/src/services/observations.ts index f2f1de60c45a..1ec6a784d910 100644 --- a/frontend/src/services/observations.ts +++ b/frontend/src/services/observations.ts @@ -80,8 +80,7 @@ export function handleObservationMessage(message: ObservationMessage) { observation: "run" as const, extras: { command: String(message.extras.command || ""), - command_id: Number(message.extras.command_id || 0), - exit_code: Number(message.extras.exit_code || 0), + metadata: message.extras.metadata, hidden: Boolean(message.extras.hidden), }, }), diff --git a/frontend/src/state/chat-slice.ts b/frontend/src/state/chat-slice.ts index aefcaf6972ea..87ce8e52dee4 100644 --- a/frontend/src/state/chat-slice.ts +++ b/frontend/src/state/chat-slice.ts @@ -93,7 +93,7 @@ export const chatSlice = createSlice({ const translationID = `ACTION_MESSAGE$${actionID.toUpperCase()}`; let text = ""; if (actionID === "run") { - text = `\`${action.payload.args.command}\``; + text = `Command:\n\`${action.payload.args.command}\``; } else if (actionID === "run_ipython") { text = `\`\`\`\n${action.payload.args.code}\n\`\`\``; } else if (actionID === "write") { @@ -144,7 +144,7 @@ export const chatSlice = createSlice({ // Set success property based on observation type if (observationID === "run") { const commandObs = observation.payload as CommandObservation; - causeMessage.success = commandObs.extras.exit_code === 0; + causeMessage.success = commandObs.extras.metadata.exit_code === 0; } else if (observationID === "run_ipython") { // For IPython, we consider it successful if there's no error message const ipythonObs = observation.payload as IPythonObservation; @@ -158,7 +158,9 @@ export const chatSlice = createSlice({ if (content.length > MAX_CONTENT_LENGTH) { content = `${content.slice(0, MAX_CONTENT_LENGTH)}...`; } - content = `\`\`\`\n${content}\n\`\`\``; + content = `${ + causeMessage.content + }\n\nOutput:\n\`\`\`\n${content.trim() || "[Command finished execution with no output]"}\n\`\`\``; causeMessage.content = content; // Observation content includes the action } else if (observationID === "read" || observationID === "edit") { const { content } = observation.payload; diff --git a/frontend/src/types/core/observations.ts b/frontend/src/types/core/observations.ts index 9a020d93654e..8f815b61f5a4 100644 --- a/frontend/src/types/core/observations.ts +++ b/frontend/src/types/core/observations.ts @@ -13,9 +13,8 @@ export interface CommandObservation extends OpenHandsObservationEvent<"run"> { source: "agent"; extras: { command: string; - command_id: number; - exit_code: number; hidden?: boolean; + metadata: Record; }; } diff --git a/frontend/src/types/message.tsx b/frontend/src/types/message.tsx index f7ef68e3fa13..eb1e1bfcf9a5 100644 --- a/frontend/src/types/message.tsx +++ b/frontend/src/types/message.tsx @@ -27,8 +27,11 @@ export interface ObservationMessage { // The observed data content: string; - // Additional structured data - extras: Record; + extras: { + metadata: Record; + error_id: string; + [key: string]: string | Record; + }; // A friendly message that can be put in the chat log message: string; diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py index 904af0450dfd..f811723dfbd5 100644 --- a/openhands/agenthub/codeact_agent/codeact_agent.py +++ b/openhands/agenthub/codeact_agent/codeact_agent.py @@ -277,7 +277,9 @@ def get_observation_message( ) else: text = truncate_content( - obs.content + obs.interpreter_details, max_message_chars + obs.content + + f'\n[Python Interpreter: {obs.metadata.py_interpreter_path}]', + max_message_chars, ) text += f'\n[Command finished with exit code {obs.exit_code}]' message = Message(role='user', content=[TextContent(text=text)]) diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py index 5038f95934ea..d06cf01cd3e4 100644 --- a/openhands/agenthub/codeact_agent/function_calling.py +++ b/openhands/agenthub/codeact_agent/function_calling.py @@ -31,8 +31,7 @@ _BASH_DESCRIPTION = """Execute a bash command in the terminal. * Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = `python3 app.py > server.log 2>&1 &`. -* Interactive: If a bash command returns exit code `-1`, this means the process is not yet finished. The assistant must then send a second call to terminal with an empty `command` (which will retrieve any additional logs), or it can send additional text (set `command` to the text) to STDIN of the running process, or it can send command=`ctrl+c` to interrupt the process. -* Timeout: If a command execution result says "Command timed out. Sending SIGINT to the process", the assistant should retry running the command in the background. +* Interactive: If a bash command returns exit code `-1`, this means the process is not yet finished. The assistant must then send a second call to terminal with an empty `command` (which will retrieve any additional logs), or it can send additional text (set `command` to the text) to STDIN of the running process, or it can send command like `C-c` (Ctrl+C) to interrupt the process. """ CmdRunTool = ChatCompletionToolParam( @@ -45,7 +44,7 @@ 'properties': { 'command': { 'type': 'string', - 'description': 'The bash command to execute. Can be empty to view additional logs when previous exit code is `-1`. Can be `ctrl+c` to interrupt the currently running process.', + 'description': 'The bash command to execute. Can be empty string to view additional logs when previous exit code is `-1`. Can be `C-c` (Ctrl+C) to interrupt the currently running process.', }, }, 'required': ['command'], diff --git a/openhands/agenthub/dummy_agent/agent.py b/openhands/agenthub/dummy_agent/agent.py index 06abacab3eb7..f7a654bf75b4 100644 --- a/openhands/agenthub/dummy_agent/agent.py +++ b/openhands/agenthub/dummy_agent/agent.py @@ -18,6 +18,7 @@ from openhands.events.observation import ( AgentStateChangedObservation, BrowserOutputObservation, + CmdOutputMetadata, CmdOutputObservation, FileReadObservation, FileWriteObservation, @@ -54,11 +55,7 @@ def __init__(self, llm: LLM, config: AgentConfig): }, { 'action': CmdRunAction(command='echo "foo"'), - 'observations': [ - CmdOutputObservation( - 'foo', command_id=-1, command='echo "foo"', exit_code=0 - ) - ], + 'observations': [CmdOutputObservation('foo', command='echo "foo"')], }, { 'action': FileWriteAction( @@ -81,9 +78,8 @@ def __init__(self, llm: LLM, config: AgentConfig): 'observations': [ CmdOutputObservation( 'bash: hello.sh: No such file or directory', - command_id=-1, command='bash workspace/hello.sh', - exit_code=127, + metadata=CmdOutputMetadata(exit_code=127), ) ], }, @@ -152,8 +148,6 @@ def step(self, state: State) -> Action: obs.pop('timestamp', None) obs.pop('cause', None) obs.pop('source', None) - if 'extras' in obs: - obs['extras'].pop('command_id', None) if hist_obs != expected_obs: print( diff --git a/openhands/controller/stuck.py b/openhands/controller/stuck.py index dbb18a9b4f2c..1bf7e410eb23 100644 --- a/openhands/controller/stuck.py +++ b/openhands/controller/stuck.py @@ -5,7 +5,7 @@ from openhands.events.action.empty import NullAction from openhands.events.action.message import MessageAction from openhands.events.event import Event, EventSource -from openhands.events.observation.commands import ( +from openhands.events.observation import ( CmdOutputObservation, IPythonRunCellObservation, ) diff --git a/openhands/core/utils/json.py b/openhands/core/utils/json.py index c0b22740bec4..e75d028f037e 100644 --- a/openhands/core/utils/json.py +++ b/openhands/core/utils/json.py @@ -6,6 +6,7 @@ from openhands.core.exceptions import LLMResponseError from openhands.events.event import Event +from openhands.events.observation import CmdOutputMetadata from openhands.events.serialization import event_to_dict from openhands.llm.metrics import Metrics @@ -20,6 +21,8 @@ def my_default_encoder(obj): return obj.get() if isinstance(obj, ModelResponse): return obj.model_dump() + if isinstance(obj, CmdOutputMetadata): + return obj.model_dump() return json.JSONEncoder().default(obj) diff --git a/openhands/events/action/commands.py b/openhands/events/action/commands.py index 83dd19f9d161..67b586f86ad9 100644 --- a/openhands/events/action/commands.py +++ b/openhands/events/action/commands.py @@ -12,19 +12,11 @@ @dataclass class CmdRunAction(Action): command: str + # When `command` is empty, it will be used to print the current tmux window thought: str = '' blocking: bool = False - # If False, the command will be run in a non-blocking / interactive way - # The partial command outputs will be returned as output observation. - # If True, the command will be run for max .timeout seconds. - keep_prompt: bool = True - # if True, the command prompt will be kept in the command output observation - # Example of command output: - # root@sandbox:~# ls - # file1.txt - # file2.txt - # root@sandbox:~# <-- this is the command prompt - + # If blocking is True, the command will be run in a blocking manner. + # e.g., it will NOT return early due to soft timeout. hidden: bool = False action: str = ActionType.RUN runnable: ClassVar[bool] = True diff --git a/openhands/events/action/files.py b/openhands/events/action/files.py index c903e4f00f7d..de2db691f146 100644 --- a/openhands/events/action/files.py +++ b/openhands/events/action/files.py @@ -48,6 +48,15 @@ class FileWriteAction(Action): def message(self) -> str: return f'Writing file: {self.path}' + def __repr__(self) -> str: + return ( + f'**FileWriteAction**\n' + f'Path: {self.path}\n' + f'Range: [L{self.start}:L{self.end}]\n' + f'Thought: {self.thought}\n' + f'Content:\n```\n{self.content}\n```\n' + ) + @dataclass class FileEditAction(Action): diff --git a/openhands/events/observation/__init__.py b/openhands/events/observation/__init__.py index 28525b09aabb..69a1853b268c 100644 --- a/openhands/events/observation/__init__.py +++ b/openhands/events/observation/__init__.py @@ -1,6 +1,7 @@ from openhands.events.observation.agent import AgentStateChangedObservation from openhands.events.observation.browse import BrowserOutputObservation from openhands.events.observation.commands import ( + CmdOutputMetadata, CmdOutputObservation, IPythonRunCellObservation, ) @@ -20,6 +21,7 @@ 'Observation', 'NullObservation', 'CmdOutputObservation', + 'CmdOutputMetadata', 'IPythonRunCellObservation', 'BrowserOutputObservation', 'FileReadObservation', diff --git a/openhands/events/observation/commands.py b/openhands/events/observation/commands.py index b522b5c47283..31b4472f7ee7 100644 --- a/openhands/events/observation/commands.py +++ b/openhands/events/observation/commands.py @@ -1,19 +1,136 @@ -from dataclasses import dataclass +import json +import re +import traceback +from dataclasses import dataclass, field +from typing import Self +from pydantic import BaseModel + +from openhands.core.logger import openhands_logger as logger from openhands.core.schema import ObservationType from openhands.events.observation.observation import Observation +CMD_OUTPUT_PS1_BEGIN = '\n###PS1JSON###\n' +CMD_OUTPUT_PS1_END = '\n###PS1END###' +CMD_OUTPUT_METADATA_PS1_REGEX = re.compile( + f'^{CMD_OUTPUT_PS1_BEGIN.strip()}(.*?){CMD_OUTPUT_PS1_END.strip()}', + re.DOTALL | re.MULTILINE, +) + + +class CmdOutputMetadata(BaseModel): + """Additional metadata captured from PS1""" + + exit_code: int = -1 + pid: int = -1 + username: str | None = None + hostname: str | None = None + working_dir: str | None = None + py_interpreter_path: str | None = None + prefix: str = '' # Prefix to add to command output + suffix: str = '' # Suffix to add to command output + + @classmethod + def to_ps1_prompt(cls) -> str: + """Convert the required metadata into a PS1 prompt.""" + prompt = CMD_OUTPUT_PS1_BEGIN + json_str = json.dumps( + { + 'pid': '$!', + 'exit_code': '$?', + 'username': r'\u', + 'hostname': r'\h', + 'working_dir': r'$(pwd)', + 'py_interpreter_path': r'$(which python 2>/dev/null || echo "")', + }, + indent=2, + ) + # Make sure we escape double quotes in the JSON string + # So that PS1 will keep them as part of the output + prompt += json_str.replace('"', r'\"') + prompt += CMD_OUTPUT_PS1_END + '\n' # Ensure there's a newline at the end + return prompt + + @classmethod + def matches_ps1_metadata(cls, string: str) -> list[re.Match[str]]: + matches = [] + for match in CMD_OUTPUT_METADATA_PS1_REGEX.finditer(string): + try: + json.loads(match.group(1).strip()) # Try to parse as JSON + matches.append(match) + except json.JSONDecodeError: + logger.warning( + f'Failed to parse PS1 metadata: {match.group(1)}. Skipping.' + + traceback.format_exc() + ) + continue # Skip if not valid JSON + return matches + + @classmethod + def from_ps1_match(cls, match: re.Match[str]) -> Self: + """Extract the required metadata from a PS1 prompt.""" + metadata = json.loads(match.group(1)) + # Create a copy of metadata to avoid modifying the original + processed = metadata.copy() + # Convert numeric fields + if 'pid' in metadata: + try: + processed['pid'] = int(float(str(metadata['pid']))) + except (ValueError, TypeError): + processed['pid'] = -1 + if 'exit_code' in metadata: + try: + processed['exit_code'] = int(float(str(metadata['exit_code']))) + except (ValueError, TypeError): + logger.warning( + f'Failed to parse exit code: {metadata["exit_code"]}. Setting to -1.' + ) + processed['exit_code'] = -1 + return cls(**processed) + @dataclass class CmdOutputObservation(Observation): """This data class represents the output of a command.""" - command_id: int command: str - exit_code: int = 0 - hidden: bool = False observation: str = ObservationType.RUN - interpreter_details: str = '' + # Additional metadata captured from PS1 + metadata: CmdOutputMetadata = field(default_factory=CmdOutputMetadata) + # Whether the command output should be hidden from the user + hidden: bool = False + + def __init__( + self, + content: str, + command: str, + observation: str = ObservationType.RUN, + metadata: dict | CmdOutputMetadata | None = None, + hidden: bool = False, + **kwargs, + ): + super().__init__(content) + self.command = command + self.observation = observation + self.hidden = hidden + if isinstance(metadata, dict): + self.metadata = CmdOutputMetadata(**metadata) + else: + self.metadata = metadata or CmdOutputMetadata() + + # Handle legacy attribute + if 'exit_code' in kwargs: + self.metadata.exit_code = kwargs['exit_code'] + if 'command_id' in kwargs: + self.metadata.pid = kwargs['command_id'] + + @property + def command_id(self) -> int: + return self.metadata.pid + + @property + def exit_code(self) -> int: + return self.metadata.exit_code @property def error(self) -> bool: @@ -28,7 +145,21 @@ def success(self) -> bool: return not self.error def __str__(self) -> str: - return f'**CmdOutputObservation (source={self.source}, exit code={self.exit_code})**\n{self.content}' + return ( + f'**CmdOutputObservation (source={self.source}, exit code={self.exit_code}, ' + f'metadata={json.dumps(self.metadata.model_dump(), indent=2)})**\n' + '--BEGIN AGENT OBSERVATION--\n' + f'{self._to_agent_observation()}\n' + '--END AGENT OBSERVATION--' + ) + + def _to_agent_observation(self) -> str: + ret = f'{self.metadata.prefix}{self.content}{self.metadata.suffix}' + if self.metadata.working_dir: + ret += f'\n[Current working directory: {self.metadata.working_dir}]' + if self.metadata.py_interpreter_path: + ret += f'\n[Python interpreter: {self.metadata.py_interpreter_path}]' + return ret @dataclass diff --git a/openhands/events/observation/files.py b/openhands/events/observation/files.py index b16804607d94..b939988fcebb 100644 --- a/openhands/events/observation/files.py +++ b/openhands/events/observation/files.py @@ -18,6 +18,9 @@ class FileReadObservation(Observation): def message(self) -> str: return f'I read the file {self.path}.' + def __str__(self) -> str: + return f'[Read from {self.path} is successful.]\n' f'{self.content}' + @dataclass class FileWriteObservation(Observation): @@ -30,6 +33,9 @@ class FileWriteObservation(Observation): def message(self) -> str: return f'I wrote to the file {self.path}.' + def __str__(self) -> str: + return f'[Write to {self.path} is successful.]\n' f'{self.content}' + @dataclass class FileEditObservation(Observation): diff --git a/openhands/events/serialization/action.py b/openhands/events/serialization/action.py index 208400fe9213..90945c1d4dfd 100644 --- a/openhands/events/serialization/action.py +++ b/openhands/events/serialization/action.py @@ -67,6 +67,10 @@ def action_from_dict(action: dict) -> Action: if 'images_urls' in args: args['image_urls'] = args.pop('images_urls') + # keep_prompt has been deprecated in https://github.com/All-Hands-AI/OpenHands/pull/4881 + if 'keep_prompt' in args: + args.pop('keep_prompt') + try: decoded_action = action_class(**args) if 'timeout' in action: diff --git a/openhands/events/serialization/event.py b/openhands/events/serialization/event.py index 6ee82a1cc81c..cf2637ab667d 100644 --- a/openhands/events/serialization/event.py +++ b/openhands/events/serialization/event.py @@ -1,6 +1,8 @@ from dataclasses import asdict from datetime import datetime +from pydantic import BaseModel + from openhands.events import Event, EventSource from openhands.events.observation.observation import Observation from openhands.events.serialization.action import action_from_dict @@ -56,6 +58,12 @@ def event_from_dict(data) -> 'Event': return evt +def _convert_pydantic_to_dict(obj: BaseModel | dict) -> dict: + if isinstance(obj, BaseModel): + return obj.model_dump() + return obj + + def event_to_dict(event: 'Event') -> dict: props = asdict(event) d = {} @@ -82,7 +90,11 @@ def event_to_dict(event: 'Event') -> dict: d['timeout'] = event.timeout elif 'observation' in d: d['content'] = props.pop('content', '') - d['extras'] = props + + # props is a dict whose values can include a complex object like an instance of a BaseModel subclass + # such as CmdOutputMetadata + # we serialize it along with the rest + d['extras'] = {k: _convert_pydantic_to_dict(v) for k, v in props.items()} # Include success field for CmdOutputObservation if hasattr(event, 'success'): d['success'] = event.success @@ -109,7 +121,6 @@ def event_to_memory(event: 'Event', max_message_chars: int) -> dict: # runnable actions have some extra fields used in the BE/FE, which should not be sent to the LLM if 'args' in d: d['args'].pop('blocking', None) - d['args'].pop('keep_prompt', None) d['args'].pop('confirmation_state', None) if 'extras' in d: diff --git a/openhands/events/serialization/observation.py b/openhands/events/serialization/observation.py index d9d8dc51adaf..961c155e8916 100644 --- a/openhands/events/serialization/observation.py +++ b/openhands/events/serialization/observation.py @@ -1,6 +1,9 @@ +import copy + from openhands.events.observation.agent import AgentStateChangedObservation from openhands.events.observation.browse import BrowserOutputObservation from openhands.events.observation.commands import ( + CmdOutputMetadata, CmdOutputObservation, IPythonRunCellObservation, ) @@ -37,6 +40,26 @@ } +def _update_cmd_output_metadata( + metadata: dict | CmdOutputMetadata | None, **kwargs +) -> dict | CmdOutputMetadata: + """Update the metadata of a CmdOutputObservation. + + If metadata is None, create a new CmdOutputMetadata instance. + If metadata is a dict, update the dict. + If metadata is a CmdOutputMetadata instance, update the instance. + """ + if metadata is None: + return CmdOutputMetadata(**kwargs) + + if isinstance(metadata, dict): + metadata.update(**kwargs) + elif isinstance(metadata, CmdOutputMetadata): + for key, value in kwargs.items(): + setattr(metadata, key, value) + return metadata + + def observation_from_dict(observation: dict) -> Observation: observation = observation.copy() if 'observation' not in observation: @@ -49,6 +72,24 @@ def observation_from_dict(observation: dict) -> Observation: observation.pop('observation') observation.pop('message', None) content = observation.pop('content', '') - extras = observation.pop('extras', {}) + extras = copy.deepcopy(observation.pop('extras', {})) + + # Handle legacy attributes for CmdOutputObservation + if 'exit_code' in extras: + extras['metadata'] = _update_cmd_output_metadata( + extras.get('metadata', None), exit_code=extras.pop('exit_code') + ) + if 'command_id' in extras: + extras['metadata'] = _update_cmd_output_metadata( + extras.get('metadata', None), pid=extras.pop('command_id') + ) + # convert metadata to CmdOutputMetadata if it is a dict + if observation_class is CmdOutputObservation: + if 'metadata' in extras and isinstance(extras['metadata'], dict): + extras['metadata'] = CmdOutputMetadata(**extras['metadata']) + elif 'metadata' in extras and isinstance(extras['metadata'], CmdOutputMetadata): + pass + else: + extras['metadata'] = CmdOutputMetadata() return observation_class(content=content, **extras) diff --git a/openhands/events/utils.py b/openhands/events/utils.py index 6c8cc415f675..bf710edcd7bd 100644 --- a/openhands/events/utils.py +++ b/openhands/events/utils.py @@ -2,9 +2,11 @@ from openhands.events.action.action import Action from openhands.events.action.empty import NullAction from openhands.events.event import Event -from openhands.events.observation.commands import CmdOutputObservation -from openhands.events.observation.empty import NullObservation -from openhands.events.observation.observation import Observation +from openhands.events.observation import ( + CmdOutputObservation, + NullObservation, + Observation, +) def get_pairs_from_events(events: list[Event]) -> list[tuple[Action, Observation]]: diff --git a/openhands/resolver/resolve_issue.py b/openhands/resolver/resolve_issue.py index ed3bbf11178e..7617410ef429 100644 --- a/openhands/resolver/resolve_issue.py +++ b/openhands/resolver/resolve_issue.py @@ -122,10 +122,7 @@ async def complete_runtime( n_retries = 0 git_patch = None while n_retries < 5: - action = CmdRunAction( - command=f'git diff --no-color --cached {base_commit}', - keep_prompt=False, - ) + action = CmdRunAction(command=f'git diff --no-color --cached {base_commit}') action.timeout = 600 + 100 * n_retries logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py index 4f7cf3ee38a3..7c83a2f0f411 100644 --- a/openhands/runtime/action_execution_server.py +++ b/openhands/runtime/action_execution_server.py @@ -95,31 +95,34 @@ def __init__( browsergym_eval_env: str | None, ) -> None: self.plugins_to_load = plugins_to_load - self._initial_pwd = work_dir + self._initial_cwd = work_dir self.username = username self.user_id = user_id _updated_user_id = init_user_and_working_directory( - username=username, user_id=self.user_id, initial_pwd=work_dir + username=username, user_id=self.user_id, initial_cwd=work_dir ) if _updated_user_id is not None: self.user_id = _updated_user_id - self.bash_session = BashSession( - work_dir=work_dir, - username=username, - ) - + self.bash_session: BashSession | None = None self.lock = asyncio.Lock() self.plugins: dict[str, Plugin] = {} self.browser = BrowserEnv(browsergym_eval_env) self.start_time = time.time() self.last_execution_time = self.start_time + self._initialized = False @property - def initial_pwd(self): - return self._initial_pwd + def initial_cwd(self): + return self._initial_cwd async def ainit(self): + # bash needs to be initialized first + self.bash_session = BashSession( + work_dir=self._initial_cwd, + username=self.username, + ) + self.bash_session.initialize() await wait_all( (self._init_plugin(plugin) for plugin in self.plugins_to_load), timeout=30, @@ -138,8 +141,14 @@ async def ainit(self): await self._init_bash_commands() logger.debug('Runtime client initialized.') + self._initialized = True + + @property + def initialized(self) -> bool: + return self._initialized async def _init_plugin(self, plugin: Plugin): + assert self.bash_session is not None await plugin.initialize(self.username) self.plugins[plugin.name] = plugin logger.debug(f'Initializing plugin: {plugin.name}') @@ -147,7 +156,7 @@ async def _init_plugin(self, plugin: Plugin): if isinstance(plugin, JupyterPlugin): await self.run_ipython( IPythonRunCellAction( - code=f'import os; os.chdir("{self.bash_session.pwd}")' + code=f'import os; os.chdir("{self.bash_session.cwd}")' ) ) @@ -177,30 +186,32 @@ async def run_action(self, action) -> Observation: async def run( self, action: CmdRunAction ) -> CmdOutputObservation | ErrorObservation: - obs = await call_sync_from_async(self.bash_session.run, action) + assert self.bash_session is not None + obs = await call_sync_from_async(self.bash_session.execute, action) return obs async def run_ipython(self, action: IPythonRunCellAction) -> Observation: + assert self.bash_session is not None if 'jupyter' in self.plugins: _jupyter_plugin: JupyterPlugin = self.plugins['jupyter'] # type: ignore # This is used to make AgentSkills in Jupyter aware of the # current working directory in Bash - jupyter_pwd = getattr(self, '_jupyter_pwd', None) - if self.bash_session.pwd != jupyter_pwd: + jupyter_cwd = getattr(self, '_jupyter_cwd', None) + if self.bash_session.cwd != jupyter_cwd: logger.debug( - f'{self.bash_session.pwd} != {jupyter_pwd} -> reset Jupyter PWD' + f'{self.bash_session.cwd} != {jupyter_cwd} -> reset Jupyter PWD' ) - reset_jupyter_pwd_code = ( - f'import os; os.chdir("{self.bash_session.pwd}")' + reset_jupyter_cwd_code = ( + f'import os; os.chdir("{self.bash_session.cwd}")' ) - _aux_action = IPythonRunCellAction(code=reset_jupyter_pwd_code) + _aux_action = IPythonRunCellAction(code=reset_jupyter_cwd_code) _reset_obs: IPythonRunCellObservation = await _jupyter_plugin.run( _aux_action ) logger.debug( - f'Changed working directory in IPython to: {self.bash_session.pwd}. Output: {_reset_obs}' + f'Changed working directory in IPython to: {self.bash_session.cwd}. Output: {_reset_obs}' ) - self._jupyter_pwd = self.bash_session.pwd + self._jupyter_cwd = self.bash_session.cwd obs: IPythonRunCellObservation = await _jupyter_plugin.run(action) obs.content = obs.content.rstrip() @@ -266,7 +277,7 @@ async def run_ipython(self, action: IPythonRunCellAction) -> Observation: if action.include_extra: obs.content += ( - f'\n[Jupyter current working directory: {self.bash_session.pwd}]' + f'\n[Jupyter current working directory: {self.bash_session.cwd}]' ) obs.content += f'\n[Jupyter Python interpreter: {_jupyter_plugin.python_interpreter_path}]' return obs @@ -282,6 +293,7 @@ def _resolve_path(self, path: str, working_dir: str) -> str: return str(filepath) async def read(self, action: FileReadAction) -> Observation: + assert self.bash_session is not None if action.impl_source == FileReadSource.OH_ACI: return await self.run_ipython( IPythonRunCellAction( @@ -292,7 +304,7 @@ async def read(self, action: FileReadAction) -> Observation: # NOTE: the client code is running inside the sandbox, # so there's no need to check permission - working_dir = self.bash_session.workdir + working_dir = self.bash_session.cwd filepath = self._resolve_path(action.path, working_dir) try: if filepath.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')): @@ -339,7 +351,8 @@ async def read(self, action: FileReadAction) -> Observation: return FileReadObservation(path=filepath, content=code_view) async def write(self, action: FileWriteAction) -> Observation: - working_dir = self.bash_session.workdir + assert self.bash_session is not None + working_dir = self.bash_session.cwd filepath = self._resolve_path(action.path, working_dir) insert = action.content.split('\n') @@ -400,7 +413,8 @@ async def browse_interactive(self, action: BrowseInteractiveAction) -> Observati return await browse(action, self.browser) def close(self): - self.bash_session.close() + if self.bash_session is not None: + self.bash_session.close() self.browser.close() @@ -609,6 +623,8 @@ async def download_file(path: str): @app.get('/alive') async def alive(): + if client is None or not client.initialized: + return {'status': 'not initialized'} return {'status': 'ok'} # ================================ @@ -658,11 +674,11 @@ async def list_files(request: Request): # Get the full path of the requested directory if path is None: - full_path = client.initial_pwd + full_path = client.initial_cwd elif os.path.isabs(path): full_path = path else: - full_path = os.path.join(client.initial_pwd, path) + full_path = os.path.join(client.initial_cwd, path) if not os.path.exists(full_path): # if user just removed a folder, prevent server error 500 in UI diff --git a/openhands/runtime/impl/action_execution/action_execution_client.py b/openhands/runtime/impl/action_execution/action_execution_client.py index 00c847a23015..80f71ffe8ab1 100644 --- a/openhands/runtime/impl/action_execution/action_execution_client.py +++ b/openhands/runtime/impl/action_execution/action_execution_client.py @@ -264,7 +264,6 @@ def send_action_for_execution(self, action: Action) -> Observation: raise AgentRuntimeTimeoutError( f'Runtime failed to return execute_action before the requested timeout of {action.timeout}s' ) - return obs def run(self, action: CmdRunAction) -> Observation: diff --git a/openhands/runtime/impl/remote/remote_runtime.py b/openhands/runtime/impl/remote/remote_runtime.py index d146e71d198a..010ec8b0e938 100644 --- a/openhands/runtime/impl/remote/remote_runtime.py +++ b/openhands/runtime/impl/remote/remote_runtime.py @@ -219,7 +219,9 @@ def _start_runtime(self): 'image': self.container_image, 'command': command, 'working_dir': '/openhands/code/', - 'environment': {'DEBUG': 'true'} if self.config.debug else {}, + 'environment': {'DEBUG': 'true'} + if self.config.debug or os.environ.get('DEBUG', 'false').lower() == 'true' + else {}, 'session_id': self.sid, 'resource_factor': self.config.sandbox.remote_runtime_resource_factor, } @@ -364,7 +366,10 @@ def _send_action_server_request(self, method, url, **kwargs): try: return super()._send_action_server_request(method, url, **kwargs) except requests.Timeout: - self.log('error', 'No response received within the timeout period.') + self.log( + 'error', + f'No response received within the timeout period for url: {url}', + ) raise except requests.HTTPError as e: if e.response.status_code == 404: diff --git a/openhands/runtime/utils/bash.py b/openhands/runtime/utils/bash.py index a5019315a038..84c5d0265957 100644 --- a/openhands/runtime/utils/bash.py +++ b/openhands/runtime/utils/bash.py @@ -1,18 +1,21 @@ import os import re +import time +import uuid +from enum import Enum import bashlex -import pexpect +import libtmux from openhands.core.logger import openhands_logger as logger from openhands.events.action import CmdRunAction -from openhands.events.event import EventSource -from openhands.events.observation import ( +from openhands.events.observation import ErrorObservation +from openhands.events.observation.commands import ( + CMD_OUTPUT_PS1_END, + CmdOutputMetadata, CmdOutputObservation, - ErrorObservation, ) - -SOFT_TIMEOUT_SECONDS = 5 +from openhands.utils.shutdown_listener import should_continue def split_bash_commands(commands): @@ -66,269 +69,493 @@ def split_bash_commands(commands): return result +def escape_bash_special_chars(command: str) -> str: + r""" + Escapes characters that have different interpretations in bash vs python. + Specifically handles escape sequences like \;, \|, \&, etc. + """ + if command.strip() == '': + return '' + + try: + parts = [] + last_pos = 0 + + def visit_node(node): + nonlocal last_pos + if ( + node.kind == 'redirect' + and hasattr(node, 'heredoc') + and node.heredoc is not None + ): + # We're entering a heredoc - preserve everything as-is until we see EOF + # Store the heredoc end marker (usually 'EOF' but could be different) + between = command[last_pos : node.pos[0]] + parts.append(between) + # Add the heredoc start marker + parts.append(command[node.pos[0] : node.heredoc.pos[0]]) + # Add the heredoc content as-is + parts.append(command[node.heredoc.pos[0] : node.heredoc.pos[1]]) + last_pos = node.pos[1] + return + + if node.kind == 'word': + # Get the raw text between the last position and current word + between = command[last_pos : node.pos[0]] + word_text = command[node.pos[0] : node.pos[1]] + + # Add the between text, escaping special characters + between = re.sub(r'\\([;&|><])', r'\\\\\1', between) + parts.append(between) + + # Check if word_text is a quoted string or command substitution + if ( + (word_text.startswith('"') and word_text.endswith('"')) + or (word_text.startswith("'") and word_text.endswith("'")) + or (word_text.startswith('$(') and word_text.endswith(')')) + or (word_text.startswith('`') and word_text.endswith('`')) + ): + # Preserve quoted strings, command substitutions, and heredoc content as-is + parts.append(word_text) + else: + # Escape special chars in unquoted text + word_text = re.sub(r'\\([;&|><])', r'\\\\\1', word_text) + parts.append(word_text) + + last_pos = node.pos[1] + return + + # Visit child nodes + if hasattr(node, 'parts'): + for part in node.parts: + visit_node(part) + + # Process all nodes in the AST + nodes = list(bashlex.parse(command)) + for node in nodes: + between = command[last_pos : node.pos[0]] + between = re.sub(r'\\([;&|><])', r'\\\\\1', between) + parts.append(between) + last_pos = node.pos[0] + visit_node(node) + + # Handle any remaining text after the last word + remaining = command[last_pos:] + parts.append(remaining) + return ''.join(parts) + except bashlex.errors.ParsingError: + # Fallback if parsing fails + logger.warning(f'Failed to parse command: {command}') + return command + + +class BashCommandStatus(Enum): + CONTINUE = 'continue' + COMPLETED = 'completed' + NO_CHANGE_TIMEOUT = 'no_change_timeout' + HARD_TIMEOUT = 'hard_timeout' + + +def _remove_command_prefix(command_output: str, command: str) -> str: + return command_output.lstrip().removeprefix(command.lstrip()).lstrip() + + class BashSession: - """A class that maintains a pexpect process and provides a simple interface for running commands and interacting with the shell.""" + POLL_INTERVAL = 0.5 + HISTORY_LIMIT = 10_000 + PS1 = CmdOutputMetadata.to_ps1_prompt() + + def __init__( + self, + work_dir: str, + username: str | None = None, + no_change_timeout_seconds: float = 30.0, + ): + self.NO_CHANGE_TIMEOUT_SECONDS = no_change_timeout_seconds + self.work_dir = work_dir + self.username = username + self._initialized = False + + def initialize(self): + self.server = libtmux.Server() + window_command = '/bin/bash' + if self.username: + # This starts a non-login (new) shell for the given user + window_command = f'su {self.username} -' + + session_name = f'openhands-{self.username}-{uuid.uuid4()}' + self.session = self.server.new_session( + session_name=session_name, + window_name='bash', + window_command=window_command, + start_directory=self.work_dir, + kill_session=True, + x=1000, + y=1000, + ) - def __init__(self, work_dir: str, username: str): - self._pwd = work_dir + # Set history limit to a large number to avoid losing history + # https://unix.stackexchange.com/questions/43414/unlimited-history-in-tmux + self.session.set_option('history-limit', str(self.HISTORY_LIMIT), _global=True) + self.session.history_limit = self.HISTORY_LIMIT + # We need to create a new pane because the initial pane's history limit is (default) 2000 + _initial_window = self.session.attached_window + self.window = self.session.new_window( + window_shell=window_command, + start_directory=self.work_dir, + ) + self.pane = self.window.attached_pane + logger.debug(f'pane: {self.pane}; history_limit: {self.session.history_limit}') + _initial_window.kill_window() - self.shell = pexpect.spawn( - f'su {username}', - encoding='utf-8', - codec_errors='replace', - echo=False, + # Configure bash to use simple PS1 and disable PS2 + self.pane.send_keys( + f'export PROMPT_COMMAND=\'export PS1="{self.PS1}"\'; export PS2=""' + ) + time.sleep(0.1) # Wait for command to take effect + self._clear_screen() + + # Store the last command for interactive input handling + self.prev_status: BashCommandStatus | None = None + self.prev_output: str = '' + self._closed: bool = False + logger.debug(f'Bash session initialized with work dir: {self.work_dir}') + + # Maintain the current working directory + self._cwd = os.path.abspath(self.work_dir) + self._initialized = True + + def __del__(self): + """Ensure the session is closed when the object is destroyed.""" + self.close() + + def _get_pane_content(self) -> str: + """Capture the current pane content and update the buffer.""" + content = '\n'.join( + map( + # avoid double newlines + lambda line: line.rstrip(), + self.pane.cmd('capture-pane', '-J', '-pS', '-').stdout, + ) ) - self._init_bash_shell(work_dir) + return content def close(self): - self.shell.close() - - @property - def pwd(self): - return self._pwd + """Clean up the session.""" + if self._closed: + return + self.session.kill_session() + self._closed = True @property - def workdir(self): - return self._get_working_directory() - - def _get_working_directory(self): - # NOTE: this is part of initialization, so we hard code the timeout - result, exit_code = self._execute_bash('pwd', timeout=60, keep_prompt=False) - if exit_code != 0: - raise RuntimeError( - f'Failed to get working directory (exit code: {exit_code}): {result}' - ) - return result.strip() - - def _init_bash_shell(self, work_dir: str): - self.__bash_PS1 = ( - r'[PEXPECT_BEGIN]\n' - r'$(which python >/dev/null 2>&1 && echo "[Python Interpreter: $(which python)]\n")' - r'\u@\h:\w\n' - r'[PEXPECT_END]' + def cwd(self): + return self._cwd + + def _is_special_key(self, command: str) -> bool: + """Check if the command is a special key.""" + # Special keys are of the form C- + _command = command.strip() + return _command.startswith('C-') and len(_command) == 3 + + def _clear_screen(self): + """Clear the tmux pane screen and history.""" + self.pane.send_keys('C-l', enter=False) + time.sleep(0.1) + self.pane.cmd('clear-history') + + def _get_command_output( + self, + command: str, + raw_command_output: str, + metadata: CmdOutputMetadata, + continue_prefix: str = '', + ) -> str: + """Get the command output with the previous command output removed. + + Args: + command: The command that was executed. + raw_command_output: The raw output from the command. + metadata: The metadata object to store prefix/suffix in. + continue_prefix: The prefix to add to the command output if it's a continuation of the previous command. + """ + # remove the previous command output from the new output if any + if self.prev_output: + command_output = raw_command_output.removeprefix(self.prev_output) + metadata.prefix = continue_prefix + else: + command_output = raw_command_output + self.prev_output = raw_command_output # update current command output anyway + command_output = _remove_command_prefix(command_output, command) + return command_output.rstrip() + + def _handle_completed_command( + self, command: str, pane_content: str, ps1_matches: list[re.Match] + ) -> CmdOutputObservation: + is_special_key = self._is_special_key(command) + assert len(ps1_matches) >= 1, ( + f'Expected at least one PS1 metadata block, but got {len(ps1_matches)}.\n' + f'---FULL OUTPUT---\n{pane_content!r}\n---END OF OUTPUT---' + ) + metadata = CmdOutputMetadata.from_ps1_match(ps1_matches[-1]) + + # Special case where the previous command output is truncated due to history limit + # We should get the content BEFORE the last PS1 prompt + get_content_before_last_match = bool(len(ps1_matches) == 1) + + # Update the current working directory if it has changed + if metadata.working_dir != self._cwd and metadata.working_dir: + self._cwd = metadata.working_dir + + logger.debug(f'COMMAND OUTPUT: {pane_content}') + # Extract the command output between the two PS1 prompts + raw_command_output = self._combine_outputs_between_matches( + pane_content, + ps1_matches, + get_content_before_last_match=get_content_before_last_match, ) - # This should NOT match "PS1=\u@\h:\w [PEXPECT]$" when `env` is executed - self.__bash_expect_regex = r'\[PEXPECT_BEGIN\]\s*(.*?)\s*([a-z0-9_-]*)@([a-zA-Z0-9.-]*):(.+)\s*\[PEXPECT_END\]' - # Set umask to allow group write permissions - self.shell.sendline(f'umask 002; export PS1="{self.__bash_PS1}"; export PS2=""') - self.shell.expect(self.__bash_expect_regex) + if get_content_before_last_match: + # Count the number of lines in the truncated output + num_lines = len(raw_command_output.splitlines()) + metadata.prefix = f'[Previous command outputs are truncated. Showing the last {num_lines} lines of the output below.]\n' - self.shell.sendline( - f'if [ ! -d "{work_dir}" ]; then mkdir -p "{work_dir}"; fi && cd "{work_dir}"' + metadata.suffix = ( + f'\n[The command completed with exit code {metadata.exit_code}.]' + if not is_special_key + else f'\n[The command completed with exit code {metadata.exit_code}. CTRL+{command[-1].upper()} was sent.]' ) - self.shell.expect(self.__bash_expect_regex) - logger.debug( - f'Bash initialized. Working directory: {work_dir}. Output: [{self.shell.before}]' + command_output = self._get_command_output( + command, + raw_command_output, + metadata, + ) + self.prev_status = BashCommandStatus.COMPLETED + self.prev_output = '' # Reset previous command output + self._ready_for_next_command() + return CmdOutputObservation( + content=command_output, + command=command, + metadata=metadata, ) - # Ensure the group has write permissions on the working directory - self.shell.sendline(f'chmod g+rw "{work_dir}"') - self.shell.expect(self.__bash_expect_regex) - - def _get_bash_prompt_and_update_pwd(self): - ps1 = self.shell.after - if ps1 == pexpect.EOF: - logger.error(f'Bash shell EOF! {self.shell.after=}, {self.shell.before=}') - raise RuntimeError('Bash shell EOF') - if ps1 == pexpect.TIMEOUT: - logger.warning('Bash shell timeout') - return '' - - # begin at the last occurrence of '[PEXPECT_BEGIN]'. - # In multi-line bash commands, the prompt will be repeated - # and the matched regex captures all of them - # - we only want the last one (newest prompt) - _begin_pos = ps1.rfind('[PEXPECT_BEGIN]') - if _begin_pos != -1: - ps1 = ps1[_begin_pos:] - - # parse the ps1 to get username, hostname, and working directory - matched = re.match(self.__bash_expect_regex, ps1) - assert ( - matched is not None - ), f'Failed to parse bash prompt: {ps1}. This should not happen.' - other_info, username, hostname, working_dir = matched.groups() - working_dir = working_dir.rstrip() - self._pwd = os.path.expanduser(working_dir) - - # re-assemble the prompt - # ignore the hostname AND use 'openhands-workspace' - prompt = f'{other_info.strip()}\n{username}@openhands-workspace:{working_dir} ' - if username == 'root': - prompt += '#' - else: - prompt += '$' - return prompt + ' ' - def _execute_bash( + def _handle_nochange_timeout_command( self, command: str, - timeout: int, - keep_prompt: bool = True, - kill_on_timeout: bool = True, - ) -> tuple[str, int]: - logger.debug(f'Executing command: {command}') - self.shell.sendline(command) - return self._continue_bash( - timeout=timeout, keep_prompt=keep_prompt, kill_on_timeout=kill_on_timeout + pane_content: str, + ps1_matches: list[re.Match], + ) -> CmdOutputObservation: + self.prev_status = BashCommandStatus.NO_CHANGE_TIMEOUT + if len(ps1_matches) != 1: + logger.warning( + 'Expected exactly one PS1 metadata block BEFORE the execution of a command, ' + f'but got {len(ps1_matches)} PS1 metadata blocks:\n---\n{pane_content!r}\n---' + ) + raw_command_output = self._combine_outputs_between_matches( + pane_content, ps1_matches + ) + metadata = CmdOutputMetadata() # No metadata available + metadata.suffix = ( + f'\n[The command has no new output after {self.NO_CHANGE_TIMEOUT_SECONDS} seconds. ' + "You may wait longer to see additional output by sending empty command '', " + 'send other commands to interact with the current process, ' + 'or send keys to interrupt/kill the command.]' + ) + command_output = self._get_command_output( + command, + raw_command_output, + metadata, + continue_prefix='[Command output continued from previous command]\n', + ) + return CmdOutputObservation( + content=command_output, + command=command, + metadata=metadata, ) - def _interrupt_bash( + def _handle_hard_timeout_command( self, - action_timeout: int | None, - interrupt_timeout: int | None = None, - max_retries: int = 2, - ) -> tuple[str, int]: - interrupt_timeout = interrupt_timeout or 1 # default timeout for SIGINT - # try to interrupt the bash shell use SIGINT - while max_retries > 0: - self.shell.sendintr() # send SIGINT to the shell - logger.debug('Sent SIGINT to bash. Waiting for output...') - try: - self.shell.expect(self.__bash_expect_regex, timeout=interrupt_timeout) - output = self.shell.before - logger.debug(f'Received output after SIGINT: {output}') - exit_code = 130 # SIGINT - - _additional_msg = '' - if action_timeout is not None: - _additional_msg = ( - f'Command timed out after {action_timeout} seconds. ' - ) - output += ( - '\r\n\r\n' - + f'[{_additional_msg}SIGINT was sent to interrupt the command.]' - ) - return output, exit_code - except pexpect.TIMEOUT as e: - logger.warning(f'Bash pexpect.TIMEOUT while waiting for SIGINT: {e}') - max_retries -= 1 - - # fall back to send control-z - logger.error( - 'Failed to get output after SIGINT. Max retries reached. Sending control-z...' + command: str, + pane_content: str, + ps1_matches: list[re.Match], + timeout: float, + ) -> CmdOutputObservation: + self.prev_status = BashCommandStatus.HARD_TIMEOUT + if len(ps1_matches) != 1: + logger.warning( + 'Expected exactly one PS1 metadata block BEFORE the execution of a command, ' + f'but got {len(ps1_matches)} PS1 metadata blocks:\n---\n{pane_content!r}\n---' + ) + raw_command_output = self._combine_outputs_between_matches( + pane_content, ps1_matches + ) + metadata = CmdOutputMetadata() # No metadata available + metadata.suffix = ( + f'\n[The command timed out after {timeout} seconds. ' + "You may wait longer to see additional output by sending empty command '', " + 'send other commands to interact with the current process, ' + 'or send keys to interrupt/kill the command.]' ) - self.shell.sendcontrol('z') - self.shell.expect(self.__bash_expect_regex) - output = self.shell.before - logger.debug(f'Received output after control-z: {output}') - # Try to kill the job - self.shell.sendline('kill -9 %1') - self.shell.expect(self.__bash_expect_regex) - logger.debug(f'Received output after killing job %1: {self.shell.before}') - output += self.shell.before - - _additional_msg = '' - if action_timeout is not None: - _additional_msg = f'Command timed out after {action_timeout} seconds. ' - output += ( - '\r\n\r\n' - + f'[{_additional_msg}SIGINT was sent to interrupt the command, but failed. The command was killed.]' + command_output = self._get_command_output( + command, + raw_command_output, + metadata, + continue_prefix='[Command output continued from previous command]\n', ) - # Try to get the exit code again - self.shell.sendline('echo $?') - self.shell.expect(self.__bash_expect_regex) - _exit_code_output = self.shell.before - exit_code = self._parse_exit_code(_exit_code_output) - - return output, exit_code - - def _parse_exit_code(self, output: str) -> int: - try: - exit_code = int(output.strip().split()[0]) - except Exception: - logger.error('Error getting exit code from bash script') - # If we try to run an invalid shell script the output sometimes includes error text - # rather than the error code - we assume this is an error - exit_code = 2 - return exit_code - - def _continue_bash( + return CmdOutputObservation( + command=command, + content=command_output, + metadata=metadata, + ) + + def _ready_for_next_command(self): + """Reset the content buffer for a new command.""" + # Clear the current content + self._clear_screen() + + def _combine_outputs_between_matches( self, - timeout: int, - keep_prompt: bool = True, - kill_on_timeout: bool = True, - ) -> tuple[str, int]: - logger.debug(f'Continuing bash with timeout={timeout}') - try: - self.shell.expect(self.__bash_expect_regex, timeout=timeout) - - output = self.shell.before - - # Get exit code - self.shell.sendline('echo $?') - logger.debug('Requesting exit code...') - self.shell.expect(self.__bash_expect_regex, timeout=timeout) - _exit_code_output = self.shell.before - exit_code = self._parse_exit_code(_exit_code_output) - except pexpect.TIMEOUT as e: - logger.warning(f'Bash pexpect.TIMEOUT while executing bash command: {e}') - if kill_on_timeout: - output, exit_code = self._interrupt_bash(action_timeout=timeout) + pane_content: str, + ps1_matches: list[re.Match], + get_content_before_last_match: bool = False, + ) -> str: + """Combine all outputs between PS1 matches. + + Args: + pane_content: The full pane content containing PS1 prompts and command outputs + ps1_matches: List of regex matches for PS1 prompts + get_content_before_last_match: when there's only one PS1 match, whether to get + the content before the last PS1 prompt (True) or after the last PS1 prompt (False) + Returns: + Combined string of all outputs between matches + """ + if len(ps1_matches) == 1: + if get_content_before_last_match: + # The command output is the content before the last PS1 prompt + return pane_content[: ps1_matches[0].start()] else: - output = self.shell.before or '' - exit_code = -1 - finally: - bash_prompt = self._get_bash_prompt_and_update_pwd() - if keep_prompt: - output += '\r\n' + bash_prompt - return output, exit_code - - def run(self, action: CmdRunAction) -> CmdOutputObservation | ErrorObservation: - try: - assert ( - action.timeout is not None - ), f'Timeout argument is required for CmdRunAction: {action}' - commands = split_bash_commands(action.command) - all_output = '' - python_interpreter = '' - for command in commands: - if command == '': - output, exit_code = self._continue_bash( - timeout=SOFT_TIMEOUT_SECONDS, - keep_prompt=action.keep_prompt, - kill_on_timeout=False, - ) - elif command.lower() == 'ctrl+c': - output, exit_code = self._interrupt_bash( - action_timeout=None, # intentionally None - ) - else: - output, exit_code = self._execute_bash( - command, - timeout=SOFT_TIMEOUT_SECONDS - if not action.blocking - else action.timeout, - keep_prompt=action.keep_prompt, - kill_on_timeout=False if not action.blocking else True, - ) - # Get rid of the python interpreter string from each line of the output. - # We need it only once at the end. - parts = output.rsplit('[Python Interpreter: ', 1) - output = parts[0] - if len(parts) == 2: - python_interpreter = '[Python Interpreter: ' + parts[1] - if all_output: - # previous output already exists so we add a newline - all_output += '\r\n' - - # If the command originated with the agent, append the command that was run... - if action.source == EventSource.AGENT: - all_output += command + '\r\n' - - all_output += str(output) - if exit_code != 0: - break + # The command output is the content after the last PS1 prompt + return pane_content[ps1_matches[0].end() + 1 :] + combined_output = '' + for i in range(len(ps1_matches) - 1): + # Extract content between current and next PS1 prompt + output_segment = pane_content[ + ps1_matches[i].end() + 1 : ps1_matches[i + 1].start() + ] + combined_output += output_segment + '\n' + logger.debug(f'COMBINED OUTPUT: {combined_output}') + return combined_output + + def execute(self, action: CmdRunAction) -> CmdOutputObservation | ErrorObservation: + """Execute a command in the bash session.""" + if not self._initialized: + raise RuntimeError('Bash session is not initialized') + + # Strip the command of any leading/trailing whitespace + logger.debug(f'RECEIVED ACTION: {action}') + command = action.command.strip() + + if command == '' and self.prev_status not in { + BashCommandStatus.CONTINUE, + BashCommandStatus.NO_CHANGE_TIMEOUT, + BashCommandStatus.HARD_TIMEOUT, + }: return CmdOutputObservation( - command_id=-1, - content=all_output.rstrip('\r\n'), - command=action.command, - hidden=action.hidden, - exit_code=exit_code, - interpreter_details=python_interpreter, + content='ERROR: No previous command to continue from. ' + + 'Previous command has to be timeout to be continued.', + command='', + metadata=CmdOutputMetadata(), ) - except UnicodeDecodeError as e: + + splited_commands = split_bash_commands(command) + if len(splited_commands) > 1: return ErrorObservation( - f'Runtime bash execution failed: Command output could not be decoded as utf-8. {str(e)}', + content=( + f'ERROR: Cannot execute multiple commands at once.\n' + f'Please run each command separately OR chain them into a single command via && or ;\n' + f'Provided commands:\n{"\n".join(f"({i+1}) {cmd}" for i, cmd in enumerate(splited_commands))}' + ) ) + + start_time = time.time() + last_change_time = start_time + last_pane_output = self._get_pane_content() + + _ps1_matches = CmdOutputMetadata.matches_ps1_metadata(last_pane_output) + assert len(_ps1_matches) >= 1, ( + 'Expected at least one PS1 metadata block BEFORE the execution of a command, ' + f'but got {len(_ps1_matches)} PS1 metadata blocks:\n---\n{last_pane_output!r}\n---' + ) + if len(_ps1_matches) > 1: + logger.warning( + 'Found multiple PS1 metadata blocks BEFORE the execution of a command. ' + 'Only the last one will be used.' + ) + _ps1_matches = [_ps1_matches[-1]] + + if command != '': + # convert command to raw string + command = escape_bash_special_chars(command) + logger.debug(f'SENDING COMMAND: {command!r}') + self.pane.send_keys( + command, + enter=not self._is_special_key(command), + ) + + # Loop until the command completes or times out + while should_continue(): + _start_time = time.time() + logger.debug(f'GETTING PANE CONTENT at {_start_time}') + cur_pane_output = self._get_pane_content() + logger.debug( + f'PANE CONTENT GOT after {time.time() - _start_time:.2f} seconds' + ) + logger.debug(f'BEGIN OF PANE CONTENT: {cur_pane_output.split("\n")[:10]}') + logger.debug(f'END OF PANE CONTENT: {cur_pane_output.split("\n")[-10:]}') + ps1_matches = CmdOutputMetadata.matches_ps1_metadata(cur_pane_output) + if cur_pane_output != last_pane_output: + last_pane_output = cur_pane_output + last_change_time = time.time() + logger.debug(f'CONTENT UPDATED DETECTED at {last_change_time}') + + # 1) Execution completed + # if the last command output contains the end marker + if cur_pane_output.rstrip().endswith(CMD_OUTPUT_PS1_END.rstrip()): + return self._handle_completed_command( + command, + pane_content=cur_pane_output, + ps1_matches=ps1_matches, + ) + + # 2) Execution timed out since there's no change in output + # for a while (self.NO_CHANGE_TIMEOUT_SECONDS) + # We ignore this if the command is *blocking + time_since_last_change = time.time() - last_change_time + logger.debug( + f'CHECKING NO CHANGE TIMEOUT ({self.NO_CHANGE_TIMEOUT_SECONDS}s): elapsed {time_since_last_change}' + ) + if ( + not action.blocking + and time_since_last_change >= self.NO_CHANGE_TIMEOUT_SECONDS + ): + return self._handle_nochange_timeout_command( + command, + pane_content=cur_pane_output, + ps1_matches=ps1_matches, + ) + + # 3) Execution timed out due to hard timeout + logger.debug( + f'CHECKING HARD TIMEOUT ({action.timeout}s): elapsed {time.time() - start_time}' + ) + if action.timeout and time.time() - start_time >= action.timeout: + return self._handle_hard_timeout_command( + command, + pane_content=cur_pane_output, + ps1_matches=ps1_matches, + timeout=action.timeout, + ) + + logger.debug(f'SLEEPING for {self.POLL_INTERVAL} seconds for next poll') + time.sleep(self.POLL_INTERVAL) + raise RuntimeError('Bash session was likely interrupted...') diff --git a/openhands/runtime/utils/runtime_init.py b/openhands/runtime/utils/runtime_init.py index 9ebba67fcd31..b38ab7ca3495 100644 --- a/openhands/runtime/utils/runtime_init.py +++ b/openhands/runtime/utils/runtime_init.py @@ -4,7 +4,7 @@ def init_user_and_working_directory( - username: str, user_id: int, initial_pwd: str + username: str, user_id: int, initial_cwd: str ) -> int | None: """Create working directory and user if not exists. It performs the following steps effectively: @@ -26,23 +26,23 @@ def init_user_and_working_directory( Args: username (str): The username to create. user_id (int): The user ID to assign to the user. - initial_pwd (str): The initial working directory to create. + initial_cwd (str): The initial working directory to create. Returns: int | None: The user ID if it was updated, None otherwise. """ # First create the working directory, independent of the user - logger.debug(f'Client working directory: {initial_pwd}') - command = f'umask 002; mkdir -p {initial_pwd}' + logger.debug(f'Client working directory: {initial_cwd}') + command = f'umask 002; mkdir -p {initial_cwd}' output = subprocess.run(command, shell=True, capture_output=True) out_str = output.stdout.decode() - command = f'chown -R {username}:root {initial_pwd}' + command = f'chown -R {username}:root {initial_cwd}' output = subprocess.run(command, shell=True, capture_output=True) out_str += output.stdout.decode() - command = f'chmod g+rw {initial_pwd}' + command = f'chmod g+rw {initial_cwd}' output = subprocess.run(command, shell=True, capture_output=True) out_str += output.stdout.decode() logger.debug(f'Created working directory. Output: [{out_str}]') diff --git a/openhands/runtime/utils/runtime_templates/Dockerfile.j2 b/openhands/runtime/utils/runtime_templates/Dockerfile.j2 index 7423421252ab..8a97792d9de3 100644 --- a/openhands/runtime/utils/runtime_templates/Dockerfile.j2 +++ b/openhands/runtime/utils/runtime_templates/Dockerfile.j2 @@ -15,7 +15,7 @@ ENV POETRY_VIRTUALENVS_PATH=/openhands/poetry \ # Install base system dependencies RUN apt-get update && \ apt-get install -y --no-install-recommends \ - wget curl sudo apt-utils git jq \ + wget curl sudo apt-utils git jq tmux \ {% if 'ubuntu' in base_image and (base_image.endswith(':latest') or base_image.endswith(':24.04')) %} libgl1 \ {% else %} diff --git a/poetry.lock b/poetry.lock index b1263ea1f9ca..54f3fd9fd419 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -3711,6 +3711,17 @@ websocket-client = ">=0.32.0,<0.40.0 || >0.40.0,<0.41.dev0 || >=0.43.dev0" [package.extras] adal = ["adal (>=1.0.2)"] +[[package]] +name = "libtmux" +version = "0.37.0" +description = "Typed library that provides an ORM wrapper for tmux, a terminal multiplexer." +optional = false +python-versions = "<4.0,>=3.8" +files = [ + {file = "libtmux-0.37.0-py3-none-any.whl", hash = "sha256:7e8cbab30b033d132b6fca5dddb575bb7f6a1fd802328e7174f9b49023556376"}, + {file = "libtmux-0.37.0.tar.gz", hash = "sha256:21955c5dce6332db41abad5e26ae8c4062ef2b9a89099bd57a36f52be1d5270f"}, +] + [[package]] name = "libwebarena" version = "0.0.3" @@ -8298,6 +8309,25 @@ postgresql-psycopgbinary = ["psycopg[binary] (>=3.0.7)"] pymysql = ["pymysql"] sqlcipher = ["sqlcipher3_binary"] +[[package]] +name = "sse-starlette" +version = "2.1.3" +description = "SSE plugin for Starlette" +optional = false +python-versions = ">=3.8" +files = [ + {file = "sse_starlette-2.1.3-py3-none-any.whl", hash = "sha256:8ec846438b4665b9e8c560fcdea6bc8081a3abf7942faa95e5a744999d219772"}, + {file = "sse_starlette-2.1.3.tar.gz", hash = "sha256:9cd27eb35319e1414e3d2558ee7414487f9529ce3b3cf9b21434fd110e017169"}, +] + +[package.dependencies] +anyio = "*" +starlette = "*" +uvicorn = "*" + +[package.extras] +examples = ["fastapi"] + [[package]] name = "stack-data" version = "0.6.3" @@ -10054,4 +10084,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "691bdd0f64e3476858eb34ce6ed6d0b0e7d97458cfd69fd366cd9c1c4f4ec897" +content-hash = "db887f071f7dbb712cfba5d9b4de8938afbedee22fd166b4527f4aec40e37cfd" diff --git a/pyproject.toml b/pyproject.toml index be7cb9a3a7ce..037025e70ea4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "openhands-ai" -version = "0.18.0" +version = "0.18.0a0" description = "OpenHands: Code Less, Make More" authors = ["OpenHands"] license = "MIT" @@ -62,11 +62,13 @@ opentelemetry-api = "1.25.0" opentelemetry-exporter-otlp-proto-grpc = "1.25.0" modal = ">=0.66.26,<0.71.0" runloop-api-client = "0.11.0" +libtmux = "^0.37.0" pygithub = "^2.5.0" joblib = "*" openhands-aci = "0.1.5" python-socketio = "^5.11.4" redis = "^5.2.0" +sse-starlette = "^2.1.3" [tool.poetry.group.llama-index.dependencies] llama-index = "*" @@ -100,6 +102,7 @@ reportlab = "*" [tool.coverage.run] concurrency = ["gevent"] + [tool.poetry.group.runtime.dependencies] jupyterlab = "*" notebook = "*" @@ -129,6 +132,7 @@ ignore = ["D1"] [tool.ruff.lint.pydocstyle] convention = "google" + [tool.poetry.group.evaluation.dependencies] streamlit = "*" whatthepatch = "*" diff --git a/tests/runtime/conftest.py b/tests/runtime/conftest.py index 062fc2ed9fa1..83038fa28666 100644 --- a/tests/runtime/conftest.py +++ b/tests/runtime/conftest.py @@ -69,7 +69,7 @@ def _close_test_runtime(runtime: Runtime) -> None: time.sleep(1) -def _reset_pwd() -> None: +def _reset_cwd() -> None: global project_dir # Try to change back to project directory try: @@ -152,16 +152,16 @@ def get_run_as_openhands() -> list[bool]: @pytest.fixture(scope='module') # for xdist def runtime_setup_module(): - _reset_pwd() + _reset_cwd() yield - _reset_pwd() + _reset_cwd() @pytest.fixture(scope='session') # not for xdist def runtime_setup_session(): - _reset_pwd() + _reset_cwd() yield - _reset_pwd() + _reset_cwd() # This assures that all tests run together per runtime, not alternating between them, @@ -230,14 +230,14 @@ def _load_runtime( global test_mount_path if use_workspace: test_mount_path = os.path.join(config.workspace_base, 'rt') + elif temp_dir is not None: + test_mount_path = os.path.join(temp_dir, sid) else: - test_mount_path = os.path.join( - temp_dir, sid - ) # need a subfolder to avoid conflicts + test_mount_path = None config.workspace_mount_path = test_mount_path # Mounting folder specific for this test inside the sandbox - config.workspace_mount_path_in_sandbox = f'{sandbox_test_folder}/{sid}' + config.workspace_mount_path_in_sandbox = f'{sandbox_test_folder}' print('\nPaths used:') print(f'use_host_network: {config.sandbox.use_host_network}') print(f'workspace_base: {config.workspace_base}') diff --git a/tests/runtime/test_bash.py b/tests/runtime/test_bash.py index f8ff95d9a6b2..b97cdbc9266b 100644 --- a/tests/runtime/test_bash.py +++ b/tests/runtime/test_bash.py @@ -5,7 +5,6 @@ import pytest from conftest import ( - TEST_IN_CI, _close_test_runtime, _get_sandbox_folder, _load_runtime, @@ -13,7 +12,7 @@ from openhands.core.logger import openhands_logger as logger from openhands.events.action import CmdRunAction -from openhands.events.observation import CmdOutputObservation +from openhands.events.observation import CmdOutputObservation, ErrorObservation from openhands.runtime.base import Runtime # ============================================================================================================================ @@ -21,36 +20,19 @@ # ============================================================================================================================ -def _run_cmd_action(runtime, custom_command: str, keep_prompt=True): - action = CmdRunAction(command=custom_command, keep_prompt=keep_prompt) +def _run_cmd_action(runtime, custom_command: str): + action = CmdRunAction(command=custom_command) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) - assert isinstance(obs, CmdOutputObservation) + assert isinstance(obs, (CmdOutputObservation, ErrorObservation)) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) return obs -def test_bash_command_pexcept(temp_dir, runtime_cls, run_as_openhands): +def test_bash_command_env(temp_dir, runtime_cls, run_as_openhands): runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) try: - # We set env var PS1="\u@\h:\w $" - # and construct the PEXCEPT prompt base on it. - # When run `env`, bad implementation of CmdRunAction will be pexcepted by this - # and failed to pexcept the right content, causing it fail to get error code. obs = runtime.run_action(CmdRunAction(command='env')) - - # For example: - # 02:16:13 - openhands:DEBUG: client.py:78 - Executing command: env - # 02:16:13 - openhands:DEBUG: client.py:82 - Command output: PYTHONUNBUFFERED=1 - # CONDA_EXE=/openhands/miniforge3/bin/conda - # [...] - # LC_CTYPE=C.UTF-8 - # PS1=\u@\h:\w $ - # 02:16:13 - openhands:DEBUG: client.py:89 - Executing command for exit code: env - # 02:16:13 - openhands:DEBUG: client.py:92 - Exit code Output: - # CONDA_DEFAULT_ENV=base - - # As long as the exit code is 0, the test will pass. assert isinstance( obs, CmdOutputObservation ), 'The observation should be a CmdOutputObservation.' @@ -59,62 +41,29 @@ def test_bash_command_pexcept(temp_dir, runtime_cls, run_as_openhands): _close_test_runtime(runtime) -def test_bash_timeout_and_keyboard_interrupt(temp_dir, runtime_cls, run_as_openhands): +def test_bash_server(temp_dir, runtime_cls, run_as_openhands): runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) try: - action = CmdRunAction(command='python -c "import time; time.sleep(10)"') + action = CmdRunAction(command='python3 -m http.server 8080') action.timeout = 1 obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) + assert obs.exit_code == -1 + assert 'Serving HTTP on 0.0.0.0 port 8080' in obs.content assert ( - '[Command timed out after 1 seconds. SIGINT was sent to interrupt the command.]' - in obs.content + "[The command timed out after 1 seconds. You may wait longer to see additional output by sending empty command '', send other commands to interact with the current process, or send keys to interrupt/kill the command.]" + in obs.metadata.suffix ) - assert 'KeyboardInterrupt' in obs.content - # follow up command should not be affected - action = CmdRunAction(command='ls') - action.timeout = 1 + action = CmdRunAction(command='C-c') + action.timeout = 30 obs = runtime.run_action(action) - assert isinstance(obs, CmdOutputObservation) - assert obs.exit_code == 0 logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - - # run it again! - action = CmdRunAction(command='python -c "import time; time.sleep(10)"') - action.timeout = 1 - obs = runtime.run_action(action) - assert isinstance(obs, CmdOutputObservation) - assert ( - '[Command timed out after 1 seconds. SIGINT was sent to interrupt the command.]' - in obs.content - ) - assert 'KeyboardInterrupt' in obs.content - - # things should still work - action = CmdRunAction(command='ls') - action.timeout = 1 - obs = runtime.run_action(action) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 - assert '/workspace' in obs.interpreter_details - - finally: - _close_test_runtime(runtime) - - -def test_bash_pexcept_eof(temp_dir, runtime_cls, run_as_openhands): - runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) - try: - action = CmdRunAction(command='python3 -m http.server 8080') - action.timeout = 1 - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert isinstance(obs, CmdOutputObservation) - assert obs.exit_code == 130 # script was killed by SIGINT - assert 'Serving HTTP on 0.0.0.0 port 8080' in obs.content assert 'Keyboard interrupt received, exiting.' in obs.content + assert '/workspace' in obs.metadata.working_dir action = CmdRunAction(command='ls') action.timeout = 1 @@ -122,7 +71,8 @@ def test_bash_pexcept_eof(temp_dir, runtime_cls, run_as_openhands): logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) assert obs.exit_code == 0 - assert '/workspace' in obs.interpreter_details + assert 'Keyboard interrupt received, exiting.' not in obs.content + assert '/workspace' in obs.metadata.working_dir # run it again! action = CmdRunAction(command='python3 -m http.server 8080') @@ -130,122 +80,8 @@ def test_bash_pexcept_eof(temp_dir, runtime_cls, run_as_openhands): obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert isinstance(obs, CmdOutputObservation) - assert obs.exit_code == 130 # script was killed by SIGINT + assert obs.exit_code == -1 assert 'Serving HTTP on 0.0.0.0 port 8080' in obs.content - assert 'Keyboard interrupt received, exiting.' in obs.content - - # things should still work - action = CmdRunAction(command='ls') - action.timeout = 1 - obs = runtime.run_action(action) - assert isinstance(obs, CmdOutputObservation) - assert obs.exit_code == 0 - assert '/workspace' in obs.interpreter_details - finally: - _close_test_runtime(runtime) - - -def test_process_resistant_to_one_sigint(temp_dir, runtime_cls, run_as_openhands): - runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) - try: - # Create a bash script that ignores SIGINT up to 1 times - script_content = """ -#!/bin/bash -trap_count=0 -trap 'echo "Caught SIGINT ($((++trap_count))/1), ignoring..."; [ $trap_count -ge 1 ] && trap - INT && exit' INT -while true; do - echo "Still running..." - sleep 1 -done - """.strip() - - with open(f'{temp_dir}/resistant_script.sh', 'w') as f: - f.write(script_content) - os.chmod(f'{temp_dir}/resistant_script.sh', 0o777) - - runtime.copy_to( - os.path.join(temp_dir, 'resistant_script.sh'), - runtime.config.workspace_mount_path_in_sandbox, - ) - - # Run the resistant script - action = CmdRunAction(command='sudo bash ./resistant_script.sh') - action.timeout = 5 - action.blocking = True - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert isinstance(obs, CmdOutputObservation) - assert obs.exit_code == 130 # script was killed by SIGINT - assert 'Still running...' in obs.content - assert 'Caught SIGINT (1/1), ignoring...' in obs.content - assert 'Stopped' not in obs.content - assert ( - '[Command timed out after 5 seconds. SIGINT was sent to interrupt the command.]' - in obs.content - ) - - # Normal command should still work - action = CmdRunAction(command='ls') - action.timeout = 10 - obs = runtime.run_action(action) - assert isinstance(obs, CmdOutputObservation) - assert obs.exit_code == 0 - assert '/workspace' in obs.interpreter_details - assert 'resistant_script.sh' in obs.content - - finally: - _close_test_runtime(runtime) - - -def test_process_resistant_to_multiple_sigint(temp_dir, runtime_cls, run_as_openhands): - runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) - try: - # Create a bash script that ignores SIGINT up to 2 times - script_content = """ -#!/bin/bash -trap_count=0 -trap 'echo "Caught SIGINT ($((++trap_count))/3), ignoring..."; [ $trap_count -ge 3 ] && trap - INT && exit' INT -while true; do - echo "Still running..." - sleep 1 -done - """.strip() - - with open(f'{temp_dir}/resistant_script.sh', 'w') as f: - f.write(script_content) - os.chmod(f'{temp_dir}/resistant_script.sh', 0o777) - - runtime.copy_to( - os.path.join(temp_dir, 'resistant_script.sh'), - runtime.config.workspace_mount_path_in_sandbox, - ) - - # Run the resistant script - action = CmdRunAction(command='sudo bash ./resistant_script.sh') - action.timeout = 2 - action.blocking = True - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert isinstance(obs, CmdOutputObservation) - assert obs.exit_code == 0 - assert 'Still running...' in obs.content - assert 'Caught SIGINT (1/3), ignoring...' in obs.content - assert '[1]+' and 'Stopped' in obs.content - assert ( - '[Command timed out after 2 seconds. SIGINT was sent to interrupt the command, but failed. The command was killed.]' - in obs.content - ) - - # Normal command should still work - action = CmdRunAction(command='ls') - action.timeout = 10 - obs = runtime.run_action(action) - assert isinstance(obs, CmdOutputObservation) - assert obs.exit_code == 0 - assert '/workspace' in obs.interpreter_details - assert 'resistant_script.sh' in obs.content finally: _close_test_runtime(runtime) @@ -262,12 +98,12 @@ def test_multiline_commands(temp_dir, runtime_cls): # test multiline echo obs = _run_cmd_action(runtime, 'echo -e "hello\nworld"') assert obs.exit_code == 0, 'The exit code should be 0.' - assert 'hello\r\nworld' in obs.content + assert 'hello\nworld' in obs.content # test whitespace obs = _run_cmd_action(runtime, 'echo -e "a\\n\\n\\nz"') assert obs.exit_code == 0, 'The exit code should be 0.' - assert '\r\n\r\n\r\n' in obs.content + assert '\n\n\n' in obs.content finally: _close_test_runtime(runtime) @@ -276,43 +112,43 @@ def test_multiple_multiline_commands(temp_dir, runtime_cls, run_as_openhands): cmds = [ 'ls -l', 'echo -e "hello\nworld"', - """ -echo -e "hello it\\'s me" -""".strip(), - """ -echo \\ + """echo -e "hello it's me\"""", + """echo \\ -e 'hello' \\ - -v -""".strip(), - """ -echo -e 'hello\\nworld\\nare\\nyou\\nthere?' -""".strip(), - """ -echo -e 'hello -world -are -you\\n -there?' -""".strip(), - """ -echo -e 'hello -world " -' -""".strip(), + -v""", + """echo -e 'hello\\nworld\\nare\\nyou\\nthere?'""", + """echo -e 'hello\nworld\nare\nyou\n\nthere?'""", + """echo -e 'hello\nworld "'""", ] joined_cmds = '\n'.join(cmds) runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) try: + # First test that running multiple commands at once fails obs = _run_cmd_action(runtime, joined_cmds) - assert obs.exit_code == 0, 'The exit code should be 0.' - - assert 'total 0' in obs.content - assert 'hello\r\nworld' in obs.content - assert "hello it\\'s me" in obs.content - assert 'hello -v' in obs.content - assert 'hello\r\nworld\r\nare\r\nyou\r\nthere?' in obs.content - assert 'hello\r\nworld\r\nare\r\nyou\r\n\r\nthere?' in obs.content + assert isinstance(obs, ErrorObservation) + assert 'Cannot execute multiple commands at once' in obs.content + + # Now run each command individually and verify they work + results = [] + for cmd in cmds: + obs = _run_cmd_action(runtime, cmd) + assert isinstance(obs, CmdOutputObservation) + assert obs.exit_code == 0 + results.append(obs.content) + + # Verify all expected outputs are present + assert 'total 0' in results[0] # ls -l + assert 'hello\nworld' in results[1] # echo -e "hello\nworld" + assert "hello it's me" in results[2] # echo -e "hello it\'s me" + assert 'hello -v' in results[3] # echo -e 'hello' -v + assert ( + 'hello\nworld\nare\nyou\nthere?' in results[4] + ) # echo -e 'hello\nworld\nare\nyou\nthere?' + assert ( + 'hello\nworld\nare\nyou\n\nthere?' in results[5] + ) # echo -e with literal newlines + assert 'hello\nworld "' in results[6] # echo -e with quote finally: _close_test_runtime(runtime) @@ -324,7 +160,7 @@ def test_no_ps2_in_output(temp_dir, runtime_cls, run_as_openhands): obs = _run_cmd_action(runtime, 'echo -e "hello\nworld"') assert obs.exit_code == 0, 'The exit code should be 0.' - assert 'hello\r\nworld' in obs.content + assert 'hello\nworld' in obs.content assert '>' not in obs.content finally: _close_test_runtime(runtime) @@ -332,21 +168,17 @@ def test_no_ps2_in_output(temp_dir, runtime_cls, run_as_openhands): def test_multiline_command_loop(temp_dir, runtime_cls): # https://github.com/All-Hands-AI/OpenHands/issues/3143 - init_cmd = """ -mkdir -p _modules && \ + init_cmd = """mkdir -p _modules && \ for month in {01..04}; do for day in {01..05}; do touch "_modules/2024-${month}-${day}-sample.md" done -done -echo "created files" +done && echo "created files" """ - follow_up_cmd = """ -for file in _modules/*.md; do + follow_up_cmd = """for file in _modules/*.md; do new_date=$(echo $file | sed -E 's/2024-(01|02|03|04)-/2024-/;s/2024-01/2024-08/;s/2024-02/2024-09/;s/2024-03/2024-10/;s/2024-04/2024-11/') mv "$file" "$new_date" -done -echo "success" +done && echo "success" """ runtime = _load_runtime(temp_dir, runtime_cls) try: @@ -424,7 +256,6 @@ def test_multi_cmd_run_in_single_line(temp_dir, runtime_cls): def test_stateful_cmd(temp_dir, runtime_cls): runtime = _load_runtime(temp_dir, runtime_cls) - sandbox_dir = _get_sandbox_folder(runtime) try: obs = _run_cmd_action(runtime, 'mkdir -p test') assert obs.exit_code == 0, 'The exit code should be 0.' @@ -434,7 +265,7 @@ def test_stateful_cmd(temp_dir, runtime_cls): obs = _run_cmd_action(runtime, 'pwd') assert obs.exit_code == 0, 'The exit code should be 0.' - assert f'{sandbox_dir}/test' in obs.content + assert '/workspace/test' in obs.content finally: _close_test_runtime(runtime) @@ -532,7 +363,7 @@ def test_copy_to_non_existent_directory(temp_dir, runtime_cls): def test_overwrite_existing_file(temp_dir, runtime_cls): runtime = _load_runtime(temp_dir, runtime_cls) try: - sandbox_dir = _get_sandbox_folder(runtime) + sandbox_dir = '/openhands/workspace' obs = _run_cmd_action(runtime, f'ls -alh {sandbox_dir}') assert obs.exit_code == 0 @@ -595,38 +426,13 @@ def test_copy_from_directory(temp_dir, runtime_cls): _close_test_runtime(runtime) -def test_keep_prompt(runtime_cls, temp_dir): - runtime = _load_runtime( - temp_dir, - runtime_cls=runtime_cls, - run_as_openhands=False, - ) - try: - sandbox_dir = _get_sandbox_folder(runtime) - - obs = _run_cmd_action(runtime, f'touch {sandbox_dir}/test_file.txt') - assert obs.exit_code == 0 - assert 'root@' in obs.interpreter_details - - obs = _run_cmd_action( - runtime, f'cat {sandbox_dir}/test_file.txt', keep_prompt=False - ) - assert obs.exit_code == 0 - assert 'root@' not in obs.interpreter_details - finally: - _close_test_runtime(runtime) - - -@pytest.mark.skipif( - TEST_IN_CI != 'True', - reason='This test is not working in WSL (file ownership)', -) def test_git_operation(runtime_cls): # do not mount workspace, since workspace mount by tests will be owned by root # while the user_id we get via os.getuid() is different from root # which causes permission issues runtime = _load_runtime( temp_dir=None, + use_workspace=False, runtime_cls=runtime_cls, # Need to use non-root user to expose issues run_as_openhands=True, @@ -634,12 +440,15 @@ def test_git_operation(runtime_cls): # this will happen if permission of runtime is not properly configured # fatal: detected dubious ownership in repository at '/workspace' try: + obs = _run_cmd_action(runtime, 'sudo chown -R openhands:root .') + assert obs.exit_code == 0 + # check the ownership of the current directory obs = _run_cmd_action(runtime, 'ls -alh .') assert obs.exit_code == 0 # drwx--S--- 2 openhands root 64 Aug 7 23:32 . # drwxr-xr-x 1 root root 4.0K Aug 7 23:33 .. - for line in obs.content.split('\r\n'): + for line in obs.content.split('\n'): if ' ..' in line: # parent directory should be owned by root assert 'root' in line @@ -663,8 +472,10 @@ def test_git_operation(runtime_cls): assert obs.exit_code == 0 # git diff - obs = _run_cmd_action(runtime, 'git diff') + obs = _run_cmd_action(runtime, 'git diff --no-color --cached') assert obs.exit_code == 0 + assert 'b/test_file.txt' in obs.content + assert '+hello' in obs.content # git commit obs = _run_cmd_action(runtime, 'git commit -m "test commit"') @@ -685,3 +496,276 @@ def test_python_version(temp_dir, runtime_cls, run_as_openhands): assert 'Python 3' in obs.content, 'The output should contain "Python 3".' finally: _close_test_runtime(runtime) + + +def test_pwd_property(temp_dir, runtime_cls, run_as_openhands): + runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) + try: + # Create a subdirectory and verify pwd updates + obs = _run_cmd_action(runtime, 'mkdir -p random_dir') + assert obs.exit_code == 0 + + obs = _run_cmd_action(runtime, 'cd random_dir && pwd') + assert obs.exit_code == 0 + assert 'random_dir' in obs.content + finally: + _close_test_runtime(runtime) + + +def test_basic_command(temp_dir, runtime_cls, run_as_openhands): + runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) + try: + # Test simple command + obs = _run_cmd_action(runtime, "echo 'hello world'") + assert 'hello world' in obs.content + assert obs.exit_code == 0 + + # Test command with error + obs = _run_cmd_action(runtime, 'nonexistent_command') + assert obs.exit_code == 127 + assert 'nonexistent_command: command not found' in obs.content + + # Test command with special characters + obs = _run_cmd_action(runtime, "echo 'hello world with\nspecial chars'") + assert 'hello world with\nspecial chars' in obs.content + assert obs.exit_code == 0 + + # Test multiple commands in sequence + obs = _run_cmd_action(runtime, 'echo "first" && echo "second" && echo "third"') + assert 'first' in obs.content + assert 'second' in obs.content + assert 'third' in obs.content + assert obs.exit_code == 0 + finally: + _close_test_runtime(runtime) + + +def test_interactive_command(temp_dir, runtime_cls, run_as_openhands): + runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) + try: + # Test interactive command + action = CmdRunAction('read -p "Enter name: " name && echo "Hello $name"') + action.timeout = 1 + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + # assert 'Enter name:' in obs.content # FIXME: this is not working + assert '[The command timed out after 1 seconds.' in obs.metadata.suffix + + action = CmdRunAction('John') + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'Hello John' in obs.content + assert '[The command completed with exit code 0.]' in obs.metadata.suffix + + # Test multiline command input with here document + action = CmdRunAction("""cat << EOF +line 1 +line 2 +EOF""") + obs = runtime.run_action(action) + assert 'line 1\nline 2' in obs.content + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert '[The command completed with exit code 0.]' in obs.metadata.suffix + assert obs.exit_code == 0 + finally: + _close_test_runtime(runtime) + + +def test_long_output(temp_dir, runtime_cls, run_as_openhands): + runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) + try: + # Generate a long output + action = CmdRunAction('for i in $(seq 1 5000); do echo "Line $i"; done') + action.timeout = 10 + obs = runtime.run_action(action) + assert obs.exit_code == 0 + assert 'Line 1' in obs.content + assert 'Line 5000' in obs.content + finally: + _close_test_runtime(runtime) + + +def test_long_output_exceed_history_limit(temp_dir, runtime_cls, run_as_openhands): + runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) + try: + # Generate a long output + action = CmdRunAction('for i in $(seq 1 50000); do echo "Line $i"; done') + action.timeout = 30 + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.exit_code == 0 + assert 'Previous command outputs are truncated' in obs.metadata.prefix + assert 'Line 40000' in obs.content + assert 'Line 50000' in obs.content + finally: + _close_test_runtime(runtime) + + +def test_long_output_from_nested_directories(temp_dir, runtime_cls, run_as_openhands): + runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) + try: + # Create nested directories with many files + setup_cmd = 'mkdir -p /tmp/test_dir && cd /tmp/test_dir && for i in $(seq 1 100); do mkdir -p "folder_$i"; for j in $(seq 1 100); do touch "folder_$i/file_$j.txt"; done; done' + setup_action = CmdRunAction(setup_cmd.strip()) + setup_action.timeout = 60 + obs = runtime.run_action(setup_action) + assert obs.exit_code == 0 + + # List the directory structure recursively + action = CmdRunAction('ls -R /tmp/test_dir') + action.timeout = 60 + obs = runtime.run_action(action) + assert obs.exit_code == 0 + + # Verify output contains expected files + assert 'folder_1' in obs.content + assert 'file_1.txt' in obs.content + assert 'folder_100' in obs.content + assert 'file_100.txt' in obs.content + finally: + _close_test_runtime(runtime) + + +def test_command_backslash(temp_dir, runtime_cls, run_as_openhands): + runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) + try: + # Create a file with the content "implemented_function" + action = CmdRunAction( + 'mkdir -p /tmp/test_dir && echo "implemented_function" > /tmp/test_dir/file_1.txt' + ) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.exit_code == 0 + + # Reproduce an issue we ran into during evaluation + # find /workspace/sympy__sympy__1.0 -type f -exec grep -l "implemented_function" {} \; + # find: missing argument to `-exec' + # --> This is unexpected output due to incorrect escaping of \; + # This tests for correct escaping of \; + action = CmdRunAction( + 'find /tmp/test_dir -type f -exec grep -l "implemented_function" {} \\;' + ) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.exit_code == 0 + assert '/tmp/test_dir/file_1.txt' in obs.content + finally: + _close_test_runtime(runtime) + + +def test_command_output_continuation(temp_dir, runtime_cls, run_as_openhands): + runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) + try: + # Start a command that produces output slowly + action = CmdRunAction('for i in {1..5}; do echo $i; sleep 3; done') + action.timeout = 2.5 # Set timeout to 2.5 seconds + obs = runtime.run_action(action) + assert obs.content.strip() == '1' + assert obs.metadata.prefix == '' + assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix + + # Continue watching output + action = CmdRunAction('') + action.timeout = 2.5 + obs = runtime.run_action(action) + assert '[Command output continued from previous command]' in obs.metadata.prefix + assert obs.content.strip() == '2' + assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix + + # Continue until completion + for expected in ['3', '4', '5']: + action = CmdRunAction('') + action.timeout = 2.5 + obs = runtime.run_action(action) + assert ( + '[Command output continued from previous command]' + in obs.metadata.prefix + ) + assert obs.content.strip() == expected + assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix + + # Final empty command to complete + action = CmdRunAction('') + obs = runtime.run_action(action) + assert '[The command completed with exit code 0.]' in obs.metadata.suffix + finally: + _close_test_runtime(runtime) + + +def test_long_running_command_follow_by_execute( + temp_dir, runtime_cls, run_as_openhands +): + runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) + try: + # Test command that produces output slowly + action = CmdRunAction('for i in {1..3}; do echo $i; sleep 3; done') + action.timeout = 2.5 + action.blocking = False + obs = runtime.run_action(action) + assert '1' in obs.content # First number should appear before timeout + assert obs.metadata.exit_code == -1 # -1 indicates command is still running + assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix + assert obs.metadata.prefix == '' + + # Continue watching output + action = CmdRunAction('') + action.timeout = 2.5 + obs = runtime.run_action(action) + assert '2' in obs.content + assert ( + obs.metadata.prefix == '[Command output continued from previous command]\n' + ) + assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix + assert obs.metadata.exit_code == -1 # -1 indicates command is still running + + # Test command that produces no output + action = CmdRunAction('sleep 15') + action.timeout = 2.5 + obs = runtime.run_action(action) + assert '3' in obs.content + assert ( + obs.metadata.prefix == '[Command output continued from previous command]\n' + ) + assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix + assert obs.metadata.exit_code == -1 # -1 indicates command is still running + finally: + _close_test_runtime(runtime) + + +def test_empty_command_errors(temp_dir, runtime_cls, run_as_openhands): + runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) + try: + # Test empty command without previous command + obs = runtime.run_action(CmdRunAction('')) + assert isinstance(obs, CmdOutputObservation) + assert 'ERROR: No previous command to continue from' in obs.content + finally: + _close_test_runtime(runtime) + + +def test_python_interactive_input(temp_dir, runtime_cls, run_as_openhands): + runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) + try: + # Test Python program that asks for input - properly escaped for bash + python_script = """name = input('Enter your name: '); age = input('Enter your age: '); print(f'Hello {name}, you are {age} years old')""" + + # Start Python with the interactive script + obs = runtime.run_action(CmdRunAction(f'python3 -c "{python_script}"')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'Enter your name:' in obs.content + assert obs.metadata.exit_code == -1 # -1 indicates command is still running + + # Send first input (name) + obs = runtime.run_action(CmdRunAction('Alice')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'Enter your age:' in obs.content + assert obs.metadata.exit_code == -1 + + # Send second input (age) + obs = runtime.run_action(CmdRunAction('25')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'Hello Alice, you are 25 years old' in obs.content + assert obs.metadata.exit_code == 0 + assert '[The command completed with exit code 0.]' in obs.metadata.suffix + finally: + _close_test_runtime(runtime) diff --git a/tests/runtime/test_ipython.py b/tests/runtime/test_ipython.py index caee30001e81..8b5c89053b52 100644 --- a/tests/runtime/test_ipython.py +++ b/tests/runtime/test_ipython.py @@ -4,7 +4,6 @@ from conftest import ( TEST_IN_CI, _close_test_runtime, - _get_sandbox_folder, _load_runtime, ) @@ -33,8 +32,6 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, runtime_cls, run_as_openhands): runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) - sandbox_dir = _get_sandbox_folder(runtime) - # Test run command action_cmd = CmdRunAction(command='ls -l') logger.info(action_cmd, extra={'msg_type': 'ACTION'}) @@ -55,7 +52,7 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, runtime_cls, run_as_openhands): logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.content.strip() == ( 'Hello, `World`!\n' - f'[Jupyter current working directory: {sandbox_dir}]\n' + '[Jupyter current working directory: /openhands/workspace]\n' '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.12/bin/python]' ) @@ -76,7 +73,7 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, runtime_cls, run_as_openhands): assert obs.content == '' # event stream runtime will always use absolute path - assert obs.path == f'{sandbox_dir}/hello.sh' + assert obs.path == '/openhands/workspace/hello.sh' # Test read file (file should exist) action_read = FileReadAction(path='hello.sh') @@ -88,7 +85,7 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, runtime_cls, run_as_openhands): logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.content == 'echo "Hello, World!"\n' - assert obs.path == f'{sandbox_dir}/hello.sh' + assert obs.path == '/openhands/workspace/hello.sh' # clean up action = CmdRunAction(command='rm -rf hello.sh') @@ -178,7 +175,6 @@ def test_ipython_multi_user(temp_dir, runtime_cls, run_as_openhands): def test_ipython_simple(temp_dir, runtime_cls): runtime = _load_runtime(temp_dir, runtime_cls) - sandbox_dir = _get_sandbox_folder(runtime) # Test run ipython # get username @@ -192,7 +188,7 @@ def test_ipython_simple(temp_dir, runtime_cls): obs.content.strip() == ( '1\n' - f'[Jupyter current working directory: {sandbox_dir}]\n' + '[Jupyter current working directory: /openhands/workspace]\n' '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.12/bin/python]' ).strip() ) @@ -203,7 +199,6 @@ def test_ipython_simple(temp_dir, runtime_cls): def test_ipython_package_install(temp_dir, runtime_cls, run_as_openhands): """Make sure that cd in bash also update the current working directory in ipython.""" runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) - sandbox_dir = _get_sandbox_folder(runtime) # It should error out since pymsgbox is not installed action = IPythonRunCellAction(code='import pymsgbox') @@ -229,7 +224,7 @@ def test_ipython_package_install(temp_dir, runtime_cls, run_as_openhands): # import should not error out assert obs.content.strip() == ( '[Code executed successfully with no output]\n' - f'[Jupyter current working directory: {sandbox_dir}]\n' + '[Jupyter current working directory: /openhands/workspace]\n' '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.12/bin/python]' ) @@ -239,7 +234,6 @@ def test_ipython_package_install(temp_dir, runtime_cls, run_as_openhands): def test_ipython_file_editor_permissions_as_openhands(temp_dir, runtime_cls): """Test file editor permission behavior when running as different users.""" runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands=True) - sandbox_dir = _get_sandbox_folder(runtime) # Create a file owned by root with restricted permissions action = CmdRunAction( @@ -277,18 +271,18 @@ def test_ipython_file_editor_permissions_as_openhands(temp_dir, runtime_cls): assert 'Permission denied' in obs.content # Try to use file editor in openhands sandbox directory - should work - test_code = f""" + test_code = """ # Create file -print(file_editor(command='create', path='{sandbox_dir}/test.txt', file_text='Line 1\\nLine 2\\nLine 3')) +print(file_editor(command='create', path='/openhands/workspace/test.txt', file_text='Line 1\\nLine 2\\nLine 3')) # View file -print(file_editor(command='view', path='{sandbox_dir}/test.txt')) +print(file_editor(command='view', path='/openhands/workspace/test.txt')) # Edit file -print(file_editor(command='str_replace', path='{sandbox_dir}/test.txt', old_str='Line 2', new_str='New Line 2')) +print(file_editor(command='str_replace', path='/openhands/workspace/test.txt', old_str='Line 2', new_str='New Line 2')) # Undo edit -print(file_editor(command='undo_edit', path='{sandbox_dir}/test.txt')) +print(file_editor(command='undo_edit', path='/openhands/workspace/test.txt')) """ action = IPythonRunCellAction(code=test_code) logger.info(action, extra={'msg_type': 'ACTION'}) @@ -303,7 +297,7 @@ def test_ipython_file_editor_permissions_as_openhands(temp_dir, runtime_cls): assert 'undone successfully' in obs.content # Clean up - action = CmdRunAction(command=f'rm -f {sandbox_dir}/test.txt') + action = CmdRunAction(command='rm -f /openhands/workspace/test.txt') logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) @@ -318,9 +312,9 @@ def test_ipython_file_editor_permissions_as_openhands(temp_dir, runtime_cls): _close_test_runtime(runtime) -def test_file_read_and_edit_via_oh_aci(temp_dir, runtime_cls, run_as_openhands): - runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands) - sandbox_dir = _get_sandbox_folder(runtime) +def test_file_read_and_edit_via_oh_aci(runtime_cls, run_as_openhands): + runtime = _load_runtime(None, runtime_cls, run_as_openhands) + sandbox_dir = '/openhands/workspace' actions = [ { diff --git a/tests/unit/resolver/test_resolve_issues.py b/tests/unit/resolver/test_resolve_issues.py index 9f5c8dc1f891..fcc12f1d0698 100644 --- a/tests/unit/resolver/test_resolve_issues.py +++ b/tests/unit/resolver/test_resolve_issues.py @@ -6,7 +6,11 @@ from openhands.core.config import LLMConfig from openhands.events.action import CmdRunAction -from openhands.events.observation import CmdOutputObservation, NullObservation +from openhands.events.observation import ( + CmdOutputMetadata, + CmdOutputObservation, + NullObservation, +) from openhands.llm.llm import LLM from openhands.resolver.github_issue import GithubIssue, ReviewThread from openhands.resolver.issue_definitions import IssueHandler, PRHandler @@ -55,23 +59,20 @@ def mock_followup_prompt_template(): return 'Issue context: {{ issues }}\n\nReview comments: {{ review_comments }}\n\nReview threads: {{ review_threads }}\n\nFiles: {{ files }}\n\nThread comments: {{ thread_context }}\n\nPlease fix this issue.' -def create_cmd_output(exit_code: int, content: str, command_id: int, command: str): +def create_cmd_output(exit_code: int, content: str, command: str): return CmdOutputObservation( - exit_code=exit_code, content=content, command_id=command_id, command=command + content=content, + command=command, + metadata=CmdOutputMetadata(exit_code=exit_code), ) def test_initialize_runtime(): mock_runtime = MagicMock() mock_runtime.run_action.side_effect = [ + create_cmd_output(exit_code=0, content='', command='cd /workspace'), create_cmd_output( - exit_code=0, content='', command_id=1, command='cd /workspace' - ), - create_cmd_output( - exit_code=0, - content='', - command_id=2, - command='git config --global core.pager ""', + exit_code=0, content='', command='git config --global core.pager ""' ), ] @@ -291,30 +292,19 @@ def get_mock_response(url, *args, **kwargs): async def test_complete_runtime(): mock_runtime = MagicMock() mock_runtime.run_action.side_effect = [ + create_cmd_output(exit_code=0, content='', command='cd /workspace'), create_cmd_output( - exit_code=0, content='', command_id=1, command='cd /workspace' + exit_code=0, content='', command='git config --global core.pager ""' ), create_cmd_output( exit_code=0, content='', - command_id=2, - command='git config --global core.pager ""', - ), - create_cmd_output( - exit_code=0, - content='', - command_id=3, command='git config --global --add safe.directory /workspace', ), create_cmd_output( - exit_code=0, - content='', - command_id=4, - command='git diff base_commit_hash fix', - ), - create_cmd_output( - exit_code=0, content='git diff content', command_id=5, command='git apply' + exit_code=0, content='', command='git diff base_commit_hash fix' ), + create_cmd_output(exit_code=0, content='git diff content', command='git apply'), ] result = await complete_runtime(mock_runtime, 'base_commit_hash') @@ -614,11 +604,7 @@ def test_guess_success(): title='Test Issue', body='This is a test issue', ) - mock_history = [ - create_cmd_output( - exit_code=0, content='', command_id=1, command='cd /workspace' - ) - ] + mock_history = [create_cmd_output(exit_code=0, content='', command='cd /workspace')] mock_llm_config = LLMConfig(model='test_model', api_key='test_api_key') mock_completion_response = MagicMock() @@ -758,11 +744,7 @@ def test_guess_success_negative_case(): title='Test Issue', body='This is a test issue', ) - mock_history = [ - create_cmd_output( - exit_code=0, content='', command_id=1, command='cd /workspace' - ) - ] + mock_history = [create_cmd_output(exit_code=0, content='', command='cd /workspace')] mock_llm_config = LLMConfig(model='test_model', api_key='test_api_key') mock_completion_response = MagicMock() @@ -795,11 +777,7 @@ def test_guess_success_invalid_output(): title='Test Issue', body='This is a test issue', ) - mock_history = [ - create_cmd_output( - exit_code=0, content='', command_id=1, command='cd /workspace' - ) - ] + mock_history = [create_cmd_output(exit_code=0, content='', command='cd /workspace')] mock_llm_config = LLMConfig(model='test_model', api_key='test_api_key') mock_completion_response = MagicMock() diff --git a/tests/unit/test_action_serialization.py b/tests/unit/test_action_serialization.py index 318dd612a2d7..374d5eee0100 100644 --- a/tests/unit/test_action_serialization.py +++ b/tests/unit/test_action_serialization.py @@ -41,11 +41,10 @@ def serialization_deserialization( serialized_action_memory = event_to_memory(action_instance, max_message_chars) original_memory_dict = original_action_dict.copy() - # we don't send backend properties like id or 'keep_prompt' + # we don't send backend properties like id original_memory_dict.pop('id', None) original_memory_dict.pop('timestamp', None) if 'args' in original_memory_dict: - original_memory_dict['args'].pop('keep_prompt', None) original_memory_dict['args'].pop('blocking', None) original_memory_dict['args'].pop('confirmation_state', None) @@ -99,7 +98,6 @@ def test_cmd_run_action_serialization_deserialization(): 'blocking': False, 'command': 'echo "Hello world"', 'thought': '', - 'keep_prompt': True, 'hidden': False, 'confirmation_state': ActionConfirmationStatus.CONFIRMED, }, @@ -154,3 +152,32 @@ def test_file_write_action_serialization_deserialization(): }, } serialization_deserialization(original_action_dict, FileWriteAction) + + +def test_legacy_serialization(): + original_action_dict = { + 'action': 'run', + 'args': { + 'blocking': False, + 'command': 'echo "Hello world"', + 'thought': '', + 'hidden': False, + 'confirmation_state': ActionConfirmationStatus.CONFIRMED, + 'keep_prompt': False, # will be treated as no-op + }, + } + event = event_from_dict(original_action_dict) + assert isinstance(event, Action) + assert isinstance(event, CmdRunAction) + assert event.command == 'echo "Hello world"' + assert event.hidden is False + assert not hasattr(event, 'keep_prompt') + + event_dict = event_to_dict(event) + assert 'keep_prompt' not in event_dict['args'] + assert ( + event_dict['args']['confirmation_state'] == ActionConfirmationStatus.CONFIRMED + ) + assert event_dict['args']['blocking'] is False + assert event_dict['args']['command'] == 'echo "Hello world"' + assert event_dict['args']['thought'] == '' diff --git a/tests/unit/test_bash_parsing.py b/tests/unit/test_bash_parsing.py index 4ba75028bba3..38ee0f221c42 100644 --- a/tests/unit/test_bash_parsing.py +++ b/tests/unit/test_bash_parsing.py @@ -1,6 +1,6 @@ import pytest -from openhands.runtime.utils.bash import split_bash_commands +from openhands.runtime.utils.bash import escape_bash_special_chars, split_bash_commands def test_split_commands_util(): @@ -257,3 +257,172 @@ def test_split_commands_with_invalid_input(): for input_command in invalid_inputs: # it will fall back to return the original input assert split_bash_commands(input_command) == [input_command] + + +def test_escape_bash_special_chars(): + test_cases = [ + # Basic cases - use raw strings (r'') to avoid Python escape sequence warnings + ('echo test \\; ls', 'echo test \\\\; ls'), + ('grep pattern \\| sort', 'grep pattern \\\\| sort'), + ('cmd1 \\&\\& cmd2', 'cmd1 \\\\&\\\\& cmd2'), + ('cat file \\> output.txt', 'cat file \\\\> output.txt'), + ('cat \\< input.txt', 'cat \\\\< input.txt'), + # Quoted strings should remain unchanged + ('echo "test \\; unchanged"', 'echo "test \\; unchanged"'), + ("echo 'test \\| unchanged'", "echo 'test \\| unchanged'"), + # Mixed quoted and unquoted + ( + 'echo "quoted \\;" \\; "more" \\| grep', + 'echo "quoted \\;" \\\\; "more" \\\\| grep', + ), + # Multiple escapes in sequence + ('cmd1 \\;\\|\\& cmd2', 'cmd1 \\\\;\\\\|\\\\& cmd2'), + # Commands with other backslashes + ('echo test\\ntest', 'echo test\\ntest'), + ('echo "test\\ntest"', 'echo "test\\ntest"'), + # Edge cases + ('', ''), # Empty string + ('\\\\', '\\\\'), # Double backslash + ('\\"', '\\"'), # Escaped quote + ] + + for input_cmd, expected in test_cases: + result = escape_bash_special_chars(input_cmd) + assert ( + result == expected + ), f'Failed on input "{input_cmd}"\nExpected: "{expected}"\nGot: "{result}"' + + +def test_escape_bash_special_chars_with_invalid_syntax(): + invalid_inputs = [ + 'echo "unclosed quote', + "echo 'unclosed quote", + 'cat < out.txt', + 'echo "$(pwd)" && cat file \\\\> out.txt', + ), + # Multiple chains + ('cmd1 && cmd2 && cmd3', 'cmd1 && cmd2 && cmd3'), + ( + 'cmd1 \\; ls && cmd2 \\| grep && cmd3', + 'cmd1 \\\\; ls && cmd2 \\\\| grep && cmd3', + ), + ] + + for input_cmd, expected in test_cases: + result = escape_bash_special_chars(input_cmd) + assert ( + result == expected + ), f'Failed on input "{input_cmd}"\nExpected: "{expected}"\nGot: "{result}"' diff --git a/tests/unit/test_bash_ps1_metadata.py b/tests/unit/test_bash_ps1_metadata.py new file mode 100644 index 000000000000..023e3c2af101 --- /dev/null +++ b/tests/unit/test_bash_ps1_metadata.py @@ -0,0 +1,339 @@ +import json + +from openhands.events.observation.commands import ( + CMD_OUTPUT_METADATA_PS1_REGEX, + CMD_OUTPUT_PS1_BEGIN, + CMD_OUTPUT_PS1_END, + CmdOutputMetadata, + CmdOutputObservation, +) + + +def test_ps1_metadata_format(): + """Test that PS1 prompt has correct format markers""" + prompt = CmdOutputMetadata.to_ps1_prompt() + print(prompt) + assert prompt.startswith('\n###PS1JSON###\n') + assert prompt.endswith('\n###PS1END###\n') + assert r'\"exit_code\"' in prompt, 'PS1 prompt should contain escaped double quotes' + + +def test_ps1_metadata_json_structure(): + """Test that PS1 prompt contains valid JSON with expected fields""" + prompt = CmdOutputMetadata.to_ps1_prompt() + # Extract JSON content between markers + json_str = prompt.replace('###PS1JSON###\n', '').replace('\n###PS1END###\n', '') + # Remove escaping before parsing + json_str = json_str.replace(r'\"', '"') + # Remove any trailing content after the JSON + json_str = json_str.split('###PS1END###')[0].strip() + data = json.loads(json_str) + + # Check required fields + expected_fields = { + 'pid', + 'exit_code', + 'username', + 'hostname', + 'working_dir', + 'py_interpreter_path', + } + assert set(data.keys()) == expected_fields + + +def test_ps1_metadata_parsing(): + """Test parsing PS1 output into CmdOutputMetadata""" + test_data = { + 'exit_code': 0, + 'username': 'testuser', + 'hostname': 'localhost', + 'working_dir': '/home/testuser', + 'py_interpreter_path': '/usr/bin/python', + } + + ps1_str = f"""###PS1JSON### +{json.dumps(test_data, indent=2)} +###PS1END### +""" + matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str) + assert len(matches) == 1 + metadata = CmdOutputMetadata.from_ps1_match(matches[0]) + assert metadata.exit_code == test_data['exit_code'] + assert metadata.username == test_data['username'] + assert metadata.hostname == test_data['hostname'] + assert metadata.working_dir == test_data['working_dir'] + assert metadata.py_interpreter_path == test_data['py_interpreter_path'] + + +def test_ps1_metadata_parsing_string(): + """Test parsing PS1 output into CmdOutputMetadata""" + ps1_str = r"""###PS1JSON### +{ + "exit_code": "0", + "username": "myname", + "hostname": "myhostname", + "working_dir": "~/mydir", + "py_interpreter_path": "/my/python/path" +} +###PS1END### +""" + matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str) + assert len(matches) == 1 + metadata = CmdOutputMetadata.from_ps1_match(matches[0]) + assert metadata.exit_code == 0 + assert metadata.username == 'myname' + assert metadata.hostname == 'myhostname' + assert metadata.working_dir == '~/mydir' + assert metadata.py_interpreter_path == '/my/python/path' + + +def test_ps1_metadata_parsing_string_real_example(): + """Test parsing PS1 output into CmdOutputMetadata""" + ps1_str = r""" +###PS1JSON### +{ + "pid": "", + "exit_code": "0", + "username": "runner", + "hostname": "fv-az1055-610", + "working_dir": "/home/runner/work/OpenHands/OpenHands", + "py_interpreter_path": "/home/runner/.cache/pypoetry/virtualenvs/openhands-ai-ULPBlkAi-py3.12/bin/python" +} +###PS1END### +""" + matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str) + assert len(matches) == 1 + metadata = CmdOutputMetadata.from_ps1_match(matches[0]) + assert metadata.exit_code == 0 + assert metadata.username == 'runner' + assert metadata.hostname == 'fv-az1055-610' + assert metadata.working_dir == '/home/runner/work/OpenHands/OpenHands' + assert ( + metadata.py_interpreter_path + == '/home/runner/.cache/pypoetry/virtualenvs/openhands-ai-ULPBlkAi-py3.12/bin/python' + ) + + +def test_ps1_metadata_parsing_additional_prefix(): + """Test parsing PS1 output into CmdOutputMetadata""" + test_data = { + 'exit_code': 0, + 'username': 'testuser', + 'hostname': 'localhost', + 'working_dir': '/home/testuser', + 'py_interpreter_path': '/usr/bin/python', + } + + ps1_str = f""" +This is something that not part of the PS1 prompt + +###PS1JSON### +{json.dumps(test_data, indent=2)} +###PS1END### +""" + + matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str) + assert len(matches) == 1 + metadata = CmdOutputMetadata.from_ps1_match(matches[0]) + assert metadata.exit_code == test_data['exit_code'] + assert metadata.username == test_data['username'] + assert metadata.hostname == test_data['hostname'] + assert metadata.working_dir == test_data['working_dir'] + assert metadata.py_interpreter_path == test_data['py_interpreter_path'] + + +def test_ps1_metadata_parsing_invalid(): + """Test parsing invalid PS1 output returns default metadata""" + # Test with invalid JSON + invalid_json = """###PS1JSON### + {invalid json} +###PS1END### +""" + matches = CmdOutputMetadata.matches_ps1_metadata(invalid_json) + assert len(matches) == 0 # No matches should be found for invalid JSON + + # Test with missing markers + invalid_format = """NOT A VALID PS1 PROMPT""" + matches = CmdOutputMetadata.matches_ps1_metadata(invalid_format) + assert len(matches) == 0 + + # Test with empty PS1 metadata + empty_metadata = """###PS1JSON### + +###PS1END### +""" + matches = CmdOutputMetadata.matches_ps1_metadata(empty_metadata) + assert len(matches) == 0 # No matches should be found for empty metadata + + # Test with whitespace in PS1 metadata + whitespace_metadata = """###PS1JSON### + + { + "exit_code": "0", + "pid": "123", + "username": "test", + "hostname": "localhost", + "working_dir": "/home/test", + "py_interpreter_path": "/usr/bin/python" + } + +###PS1END### +""" + matches = CmdOutputMetadata.matches_ps1_metadata(whitespace_metadata) + assert len(matches) == 1 + metadata = CmdOutputMetadata.from_ps1_match(matches[0]) + assert metadata.exit_code == 0 + assert metadata.pid == 123 + + +def test_ps1_metadata_missing_fields(): + """Test handling of missing fields in PS1 metadata""" + # Test with only required fields + minimal_data = {'exit_code': 0, 'pid': 123} + ps1_str = f"""###PS1JSON### +{json.dumps(minimal_data)} +###PS1END### +""" + matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str) + assert len(matches) == 1 + metadata = CmdOutputMetadata.from_ps1_match(matches[0]) + assert metadata.exit_code == 0 + assert metadata.pid == 123 + assert metadata.username is None + assert metadata.hostname is None + assert metadata.working_dir is None + assert metadata.py_interpreter_path is None + + # Test with missing exit_code but valid pid + no_exit_code = {'pid': 123, 'username': 'test'} + ps1_str = f"""###PS1JSON### +{json.dumps(no_exit_code)} +###PS1END### +""" + matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str) + assert len(matches) == 1 + metadata = CmdOutputMetadata.from_ps1_match(matches[0]) + assert metadata.exit_code == -1 # default value + assert metadata.pid == 123 + assert metadata.username == 'test' + + +def test_ps1_metadata_multiple_blocks(): + """Test handling multiple PS1 metadata blocks""" + test_data = { + 'exit_code': 0, + 'username': 'testuser', + 'hostname': 'localhost', + 'working_dir': '/home/testuser', + 'py_interpreter_path': '/usr/bin/python', + } + + ps1_str = f"""###PS1JSON### +{json.dumps(test_data, indent=2)} +###PS1END### +Some other content +###PS1JSON### +{json.dumps(test_data, indent=2)} +###PS1END### +""" + matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str) + assert len(matches) == 2 # Should find both blocks + # Both blocks should parse successfully + metadata1 = CmdOutputMetadata.from_ps1_match(matches[0]) + metadata2 = CmdOutputMetadata.from_ps1_match(matches[1]) + assert metadata1.exit_code == test_data['exit_code'] + assert metadata2.exit_code == test_data['exit_code'] + + +def test_ps1_metadata_regex_pattern(): + """Test the regex pattern used to extract PS1 metadata""" + # Test basic pattern matching + test_str = f'{CMD_OUTPUT_PS1_BEGIN}test\n{CMD_OUTPUT_PS1_END}' + matches = CMD_OUTPUT_METADATA_PS1_REGEX.finditer(test_str) + match = next(matches) + assert match.group(1).strip() == 'test' + + # Test with content before and after + test_str = f'prefix\n{CMD_OUTPUT_PS1_BEGIN}test\n{CMD_OUTPUT_PS1_END}suffix' + matches = CMD_OUTPUT_METADATA_PS1_REGEX.finditer(test_str) + match = next(matches) + assert match.group(1).strip() == 'test' + + # Test with multiline content + test_str = f'{CMD_OUTPUT_PS1_BEGIN}line1\nline2\nline3\n{CMD_OUTPUT_PS1_END}' + matches = CMD_OUTPUT_METADATA_PS1_REGEX.finditer(test_str) + match = next(matches) + assert match.group(1).strip() == 'line1\nline2\nline3' + + +def test_cmd_output_observation_properties(): + """Test CmdOutputObservation class properties""" + # Test with successful command + metadata = CmdOutputMetadata(exit_code=0, pid=123) + obs = CmdOutputObservation(command='ls', content='file1\nfile2', metadata=metadata) + assert obs.command_id == 123 + assert obs.exit_code == 0 + assert not obs.error + assert 'exit code 0' in obs.message + assert 'ls' in obs.message + assert 'file1' in str(obs) + assert 'file2' in str(obs) + assert 'metadata' in str(obs) + + # Test with failed command + metadata = CmdOutputMetadata(exit_code=1, pid=456) + obs = CmdOutputObservation(command='invalid', content='error', metadata=metadata) + assert obs.command_id == 456 + assert obs.exit_code == 1 + assert obs.error + assert 'exit code 1' in obs.message + assert 'invalid' in obs.message + assert 'error' in str(obs) + + +def test_ps1_metadata_empty_fields(): + """Test handling of empty fields in PS1 metadata""" + # Test with empty strings + empty_data = { + 'exit_code': 0, + 'pid': 123, + 'username': '', + 'hostname': '', + 'working_dir': '', + 'py_interpreter_path': '', + } + ps1_str = f"""###PS1JSON### +{json.dumps(empty_data)} +###PS1END### +""" + matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str) + assert len(matches) == 1 + metadata = CmdOutputMetadata.from_ps1_match(matches[0]) + assert metadata.exit_code == 0 + assert metadata.pid == 123 + assert metadata.username == '' + assert metadata.hostname == '' + assert metadata.working_dir == '' + assert metadata.py_interpreter_path == '' + + # Test with malformed but valid JSON + malformed_json = """###PS1JSON### + { + "exit_code":0, + "pid" : 123, + "username": "test" , + "hostname": "host", + "working_dir" :"dir", + "py_interpreter_path":"path" + } +###PS1END### +""" + matches = CmdOutputMetadata.matches_ps1_metadata(malformed_json) + assert len(matches) == 1 + metadata = CmdOutputMetadata.from_ps1_match(matches[0]) + assert metadata.exit_code == 0 + assert metadata.pid == 123 + assert metadata.username == 'test' + assert metadata.hostname == 'host' + assert metadata.working_dir == 'dir' + assert metadata.py_interpreter_path == 'path' diff --git a/tests/unit/test_bash_session.py b/tests/unit/test_bash_session.py new file mode 100644 index 000000000000..13f32ad27d25 --- /dev/null +++ b/tests/unit/test_bash_session.py @@ -0,0 +1,384 @@ +import os +import tempfile + +from openhands.core.logger import openhands_logger as logger +from openhands.events.action import CmdRunAction +from openhands.runtime.utils.bash import BashCommandStatus, BashSession + + +def test_session_initialization(): + # Test with custom working directory + with tempfile.TemporaryDirectory() as temp_dir: + session = BashSession(work_dir=temp_dir) + session.initialize() + obs = session.execute(CmdRunAction('pwd')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert temp_dir in obs.content + assert '[The command completed with exit code 0.]' in obs.metadata.suffix + session.close() + + # Test with custom username + session = BashSession(work_dir=os.getcwd(), username='nobody') + session.initialize() + assert 'openhands-nobody' in session.session.name + session.close() + + +def test_cwd_property(tmp_path): + session = BashSession(work_dir=tmp_path) + session.initialize() + # Change directory and verify pwd updates + random_dir = tmp_path / 'random' + random_dir.mkdir() + session.execute(CmdRunAction(f'cd {random_dir}')) + assert session.cwd == str(random_dir) + session.close() + + +def test_basic_command(): + session = BashSession(work_dir=os.getcwd()) + session.initialize() + + # Test simple command + obs = session.execute(CmdRunAction("echo 'hello world'")) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'hello world' in obs.content + assert obs.metadata.suffix == '\n[The command completed with exit code 0.]' + assert obs.metadata.prefix == '' + assert obs.metadata.exit_code == 0 + assert session.prev_status == BashCommandStatus.COMPLETED + + # Test command with error + obs = session.execute(CmdRunAction('nonexistent_command')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.metadata.exit_code == 127 + assert 'nonexistent_command: command not found' in obs.content + assert obs.metadata.suffix == '\n[The command completed with exit code 127.]' + assert obs.metadata.prefix == '' + assert session.prev_status == BashCommandStatus.COMPLETED + + # Test multiple commands in sequence + obs = session.execute(CmdRunAction('echo "first" && echo "second" && echo "third"')) + assert 'first' in obs.content + assert 'second' in obs.content + assert 'third' in obs.content + assert obs.metadata.suffix == '\n[The command completed with exit code 0.]' + assert obs.metadata.prefix == '' + assert obs.metadata.exit_code == 0 + assert session.prev_status == BashCommandStatus.COMPLETED + + session.close() + + +def test_long_running_command_follow_by_execute(): + session = BashSession(work_dir=os.getcwd(), no_change_timeout_seconds=2) + session.initialize() + + # Test command that produces output slowly + obs = session.execute( + CmdRunAction('for i in {1..3}; do echo $i; sleep 3; done', blocking=False) + ) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert '1' in obs.content # First number should appear before timeout + assert obs.metadata.exit_code == -1 # -1 indicates command is still running + assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT + assert obs.metadata.suffix == ( + '\n[The command has no new output after 2 seconds. ' + "You may wait longer to see additional output by sending empty command '', " + 'send other commands to interact with the current process, ' + 'or send keys to interrupt/kill the command.]' + ) + assert obs.metadata.prefix == '' + + # Continue watching output + obs = session.execute(CmdRunAction('')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert '2' in obs.content + assert obs.metadata.prefix == '[Command output continued from previous command]\n' + assert obs.metadata.suffix == ( + '\n[The command has no new output after 2 seconds. ' + "You may wait longer to see additional output by sending empty command '', " + 'send other commands to interact with the current process, ' + 'or send keys to interrupt/kill the command.]' + ) + assert obs.metadata.exit_code == -1 # -1 indicates command is still running + assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT + + # Test command that produces no output + obs = session.execute(CmdRunAction('sleep 15')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert '3' in obs.content + assert obs.metadata.prefix == '[Command output continued from previous command]\n' + assert obs.metadata.suffix == ( + '\n[The command has no new output after 2 seconds. ' + "You may wait longer to see additional output by sending empty command '', " + 'send other commands to interact with the current process, ' + 'or send keys to interrupt/kill the command.]' + ) + assert obs.metadata.exit_code == -1 # -1 indicates command is still running + assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT + + session.close() + + +def test_interactive_command(): + session = BashSession(work_dir=os.getcwd(), no_change_timeout_seconds=3) + session.initialize() + + # Test interactive command with blocking=True + obs = session.execute( + CmdRunAction( + 'read -p \'Enter name: \' name && echo "Hello $name"', + ) + ) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'Enter name:' in obs.content + assert obs.metadata.exit_code == -1 # -1 indicates command is still running + assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT + assert obs.metadata.suffix == ( + '\n[The command has no new output after 3 seconds. ' + "You may wait longer to see additional output by sending empty command '', " + 'send other commands to interact with the current process, ' + 'or send keys to interrupt/kill the command.]' + ) + assert obs.metadata.prefix == '' + + # Send input + obs = session.execute(CmdRunAction('John')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'Hello John' in obs.content + assert obs.metadata.exit_code == 0 + assert obs.metadata.suffix == '\n[The command completed with exit code 0.]' + assert obs.metadata.prefix == '' + assert session.prev_status == BashCommandStatus.COMPLETED + + # Test multiline command input + obs = session.execute(CmdRunAction('cat << EOF')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.metadata.exit_code == -1 + assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT + assert obs.metadata.suffix == ( + '\n[The command has no new output after 3 seconds. ' + "You may wait longer to see additional output by sending empty command '', " + 'send other commands to interact with the current process, ' + 'or send keys to interrupt/kill the command.]' + ) + assert obs.metadata.prefix == '' + + obs = session.execute(CmdRunAction('line 1')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.metadata.exit_code == -1 + assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT + assert obs.metadata.suffix == ( + '\n[The command has no new output after 3 seconds. ' + "You may wait longer to see additional output by sending empty command '', " + 'send other commands to interact with the current process, ' + 'or send keys to interrupt/kill the command.]' + ) + assert obs.metadata.prefix == '[Command output continued from previous command]\n' + + obs = session.execute(CmdRunAction('line 2')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.metadata.exit_code == -1 + assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT + assert obs.metadata.suffix == ( + '\n[The command has no new output after 3 seconds. ' + "You may wait longer to see additional output by sending empty command '', " + 'send other commands to interact with the current process, ' + 'or send keys to interrupt/kill the command.]' + ) + assert obs.metadata.prefix == '[Command output continued from previous command]\n' + + obs = session.execute(CmdRunAction('EOF')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'line 1' in obs.content and 'line 2' in obs.content + assert obs.metadata.exit_code == 0 + assert obs.metadata.suffix == '\n[The command completed with exit code 0.]' + assert obs.metadata.prefix == '' + + session.close() + + +def test_ctrl_c(): + session = BashSession(work_dir=os.getcwd(), no_change_timeout_seconds=2) + session.initialize() + + # Start infinite loop + obs = session.execute( + CmdRunAction("while true; do echo 'looping'; sleep 3; done"), + ) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'looping' in obs.content + assert obs.metadata.suffix == ( + '\n[The command has no new output after 2 seconds. ' + "You may wait longer to see additional output by sending empty command '', " + 'send other commands to interact with the current process, ' + 'or send keys to interrupt/kill the command.]' + ) + assert obs.metadata.prefix == '' + assert obs.metadata.exit_code == -1 # -1 indicates command is still running + assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT + + # Send Ctrl+C + obs = session.execute(CmdRunAction('C-c')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.metadata.exit_code == 130 # Standard exit code for Ctrl+C + assert ( + obs.metadata.suffix + == '\n[The command completed with exit code 130. CTRL+C was sent.]' + ) + assert obs.metadata.prefix == '' + assert session.prev_status == BashCommandStatus.COMPLETED + + session.close() + + +def test_empty_command_errors(): + session = BashSession(work_dir=os.getcwd()) + session.initialize() + + # Test empty command without previous command + obs = session.execute(CmdRunAction('')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert ( + obs.content + == 'ERROR: No previous command to continue from. Previous command has to be timeout to be continued.' + ) + assert obs.metadata.exit_code == -1 + assert obs.metadata.prefix == '' + assert obs.metadata.suffix == '' + assert session.prev_status is None + + session.close() + + +def test_command_output_continuation(): + session = BashSession(work_dir=os.getcwd(), no_change_timeout_seconds=2) + session.initialize() + + # Start a command that produces output slowly + obs = session.execute(CmdRunAction('for i in {1..5}; do echo $i; sleep 3; done')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.content.strip() == '1' + assert obs.metadata.prefix == '' + assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix + assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT + + obs = session.execute(CmdRunAction('')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert '[Command output continued from previous command]' in obs.metadata.prefix + assert obs.content.strip() == '2' + assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix + assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT + + obs = session.execute(CmdRunAction('')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert '[Command output continued from previous command]' in obs.metadata.prefix + assert obs.content.strip() == '3' + + assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix + assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT + + obs = session.execute(CmdRunAction('')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert '[Command output continued from previous command]' in obs.metadata.prefix + assert obs.content.strip() == '4' + assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix + assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT + + obs = session.execute(CmdRunAction('')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert '[Command output continued from previous command]' in obs.metadata.prefix + assert obs.content.strip() == '5' + assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix + assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT + + obs = session.execute(CmdRunAction('')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert '[The command completed with exit code 0.]' in obs.metadata.suffix + assert session.prev_status == BashCommandStatus.COMPLETED + + session.close() + + +def test_long_output(): + session = BashSession(work_dir=os.getcwd()) + session.initialize() + + # Generate a long output that may exceed buffer size + obs = session.execute(CmdRunAction('for i in {1..5000}; do echo "Line $i"; done')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'Line 1' in obs.content + assert 'Line 5000' in obs.content + assert obs.metadata.exit_code == 0 + assert obs.metadata.prefix == '' + assert obs.metadata.suffix == '\n[The command completed with exit code 0.]' + + session.close() + + +def test_long_output_exceed_history_limit(): + session = BashSession(work_dir=os.getcwd()) + session.initialize() + + # Generate a long output that may exceed buffer size + obs = session.execute(CmdRunAction('for i in {1..50000}; do echo "Line $i"; done')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'Previous command outputs are truncated' in obs.metadata.prefix + assert 'Line 40000' in obs.content + assert 'Line 50000' in obs.content + assert obs.metadata.exit_code == 0 + assert obs.metadata.suffix == '\n[The command completed with exit code 0.]' + + session.close() + + +def test_multiline_command(): + session = BashSession(work_dir=os.getcwd()) + session.initialize() + + # Test multiline command with PS2 prompt disabled + obs = session.execute( + CmdRunAction("""if true; then +echo "inside if" +fi""") + ) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'inside if' in obs.content + assert obs.metadata.exit_code == 0 + assert obs.metadata.prefix == '' + assert obs.metadata.suffix == '\n[The command completed with exit code 0.]' + + session.close() + + +def test_python_interactive_input(): + session = BashSession(work_dir=os.getcwd(), no_change_timeout_seconds=2) + session.initialize() + + # Test Python program that asks for input - properly escaped for bash + python_script = """name = input('Enter your name: '); age = input('Enter your age: '); print(f'Hello {name}, you are {age} years old')""" + + # Start Python with the interactive script + obs = session.execute(CmdRunAction(f'python3 -c "{python_script}"')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'Enter your name:' in obs.content + assert obs.metadata.exit_code == -1 # -1 indicates command is still running + assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT + + # Send first input (name) + obs = session.execute(CmdRunAction('Alice')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'Enter your age:' in obs.content + assert obs.metadata.exit_code == -1 + assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT + + # Send second input (age) + obs = session.execute(CmdRunAction('25')) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'Hello Alice, you are 25 years old' in obs.content + assert obs.metadata.exit_code == 0 + assert obs.metadata.suffix == '\n[The command completed with exit code 0.]' + assert session.prev_status == BashCommandStatus.COMPLETED + + session.close() diff --git a/tests/unit/test_codeact_agent.py b/tests/unit/test_codeact_agent.py index 28696df7d2a7..fa265e750b23 100644 --- a/tests/unit/test_codeact_agent.py +++ b/tests/unit/test_codeact_agent.py @@ -26,6 +26,7 @@ from openhands.events.event import EventSource, FileEditSource, FileReadSource from openhands.events.observation.browse import BrowserOutputObservation from openhands.events.observation.commands import ( + CmdOutputMetadata, CmdOutputObservation, IPythonRunCellObservation, ) @@ -50,7 +51,11 @@ def agent() -> CodeActAgent: def test_cmd_output_observation_message(agent: CodeActAgent): agent.config.function_calling = False obs = CmdOutputObservation( - command='echo hello', content='Command output', command_id=1, exit_code=0 + command='echo hello', + content='Command output', + metadata=CmdOutputMetadata( + exit_code=0, + ), ) results = agent.get_observation_message(obs, tool_call_id_to_message={}) @@ -62,7 +67,7 @@ def test_cmd_output_observation_message(agent: CodeActAgent): assert len(result.content) == 1 assert isinstance(result.content[0], TextContent) assert 'Command output' in result.content[0].text - assert 'Command finished with exit code 0' in result.content[0].text + assert '[Command finished with exit code 0]' in result.content[0].text def test_ipython_run_cell_observation_message(agent: CodeActAgent): diff --git a/tests/unit/test_command_success.py b/tests/unit/test_command_success.py index b52ceb4815c7..298a3bcb4f6c 100644 --- a/tests/unit/test_command_success.py +++ b/tests/unit/test_command_success.py @@ -1,4 +1,5 @@ from openhands.events.observation.commands import ( + CmdOutputMetadata, CmdOutputObservation, IPythonRunCellObservation, ) @@ -7,14 +8,18 @@ def test_cmd_output_success(): # Test successful command obs = CmdOutputObservation( - command_id=1, command='ls', content='file1.txt\nfile2.txt', exit_code=0 + command='ls', + content='file1.txt\nfile2.txt', + metadata=CmdOutputMetadata(exit_code=0), ) assert obs.success is True assert obs.error is False # Test failed command obs = CmdOutputObservation( - command_id=2, command='ls', content='No such file or directory', exit_code=1 + command='ls', + content='No such file or directory', + metadata=CmdOutputMetadata(exit_code=1), ) assert obs.success is False assert obs.error is True diff --git a/tests/unit/test_event_serialization.py b/tests/unit/test_event_serialization.py index d1989a30bb09..52df3ef1d0e1 100644 --- a/tests/unit/test_event_serialization.py +++ b/tests/unit/test_event_serialization.py @@ -1,18 +1,22 @@ -from openhands.events.observation import CmdOutputObservation +from openhands.events.observation import CmdOutputMetadata, CmdOutputObservation from openhands.events.serialization import event_to_dict def test_command_output_success_serialization(): # Test successful command obs = CmdOutputObservation( - command_id=1, command='ls', content='file1.txt\nfile2.txt', exit_code=0 + command='ls', + content='file1.txt\nfile2.txt', + metadata=CmdOutputMetadata(exit_code=0), ) serialized = event_to_dict(obs) assert serialized['success'] is True # Test failed command obs = CmdOutputObservation( - command_id=2, command='ls', content='No such file or directory', exit_code=1 + command='ls', + content='No such file or directory', + metadata=CmdOutputMetadata(exit_code=1), ) serialized = event_to_dict(obs) assert serialized['success'] is False diff --git a/tests/unit/test_is_stuck.py b/tests/unit/test_is_stuck.py index 4b9a4e820ee5..fae6b1038131 100644 --- a/tests/unit/test_is_stuck.py +++ b/tests/unit/test_is_stuck.py @@ -107,7 +107,7 @@ def test_history_too_short(self, stuck_detector: StuckDetector): cmd_action = CmdRunAction(command='ls') state.history.append(cmd_action) cmd_observation = CmdOutputObservation( - command_id=1, command='ls', content='file1.txt\nfile2.txt' + command='ls', content='file1.txt\nfile2.txt' ) # cmd_observation._cause = cmd_action._id state.history.append(cmd_observation) @@ -188,7 +188,7 @@ def test_is_stuck_repeating_action_observation(self, stuck_detector: StuckDetect cmd_action_1 = CmdRunAction(command='ls') cmd_action_1._id = 1 state.history.append(cmd_action_1) - cmd_observation_1 = CmdOutputObservation(content='', command='ls', command_id=1) + cmd_observation_1 = CmdOutputObservation(content='', command='ls') cmd_observation_1._cause = cmd_action_1._id state.history.append(cmd_observation_1) # 4 events @@ -196,7 +196,7 @@ def test_is_stuck_repeating_action_observation(self, stuck_detector: StuckDetect cmd_action_2 = CmdRunAction(command='ls') cmd_action_2._id = 2 state.history.append(cmd_action_2) - cmd_observation_2 = CmdOutputObservation(content='', command='ls', command_id=2) + cmd_observation_2 = CmdOutputObservation(content='', command='ls') cmd_observation_2._cause = cmd_action_2._id state.history.append(cmd_observation_2) # 6 events @@ -212,7 +212,7 @@ def test_is_stuck_repeating_action_observation(self, stuck_detector: StuckDetect cmd_action_3 = CmdRunAction(command='ls') cmd_action_3._id = 3 state.history.append(cmd_action_3) - cmd_observation_3 = CmdOutputObservation(content='', command='ls', command_id=3) + cmd_observation_3 = CmdOutputObservation(content='', command='ls') cmd_observation_3._cause = cmd_action_3._id state.history.append(cmd_observation_3) # 10 events @@ -223,7 +223,7 @@ def test_is_stuck_repeating_action_observation(self, stuck_detector: StuckDetect cmd_action_4 = CmdRunAction(command='ls') cmd_action_4._id = 4 state.history.append(cmd_action_4) - cmd_observation_4 = CmdOutputObservation(content='', command='ls', command_id=4) + cmd_observation_4 = CmdOutputObservation(content='', command='ls') cmd_observation_4._cause = cmd_action_4._id state.history.append(cmd_observation_4) # 12 events @@ -436,7 +436,7 @@ def test_is_stuck_repeating_action_observation_pattern( cmd_action_1 = CmdRunAction(command='ls') state.history.append(cmd_action_1) cmd_observation_1 = CmdOutputObservation( - command_id=1, command='ls', content='file1.txt\nfile2.txt' + command='ls', content='file1.txt\nfile2.txt' ) # cmd_observation_1._cause = cmd_action_1._id state.history.append(cmd_observation_1) @@ -452,7 +452,7 @@ def test_is_stuck_repeating_action_observation_pattern( cmd_action_2 = CmdRunAction(command='ls') state.history.append(cmd_action_2) cmd_observation_2 = CmdOutputObservation( - command_id=2, command='ls', content='file1.txt\nfile2.txt' + command='ls', content='file1.txt\nfile2.txt' ) # cmd_observation_2._cause = cmd_action_2._id state.history.append(cmd_observation_2) @@ -475,7 +475,7 @@ def test_is_stuck_repeating_action_observation_pattern( cmd_action_3 = CmdRunAction(command='ls') state.history.append(cmd_action_3) cmd_observation_3 = CmdOutputObservation( - command_id=3, command='ls', content='file1.txt\nfile2.txt' + command='ls', content='file1.txt\nfile2.txt' ) # cmd_observation_3._cause = cmd_action_3._id state.history.append(cmd_observation_3) @@ -506,7 +506,7 @@ def test_is_stuck_not_stuck(self, stuck_detector: StuckDetector): cmd_action_1 = CmdRunAction(command='ls') state.history.append(cmd_action_1) cmd_observation_1 = CmdOutputObservation( - command_id=cmd_action_1.id, command='ls', content='file1.txt\nfile2.txt' + command='ls', content='file1.txt\nfile2.txt' ) # cmd_observation_1._cause = cmd_action_1._id state.history.append(cmd_observation_1) @@ -521,9 +521,7 @@ def test_is_stuck_not_stuck(self, stuck_detector: StuckDetector): cmd_action_2 = CmdRunAction(command='pwd') state.history.append(cmd_action_2) - cmd_observation_2 = CmdOutputObservation( - command_id=2, command='pwd', content='/home/user' - ) + cmd_observation_2 = CmdOutputObservation(command='pwd', content='/home/user') # cmd_observation_2._cause = cmd_action_2._id state.history.append(cmd_observation_2) @@ -541,9 +539,7 @@ def test_is_stuck_not_stuck(self, stuck_detector: StuckDetector): cmd_action_3 = CmdRunAction(command='pwd') state.history.append(cmd_action_3) - cmd_observation_3 = CmdOutputObservation( - command_id=cmd_action_3.id, command='pwd', content='/home/user' - ) + cmd_observation_3 = CmdOutputObservation(command='pwd', content='/home/user') # cmd_observation_3._cause = cmd_action_3._id state.history.append(cmd_observation_3) @@ -590,7 +586,6 @@ def test_is_stuck_monologue(self, stuck_detector): # Add an observation event between the repeated message actions cmd_output_observation = CmdOutputObservation( content='OK, I was stuck, but no more.', - command_id=42, command='storybook', exit_code=0, ) diff --git a/tests/unit/test_observation_serialization.py b/tests/unit/test_observation_serialization.py index 67a95449b719..626ea2c14774 100644 --- a/tests/unit/test_observation_serialization.py +++ b/tests/unit/test_observation_serialization.py @@ -1,4 +1,5 @@ from openhands.events.observation import ( + CmdOutputMetadata, CmdOutputObservation, Observation, ) @@ -40,13 +41,65 @@ def serialization_deserialization( # Additional tests for various observation subclasses can be included here +def test_observation_event_props_serialization_deserialization(): + original_observation_dict = { + 'id': 42, + 'source': 'agent', + 'timestamp': '2021-08-01T12:00:00', + 'observation': 'run', + 'message': 'Command `ls -l` executed with exit code 0.', + 'extras': { + 'command': 'ls -l', + 'hidden': False, + 'metadata': { + 'exit_code': 0, + 'hostname': None, + 'pid': -1, + 'prefix': '', + 'py_interpreter_path': None, + 'suffix': '', + 'username': None, + 'working_dir': None, + }, + }, + 'content': 'foo.txt', + 'success': True, + } + serialization_deserialization(original_observation_dict, CmdOutputObservation) + + +def test_command_output_observation_serialization_deserialization(): + original_observation_dict = { + 'observation': 'run', + 'extras': { + 'command': 'ls -l', + 'hidden': False, + 'metadata': { + 'exit_code': 0, + 'hostname': None, + 'pid': -1, + 'prefix': '', + 'py_interpreter_path': None, + 'suffix': '', + 'username': None, + 'working_dir': None, + }, + }, + 'message': 'Command `ls -l` executed with exit code 0.', + 'content': 'foo.txt', + 'success': True, + } + serialization_deserialization(original_observation_dict, CmdOutputObservation) + + def test_success_field_serialization(): # Test success=True obs = CmdOutputObservation( content='Command succeeded', - exit_code=0, command='ls -l', - command_id=3, + metadata=CmdOutputMetadata( + exit_code=0, + ), ) serialized = event_to_dict(obs) assert serialized['success'] is True @@ -54,9 +107,42 @@ def test_success_field_serialization(): # Test success=False obs = CmdOutputObservation( content='No such file or directory', - exit_code=1, command='ls -l', - command_id=3, + metadata=CmdOutputMetadata( + exit_code=1, + ), ) serialized = event_to_dict(obs) assert serialized['success'] is False + + +def test_legacy_serialization(): + original_observation_dict = { + 'id': 42, + 'source': 'agent', + 'timestamp': '2021-08-01T12:00:00', + 'observation': 'run', + 'message': 'Command `ls -l` executed with exit code 0.', + 'extras': { + 'command': 'ls -l', + 'hidden': False, + 'exit_code': 0, + 'command_id': 3, + }, + 'content': 'foo.txt', + 'success': True, + } + event = event_from_dict(original_observation_dict) + assert isinstance(event, Observation) + assert isinstance(event, CmdOutputObservation) + assert event.metadata.exit_code == 0 + assert event.success is True + assert event.command == 'ls -l' + assert event.hidden is False + + event_dict = event_to_dict(event) + assert event_dict['success'] is True + assert event_dict['extras']['metadata']['exit_code'] == 0 + assert event_dict['extras']['metadata']['pid'] == 3 + assert event_dict['extras']['command'] == 'ls -l' + assert event_dict['extras']['hidden'] is False diff --git a/tests/unit/test_security.py b/tests/unit/test_security.py index 71afd04dbe61..0e2b3c8a0d07 100644 --- a/tests/unit/test_security.py +++ b/tests/unit/test_security.py @@ -368,7 +368,6 @@ async def test_unsafe_bash_command(temp_dir: str): 'blocking': False, 'command': 'ls', 'hidden': False, - 'keep_prompt': True, 'confirmation_state': ActionConfirmationStatus.CONFIRMED, }, ), @@ -495,9 +494,7 @@ def test_parse_action(action, expected_trace): ], ), ( - CmdOutputObservation( - content='cmd output content', command_id=1, command='ls' - ), + CmdOutputObservation(content='cmd output content', command='ls'), [ ToolOutput( metadata={},