diff --git a/.github/workflows/dummy-agent-test.yml b/.github/workflows/dummy-agent-test.yml
index bac3fd8bb8da..9772dfda4a88 100644
--- a/.github/workflows/dummy-agent-test.yml
+++ b/.github/workflows/dummy-agent-test.yml
@@ -36,6 +36,8 @@ jobs:
       - name: Set up Docker Buildx
         id: buildx
         uses: docker/setup-buildx-action@v3
+      - name: Install tmux
+        run: sudo apt-get update && sudo apt-get install -y tmux
       - name: Install poetry via pipx
         run: pipx install poetry
       - name: Set up Python
diff --git a/.github/workflows/eval-runner.yml b/.github/workflows/eval-runner.yml
index 68463a47ae06..6efc51acbfa7 100644
--- a/.github/workflows/eval-runner.yml
+++ b/.github/workflows/eval-runner.yml
@@ -29,6 +29,8 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
 
+      - name: Install tmux
+        run: sudo apt-get update && sudo apt-get install -y tmux
       - name: Install poetry via pipx
         run: pipx install poetry
 
diff --git a/.github/workflows/py-unit-tests-mac.yml b/.github/workflows/py-unit-tests-mac.yml
index 93897000c4a9..4d9e8bf5cbd4 100644
--- a/.github/workflows/py-unit-tests-mac.yml
+++ b/.github/workflows/py-unit-tests-mac.yml
@@ -31,6 +31,8 @@ jobs:
           key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
           restore-keys: |
             ${{ runner.os }}-poetry-
+      - name: Install tmux
+        run: brew install tmux
       - name: Install poetry via pipx
         run: pipx install poetry
       - name: Install Python dependencies using Poetry
diff --git a/.github/workflows/py-unit-tests.yml b/.github/workflows/py-unit-tests.yml
index ebf159b8c3f7..1e42ccc2d82c 100644
--- a/.github/workflows/py-unit-tests.yml
+++ b/.github/workflows/py-unit-tests.yml
@@ -30,6 +30,8 @@ jobs:
       - name: Set up Docker Buildx
         id: buildx
         uses: docker/setup-buildx-action@v3
+      - name: Install tmux
+        run: sudo apt-get update && sudo apt-get install -y tmux
       - name: Install poetry via pipx
         run: pipx install poetry
       - name: Set up Python
diff --git a/docs/static/img/backend_architecture.puml b/docs/static/img/backend_architecture.puml
index 9f5564a52a8d..c96651ab469b 100644
--- a/docs/static/img/backend_architecture.puml
+++ b/docs/static/img/backend_architecture.puml
@@ -123,7 +123,6 @@ class openhands.state.State {
   updated_info: List[Tuple[Action, Observation]]
 }
 class openhands.observation.CmdOutputObservation {
-  command_id: int
   command: str
   exit_code: int
   observation: str
diff --git a/evaluation/benchmarks/agent_bench/run_infer.py b/evaluation/benchmarks/agent_bench/run_infer.py
index cf1dc6bba97c..554ddc66488f 100644
--- a/evaluation/benchmarks/agent_bench/run_infer.py
+++ b/evaluation/benchmarks/agent_bench/run_infer.py
@@ -137,7 +137,6 @@ def complete_runtime(
 
         action = CmdRunAction(
             command=f'chmod +x ./{script_name} && ./{script_name}',
-            keep_prompt=False,
         )
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
@@ -164,8 +163,7 @@ def complete_runtime(
             logger.info(f'Running get ground truth cmd: {script_name}')
 
             action = CmdRunAction(
-                command=f'chmod +x ./{script_name} && ./{script_name}',
-                keep_prompt=False,
+                command=f'chmod +x ./{script_name} && ./{script_name}'
             )
             logger.info(action, extra={'msg_type': 'ACTION'})
             obs = runtime.run_action(action)
diff --git a/evaluation/benchmarks/aider_bench/run_infer.py b/evaluation/benchmarks/aider_bench/run_infer.py
index f0e86f30380e..51179724e239 100644
--- a/evaluation/benchmarks/aider_bench/run_infer.py
+++ b/evaluation/benchmarks/aider_bench/run_infer.py
@@ -145,10 +145,7 @@ def complete_runtime(
         )
         logger.info(f'Running test file: {script_name}')
 
-    action = CmdRunAction(
-        command=f'python3 -m unittest {script_name}',
-        keep_prompt=False,
-    )
+    action = CmdRunAction(command=f'python3 -m unittest {script_name}')
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
diff --git a/evaluation/benchmarks/biocoder/run_infer.py b/evaluation/benchmarks/biocoder/run_infer.py
index ba8eb4d17b20..dfc0eaf1f912 100644
--- a/evaluation/benchmarks/biocoder/run_infer.py
+++ b/evaluation/benchmarks/biocoder/run_infer.py
@@ -199,7 +199,7 @@ def complete_runtime(
     if obs.exit_code == 0:
         test_result['metadata']['1_copy_change_success'] = True
 
-        action = CmdRunAction(command=f'cat {generated_path}', keep_prompt=False)
+        action = CmdRunAction(command=f'cat {generated_path}')
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         assert obs.exit_code == 0
@@ -223,9 +223,7 @@ def complete_runtime(
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert obs.exit_code == 0
 
-    action = CmdRunAction(
-        command='cat /testing_files/results_biocoder.json', keep_prompt=False
-    )
+    action = CmdRunAction(command='cat /testing_files/results_biocoder.json')
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     if obs.exit_code == 0:
diff --git a/evaluation/benchmarks/bird/README.md b/evaluation/benchmarks/bird/README.md
index 41874fe99f59..b13df87f22a3 100644
--- a/evaluation/benchmarks/bird/README.md
+++ b/evaluation/benchmarks/bird/README.md
@@ -127,7 +127,6 @@ For each problem, OpenHands is given a set number of iterations to fix the faili
         "observation": "run",
         "content": "california_schools/california_schools.sqlite\r\n[(1.0,)]",
         "extras": {
-          "command_id": -1,
           "command": "python3 0.py",
           "exit_code": 0
         }
diff --git a/evaluation/benchmarks/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py
index 45ddf582dc64..3254e4a2c791 100644
--- a/evaluation/benchmarks/bird/run_infer.py
+++ b/evaluation/benchmarks/bird/run_infer.py
@@ -268,10 +268,7 @@ def initialize_runtime(
     runtime.copy_to(db_file, '/workspace')
 
     # Check the database is copied
-    action = CmdRunAction(
-        command='cd /workspace && ls -l',
-        keep_prompt=False,
-    )
+    action = CmdRunAction(command='cd /workspace && ls -l')
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert obs.exit_code == 0
@@ -300,10 +297,7 @@ def complete_runtime(
     instance_id = instance.instance_id.replace('/', '__')
     path = os.path.join('/workspace', f'{instance_id}.py')
 
-    action = CmdRunAction(
-        command=f'cat {path}',
-        keep_prompt=False,
-    )
+    action = CmdRunAction(command=f'cat {path}')
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
 
diff --git a/evaluation/benchmarks/humanevalfix/README.md b/evaluation/benchmarks/humanevalfix/README.md
index 60dabef1f609..6d355e8093fd 100644
--- a/evaluation/benchmarks/humanevalfix/README.md
+++ b/evaluation/benchmarks/humanevalfix/README.md
@@ -71,7 +71,6 @@ For each problem, OpenHands is given a set number of iterations to fix the faili
                 "observation": "run",
                 "content": "[File: /workspace/Python__2.py (14 lines total)]\r\n1:def truncate_number(number: float) -> float:\r\n2:    return number % 1.0 + 1.0\r\n3:\r\n4:\r\n5:\r\n6:\r\n7:\r\n8:\r\n9:def check(truncate_number):\r\n10:    assert truncate_number(3.5) == 0.5\r\n11:    assert abs(truncate_number(1.33) - 0.33) < 1e-6\r\n12:    assert abs(truncate_number(123.456) - 0.456) < 1e-6\r\n13:\r\n14:check(truncate_number)",
                 "extras": {
-                    "command_id": -1,
                     "command": "open Python__2.py",
                     "exit_code": 0
                 }
@@ -98,7 +97,6 @@ For each problem, OpenHands is given a set number of iterations to fix the faili
                 "observation": "run",
                 "content": "> > [File: /workspace/Python__2.py (14 lines total)]\r\n1:def truncate_number(number: float) -> float:\r\n2:    return number % 1.0\r\n3:\r\n4:\r\n5:\r\n6:\r\n7:\r\n8:\r\n9:def check(truncate_number):\r\n10:    assert truncate_number(3.5) == 0.5\r\n11:    assert abs(truncate_number(1.33) - 0.33) < 1e-6\r\n12:    assert abs(truncate_number(123.456) - 0.456) < 1e-6\r\n13:\r\n14:check(truncate_number)\r\nFile updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.",
                 "extras": {
-                    "command_id": -1,
                     "command": "edit 2:2 <<EOF\n    return number % 1.0\nEOF",
                     "exit_code": 0
                 }
@@ -125,7 +123,6 @@ For each problem, OpenHands is given a set number of iterations to fix the faili
                 "observation": "run",
                 "content": "",
                 "extras": {
-                    "command_id": -1,
                     "command": "python3 Python__2.py",
                     "exit_code": 0
                 }
diff --git a/evaluation/benchmarks/humanevalfix/run_infer.py b/evaluation/benchmarks/humanevalfix/run_infer.py
index 5ab5af818f90..ba802ddf9dfa 100644
--- a/evaluation/benchmarks/humanevalfix/run_infer.py
+++ b/evaluation/benchmarks/humanevalfix/run_infer.py
@@ -171,9 +171,7 @@ def complete_runtime(
     num_workers = LANGUAGE_TO_NUM_WORKERS[language]
     python_imports = '\n'.join(IMPORT_HELPER[language])
 
-    action = CmdRunAction(
-        command=f'cat /workspace/{_get_instance_id(instance)}.py', keep_prompt=False
-    )
+    action = CmdRunAction(command=f'cat /workspace/{_get_instance_id(instance)}.py')
     obs = runtime.run_action(action)
     assert obs.exit_code == 0
 
diff --git a/evaluation/benchmarks/ml_bench/run_infer.py b/evaluation/benchmarks/ml_bench/run_infer.py
index 7f15476423c0..4e396b3c3fe1 100644
--- a/evaluation/benchmarks/ml_bench/run_infer.py
+++ b/evaluation/benchmarks/ml_bench/run_infer.py
@@ -163,7 +163,7 @@ def complete_runtime(
     eval_script = os.path.join(task_path, 'run.sh')
     logger.info(f'Running evaluation script: {eval_script}')
 
-    action = CmdRunAction(command=f'cat {eval_script}', keep_prompt=False)
+    action = CmdRunAction(command=f'cat {eval_script}')
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     if obs.exit_code == 0:
diff --git a/evaluation/benchmarks/scienceagentbench/run_infer.py b/evaluation/benchmarks/scienceagentbench/run_infer.py
index ebe1b783cfed..2ca5c2b40323 100644
--- a/evaluation/benchmarks/scienceagentbench/run_infer.py
+++ b/evaluation/benchmarks/scienceagentbench/run_infer.py
@@ -121,10 +121,7 @@ def initialize_runtime(
     runtime.copy_to(dataset_dir, '/workspace/benchmark/datasets', recursive=True)
 
     # Check the dataset exists
-    action = CmdRunAction(
-        command='cd /workspace/benchmark/datasets && ls',
-        keep_prompt=False,
-    )
+    action = CmdRunAction(command='cd /workspace/benchmark/datasets && ls')
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert obs.exit_code == 0
@@ -154,10 +151,7 @@ def complete_runtime(
 
     assert obs.exit_code == 0
 
-    action = CmdRunAction(
-        command=f'cat pred_programs/{instance.pred_program_name}',
-        keep_prompt=False,
-    )
+    action = CmdRunAction(command=f'cat pred_programs/{instance.pred_program_name}')
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
 
diff --git a/evaluation/benchmarks/swe_bench/eval_infer.py b/evaluation/benchmarks/swe_bench/eval_infer.py
index c5d479dd50d5..7beacf344408 100644
--- a/evaluation/benchmarks/swe_bench/eval_infer.py
+++ b/evaluation/benchmarks/swe_bench/eval_infer.py
@@ -98,6 +98,7 @@ def process_instance(
     metadata: EvalMetadata,
     reset_logger: bool = True,
     log_dir: str | None = None,
+    runtime_failure_count: int = 0,
 ) -> EvalOutput:
     """
     Evaluate agent performance on a SWE-bench problem instance.
@@ -146,6 +147,16 @@ def process_instance(
             metadata=metadata,
         )
 
+    # Increase resource_factor with increasing attempt_id
+    if runtime_failure_count > 0:
+        config.sandbox.remote_runtime_resource_factor = min(
+            config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
+            4,  # hardcode maximum resource factor to 4
+        )
+        logger.warning(
+            f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
+        )
+
     runtime = create_runtime(config)
     call_async_from_sync(runtime.connect)
     # Get patch and save it to /tmp/patch.diff
@@ -177,7 +188,7 @@ def process_instance(
         "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
         "echo 'APPLY_PATCH_FAIL')))"
     )
-    action = CmdRunAction(command=exec_command, keep_prompt=False)
+    action = CmdRunAction(command=exec_command)
     action.timeout = 600
     obs = runtime.run_action(action)
     assert isinstance(obs, CmdOutputObservation)
@@ -200,9 +211,7 @@ def process_instance(
 
             # Run eval script in background and save output to log file
             log_file = '/tmp/eval_output.log'
-            action = CmdRunAction(
-                command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!', keep_prompt=False
-            )
+            action = CmdRunAction(command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!')
             action.timeout = 60  # Short timeout just to get the process ID
             obs = runtime.run_action(action)
 
@@ -224,7 +233,7 @@ def process_instance(
                         instance['test_result']['report']['test_timeout'] = True
                         break
                     check_action = CmdRunAction(
-                        command=f'ps -p {pid} > /dev/null; echo $?', keep_prompt=False
+                        command=f'ps -p {pid} > /dev/null; echo $?'
                     )
                     check_action.timeout = 60
                     check_obs = runtime.run_action(check_action)
@@ -242,7 +251,7 @@ def process_instance(
                     time.sleep(30)  # Wait for 30 seconds before checking again
 
                 # Read the log file
-                cat_action = CmdRunAction(command=f'cat {log_file}', keep_prompt=False)
+                cat_action = CmdRunAction(command=f'cat {log_file}')
                 cat_action.timeout = 300
                 cat_obs = runtime.run_action(cat_action)
 
diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py
index 61c045037bbb..4f0dff6508f4 100644
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -282,6 +282,16 @@ def initialize_runtime(
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')
 
+    action = CmdRunAction(command='which python')
+    action.timeout = 600
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        obs.exit_code == 0 and 'testbed' in obs.content,
+        f'Expected to find python interpreter from testbed, but got: {str(obs)}',
+    )
+
     logger.info('-' * 30)
     logger.info('END Runtime Initialization Fn')
     logger.info('-' * 30)
@@ -337,8 +347,7 @@ def complete_runtime(
     git_patch = None
     while n_retries < 5:
         action = CmdRunAction(
-            command=f'git diff --no-color --cached {instance["base_commit"]}',
-            keep_prompt=False,
+            command=f'git diff --no-color --cached {instance["base_commit"]}'
         )
         action.timeout = 600 + 100 * n_retries
         logger.info(action, extra={'msg_type': 'ACTION'})
@@ -385,7 +394,7 @@ def process_instance(
     if runtime_failure_count > 0:
         config.sandbox.remote_runtime_resource_factor = min(
             config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
-            2,  # hardcode maximum resource factor to 2
+            8,
         )
         logger.warning(
             f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
@@ -535,4 +544,5 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
         args.eval_num_workers,
         process_instance,
         timeout_seconds=120 * 60,  # 2 hour PER instance should be more than enough
+        max_retries=5,
     )
diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py b/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py
index cec92e153d84..0dc9552a25ea 100755
--- a/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py
@@ -104,9 +104,9 @@ def _get_resolved(report):
     # Determine if this repo has a significant diff
     is_significant = diff >= threshold
     repo_color = 'red' if is_significant else 'yellow'
-    print(colored(f'Difference: {diff} instances!', repo_color, attrs=['bold']))
 
     print(f"\n{colored(repo, repo_color, attrs=['bold'])}:")
+    print(colored(f'Difference: {diff} instances!', repo_color, attrs=['bold']))
     print(colored(f'X resolved but Y failed: ({len(x_instances)} instances)', 'green'))
     if x_instances:
         print('  ' + str(x_instances))
diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py
index 8e9fc407d93b..97698feba3c7 100755
--- a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py
@@ -20,6 +20,13 @@
 print(f'Converting {args.oh_output_file} to markdown files in {output_md_folder}')
 
 oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True)
+
+swebench_eval_file = args.oh_output_file.replace('.jsonl', '.swebench_eval.jsonl')
+if os.path.exists(swebench_eval_file):
+    eval_output_df = pd.read_json(swebench_eval_file, orient='records', lines=True)
+else:
+    eval_output_df = None
+
 # model name is the folder name of oh_output_file
 model_name = os.path.basename(os.path.dirname(args.oh_output_file))
 
@@ -50,7 +57,7 @@ def convert_history_to_str(history):
     return ret
 
 
-def write_row_to_md_file(row):
+def write_row_to_md_file(row, instance_id_to_test_result):
     if 'git_patch' in row:
         model_patch = row['git_patch']
     elif 'test_result' in row and 'git_patch' in row['test_result']:
@@ -58,8 +65,21 @@ def write_row_to_md_file(row):
     else:
         raise ValueError(f'Row {row} does not have a git_patch')
 
-    if 'report' in row:
-        resolved = row['report'].get('resolved', False)
+    test_output = None
+    if row['instance_id'] in instance_id_to_test_result:
+        report = instance_id_to_test_result[row['instance_id']].get('report', {})
+        resolved = report.get('resolved', False)
+        test_output = instance_id_to_test_result[row['instance_id']].get(
+            'test_output', None
+        )
+    elif 'report' in row and row['report'] is not None:
+        if not isinstance(row['report'], dict):
+            resolved = None
+            print(
+                f'ERROR: Report is not a dict, but a {type(row["report"])}. Row: {row}'
+            )
+        else:
+            resolved = row['report'].get('resolved', False)
     else:
         resolved = None
 
@@ -84,5 +104,18 @@ def write_row_to_md_file(row):
         f.write('## Model Patch\n')
         f.write(f'{process_git_patch(model_patch)}\n')
 
+        f.write('## Test Output\n')
+        f.write(str(test_output))
+
+
+instance_id_to_test_result = {}
+if eval_output_df is not None:
+    instance_id_to_test_result = (
+        eval_output_df[['instance_id', 'test_result']]
+        .set_index('instance_id')['test_result']
+        .to_dict()
+    )
 
-oh_format.progress_apply(write_row_to_md_file, axis=1)
+oh_format.progress_apply(
+    write_row_to_md_file, axis=1, instance_id_to_test_result=instance_id_to_test_result
+)
diff --git a/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py b/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py
index 662e640ca752..d9c5c540f24b 100644
--- a/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py
@@ -111,6 +111,11 @@ def apply_report(row):
         instance_id_to_status[row['instance_id']] = row['test_result']['report']
     df['report'] = df.apply(apply_report, axis=1)
 
+    report_is_dict = df['report'].apply(lambda x: isinstance(x, dict))
+    if not report_is_dict.all():
+        print(df[~report_is_dict])
+        raise ValueError(f'Report is not a dict, but a {type(row["report"])}')
+
     _n_instances = len(df)
     _n_resolved = len(df[df['report'].apply(lambda x: x.get('resolved', False))])
     _n_unresolved = _n_instances - _n_resolved
diff --git a/evaluation/integration_tests/tests/t01_fix_simple_typo.py b/evaluation/integration_tests/tests/t01_fix_simple_typo.py
index 4cfa331df1b5..532d5d5b38fd 100644
--- a/evaluation/integration_tests/tests/t01_fix_simple_typo.py
+++ b/evaluation/integration_tests/tests/t01_fix_simple_typo.py
@@ -24,7 +24,7 @@ def initialize_runtime(cls, runtime: Runtime) -> None:
     @classmethod
     def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
         # check if the file /workspace/bad.txt has been fixed
-        action = CmdRunAction(command='cat /workspace/bad.txt', keep_prompt=False)
+        action = CmdRunAction(command='cat /workspace/bad.txt')
         obs = runtime.run_action(action)
         if obs.exit_code != 0:
             return TestResult(
diff --git a/evaluation/integration_tests/tests/t02_add_bash_hello.py b/evaluation/integration_tests/tests/t02_add_bash_hello.py
index ac82e89bac05..88384a87f248 100644
--- a/evaluation/integration_tests/tests/t02_add_bash_hello.py
+++ b/evaluation/integration_tests/tests/t02_add_bash_hello.py
@@ -10,14 +10,14 @@ class Test(BaseIntegrationTest):
 
     @classmethod
     def initialize_runtime(cls, runtime: Runtime) -> None:
-        action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False)
+        action = CmdRunAction(command='mkdir -p /workspace')
         obs = runtime.run_action(action)
         assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
 
     @classmethod
     def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
         # check if the file /workspace/hello.sh exists
-        action = CmdRunAction(command='cat /workspace/hello.sh', keep_prompt=False)
+        action = CmdRunAction(command='cat /workspace/hello.sh')
         obs = runtime.run_action(action)
         if obs.exit_code != 0:
             return TestResult(
@@ -26,7 +26,7 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
             )
 
         # execute the script
-        action = CmdRunAction(command='bash /workspace/hello.sh', keep_prompt=False)
+        action = CmdRunAction(command='bash /workspace/hello.sh')
         obs = runtime.run_action(action)
         if obs.exit_code != 0:
             return TestResult(
diff --git a/evaluation/integration_tests/tests/t03_jupyter_write_file.py b/evaluation/integration_tests/tests/t03_jupyter_write_file.py
index e1ed6c27c4a6..2f88e1228b58 100644
--- a/evaluation/integration_tests/tests/t03_jupyter_write_file.py
+++ b/evaluation/integration_tests/tests/t03_jupyter_write_file.py
@@ -10,14 +10,14 @@ class Test(BaseIntegrationTest):
 
     @classmethod
     def initialize_runtime(cls, runtime: Runtime) -> None:
-        action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False)
+        action = CmdRunAction(command='mkdir -p /workspace')
         obs = runtime.run_action(action)
         assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
 
     @classmethod
     def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
         # check if the file /workspace/hello.sh exists
-        action = CmdRunAction(command='cat /workspace/test.txt', keep_prompt=False)
+        action = CmdRunAction(command='cat /workspace/test.txt')
         obs = runtime.run_action(action)
         if obs.exit_code != 0:
             return TestResult(
@@ -26,7 +26,7 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
             )
 
         # execute the script
-        action = CmdRunAction(command='cat /workspace/test.txt', keep_prompt=False)
+        action = CmdRunAction(command='cat /workspace/test.txt')
         obs = runtime.run_action(action)
 
         if obs.exit_code != 0:
diff --git a/evaluation/integration_tests/tests/t04_git_staging.py b/evaluation/integration_tests/tests/t04_git_staging.py
index aadb861203e7..5d9459ec4a40 100644
--- a/evaluation/integration_tests/tests/t04_git_staging.py
+++ b/evaluation/integration_tests/tests/t04_git_staging.py
@@ -10,31 +10,29 @@ class Test(BaseIntegrationTest):
 
     @classmethod
     def initialize_runtime(cls, runtime: Runtime) -> None:
-        action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False)
+        action = CmdRunAction(command='mkdir -p /workspace')
         obs = runtime.run_action(action)
         assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
 
         # git init
-        action = CmdRunAction(command='git init', keep_prompt=False)
+        action = CmdRunAction(command='git init')
         obs = runtime.run_action(action)
         assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
 
         # create README.md
-        action = CmdRunAction(
-            command='echo \'print("hello world")\' > hello.py', keep_prompt=False
-        )
+        action = CmdRunAction(command='echo \'print("hello world")\' > hello.py')
         obs = runtime.run_action(action)
         assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
 
         # git add README.md
-        action = CmdRunAction(command='git add hello.py', keep_prompt=False)
+        action = CmdRunAction(command='git add hello.py')
         obs = runtime.run_action(action)
         assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
 
     @classmethod
     def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
         # check if the file /workspace/hello.py exists
-        action = CmdRunAction(command='cat /workspace/hello.py', keep_prompt=False)
+        action = CmdRunAction(command='cat /workspace/hello.py')
         obs = runtime.run_action(action)
         if obs.exit_code != 0:
             return TestResult(
@@ -43,7 +41,7 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
             )
 
         # check if the staging area is empty
-        action = CmdRunAction(command='git status', keep_prompt=False)
+        action = CmdRunAction(command='git status')
         obs = runtime.run_action(action)
         if obs.exit_code != 0:
             return TestResult(
diff --git a/evaluation/integration_tests/tests/t05_simple_browsing.py b/evaluation/integration_tests/tests/t05_simple_browsing.py
index 96bb47875aec..8542e50d80a3 100644
--- a/evaluation/integration_tests/tests/t05_simple_browsing.py
+++ b/evaluation/integration_tests/tests/t05_simple_browsing.py
@@ -83,11 +83,11 @@ class Test(BaseIntegrationTest):
 
     @classmethod
     def initialize_runtime(cls, runtime: Runtime) -> None:
-        action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False)
+        action = CmdRunAction(command='mkdir -p /workspace')
         obs = runtime.run_action(action)
         assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
 
-        action = CmdRunAction(command='mkdir -p /tmp/server', keep_prompt=False)
+        action = CmdRunAction(command='mkdir -p /tmp/server')
         obs = runtime.run_action(action)
         assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')
 
@@ -101,8 +101,7 @@ def initialize_runtime(cls, runtime: Runtime) -> None:
 
         # create README.md
         action = CmdRunAction(
-            command='cd /tmp/server && nohup python3 -m http.server 8000 &',
-            keep_prompt=False,
+            command='cd /tmp/server && nohup python3 -m http.server 8000 &'
         )
         obs = runtime.run_action(action)
 
diff --git a/frontend/src/services/actions.ts b/frontend/src/services/actions.ts
index e235218df5af..b19fc92383b7 100644
--- a/frontend/src/services/actions.ts
+++ b/frontend/src/services/actions.ts
@@ -16,6 +16,7 @@ import {
   StatusMessage,
 } from "#/types/message";
 import { handleObservationMessage } from "./observations";
+import { appendInput } from "#/state/command-slice";
 
 const messageActions = {
   [ActionType.BROWSE]: (message: ActionMessage) => {
@@ -62,6 +63,10 @@ export function handleActionMessage(message: ActionMessage) {
     return;
   }
 
+  if (message.action === ActionType.RUN) {
+    store.dispatch(appendInput(message.args.command));
+  }
+
   if ("args" in message && "security_risk" in message.args) {
     store.dispatch(appendSecurityAnalyzerInput(message));
   }
diff --git a/frontend/src/services/observations.ts b/frontend/src/services/observations.ts
index f2f1de60c45a..1ec6a784d910 100644
--- a/frontend/src/services/observations.ts
+++ b/frontend/src/services/observations.ts
@@ -80,8 +80,7 @@ export function handleObservationMessage(message: ObservationMessage) {
             observation: "run" as const,
             extras: {
               command: String(message.extras.command || ""),
-              command_id: Number(message.extras.command_id || 0),
-              exit_code: Number(message.extras.exit_code || 0),
+              metadata: message.extras.metadata,
               hidden: Boolean(message.extras.hidden),
             },
           }),
diff --git a/frontend/src/state/chat-slice.ts b/frontend/src/state/chat-slice.ts
index aefcaf6972ea..87ce8e52dee4 100644
--- a/frontend/src/state/chat-slice.ts
+++ b/frontend/src/state/chat-slice.ts
@@ -93,7 +93,7 @@ export const chatSlice = createSlice({
       const translationID = `ACTION_MESSAGE$${actionID.toUpperCase()}`;
       let text = "";
       if (actionID === "run") {
-        text = `\`${action.payload.args.command}\``;
+        text = `Command:\n\`${action.payload.args.command}\``;
       } else if (actionID === "run_ipython") {
         text = `\`\`\`\n${action.payload.args.code}\n\`\`\``;
       } else if (actionID === "write") {
@@ -144,7 +144,7 @@ export const chatSlice = createSlice({
       // Set success property based on observation type
       if (observationID === "run") {
         const commandObs = observation.payload as CommandObservation;
-        causeMessage.success = commandObs.extras.exit_code === 0;
+        causeMessage.success = commandObs.extras.metadata.exit_code === 0;
       } else if (observationID === "run_ipython") {
         // For IPython, we consider it successful if there's no error message
         const ipythonObs = observation.payload as IPythonObservation;
@@ -158,7 +158,9 @@ export const chatSlice = createSlice({
         if (content.length > MAX_CONTENT_LENGTH) {
           content = `${content.slice(0, MAX_CONTENT_LENGTH)}...`;
         }
-        content = `\`\`\`\n${content}\n\`\`\``;
+        content = `${
+          causeMessage.content
+        }\n\nOutput:\n\`\`\`\n${content.trim() || "[Command finished execution with no output]"}\n\`\`\``;
         causeMessage.content = content; // Observation content includes the action
       } else if (observationID === "read" || observationID === "edit") {
         const { content } = observation.payload;
diff --git a/frontend/src/types/core/observations.ts b/frontend/src/types/core/observations.ts
index 9a020d93654e..8f815b61f5a4 100644
--- a/frontend/src/types/core/observations.ts
+++ b/frontend/src/types/core/observations.ts
@@ -13,9 +13,8 @@ export interface CommandObservation extends OpenHandsObservationEvent<"run"> {
   source: "agent";
   extras: {
     command: string;
-    command_id: number;
-    exit_code: number;
     hidden?: boolean;
+    metadata: Record<string, unknown>;
   };
 }
 
diff --git a/frontend/src/types/message.tsx b/frontend/src/types/message.tsx
index f7ef68e3fa13..eb1e1bfcf9a5 100644
--- a/frontend/src/types/message.tsx
+++ b/frontend/src/types/message.tsx
@@ -27,8 +27,11 @@ export interface ObservationMessage {
   // The observed data
   content: string;
 
-  // Additional structured data
-  extras: Record<string, string>;
+  extras: {
+    metadata: Record<string, unknown>;
+    error_id: string;
+    [key: string]: string | Record<string, unknown>;
+  };
 
   // A friendly message that can be put in the chat log
   message: string;
diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index 904af0450dfd..f811723dfbd5 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -277,7 +277,9 @@ def get_observation_message(
                 )
             else:
                 text = truncate_content(
-                    obs.content + obs.interpreter_details, max_message_chars
+                    obs.content
+                    + f'\n[Python Interpreter: {obs.metadata.py_interpreter_path}]',
+                    max_message_chars,
                 )
             text += f'\n[Command finished with exit code {obs.exit_code}]'
             message = Message(role='user', content=[TextContent(text=text)])
diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py
index 5038f95934ea..d06cf01cd3e4 100644
--- a/openhands/agenthub/codeact_agent/function_calling.py
+++ b/openhands/agenthub/codeact_agent/function_calling.py
@@ -31,8 +31,7 @@
 
 _BASH_DESCRIPTION = """Execute a bash command in the terminal.
 * Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = `python3 app.py > server.log 2>&1 &`.
-* Interactive: If a bash command returns exit code `-1`, this means the process is not yet finished. The assistant must then send a second call to terminal with an empty `command` (which will retrieve any additional logs), or it can send additional text (set `command` to the text) to STDIN of the running process, or it can send command=`ctrl+c` to interrupt the process.
-* Timeout: If a command execution result says "Command timed out. Sending SIGINT to the process", the assistant should retry running the command in the background.
+* Interactive: If a bash command returns exit code `-1`, this means the process is not yet finished. The assistant must then send a second call to terminal with an empty `command` (which will retrieve any additional logs), or it can send additional text (set `command` to the text) to STDIN of the running process, or it can send command like `C-c` (Ctrl+C) to interrupt the process.
 """
 
 CmdRunTool = ChatCompletionToolParam(
@@ -45,7 +44,7 @@
             'properties': {
                 'command': {
                     'type': 'string',
-                    'description': 'The bash command to execute. Can be empty to view additional logs when previous exit code is `-1`. Can be `ctrl+c` to interrupt the currently running process.',
+                    'description': 'The bash command to execute. Can be empty string to view additional logs when previous exit code is `-1`. Can be `C-c` (Ctrl+C) to interrupt the currently running process.',
                 },
             },
             'required': ['command'],
diff --git a/openhands/agenthub/dummy_agent/agent.py b/openhands/agenthub/dummy_agent/agent.py
index 06abacab3eb7..f7a654bf75b4 100644
--- a/openhands/agenthub/dummy_agent/agent.py
+++ b/openhands/agenthub/dummy_agent/agent.py
@@ -18,6 +18,7 @@
 from openhands.events.observation import (
     AgentStateChangedObservation,
     BrowserOutputObservation,
+    CmdOutputMetadata,
     CmdOutputObservation,
     FileReadObservation,
     FileWriteObservation,
@@ -54,11 +55,7 @@ def __init__(self, llm: LLM, config: AgentConfig):
             },
             {
                 'action': CmdRunAction(command='echo "foo"'),
-                'observations': [
-                    CmdOutputObservation(
-                        'foo', command_id=-1, command='echo "foo"', exit_code=0
-                    )
-                ],
+                'observations': [CmdOutputObservation('foo', command='echo "foo"')],
             },
             {
                 'action': FileWriteAction(
@@ -81,9 +78,8 @@ def __init__(self, llm: LLM, config: AgentConfig):
                 'observations': [
                     CmdOutputObservation(
                         'bash: hello.sh: No such file or directory',
-                        command_id=-1,
                         command='bash workspace/hello.sh',
-                        exit_code=127,
+                        metadata=CmdOutputMetadata(exit_code=127),
                     )
                 ],
             },
@@ -152,8 +148,6 @@ def step(self, state: State) -> Action:
                         obs.pop('timestamp', None)
                         obs.pop('cause', None)
                         obs.pop('source', None)
-                        if 'extras' in obs:
-                            obs['extras'].pop('command_id', None)
 
                     if hist_obs != expected_obs:
                         print(
diff --git a/openhands/controller/stuck.py b/openhands/controller/stuck.py
index dbb18a9b4f2c..1bf7e410eb23 100644
--- a/openhands/controller/stuck.py
+++ b/openhands/controller/stuck.py
@@ -5,7 +5,7 @@
 from openhands.events.action.empty import NullAction
 from openhands.events.action.message import MessageAction
 from openhands.events.event import Event, EventSource
-from openhands.events.observation.commands import (
+from openhands.events.observation import (
     CmdOutputObservation,
     IPythonRunCellObservation,
 )
diff --git a/openhands/core/utils/json.py b/openhands/core/utils/json.py
index c0b22740bec4..e75d028f037e 100644
--- a/openhands/core/utils/json.py
+++ b/openhands/core/utils/json.py
@@ -6,6 +6,7 @@
 
 from openhands.core.exceptions import LLMResponseError
 from openhands.events.event import Event
+from openhands.events.observation import CmdOutputMetadata
 from openhands.events.serialization import event_to_dict
 from openhands.llm.metrics import Metrics
 
@@ -20,6 +21,8 @@ def my_default_encoder(obj):
         return obj.get()
     if isinstance(obj, ModelResponse):
         return obj.model_dump()
+    if isinstance(obj, CmdOutputMetadata):
+        return obj.model_dump()
     return json.JSONEncoder().default(obj)
 
 
diff --git a/openhands/events/action/commands.py b/openhands/events/action/commands.py
index 83dd19f9d161..67b586f86ad9 100644
--- a/openhands/events/action/commands.py
+++ b/openhands/events/action/commands.py
@@ -12,19 +12,11 @@
 @dataclass
 class CmdRunAction(Action):
     command: str
+    # When `command` is empty, it will be used to print the current tmux window
     thought: str = ''
     blocking: bool = False
-    # If False, the command will be run in a non-blocking / interactive way
-    # The partial command outputs will be returned as output observation.
-    # If True, the command will be run for max .timeout seconds.
-    keep_prompt: bool = True
-    # if True, the command prompt will be kept in the command output observation
-    # Example of command output:
-    # root@sandbox:~# ls
-    # file1.txt
-    # file2.txt
-    # root@sandbox:~# <-- this is the command prompt
-
+    # If blocking is True, the command will be run in a blocking manner.
+    # e.g., it will NOT return early due to soft timeout.
     hidden: bool = False
     action: str = ActionType.RUN
     runnable: ClassVar[bool] = True
diff --git a/openhands/events/action/files.py b/openhands/events/action/files.py
index c903e4f00f7d..de2db691f146 100644
--- a/openhands/events/action/files.py
+++ b/openhands/events/action/files.py
@@ -48,6 +48,15 @@ class FileWriteAction(Action):
     def message(self) -> str:
         return f'Writing file: {self.path}'
 
+    def __repr__(self) -> str:
+        return (
+            f'**FileWriteAction**\n'
+            f'Path: {self.path}\n'
+            f'Range: [L{self.start}:L{self.end}]\n'
+            f'Thought: {self.thought}\n'
+            f'Content:\n```\n{self.content}\n```\n'
+        )
+
 
 @dataclass
 class FileEditAction(Action):
diff --git a/openhands/events/observation/__init__.py b/openhands/events/observation/__init__.py
index 28525b09aabb..69a1853b268c 100644
--- a/openhands/events/observation/__init__.py
+++ b/openhands/events/observation/__init__.py
@@ -1,6 +1,7 @@
 from openhands.events.observation.agent import AgentStateChangedObservation
 from openhands.events.observation.browse import BrowserOutputObservation
 from openhands.events.observation.commands import (
+    CmdOutputMetadata,
     CmdOutputObservation,
     IPythonRunCellObservation,
 )
@@ -20,6 +21,7 @@
     'Observation',
     'NullObservation',
     'CmdOutputObservation',
+    'CmdOutputMetadata',
     'IPythonRunCellObservation',
     'BrowserOutputObservation',
     'FileReadObservation',
diff --git a/openhands/events/observation/commands.py b/openhands/events/observation/commands.py
index b522b5c47283..31b4472f7ee7 100644
--- a/openhands/events/observation/commands.py
+++ b/openhands/events/observation/commands.py
@@ -1,19 +1,136 @@
-from dataclasses import dataclass
+import json
+import re
+import traceback
+from dataclasses import dataclass, field
+from typing import Self
 
+from pydantic import BaseModel
+
+from openhands.core.logger import openhands_logger as logger
 from openhands.core.schema import ObservationType
 from openhands.events.observation.observation import Observation
 
+CMD_OUTPUT_PS1_BEGIN = '\n###PS1JSON###\n'
+CMD_OUTPUT_PS1_END = '\n###PS1END###'
+CMD_OUTPUT_METADATA_PS1_REGEX = re.compile(
+    f'^{CMD_OUTPUT_PS1_BEGIN.strip()}(.*?){CMD_OUTPUT_PS1_END.strip()}',
+    re.DOTALL | re.MULTILINE,
+)
+
+
+class CmdOutputMetadata(BaseModel):
+    """Additional metadata captured from PS1"""
+
+    exit_code: int = -1
+    pid: int = -1
+    username: str | None = None
+    hostname: str | None = None
+    working_dir: str | None = None
+    py_interpreter_path: str | None = None
+    prefix: str = ''  # Prefix to add to command output
+    suffix: str = ''  # Suffix to add to command output
+
+    @classmethod
+    def to_ps1_prompt(cls) -> str:
+        """Convert the required metadata into a PS1 prompt."""
+        prompt = CMD_OUTPUT_PS1_BEGIN
+        json_str = json.dumps(
+            {
+                'pid': '$!',
+                'exit_code': '$?',
+                'username': r'\u',
+                'hostname': r'\h',
+                'working_dir': r'$(pwd)',
+                'py_interpreter_path': r'$(which python 2>/dev/null || echo "")',
+            },
+            indent=2,
+        )
+        # Make sure we escape double quotes in the JSON string
+        # So that PS1 will keep them as part of the output
+        prompt += json_str.replace('"', r'\"')
+        prompt += CMD_OUTPUT_PS1_END + '\n'  # Ensure there's a newline at the end
+        return prompt
+
+    @classmethod
+    def matches_ps1_metadata(cls, string: str) -> list[re.Match[str]]:
+        matches = []
+        for match in CMD_OUTPUT_METADATA_PS1_REGEX.finditer(string):
+            try:
+                json.loads(match.group(1).strip())  # Try to parse as JSON
+                matches.append(match)
+            except json.JSONDecodeError:
+                logger.warning(
+                    f'Failed to parse PS1 metadata: {match.group(1)}. Skipping.'
+                    + traceback.format_exc()
+                )
+                continue  # Skip if not valid JSON
+        return matches
+
+    @classmethod
+    def from_ps1_match(cls, match: re.Match[str]) -> Self:
+        """Extract the required metadata from a PS1 prompt."""
+        metadata = json.loads(match.group(1))
+        # Create a copy of metadata to avoid modifying the original
+        processed = metadata.copy()
+        # Convert numeric fields
+        if 'pid' in metadata:
+            try:
+                processed['pid'] = int(float(str(metadata['pid'])))
+            except (ValueError, TypeError):
+                processed['pid'] = -1
+        if 'exit_code' in metadata:
+            try:
+                processed['exit_code'] = int(float(str(metadata['exit_code'])))
+            except (ValueError, TypeError):
+                logger.warning(
+                    f'Failed to parse exit code: {metadata["exit_code"]}. Setting to -1.'
+                )
+                processed['exit_code'] = -1
+        return cls(**processed)
+
 
 @dataclass
 class CmdOutputObservation(Observation):
     """This data class represents the output of a command."""
 
-    command_id: int
     command: str
-    exit_code: int = 0
-    hidden: bool = False
     observation: str = ObservationType.RUN
-    interpreter_details: str = ''
+    # Additional metadata captured from PS1
+    metadata: CmdOutputMetadata = field(default_factory=CmdOutputMetadata)
+    # Whether the command output should be hidden from the user
+    hidden: bool = False
+
+    def __init__(
+        self,
+        content: str,
+        command: str,
+        observation: str = ObservationType.RUN,
+        metadata: dict | CmdOutputMetadata | None = None,
+        hidden: bool = False,
+        **kwargs,
+    ):
+        super().__init__(content)
+        self.command = command
+        self.observation = observation
+        self.hidden = hidden
+        if isinstance(metadata, dict):
+            self.metadata = CmdOutputMetadata(**metadata)
+        else:
+            self.metadata = metadata or CmdOutputMetadata()
+
+        # Handle legacy attribute
+        if 'exit_code' in kwargs:
+            self.metadata.exit_code = kwargs['exit_code']
+        if 'command_id' in kwargs:
+            self.metadata.pid = kwargs['command_id']
+
+    @property
+    def command_id(self) -> int:
+        return self.metadata.pid
+
+    @property
+    def exit_code(self) -> int:
+        return self.metadata.exit_code
 
     @property
     def error(self) -> bool:
@@ -28,7 +145,21 @@ def success(self) -> bool:
         return not self.error
 
     def __str__(self) -> str:
-        return f'**CmdOutputObservation (source={self.source}, exit code={self.exit_code})**\n{self.content}'
+        return (
+            f'**CmdOutputObservation (source={self.source}, exit code={self.exit_code}, '
+            f'metadata={json.dumps(self.metadata.model_dump(), indent=2)})**\n'
+            '--BEGIN AGENT OBSERVATION--\n'
+            f'{self._to_agent_observation()}\n'
+            '--END AGENT OBSERVATION--'
+        )
+
+    def _to_agent_observation(self) -> str:
+        ret = f'{self.metadata.prefix}{self.content}{self.metadata.suffix}'
+        if self.metadata.working_dir:
+            ret += f'\n[Current working directory: {self.metadata.working_dir}]'
+        if self.metadata.py_interpreter_path:
+            ret += f'\n[Python interpreter: {self.metadata.py_interpreter_path}]'
+        return ret
 
 
 @dataclass
diff --git a/openhands/events/observation/files.py b/openhands/events/observation/files.py
index b16804607d94..b939988fcebb 100644
--- a/openhands/events/observation/files.py
+++ b/openhands/events/observation/files.py
@@ -18,6 +18,9 @@ class FileReadObservation(Observation):
     def message(self) -> str:
         return f'I read the file {self.path}.'
 
+    def __str__(self) -> str:
+        return f'[Read from {self.path} is successful.]\n' f'{self.content}'
+
 
 @dataclass
 class FileWriteObservation(Observation):
@@ -30,6 +33,9 @@ class FileWriteObservation(Observation):
     def message(self) -> str:
         return f'I wrote to the file {self.path}.'
 
+    def __str__(self) -> str:
+        return f'[Write to {self.path} is successful.]\n' f'{self.content}'
+
 
 @dataclass
 class FileEditObservation(Observation):
diff --git a/openhands/events/serialization/action.py b/openhands/events/serialization/action.py
index 208400fe9213..90945c1d4dfd 100644
--- a/openhands/events/serialization/action.py
+++ b/openhands/events/serialization/action.py
@@ -67,6 +67,10 @@ def action_from_dict(action: dict) -> Action:
     if 'images_urls' in args:
         args['image_urls'] = args.pop('images_urls')
 
+    # keep_prompt has been deprecated in https://github.com/All-Hands-AI/OpenHands/pull/4881
+    if 'keep_prompt' in args:
+        args.pop('keep_prompt')
+
     try:
         decoded_action = action_class(**args)
         if 'timeout' in action:
diff --git a/openhands/events/serialization/event.py b/openhands/events/serialization/event.py
index 6ee82a1cc81c..cf2637ab667d 100644
--- a/openhands/events/serialization/event.py
+++ b/openhands/events/serialization/event.py
@@ -1,6 +1,8 @@
 from dataclasses import asdict
 from datetime import datetime
 
+from pydantic import BaseModel
+
 from openhands.events import Event, EventSource
 from openhands.events.observation.observation import Observation
 from openhands.events.serialization.action import action_from_dict
@@ -56,6 +58,12 @@ def event_from_dict(data) -> 'Event':
     return evt
 
 
+def _convert_pydantic_to_dict(obj: BaseModel | dict) -> dict:
+    if isinstance(obj, BaseModel):
+        return obj.model_dump()
+    return obj
+
+
 def event_to_dict(event: 'Event') -> dict:
     props = asdict(event)
     d = {}
@@ -82,7 +90,11 @@ def event_to_dict(event: 'Event') -> dict:
             d['timeout'] = event.timeout
     elif 'observation' in d:
         d['content'] = props.pop('content', '')
-        d['extras'] = props
+
+        # props is a dict whose values can include a complex object like an instance of a BaseModel subclass
+        # such as CmdOutputMetadata
+        # we serialize it along with the rest
+        d['extras'] = {k: _convert_pydantic_to_dict(v) for k, v in props.items()}
         # Include success field for CmdOutputObservation
         if hasattr(event, 'success'):
             d['success'] = event.success
@@ -109,7 +121,6 @@ def event_to_memory(event: 'Event', max_message_chars: int) -> dict:
     # runnable actions have some extra fields used in the BE/FE, which should not be sent to the LLM
     if 'args' in d:
         d['args'].pop('blocking', None)
-        d['args'].pop('keep_prompt', None)
         d['args'].pop('confirmation_state', None)
 
     if 'extras' in d:
diff --git a/openhands/events/serialization/observation.py b/openhands/events/serialization/observation.py
index d9d8dc51adaf..961c155e8916 100644
--- a/openhands/events/serialization/observation.py
+++ b/openhands/events/serialization/observation.py
@@ -1,6 +1,9 @@
+import copy
+
 from openhands.events.observation.agent import AgentStateChangedObservation
 from openhands.events.observation.browse import BrowserOutputObservation
 from openhands.events.observation.commands import (
+    CmdOutputMetadata,
     CmdOutputObservation,
     IPythonRunCellObservation,
 )
@@ -37,6 +40,26 @@
 }
 
 
+def _update_cmd_output_metadata(
+    metadata: dict | CmdOutputMetadata | None, **kwargs
+) -> dict | CmdOutputMetadata:
+    """Update the metadata of a CmdOutputObservation.
+
+    If metadata is None, create a new CmdOutputMetadata instance.
+    If metadata is a dict, update the dict.
+    If metadata is a CmdOutputMetadata instance, update the instance.
+    """
+    if metadata is None:
+        return CmdOutputMetadata(**kwargs)
+
+    if isinstance(metadata, dict):
+        metadata.update(**kwargs)
+    elif isinstance(metadata, CmdOutputMetadata):
+        for key, value in kwargs.items():
+            setattr(metadata, key, value)
+    return metadata
+
+
 def observation_from_dict(observation: dict) -> Observation:
     observation = observation.copy()
     if 'observation' not in observation:
@@ -49,6 +72,24 @@ def observation_from_dict(observation: dict) -> Observation:
     observation.pop('observation')
     observation.pop('message', None)
     content = observation.pop('content', '')
-    extras = observation.pop('extras', {})
+    extras = copy.deepcopy(observation.pop('extras', {}))
+
+    # Handle legacy attributes for CmdOutputObservation
+    if 'exit_code' in extras:
+        extras['metadata'] = _update_cmd_output_metadata(
+            extras.get('metadata', None), exit_code=extras.pop('exit_code')
+        )
+    if 'command_id' in extras:
+        extras['metadata'] = _update_cmd_output_metadata(
+            extras.get('metadata', None), pid=extras.pop('command_id')
+        )
+    # convert metadata to CmdOutputMetadata if it is a dict
+    if observation_class is CmdOutputObservation:
+        if 'metadata' in extras and isinstance(extras['metadata'], dict):
+            extras['metadata'] = CmdOutputMetadata(**extras['metadata'])
+        elif 'metadata' in extras and isinstance(extras['metadata'], CmdOutputMetadata):
+            pass
+        else:
+            extras['metadata'] = CmdOutputMetadata()
 
     return observation_class(content=content, **extras)
diff --git a/openhands/events/utils.py b/openhands/events/utils.py
index 6c8cc415f675..bf710edcd7bd 100644
--- a/openhands/events/utils.py
+++ b/openhands/events/utils.py
@@ -2,9 +2,11 @@
 from openhands.events.action.action import Action
 from openhands.events.action.empty import NullAction
 from openhands.events.event import Event
-from openhands.events.observation.commands import CmdOutputObservation
-from openhands.events.observation.empty import NullObservation
-from openhands.events.observation.observation import Observation
+from openhands.events.observation import (
+    CmdOutputObservation,
+    NullObservation,
+    Observation,
+)
 
 
 def get_pairs_from_events(events: list[Event]) -> list[tuple[Action, Observation]]:
diff --git a/openhands/resolver/resolve_issue.py b/openhands/resolver/resolve_issue.py
index ed3bbf11178e..7617410ef429 100644
--- a/openhands/resolver/resolve_issue.py
+++ b/openhands/resolver/resolve_issue.py
@@ -122,10 +122,7 @@ async def complete_runtime(
     n_retries = 0
     git_patch = None
     while n_retries < 5:
-        action = CmdRunAction(
-            command=f'git diff --no-color --cached {base_commit}',
-            keep_prompt=False,
-        )
+        action = CmdRunAction(command=f'git diff --no-color --cached {base_commit}')
         action.timeout = 600 + 100 * n_retries
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py
index 4f7cf3ee38a3..7c83a2f0f411 100644
--- a/openhands/runtime/action_execution_server.py
+++ b/openhands/runtime/action_execution_server.py
@@ -95,31 +95,34 @@ def __init__(
         browsergym_eval_env: str | None,
     ) -> None:
         self.plugins_to_load = plugins_to_load
-        self._initial_pwd = work_dir
+        self._initial_cwd = work_dir
         self.username = username
         self.user_id = user_id
         _updated_user_id = init_user_and_working_directory(
-            username=username, user_id=self.user_id, initial_pwd=work_dir
+            username=username, user_id=self.user_id, initial_cwd=work_dir
         )
         if _updated_user_id is not None:
             self.user_id = _updated_user_id
 
-        self.bash_session = BashSession(
-            work_dir=work_dir,
-            username=username,
-        )
-
+        self.bash_session: BashSession | None = None
         self.lock = asyncio.Lock()
         self.plugins: dict[str, Plugin] = {}
         self.browser = BrowserEnv(browsergym_eval_env)
         self.start_time = time.time()
         self.last_execution_time = self.start_time
+        self._initialized = False
 
     @property
-    def initial_pwd(self):
-        return self._initial_pwd
+    def initial_cwd(self):
+        return self._initial_cwd
 
     async def ainit(self):
+        # bash needs to be initialized first
+        self.bash_session = BashSession(
+            work_dir=self._initial_cwd,
+            username=self.username,
+        )
+        self.bash_session.initialize()
         await wait_all(
             (self._init_plugin(plugin) for plugin in self.plugins_to_load),
             timeout=30,
@@ -138,8 +141,14 @@ async def ainit(self):
 
         await self._init_bash_commands()
         logger.debug('Runtime client initialized.')
+        self._initialized = True
+
+    @property
+    def initialized(self) -> bool:
+        return self._initialized
 
     async def _init_plugin(self, plugin: Plugin):
+        assert self.bash_session is not None
         await plugin.initialize(self.username)
         self.plugins[plugin.name] = plugin
         logger.debug(f'Initializing plugin: {plugin.name}')
@@ -147,7 +156,7 @@ async def _init_plugin(self, plugin: Plugin):
         if isinstance(plugin, JupyterPlugin):
             await self.run_ipython(
                 IPythonRunCellAction(
-                    code=f'import os; os.chdir("{self.bash_session.pwd}")'
+                    code=f'import os; os.chdir("{self.bash_session.cwd}")'
                 )
             )
 
@@ -177,30 +186,32 @@ async def run_action(self, action) -> Observation:
     async def run(
         self, action: CmdRunAction
     ) -> CmdOutputObservation | ErrorObservation:
-        obs = await call_sync_from_async(self.bash_session.run, action)
+        assert self.bash_session is not None
+        obs = await call_sync_from_async(self.bash_session.execute, action)
         return obs
 
     async def run_ipython(self, action: IPythonRunCellAction) -> Observation:
+        assert self.bash_session is not None
         if 'jupyter' in self.plugins:
             _jupyter_plugin: JupyterPlugin = self.plugins['jupyter']  # type: ignore
             # This is used to make AgentSkills in Jupyter aware of the
             # current working directory in Bash
-            jupyter_pwd = getattr(self, '_jupyter_pwd', None)
-            if self.bash_session.pwd != jupyter_pwd:
+            jupyter_cwd = getattr(self, '_jupyter_cwd', None)
+            if self.bash_session.cwd != jupyter_cwd:
                 logger.debug(
-                    f'{self.bash_session.pwd} != {jupyter_pwd} -> reset Jupyter PWD'
+                    f'{self.bash_session.cwd} != {jupyter_cwd} -> reset Jupyter PWD'
                 )
-                reset_jupyter_pwd_code = (
-                    f'import os; os.chdir("{self.bash_session.pwd}")'
+                reset_jupyter_cwd_code = (
+                    f'import os; os.chdir("{self.bash_session.cwd}")'
                 )
-                _aux_action = IPythonRunCellAction(code=reset_jupyter_pwd_code)
+                _aux_action = IPythonRunCellAction(code=reset_jupyter_cwd_code)
                 _reset_obs: IPythonRunCellObservation = await _jupyter_plugin.run(
                     _aux_action
                 )
                 logger.debug(
-                    f'Changed working directory in IPython to: {self.bash_session.pwd}. Output: {_reset_obs}'
+                    f'Changed working directory in IPython to: {self.bash_session.cwd}. Output: {_reset_obs}'
                 )
-                self._jupyter_pwd = self.bash_session.pwd
+                self._jupyter_cwd = self.bash_session.cwd
 
             obs: IPythonRunCellObservation = await _jupyter_plugin.run(action)
             obs.content = obs.content.rstrip()
@@ -266,7 +277,7 @@ async def run_ipython(self, action: IPythonRunCellAction) -> Observation:
 
             if action.include_extra:
                 obs.content += (
-                    f'\n[Jupyter current working directory: {self.bash_session.pwd}]'
+                    f'\n[Jupyter current working directory: {self.bash_session.cwd}]'
                 )
                 obs.content += f'\n[Jupyter Python interpreter: {_jupyter_plugin.python_interpreter_path}]'
             return obs
@@ -282,6 +293,7 @@ def _resolve_path(self, path: str, working_dir: str) -> str:
         return str(filepath)
 
     async def read(self, action: FileReadAction) -> Observation:
+        assert self.bash_session is not None
         if action.impl_source == FileReadSource.OH_ACI:
             return await self.run_ipython(
                 IPythonRunCellAction(
@@ -292,7 +304,7 @@ async def read(self, action: FileReadAction) -> Observation:
 
         # NOTE: the client code is running inside the sandbox,
         # so there's no need to check permission
-        working_dir = self.bash_session.workdir
+        working_dir = self.bash_session.cwd
         filepath = self._resolve_path(action.path, working_dir)
         try:
             if filepath.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
@@ -339,7 +351,8 @@ async def read(self, action: FileReadAction) -> Observation:
         return FileReadObservation(path=filepath, content=code_view)
 
     async def write(self, action: FileWriteAction) -> Observation:
-        working_dir = self.bash_session.workdir
+        assert self.bash_session is not None
+        working_dir = self.bash_session.cwd
         filepath = self._resolve_path(action.path, working_dir)
 
         insert = action.content.split('\n')
@@ -400,7 +413,8 @@ async def browse_interactive(self, action: BrowseInteractiveAction) -> Observati
         return await browse(action, self.browser)
 
     def close(self):
-        self.bash_session.close()
+        if self.bash_session is not None:
+            self.bash_session.close()
         self.browser.close()
 
 
@@ -609,6 +623,8 @@ async def download_file(path: str):
 
     @app.get('/alive')
     async def alive():
+        if client is None or not client.initialized:
+            return {'status': 'not initialized'}
         return {'status': 'ok'}
 
     # ================================
@@ -658,11 +674,11 @@ async def list_files(request: Request):
 
         # Get the full path of the requested directory
         if path is None:
-            full_path = client.initial_pwd
+            full_path = client.initial_cwd
         elif os.path.isabs(path):
             full_path = path
         else:
-            full_path = os.path.join(client.initial_pwd, path)
+            full_path = os.path.join(client.initial_cwd, path)
 
         if not os.path.exists(full_path):
             # if user just removed a folder, prevent server error 500 in UI
diff --git a/openhands/runtime/impl/action_execution/action_execution_client.py b/openhands/runtime/impl/action_execution/action_execution_client.py
index 00c847a23015..80f71ffe8ab1 100644
--- a/openhands/runtime/impl/action_execution/action_execution_client.py
+++ b/openhands/runtime/impl/action_execution/action_execution_client.py
@@ -264,7 +264,6 @@ def send_action_for_execution(self, action: Action) -> Observation:
                 raise AgentRuntimeTimeoutError(
                     f'Runtime failed to return execute_action before the requested timeout of {action.timeout}s'
                 )
-
             return obs
 
     def run(self, action: CmdRunAction) -> Observation:
diff --git a/openhands/runtime/impl/remote/remote_runtime.py b/openhands/runtime/impl/remote/remote_runtime.py
index d146e71d198a..010ec8b0e938 100644
--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@@ -219,7 +219,9 @@ def _start_runtime(self):
             'image': self.container_image,
             'command': command,
             'working_dir': '/openhands/code/',
-            'environment': {'DEBUG': 'true'} if self.config.debug else {},
+            'environment': {'DEBUG': 'true'}
+            if self.config.debug or os.environ.get('DEBUG', 'false').lower() == 'true'
+            else {},
             'session_id': self.sid,
             'resource_factor': self.config.sandbox.remote_runtime_resource_factor,
         }
@@ -364,7 +366,10 @@ def _send_action_server_request(self, method, url, **kwargs):
         try:
             return super()._send_action_server_request(method, url, **kwargs)
         except requests.Timeout:
-            self.log('error', 'No response received within the timeout period.')
+            self.log(
+                'error',
+                f'No response received within the timeout period for url: {url}',
+            )
             raise
         except requests.HTTPError as e:
             if e.response.status_code == 404:
diff --git a/openhands/runtime/utils/bash.py b/openhands/runtime/utils/bash.py
index a5019315a038..84c5d0265957 100644
--- a/openhands/runtime/utils/bash.py
+++ b/openhands/runtime/utils/bash.py
@@ -1,18 +1,21 @@
 import os
 import re
+import time
+import uuid
+from enum import Enum
 
 import bashlex
-import pexpect
+import libtmux
 
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import CmdRunAction
-from openhands.events.event import EventSource
-from openhands.events.observation import (
+from openhands.events.observation import ErrorObservation
+from openhands.events.observation.commands import (
+    CMD_OUTPUT_PS1_END,
+    CmdOutputMetadata,
     CmdOutputObservation,
-    ErrorObservation,
 )
-
-SOFT_TIMEOUT_SECONDS = 5
+from openhands.utils.shutdown_listener import should_continue
 
 
 def split_bash_commands(commands):
@@ -66,269 +69,493 @@ def split_bash_commands(commands):
     return result
 
 
+def escape_bash_special_chars(command: str) -> str:
+    r"""
+    Escapes characters that have different interpretations in bash vs python.
+    Specifically handles escape sequences like \;, \|, \&, etc.
+    """
+    if command.strip() == '':
+        return ''
+
+    try:
+        parts = []
+        last_pos = 0
+
+        def visit_node(node):
+            nonlocal last_pos
+            if (
+                node.kind == 'redirect'
+                and hasattr(node, 'heredoc')
+                and node.heredoc is not None
+            ):
+                # We're entering a heredoc - preserve everything as-is until we see EOF
+                # Store the heredoc end marker (usually 'EOF' but could be different)
+                between = command[last_pos : node.pos[0]]
+                parts.append(between)
+                # Add the heredoc start marker
+                parts.append(command[node.pos[0] : node.heredoc.pos[0]])
+                # Add the heredoc content as-is
+                parts.append(command[node.heredoc.pos[0] : node.heredoc.pos[1]])
+                last_pos = node.pos[1]
+                return
+
+            if node.kind == 'word':
+                # Get the raw text between the last position and current word
+                between = command[last_pos : node.pos[0]]
+                word_text = command[node.pos[0] : node.pos[1]]
+
+                # Add the between text, escaping special characters
+                between = re.sub(r'\\([;&|><])', r'\\\\\1', between)
+                parts.append(between)
+
+                # Check if word_text is a quoted string or command substitution
+                if (
+                    (word_text.startswith('"') and word_text.endswith('"'))
+                    or (word_text.startswith("'") and word_text.endswith("'"))
+                    or (word_text.startswith('$(') and word_text.endswith(')'))
+                    or (word_text.startswith('`') and word_text.endswith('`'))
+                ):
+                    # Preserve quoted strings, command substitutions, and heredoc content as-is
+                    parts.append(word_text)
+                else:
+                    # Escape special chars in unquoted text
+                    word_text = re.sub(r'\\([;&|><])', r'\\\\\1', word_text)
+                    parts.append(word_text)
+
+                last_pos = node.pos[1]
+                return
+
+            # Visit child nodes
+            if hasattr(node, 'parts'):
+                for part in node.parts:
+                    visit_node(part)
+
+        # Process all nodes in the AST
+        nodes = list(bashlex.parse(command))
+        for node in nodes:
+            between = command[last_pos : node.pos[0]]
+            between = re.sub(r'\\([;&|><])', r'\\\\\1', between)
+            parts.append(between)
+            last_pos = node.pos[0]
+            visit_node(node)
+
+        # Handle any remaining text after the last word
+        remaining = command[last_pos:]
+        parts.append(remaining)
+        return ''.join(parts)
+    except bashlex.errors.ParsingError:
+        # Fallback if parsing fails
+        logger.warning(f'Failed to parse command: {command}')
+        return command
+
+
+class BashCommandStatus(Enum):
+    CONTINUE = 'continue'
+    COMPLETED = 'completed'
+    NO_CHANGE_TIMEOUT = 'no_change_timeout'
+    HARD_TIMEOUT = 'hard_timeout'
+
+
+def _remove_command_prefix(command_output: str, command: str) -> str:
+    return command_output.lstrip().removeprefix(command.lstrip()).lstrip()
+
+
 class BashSession:
-    """A class that maintains a pexpect process and provides a simple interface for running commands and interacting with the shell."""
+    POLL_INTERVAL = 0.5
+    HISTORY_LIMIT = 10_000
+    PS1 = CmdOutputMetadata.to_ps1_prompt()
+
+    def __init__(
+        self,
+        work_dir: str,
+        username: str | None = None,
+        no_change_timeout_seconds: float = 30.0,
+    ):
+        self.NO_CHANGE_TIMEOUT_SECONDS = no_change_timeout_seconds
+        self.work_dir = work_dir
+        self.username = username
+        self._initialized = False
+
+    def initialize(self):
+        self.server = libtmux.Server()
+        window_command = '/bin/bash'
+        if self.username:
+            # This starts a non-login (new) shell for the given user
+            window_command = f'su {self.username} -'
+
+        session_name = f'openhands-{self.username}-{uuid.uuid4()}'
+        self.session = self.server.new_session(
+            session_name=session_name,
+            window_name='bash',
+            window_command=window_command,
+            start_directory=self.work_dir,
+            kill_session=True,
+            x=1000,
+            y=1000,
+        )
 
-    def __init__(self, work_dir: str, username: str):
-        self._pwd = work_dir
+        # Set history limit to a large number to avoid losing history
+        # https://unix.stackexchange.com/questions/43414/unlimited-history-in-tmux
+        self.session.set_option('history-limit', str(self.HISTORY_LIMIT), _global=True)
+        self.session.history_limit = self.HISTORY_LIMIT
+        # We need to create a new pane because the initial pane's history limit is (default) 2000
+        _initial_window = self.session.attached_window
+        self.window = self.session.new_window(
+            window_shell=window_command,
+            start_directory=self.work_dir,
+        )
+        self.pane = self.window.attached_pane
+        logger.debug(f'pane: {self.pane}; history_limit: {self.session.history_limit}')
+        _initial_window.kill_window()
 
-        self.shell = pexpect.spawn(
-            f'su {username}',
-            encoding='utf-8',
-            codec_errors='replace',
-            echo=False,
+        # Configure bash to use simple PS1 and disable PS2
+        self.pane.send_keys(
+            f'export PROMPT_COMMAND=\'export PS1="{self.PS1}"\'; export PS2=""'
+        )
+        time.sleep(0.1)  # Wait for command to take effect
+        self._clear_screen()
+
+        # Store the last command for interactive input handling
+        self.prev_status: BashCommandStatus | None = None
+        self.prev_output: str = ''
+        self._closed: bool = False
+        logger.debug(f'Bash session initialized with work dir: {self.work_dir}')
+
+        # Maintain the current working directory
+        self._cwd = os.path.abspath(self.work_dir)
+        self._initialized = True
+
+    def __del__(self):
+        """Ensure the session is closed when the object is destroyed."""
+        self.close()
+
+    def _get_pane_content(self) -> str:
+        """Capture the current pane content and update the buffer."""
+        content = '\n'.join(
+            map(
+                # avoid double newlines
+                lambda line: line.rstrip(),
+                self.pane.cmd('capture-pane', '-J', '-pS', '-').stdout,
+            )
         )
-        self._init_bash_shell(work_dir)
+        return content
 
     def close(self):
-        self.shell.close()
-
-    @property
-    def pwd(self):
-        return self._pwd
+        """Clean up the session."""
+        if self._closed:
+            return
+        self.session.kill_session()
+        self._closed = True
 
     @property
-    def workdir(self):
-        return self._get_working_directory()
-
-    def _get_working_directory(self):
-        # NOTE: this is part of initialization, so we hard code the timeout
-        result, exit_code = self._execute_bash('pwd', timeout=60, keep_prompt=False)
-        if exit_code != 0:
-            raise RuntimeError(
-                f'Failed to get working directory (exit code: {exit_code}): {result}'
-            )
-        return result.strip()
-
-    def _init_bash_shell(self, work_dir: str):
-        self.__bash_PS1 = (
-            r'[PEXPECT_BEGIN]\n'
-            r'$(which python >/dev/null 2>&1 && echo "[Python Interpreter: $(which python)]\n")'
-            r'\u@\h:\w\n'
-            r'[PEXPECT_END]'
+    def cwd(self):
+        return self._cwd
+
+    def _is_special_key(self, command: str) -> bool:
+        """Check if the command is a special key."""
+        # Special keys are of the form C-<key>
+        _command = command.strip()
+        return _command.startswith('C-') and len(_command) == 3
+
+    def _clear_screen(self):
+        """Clear the tmux pane screen and history."""
+        self.pane.send_keys('C-l', enter=False)
+        time.sleep(0.1)
+        self.pane.cmd('clear-history')
+
+    def _get_command_output(
+        self,
+        command: str,
+        raw_command_output: str,
+        metadata: CmdOutputMetadata,
+        continue_prefix: str = '',
+    ) -> str:
+        """Get the command output with the previous command output removed.
+
+        Args:
+            command: The command that was executed.
+            raw_command_output: The raw output from the command.
+            metadata: The metadata object to store prefix/suffix in.
+            continue_prefix: The prefix to add to the command output if it's a continuation of the previous command.
+        """
+        # remove the previous command output from the new output if any
+        if self.prev_output:
+            command_output = raw_command_output.removeprefix(self.prev_output)
+            metadata.prefix = continue_prefix
+        else:
+            command_output = raw_command_output
+        self.prev_output = raw_command_output  # update current command output anyway
+        command_output = _remove_command_prefix(command_output, command)
+        return command_output.rstrip()
+
+    def _handle_completed_command(
+        self, command: str, pane_content: str, ps1_matches: list[re.Match]
+    ) -> CmdOutputObservation:
+        is_special_key = self._is_special_key(command)
+        assert len(ps1_matches) >= 1, (
+            f'Expected at least one PS1 metadata block, but got {len(ps1_matches)}.\n'
+            f'---FULL OUTPUT---\n{pane_content!r}\n---END OF OUTPUT---'
+        )
+        metadata = CmdOutputMetadata.from_ps1_match(ps1_matches[-1])
+
+        # Special case where the previous command output is truncated due to history limit
+        # We should get the content BEFORE the last PS1 prompt
+        get_content_before_last_match = bool(len(ps1_matches) == 1)
+
+        # Update the current working directory if it has changed
+        if metadata.working_dir != self._cwd and metadata.working_dir:
+            self._cwd = metadata.working_dir
+
+        logger.debug(f'COMMAND OUTPUT: {pane_content}')
+        # Extract the command output between the two PS1 prompts
+        raw_command_output = self._combine_outputs_between_matches(
+            pane_content,
+            ps1_matches,
+            get_content_before_last_match=get_content_before_last_match,
         )
 
-        # This should NOT match "PS1=\u@\h:\w [PEXPECT]$" when `env` is executed
-        self.__bash_expect_regex = r'\[PEXPECT_BEGIN\]\s*(.*?)\s*([a-z0-9_-]*)@([a-zA-Z0-9.-]*):(.+)\s*\[PEXPECT_END\]'
-        # Set umask to allow group write permissions
-        self.shell.sendline(f'umask 002; export PS1="{self.__bash_PS1}"; export PS2=""')
-        self.shell.expect(self.__bash_expect_regex)
+        if get_content_before_last_match:
+            # Count the number of lines in the truncated output
+            num_lines = len(raw_command_output.splitlines())
+            metadata.prefix = f'[Previous command outputs are truncated. Showing the last {num_lines} lines of the output below.]\n'
 
-        self.shell.sendline(
-            f'if [ ! -d "{work_dir}" ]; then mkdir -p "{work_dir}"; fi && cd "{work_dir}"'
+        metadata.suffix = (
+            f'\n[The command completed with exit code {metadata.exit_code}.]'
+            if not is_special_key
+            else f'\n[The command completed with exit code {metadata.exit_code}. CTRL+{command[-1].upper()} was sent.]'
         )
-        self.shell.expect(self.__bash_expect_regex)
-        logger.debug(
-            f'Bash initialized. Working directory: {work_dir}. Output: [{self.shell.before}]'
+        command_output = self._get_command_output(
+            command,
+            raw_command_output,
+            metadata,
+        )
+        self.prev_status = BashCommandStatus.COMPLETED
+        self.prev_output = ''  # Reset previous command output
+        self._ready_for_next_command()
+        return CmdOutputObservation(
+            content=command_output,
+            command=command,
+            metadata=metadata,
         )
-        # Ensure the group has write permissions on the working directory
-        self.shell.sendline(f'chmod g+rw "{work_dir}"')
-        self.shell.expect(self.__bash_expect_regex)
-
-    def _get_bash_prompt_and_update_pwd(self):
-        ps1 = self.shell.after
-        if ps1 == pexpect.EOF:
-            logger.error(f'Bash shell EOF! {self.shell.after=}, {self.shell.before=}')
-            raise RuntimeError('Bash shell EOF')
-        if ps1 == pexpect.TIMEOUT:
-            logger.warning('Bash shell timeout')
-            return ''
-
-        # begin at the last occurrence of '[PEXPECT_BEGIN]'.
-        # In multi-line bash commands, the prompt will be repeated
-        # and the matched regex captures all of them
-        # - we only want the last one (newest prompt)
-        _begin_pos = ps1.rfind('[PEXPECT_BEGIN]')
-        if _begin_pos != -1:
-            ps1 = ps1[_begin_pos:]
-
-        # parse the ps1 to get username, hostname, and working directory
-        matched = re.match(self.__bash_expect_regex, ps1)
-        assert (
-            matched is not None
-        ), f'Failed to parse bash prompt: {ps1}. This should not happen.'
-        other_info, username, hostname, working_dir = matched.groups()
-        working_dir = working_dir.rstrip()
-        self._pwd = os.path.expanduser(working_dir)
-
-        # re-assemble the prompt
-        # ignore the hostname AND use 'openhands-workspace'
-        prompt = f'{other_info.strip()}\n{username}@openhands-workspace:{working_dir} '
-        if username == 'root':
-            prompt += '#'
-        else:
-            prompt += '$'
-        return prompt + ' '
 
-    def _execute_bash(
+    def _handle_nochange_timeout_command(
         self,
         command: str,
-        timeout: int,
-        keep_prompt: bool = True,
-        kill_on_timeout: bool = True,
-    ) -> tuple[str, int]:
-        logger.debug(f'Executing command: {command}')
-        self.shell.sendline(command)
-        return self._continue_bash(
-            timeout=timeout, keep_prompt=keep_prompt, kill_on_timeout=kill_on_timeout
+        pane_content: str,
+        ps1_matches: list[re.Match],
+    ) -> CmdOutputObservation:
+        self.prev_status = BashCommandStatus.NO_CHANGE_TIMEOUT
+        if len(ps1_matches) != 1:
+            logger.warning(
+                'Expected exactly one PS1 metadata block BEFORE the execution of a command, '
+                f'but got {len(ps1_matches)} PS1 metadata blocks:\n---\n{pane_content!r}\n---'
+            )
+        raw_command_output = self._combine_outputs_between_matches(
+            pane_content, ps1_matches
+        )
+        metadata = CmdOutputMetadata()  # No metadata available
+        metadata.suffix = (
+            f'\n[The command has no new output after {self.NO_CHANGE_TIMEOUT_SECONDS} seconds. '
+            "You may wait longer to see additional output by sending empty command '', "
+            'send other commands to interact with the current process, '
+            'or send keys to interrupt/kill the command.]'
+        )
+        command_output = self._get_command_output(
+            command,
+            raw_command_output,
+            metadata,
+            continue_prefix='[Command output continued from previous command]\n',
+        )
+        return CmdOutputObservation(
+            content=command_output,
+            command=command,
+            metadata=metadata,
         )
 
-    def _interrupt_bash(
+    def _handle_hard_timeout_command(
         self,
-        action_timeout: int | None,
-        interrupt_timeout: int | None = None,
-        max_retries: int = 2,
-    ) -> tuple[str, int]:
-        interrupt_timeout = interrupt_timeout or 1  # default timeout for SIGINT
-        # try to interrupt the bash shell use SIGINT
-        while max_retries > 0:
-            self.shell.sendintr()  # send SIGINT to the shell
-            logger.debug('Sent SIGINT to bash. Waiting for output...')
-            try:
-                self.shell.expect(self.__bash_expect_regex, timeout=interrupt_timeout)
-                output = self.shell.before
-                logger.debug(f'Received output after SIGINT: {output}')
-                exit_code = 130  # SIGINT
-
-                _additional_msg = ''
-                if action_timeout is not None:
-                    _additional_msg = (
-                        f'Command timed out after {action_timeout} seconds. '
-                    )
-                output += (
-                    '\r\n\r\n'
-                    + f'[{_additional_msg}SIGINT was sent to interrupt the command.]'
-                )
-                return output, exit_code
-            except pexpect.TIMEOUT as e:
-                logger.warning(f'Bash pexpect.TIMEOUT while waiting for SIGINT: {e}')
-                max_retries -= 1
-
-        # fall back to send control-z
-        logger.error(
-            'Failed to get output after SIGINT. Max retries reached. Sending control-z...'
+        command: str,
+        pane_content: str,
+        ps1_matches: list[re.Match],
+        timeout: float,
+    ) -> CmdOutputObservation:
+        self.prev_status = BashCommandStatus.HARD_TIMEOUT
+        if len(ps1_matches) != 1:
+            logger.warning(
+                'Expected exactly one PS1 metadata block BEFORE the execution of a command, '
+                f'but got {len(ps1_matches)} PS1 metadata blocks:\n---\n{pane_content!r}\n---'
+            )
+        raw_command_output = self._combine_outputs_between_matches(
+            pane_content, ps1_matches
+        )
+        metadata = CmdOutputMetadata()  # No metadata available
+        metadata.suffix = (
+            f'\n[The command timed out after {timeout} seconds. '
+            "You may wait longer to see additional output by sending empty command '', "
+            'send other commands to interact with the current process, '
+            'or send keys to interrupt/kill the command.]'
         )
-        self.shell.sendcontrol('z')
-        self.shell.expect(self.__bash_expect_regex)
-        output = self.shell.before
-        logger.debug(f'Received output after control-z: {output}')
-        # Try to kill the job
-        self.shell.sendline('kill -9 %1')
-        self.shell.expect(self.__bash_expect_regex)
-        logger.debug(f'Received output after killing job %1: {self.shell.before}')
-        output += self.shell.before
-
-        _additional_msg = ''
-        if action_timeout is not None:
-            _additional_msg = f'Command timed out after {action_timeout} seconds. '
-        output += (
-            '\r\n\r\n'
-            + f'[{_additional_msg}SIGINT was sent to interrupt the command, but failed. The command was killed.]'
+        command_output = self._get_command_output(
+            command,
+            raw_command_output,
+            metadata,
+            continue_prefix='[Command output continued from previous command]\n',
         )
 
-        # Try to get the exit code again
-        self.shell.sendline('echo $?')
-        self.shell.expect(self.__bash_expect_regex)
-        _exit_code_output = self.shell.before
-        exit_code = self._parse_exit_code(_exit_code_output)
-
-        return output, exit_code
-
-    def _parse_exit_code(self, output: str) -> int:
-        try:
-            exit_code = int(output.strip().split()[0])
-        except Exception:
-            logger.error('Error getting exit code from bash script')
-            # If we try to run an invalid shell script the output sometimes includes error text
-            # rather than the error code - we assume this is an error
-            exit_code = 2
-        return exit_code
-
-    def _continue_bash(
+        return CmdOutputObservation(
+            command=command,
+            content=command_output,
+            metadata=metadata,
+        )
+
+    def _ready_for_next_command(self):
+        """Reset the content buffer for a new command."""
+        # Clear the current content
+        self._clear_screen()
+
+    def _combine_outputs_between_matches(
         self,
-        timeout: int,
-        keep_prompt: bool = True,
-        kill_on_timeout: bool = True,
-    ) -> tuple[str, int]:
-        logger.debug(f'Continuing bash with timeout={timeout}')
-        try:
-            self.shell.expect(self.__bash_expect_regex, timeout=timeout)
-
-            output = self.shell.before
-
-            # Get exit code
-            self.shell.sendline('echo $?')
-            logger.debug('Requesting exit code...')
-            self.shell.expect(self.__bash_expect_regex, timeout=timeout)
-            _exit_code_output = self.shell.before
-            exit_code = self._parse_exit_code(_exit_code_output)
-        except pexpect.TIMEOUT as e:
-            logger.warning(f'Bash pexpect.TIMEOUT while executing bash command: {e}')
-            if kill_on_timeout:
-                output, exit_code = self._interrupt_bash(action_timeout=timeout)
+        pane_content: str,
+        ps1_matches: list[re.Match],
+        get_content_before_last_match: bool = False,
+    ) -> str:
+        """Combine all outputs between PS1 matches.
+
+        Args:
+            pane_content: The full pane content containing PS1 prompts and command outputs
+            ps1_matches: List of regex matches for PS1 prompts
+            get_content_before_last_match: when there's only one PS1 match, whether to get
+                the content before the last PS1 prompt (True) or after the last PS1 prompt (False)
+        Returns:
+            Combined string of all outputs between matches
+        """
+        if len(ps1_matches) == 1:
+            if get_content_before_last_match:
+                # The command output is the content before the last PS1 prompt
+                return pane_content[: ps1_matches[0].start()]
             else:
-                output = self.shell.before or ''
-                exit_code = -1
-        finally:
-            bash_prompt = self._get_bash_prompt_and_update_pwd()
-            if keep_prompt:
-                output += '\r\n' + bash_prompt
-        return output, exit_code
-
-    def run(self, action: CmdRunAction) -> CmdOutputObservation | ErrorObservation:
-        try:
-            assert (
-                action.timeout is not None
-            ), f'Timeout argument is required for CmdRunAction: {action}'
-            commands = split_bash_commands(action.command)
-            all_output = ''
-            python_interpreter = ''
-            for command in commands:
-                if command == '':
-                    output, exit_code = self._continue_bash(
-                        timeout=SOFT_TIMEOUT_SECONDS,
-                        keep_prompt=action.keep_prompt,
-                        kill_on_timeout=False,
-                    )
-                elif command.lower() == 'ctrl+c':
-                    output, exit_code = self._interrupt_bash(
-                        action_timeout=None,  # intentionally None
-                    )
-                else:
-                    output, exit_code = self._execute_bash(
-                        command,
-                        timeout=SOFT_TIMEOUT_SECONDS
-                        if not action.blocking
-                        else action.timeout,
-                        keep_prompt=action.keep_prompt,
-                        kill_on_timeout=False if not action.blocking else True,
-                    )
-                    # Get rid of the python interpreter string from each line of the output.
-                    # We need it only once at the end.
-                    parts = output.rsplit('[Python Interpreter: ', 1)
-                    output = parts[0]
-                    if len(parts) == 2:
-                        python_interpreter = '[Python Interpreter: ' + parts[1]
-                if all_output:
-                    # previous output already exists so we add a newline
-                    all_output += '\r\n'
-
-                # If the command originated with the agent, append the command that was run...
-                if action.source == EventSource.AGENT:
-                    all_output += command + '\r\n'
-
-                all_output += str(output)
-                if exit_code != 0:
-                    break
+                # The command output is the content after the last PS1 prompt
+                return pane_content[ps1_matches[0].end() + 1 :]
+        combined_output = ''
+        for i in range(len(ps1_matches) - 1):
+            # Extract content between current and next PS1 prompt
+            output_segment = pane_content[
+                ps1_matches[i].end() + 1 : ps1_matches[i + 1].start()
+            ]
+            combined_output += output_segment + '\n'
+        logger.debug(f'COMBINED OUTPUT: {combined_output}')
+        return combined_output
+
+    def execute(self, action: CmdRunAction) -> CmdOutputObservation | ErrorObservation:
+        """Execute a command in the bash session."""
+        if not self._initialized:
+            raise RuntimeError('Bash session is not initialized')
+
+        # Strip the command of any leading/trailing whitespace
+        logger.debug(f'RECEIVED ACTION: {action}')
+        command = action.command.strip()
+
+        if command == '' and self.prev_status not in {
+            BashCommandStatus.CONTINUE,
+            BashCommandStatus.NO_CHANGE_TIMEOUT,
+            BashCommandStatus.HARD_TIMEOUT,
+        }:
             return CmdOutputObservation(
-                command_id=-1,
-                content=all_output.rstrip('\r\n'),
-                command=action.command,
-                hidden=action.hidden,
-                exit_code=exit_code,
-                interpreter_details=python_interpreter,
+                content='ERROR: No previous command to continue from. '
+                + 'Previous command has to be timeout to be continued.',
+                command='',
+                metadata=CmdOutputMetadata(),
             )
-        except UnicodeDecodeError as e:
+
+        splited_commands = split_bash_commands(command)
+        if len(splited_commands) > 1:
             return ErrorObservation(
-                f'Runtime bash execution failed: Command output could not be decoded as utf-8. {str(e)}',
+                content=(
+                    f'ERROR: Cannot execute multiple commands at once.\n'
+                    f'Please run each command separately OR chain them into a single command via && or ;\n'
+                    f'Provided commands:\n{"\n".join(f"({i+1}) {cmd}" for i, cmd in enumerate(splited_commands))}'
+                )
             )
+
+        start_time = time.time()
+        last_change_time = start_time
+        last_pane_output = self._get_pane_content()
+
+        _ps1_matches = CmdOutputMetadata.matches_ps1_metadata(last_pane_output)
+        assert len(_ps1_matches) >= 1, (
+            'Expected at least one PS1 metadata block BEFORE the execution of a command, '
+            f'but got {len(_ps1_matches)} PS1 metadata blocks:\n---\n{last_pane_output!r}\n---'
+        )
+        if len(_ps1_matches) > 1:
+            logger.warning(
+                'Found multiple PS1 metadata blocks BEFORE the execution of a command. '
+                'Only the last one will be used.'
+            )
+            _ps1_matches = [_ps1_matches[-1]]
+
+        if command != '':
+            # convert command to raw string
+            command = escape_bash_special_chars(command)
+            logger.debug(f'SENDING COMMAND: {command!r}')
+            self.pane.send_keys(
+                command,
+                enter=not self._is_special_key(command),
+            )
+
+        # Loop until the command completes or times out
+        while should_continue():
+            _start_time = time.time()
+            logger.debug(f'GETTING PANE CONTENT at {_start_time}')
+            cur_pane_output = self._get_pane_content()
+            logger.debug(
+                f'PANE CONTENT GOT after {time.time() - _start_time:.2f} seconds'
+            )
+            logger.debug(f'BEGIN OF PANE CONTENT: {cur_pane_output.split("\n")[:10]}')
+            logger.debug(f'END OF PANE CONTENT: {cur_pane_output.split("\n")[-10:]}')
+            ps1_matches = CmdOutputMetadata.matches_ps1_metadata(cur_pane_output)
+            if cur_pane_output != last_pane_output:
+                last_pane_output = cur_pane_output
+                last_change_time = time.time()
+                logger.debug(f'CONTENT UPDATED DETECTED at {last_change_time}')
+
+            # 1) Execution completed
+            # if the last command output contains the end marker
+            if cur_pane_output.rstrip().endswith(CMD_OUTPUT_PS1_END.rstrip()):
+                return self._handle_completed_command(
+                    command,
+                    pane_content=cur_pane_output,
+                    ps1_matches=ps1_matches,
+                )
+
+            # 2) Execution timed out since there's no change in output
+            # for a while (self.NO_CHANGE_TIMEOUT_SECONDS)
+            # We ignore this if the command is *blocking
+            time_since_last_change = time.time() - last_change_time
+            logger.debug(
+                f'CHECKING NO CHANGE TIMEOUT ({self.NO_CHANGE_TIMEOUT_SECONDS}s): elapsed {time_since_last_change}'
+            )
+            if (
+                not action.blocking
+                and time_since_last_change >= self.NO_CHANGE_TIMEOUT_SECONDS
+            ):
+                return self._handle_nochange_timeout_command(
+                    command,
+                    pane_content=cur_pane_output,
+                    ps1_matches=ps1_matches,
+                )
+
+            # 3) Execution timed out due to hard timeout
+            logger.debug(
+                f'CHECKING HARD TIMEOUT ({action.timeout}s): elapsed {time.time() - start_time}'
+            )
+            if action.timeout and time.time() - start_time >= action.timeout:
+                return self._handle_hard_timeout_command(
+                    command,
+                    pane_content=cur_pane_output,
+                    ps1_matches=ps1_matches,
+                    timeout=action.timeout,
+                )
+
+            logger.debug(f'SLEEPING for {self.POLL_INTERVAL} seconds for next poll')
+            time.sleep(self.POLL_INTERVAL)
+        raise RuntimeError('Bash session was likely interrupted...')
diff --git a/openhands/runtime/utils/runtime_init.py b/openhands/runtime/utils/runtime_init.py
index 9ebba67fcd31..b38ab7ca3495 100644
--- a/openhands/runtime/utils/runtime_init.py
+++ b/openhands/runtime/utils/runtime_init.py
@@ -4,7 +4,7 @@
 
 
 def init_user_and_working_directory(
-    username: str, user_id: int, initial_pwd: str
+    username: str, user_id: int, initial_cwd: str
 ) -> int | None:
     """Create working directory and user if not exists.
     It performs the following steps effectively:
@@ -26,23 +26,23 @@ def init_user_and_working_directory(
     Args:
         username (str): The username to create.
         user_id (int): The user ID to assign to the user.
-        initial_pwd (str): The initial working directory to create.
+        initial_cwd (str): The initial working directory to create.
 
     Returns:
         int | None: The user ID if it was updated, None otherwise.
     """
 
     # First create the working directory, independent of the user
-    logger.debug(f'Client working directory: {initial_pwd}')
-    command = f'umask 002; mkdir -p {initial_pwd}'
+    logger.debug(f'Client working directory: {initial_cwd}')
+    command = f'umask 002; mkdir -p {initial_cwd}'
     output = subprocess.run(command, shell=True, capture_output=True)
     out_str = output.stdout.decode()
 
-    command = f'chown -R {username}:root {initial_pwd}'
+    command = f'chown -R {username}:root {initial_cwd}'
     output = subprocess.run(command, shell=True, capture_output=True)
     out_str += output.stdout.decode()
 
-    command = f'chmod g+rw {initial_pwd}'
+    command = f'chmod g+rw {initial_cwd}'
     output = subprocess.run(command, shell=True, capture_output=True)
     out_str += output.stdout.decode()
     logger.debug(f'Created working directory. Output: [{out_str}]')
diff --git a/openhands/runtime/utils/runtime_templates/Dockerfile.j2 b/openhands/runtime/utils/runtime_templates/Dockerfile.j2
index 7423421252ab..8a97792d9de3 100644
--- a/openhands/runtime/utils/runtime_templates/Dockerfile.j2
+++ b/openhands/runtime/utils/runtime_templates/Dockerfile.j2
@@ -15,7 +15,7 @@ ENV POETRY_VIRTUALENVS_PATH=/openhands/poetry \
 # Install base system dependencies
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
-        wget curl sudo apt-utils git jq \
+        wget curl sudo apt-utils git jq tmux \
         {% if 'ubuntu' in base_image and (base_image.endswith(':latest') or base_image.endswith(':24.04')) %}
         libgl1 \
         {% else %}
diff --git a/poetry.lock b/poetry.lock
index b1263ea1f9ca..54f3fd9fd419 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -3711,6 +3711,17 @@ websocket-client = ">=0.32.0,<0.40.0 || >0.40.0,<0.41.dev0 || >=0.43.dev0"
 [package.extras]
 adal = ["adal (>=1.0.2)"]
 
+[[package]]
+name = "libtmux"
+version = "0.37.0"
+description = "Typed library that provides an ORM wrapper for tmux, a terminal multiplexer."
+optional = false
+python-versions = "<4.0,>=3.8"
+files = [
+    {file = "libtmux-0.37.0-py3-none-any.whl", hash = "sha256:7e8cbab30b033d132b6fca5dddb575bb7f6a1fd802328e7174f9b49023556376"},
+    {file = "libtmux-0.37.0.tar.gz", hash = "sha256:21955c5dce6332db41abad5e26ae8c4062ef2b9a89099bd57a36f52be1d5270f"},
+]
+
 [[package]]
 name = "libwebarena"
 version = "0.0.3"
@@ -8298,6 +8309,25 @@ postgresql-psycopgbinary = ["psycopg[binary] (>=3.0.7)"]
 pymysql = ["pymysql"]
 sqlcipher = ["sqlcipher3_binary"]
 
+[[package]]
+name = "sse-starlette"
+version = "2.1.3"
+description = "SSE plugin for Starlette"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "sse_starlette-2.1.3-py3-none-any.whl", hash = "sha256:8ec846438b4665b9e8c560fcdea6bc8081a3abf7942faa95e5a744999d219772"},
+    {file = "sse_starlette-2.1.3.tar.gz", hash = "sha256:9cd27eb35319e1414e3d2558ee7414487f9529ce3b3cf9b21434fd110e017169"},
+]
+
+[package.dependencies]
+anyio = "*"
+starlette = "*"
+uvicorn = "*"
+
+[package.extras]
+examples = ["fastapi"]
+
 [[package]]
 name = "stack-data"
 version = "0.6.3"
@@ -10054,4 +10084,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "691bdd0f64e3476858eb34ce6ed6d0b0e7d97458cfd69fd366cd9c1c4f4ec897"
+content-hash = "db887f071f7dbb712cfba5d9b4de8938afbedee22fd166b4527f4aec40e37cfd"
diff --git a/pyproject.toml b/pyproject.toml
index be7cb9a3a7ce..037025e70ea4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "openhands-ai"
-version = "0.18.0"
+version = "0.18.0a0"
 description = "OpenHands: Code Less, Make More"
 authors = ["OpenHands"]
 license = "MIT"
@@ -62,11 +62,13 @@ opentelemetry-api = "1.25.0"
 opentelemetry-exporter-otlp-proto-grpc = "1.25.0"
 modal = ">=0.66.26,<0.71.0"
 runloop-api-client = "0.11.0"
+libtmux = "^0.37.0"
 pygithub = "^2.5.0"
 joblib = "*"
 openhands-aci = "0.1.5"
 python-socketio = "^5.11.4"
 redis = "^5.2.0"
+sse-starlette = "^2.1.3"
 
 [tool.poetry.group.llama-index.dependencies]
 llama-index = "*"
@@ -100,6 +102,7 @@ reportlab = "*"
 [tool.coverage.run]
 concurrency = ["gevent"]
 
+
 [tool.poetry.group.runtime.dependencies]
 jupyterlab = "*"
 notebook = "*"
@@ -129,6 +132,7 @@ ignore = ["D1"]
 [tool.ruff.lint.pydocstyle]
 convention = "google"
 
+
 [tool.poetry.group.evaluation.dependencies]
 streamlit = "*"
 whatthepatch = "*"
diff --git a/tests/runtime/conftest.py b/tests/runtime/conftest.py
index 062fc2ed9fa1..83038fa28666 100644
--- a/tests/runtime/conftest.py
+++ b/tests/runtime/conftest.py
@@ -69,7 +69,7 @@ def _close_test_runtime(runtime: Runtime) -> None:
     time.sleep(1)
 
 
-def _reset_pwd() -> None:
+def _reset_cwd() -> None:
     global project_dir
     # Try to change back to project directory
     try:
@@ -152,16 +152,16 @@ def get_run_as_openhands() -> list[bool]:
 
 @pytest.fixture(scope='module')  # for xdist
 def runtime_setup_module():
-    _reset_pwd()
+    _reset_cwd()
     yield
-    _reset_pwd()
+    _reset_cwd()
 
 
 @pytest.fixture(scope='session')  # not for xdist
 def runtime_setup_session():
-    _reset_pwd()
+    _reset_cwd()
     yield
-    _reset_pwd()
+    _reset_cwd()
 
 
 # This assures that all tests run together per runtime, not alternating between them,
@@ -230,14 +230,14 @@ def _load_runtime(
     global test_mount_path
     if use_workspace:
         test_mount_path = os.path.join(config.workspace_base, 'rt')
+    elif temp_dir is not None:
+        test_mount_path = os.path.join(temp_dir, sid)
     else:
-        test_mount_path = os.path.join(
-            temp_dir, sid
-        )  # need a subfolder to avoid conflicts
+        test_mount_path = None
     config.workspace_mount_path = test_mount_path
 
     # Mounting folder specific for this test inside the sandbox
-    config.workspace_mount_path_in_sandbox = f'{sandbox_test_folder}/{sid}'
+    config.workspace_mount_path_in_sandbox = f'{sandbox_test_folder}'
     print('\nPaths used:')
     print(f'use_host_network: {config.sandbox.use_host_network}')
     print(f'workspace_base: {config.workspace_base}')
diff --git a/tests/runtime/test_bash.py b/tests/runtime/test_bash.py
index f8ff95d9a6b2..b97cdbc9266b 100644
--- a/tests/runtime/test_bash.py
+++ b/tests/runtime/test_bash.py
@@ -5,7 +5,6 @@
 
 import pytest
 from conftest import (
-    TEST_IN_CI,
     _close_test_runtime,
     _get_sandbox_folder,
     _load_runtime,
@@ -13,7 +12,7 @@
 
 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import CmdRunAction
-from openhands.events.observation import CmdOutputObservation
+from openhands.events.observation import CmdOutputObservation, ErrorObservation
 from openhands.runtime.base import Runtime
 
 # ============================================================================================================================
@@ -21,36 +20,19 @@
 # ============================================================================================================================
 
 
-def _run_cmd_action(runtime, custom_command: str, keep_prompt=True):
-    action = CmdRunAction(command=custom_command, keep_prompt=keep_prompt)
+def _run_cmd_action(runtime, custom_command: str):
+    action = CmdRunAction(command=custom_command)
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
-    assert isinstance(obs, CmdOutputObservation)
+    assert isinstance(obs, (CmdOutputObservation, ErrorObservation))
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     return obs
 
 
-def test_bash_command_pexcept(temp_dir, runtime_cls, run_as_openhands):
+def test_bash_command_env(temp_dir, runtime_cls, run_as_openhands):
     runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
     try:
-        # We set env var PS1="\u@\h:\w $"
-        # and construct the PEXCEPT prompt base on it.
-        # When run `env`, bad implementation of CmdRunAction will be pexcepted by this
-        # and failed to pexcept the right content, causing it fail to get error code.
         obs = runtime.run_action(CmdRunAction(command='env'))
-
-        # For example:
-        # 02:16:13 - openhands:DEBUG: client.py:78 - Executing command: env
-        # 02:16:13 - openhands:DEBUG: client.py:82 - Command output: PYTHONUNBUFFERED=1
-        # CONDA_EXE=/openhands/miniforge3/bin/conda
-        # [...]
-        # LC_CTYPE=C.UTF-8
-        # PS1=\u@\h:\w $
-        # 02:16:13 - openhands:DEBUG: client.py:89 - Executing command for exit code: env
-        # 02:16:13 - openhands:DEBUG: client.py:92 - Exit code Output:
-        # CONDA_DEFAULT_ENV=base
-
-        # As long as the exit code is 0, the test will pass.
         assert isinstance(
             obs, CmdOutputObservation
         ), 'The observation should be a CmdOutputObservation.'
@@ -59,62 +41,29 @@ def test_bash_command_pexcept(temp_dir, runtime_cls, run_as_openhands):
         _close_test_runtime(runtime)
 
 
-def test_bash_timeout_and_keyboard_interrupt(temp_dir, runtime_cls, run_as_openhands):
+def test_bash_server(temp_dir, runtime_cls, run_as_openhands):
     runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
     try:
-        action = CmdRunAction(command='python -c "import time; time.sleep(10)"')
+        action = CmdRunAction(command='python3 -m http.server 8080')
         action.timeout = 1
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
         assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == -1
+        assert 'Serving HTTP on 0.0.0.0 port 8080' in obs.content
         assert (
-            '[Command timed out after 1 seconds. SIGINT was sent to interrupt the command.]'
-            in obs.content
+            "[The command timed out after 1 seconds. You may wait longer to see additional output by sending empty command '', send other commands to interact with the current process, or send keys to interrupt/kill the command.]"
+            in obs.metadata.suffix
         )
-        assert 'KeyboardInterrupt' in obs.content
 
-        # follow up command should not be affected
-        action = CmdRunAction(command='ls')
-        action.timeout = 1
+        action = CmdRunAction(command='C-c')
+        action.timeout = 30
         obs = runtime.run_action(action)
-        assert isinstance(obs, CmdOutputObservation)
-        assert obs.exit_code == 0
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-        # run it again!
-        action = CmdRunAction(command='python -c "import time; time.sleep(10)"')
-        action.timeout = 1
-        obs = runtime.run_action(action)
-        assert isinstance(obs, CmdOutputObservation)
-        assert (
-            '[Command timed out after 1 seconds. SIGINT was sent to interrupt the command.]'
-            in obs.content
-        )
-        assert 'KeyboardInterrupt' in obs.content
-
-        # things should still work
-        action = CmdRunAction(command='ls')
-        action.timeout = 1
-        obs = runtime.run_action(action)
         assert isinstance(obs, CmdOutputObservation)
         assert obs.exit_code == 0
-        assert '/workspace' in obs.interpreter_details
-
-    finally:
-        _close_test_runtime(runtime)
-
-
-def test_bash_pexcept_eof(temp_dir, runtime_cls, run_as_openhands):
-    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
-    try:
-        action = CmdRunAction(command='python3 -m http.server 8080')
-        action.timeout = 1
-        obs = runtime.run_action(action)
-        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-        assert isinstance(obs, CmdOutputObservation)
-        assert obs.exit_code == 130  # script was killed by SIGINT
-        assert 'Serving HTTP on 0.0.0.0 port 8080' in obs.content
         assert 'Keyboard interrupt received, exiting.' in obs.content
+        assert '/workspace' in obs.metadata.working_dir
 
         action = CmdRunAction(command='ls')
         action.timeout = 1
@@ -122,7 +71,8 @@ def test_bash_pexcept_eof(temp_dir, runtime_cls, run_as_openhands):
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
         assert isinstance(obs, CmdOutputObservation)
         assert obs.exit_code == 0
-        assert '/workspace' in obs.interpreter_details
+        assert 'Keyboard interrupt received, exiting.' not in obs.content
+        assert '/workspace' in obs.metadata.working_dir
 
         # run it again!
         action = CmdRunAction(command='python3 -m http.server 8080')
@@ -130,122 +80,8 @@ def test_bash_pexcept_eof(temp_dir, runtime_cls, run_as_openhands):
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
         assert isinstance(obs, CmdOutputObservation)
-        assert obs.exit_code == 130  # script was killed by SIGINT
+        assert obs.exit_code == -1
         assert 'Serving HTTP on 0.0.0.0 port 8080' in obs.content
-        assert 'Keyboard interrupt received, exiting.' in obs.content
-
-        # things should still work
-        action = CmdRunAction(command='ls')
-        action.timeout = 1
-        obs = runtime.run_action(action)
-        assert isinstance(obs, CmdOutputObservation)
-        assert obs.exit_code == 0
-        assert '/workspace' in obs.interpreter_details
-    finally:
-        _close_test_runtime(runtime)
-
-
-def test_process_resistant_to_one_sigint(temp_dir, runtime_cls, run_as_openhands):
-    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
-    try:
-        # Create a bash script that ignores SIGINT up to 1 times
-        script_content = """
-#!/bin/bash
-trap_count=0
-trap 'echo "Caught SIGINT ($((++trap_count))/1), ignoring..."; [ $trap_count -ge 1 ] && trap - INT && exit' INT
-while true; do
-    echo "Still running..."
-    sleep 1
-done
-        """.strip()
-
-        with open(f'{temp_dir}/resistant_script.sh', 'w') as f:
-            f.write(script_content)
-        os.chmod(f'{temp_dir}/resistant_script.sh', 0o777)
-
-        runtime.copy_to(
-            os.path.join(temp_dir, 'resistant_script.sh'),
-            runtime.config.workspace_mount_path_in_sandbox,
-        )
-
-        # Run the resistant script
-        action = CmdRunAction(command='sudo bash ./resistant_script.sh')
-        action.timeout = 5
-        action.blocking = True
-        logger.info(action, extra={'msg_type': 'ACTION'})
-        obs = runtime.run_action(action)
-        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-        assert isinstance(obs, CmdOutputObservation)
-        assert obs.exit_code == 130  # script was killed by SIGINT
-        assert 'Still running...' in obs.content
-        assert 'Caught SIGINT (1/1), ignoring...' in obs.content
-        assert 'Stopped' not in obs.content
-        assert (
-            '[Command timed out after 5 seconds. SIGINT was sent to interrupt the command.]'
-            in obs.content
-        )
-
-        # Normal command should still work
-        action = CmdRunAction(command='ls')
-        action.timeout = 10
-        obs = runtime.run_action(action)
-        assert isinstance(obs, CmdOutputObservation)
-        assert obs.exit_code == 0
-        assert '/workspace' in obs.interpreter_details
-        assert 'resistant_script.sh' in obs.content
-
-    finally:
-        _close_test_runtime(runtime)
-
-
-def test_process_resistant_to_multiple_sigint(temp_dir, runtime_cls, run_as_openhands):
-    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
-    try:
-        # Create a bash script that ignores SIGINT up to 2 times
-        script_content = """
-#!/bin/bash
-trap_count=0
-trap 'echo "Caught SIGINT ($((++trap_count))/3), ignoring..."; [ $trap_count -ge 3 ] && trap - INT && exit' INT
-while true; do
-    echo "Still running..."
-    sleep 1
-done
-        """.strip()
-
-        with open(f'{temp_dir}/resistant_script.sh', 'w') as f:
-            f.write(script_content)
-        os.chmod(f'{temp_dir}/resistant_script.sh', 0o777)
-
-        runtime.copy_to(
-            os.path.join(temp_dir, 'resistant_script.sh'),
-            runtime.config.workspace_mount_path_in_sandbox,
-        )
-
-        # Run the resistant script
-        action = CmdRunAction(command='sudo bash ./resistant_script.sh')
-        action.timeout = 2
-        action.blocking = True
-        logger.info(action, extra={'msg_type': 'ACTION'})
-        obs = runtime.run_action(action)
-        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-        assert isinstance(obs, CmdOutputObservation)
-        assert obs.exit_code == 0
-        assert 'Still running...' in obs.content
-        assert 'Caught SIGINT (1/3), ignoring...' in obs.content
-        assert '[1]+' and 'Stopped' in obs.content
-        assert (
-            '[Command timed out after 2 seconds. SIGINT was sent to interrupt the command, but failed. The command was killed.]'
-            in obs.content
-        )
-
-        # Normal command should still work
-        action = CmdRunAction(command='ls')
-        action.timeout = 10
-        obs = runtime.run_action(action)
-        assert isinstance(obs, CmdOutputObservation)
-        assert obs.exit_code == 0
-        assert '/workspace' in obs.interpreter_details
-        assert 'resistant_script.sh' in obs.content
 
     finally:
         _close_test_runtime(runtime)
@@ -262,12 +98,12 @@ def test_multiline_commands(temp_dir, runtime_cls):
         # test multiline echo
         obs = _run_cmd_action(runtime, 'echo -e "hello\nworld"')
         assert obs.exit_code == 0, 'The exit code should be 0.'
-        assert 'hello\r\nworld' in obs.content
+        assert 'hello\nworld' in obs.content
 
         # test whitespace
         obs = _run_cmd_action(runtime, 'echo -e "a\\n\\n\\nz"')
         assert obs.exit_code == 0, 'The exit code should be 0.'
-        assert '\r\n\r\n\r\n' in obs.content
+        assert '\n\n\n' in obs.content
     finally:
         _close_test_runtime(runtime)
 
@@ -276,43 +112,43 @@ def test_multiple_multiline_commands(temp_dir, runtime_cls, run_as_openhands):
     cmds = [
         'ls -l',
         'echo -e "hello\nworld"',
-        """
-echo -e "hello it\\'s me"
-""".strip(),
-        """
-echo \\
+        """echo -e "hello it's me\"""",
+        """echo \\
     -e 'hello' \\
-    -v
-""".strip(),
-        """
-echo -e 'hello\\nworld\\nare\\nyou\\nthere?'
-""".strip(),
-        """
-echo -e 'hello
-world
-are
-you\\n
-there?'
-""".strip(),
-        """
-echo -e 'hello
-world "
-'
-""".strip(),
+    -v""",
+        """echo -e 'hello\\nworld\\nare\\nyou\\nthere?'""",
+        """echo -e 'hello\nworld\nare\nyou\n\nthere?'""",
+        """echo -e 'hello\nworld "'""",
     ]
     joined_cmds = '\n'.join(cmds)
 
     runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
     try:
+        # First test that running multiple commands at once fails
         obs = _run_cmd_action(runtime, joined_cmds)
-        assert obs.exit_code == 0, 'The exit code should be 0.'
-
-        assert 'total 0' in obs.content
-        assert 'hello\r\nworld' in obs.content
-        assert "hello it\\'s me" in obs.content
-        assert 'hello -v' in obs.content
-        assert 'hello\r\nworld\r\nare\r\nyou\r\nthere?' in obs.content
-        assert 'hello\r\nworld\r\nare\r\nyou\r\n\r\nthere?' in obs.content
+        assert isinstance(obs, ErrorObservation)
+        assert 'Cannot execute multiple commands at once' in obs.content
+
+        # Now run each command individually and verify they work
+        results = []
+        for cmd in cmds:
+            obs = _run_cmd_action(runtime, cmd)
+            assert isinstance(obs, CmdOutputObservation)
+            assert obs.exit_code == 0
+            results.append(obs.content)
+
+        # Verify all expected outputs are present
+        assert 'total 0' in results[0]  # ls -l
+        assert 'hello\nworld' in results[1]  # echo -e "hello\nworld"
+        assert "hello it's me" in results[2]  # echo -e "hello it\'s me"
+        assert 'hello -v' in results[3]  # echo -e 'hello' -v
+        assert (
+            'hello\nworld\nare\nyou\nthere?' in results[4]
+        )  # echo -e 'hello\nworld\nare\nyou\nthere?'
+        assert (
+            'hello\nworld\nare\nyou\n\nthere?' in results[5]
+        )  # echo -e with literal newlines
+        assert 'hello\nworld "' in results[6]  # echo -e with quote
     finally:
         _close_test_runtime(runtime)
 
@@ -324,7 +160,7 @@ def test_no_ps2_in_output(temp_dir, runtime_cls, run_as_openhands):
         obs = _run_cmd_action(runtime, 'echo -e "hello\nworld"')
         assert obs.exit_code == 0, 'The exit code should be 0.'
 
-        assert 'hello\r\nworld' in obs.content
+        assert 'hello\nworld' in obs.content
         assert '>' not in obs.content
     finally:
         _close_test_runtime(runtime)
@@ -332,21 +168,17 @@ def test_no_ps2_in_output(temp_dir, runtime_cls, run_as_openhands):
 
 def test_multiline_command_loop(temp_dir, runtime_cls):
     # https://github.com/All-Hands-AI/OpenHands/issues/3143
-    init_cmd = """
-mkdir -p _modules && \
+    init_cmd = """mkdir -p _modules && \
 for month in {01..04}; do
     for day in {01..05}; do
         touch "_modules/2024-${month}-${day}-sample.md"
     done
-done
-echo "created files"
+done && echo "created files"
 """
-    follow_up_cmd = """
-for file in _modules/*.md; do
+    follow_up_cmd = """for file in _modules/*.md; do
     new_date=$(echo $file | sed -E 's/2024-(01|02|03|04)-/2024-/;s/2024-01/2024-08/;s/2024-02/2024-09/;s/2024-03/2024-10/;s/2024-04/2024-11/')
     mv "$file" "$new_date"
-done
-echo "success"
+done && echo "success"
 """
     runtime = _load_runtime(temp_dir, runtime_cls)
     try:
@@ -424,7 +256,6 @@ def test_multi_cmd_run_in_single_line(temp_dir, runtime_cls):
 
 def test_stateful_cmd(temp_dir, runtime_cls):
     runtime = _load_runtime(temp_dir, runtime_cls)
-    sandbox_dir = _get_sandbox_folder(runtime)
     try:
         obs = _run_cmd_action(runtime, 'mkdir -p test')
         assert obs.exit_code == 0, 'The exit code should be 0.'
@@ -434,7 +265,7 @@ def test_stateful_cmd(temp_dir, runtime_cls):
 
         obs = _run_cmd_action(runtime, 'pwd')
         assert obs.exit_code == 0, 'The exit code should be 0.'
-        assert f'{sandbox_dir}/test' in obs.content
+        assert '/workspace/test' in obs.content
     finally:
         _close_test_runtime(runtime)
 
@@ -532,7 +363,7 @@ def test_copy_to_non_existent_directory(temp_dir, runtime_cls):
 def test_overwrite_existing_file(temp_dir, runtime_cls):
     runtime = _load_runtime(temp_dir, runtime_cls)
     try:
-        sandbox_dir = _get_sandbox_folder(runtime)
+        sandbox_dir = '/openhands/workspace'
 
         obs = _run_cmd_action(runtime, f'ls -alh {sandbox_dir}')
         assert obs.exit_code == 0
@@ -595,38 +426,13 @@ def test_copy_from_directory(temp_dir, runtime_cls):
         _close_test_runtime(runtime)
 
 
-def test_keep_prompt(runtime_cls, temp_dir):
-    runtime = _load_runtime(
-        temp_dir,
-        runtime_cls=runtime_cls,
-        run_as_openhands=False,
-    )
-    try:
-        sandbox_dir = _get_sandbox_folder(runtime)
-
-        obs = _run_cmd_action(runtime, f'touch {sandbox_dir}/test_file.txt')
-        assert obs.exit_code == 0
-        assert 'root@' in obs.interpreter_details
-
-        obs = _run_cmd_action(
-            runtime, f'cat {sandbox_dir}/test_file.txt', keep_prompt=False
-        )
-        assert obs.exit_code == 0
-        assert 'root@' not in obs.interpreter_details
-    finally:
-        _close_test_runtime(runtime)
-
-
-@pytest.mark.skipif(
-    TEST_IN_CI != 'True',
-    reason='This test is not working in WSL (file ownership)',
-)
 def test_git_operation(runtime_cls):
     # do not mount workspace, since workspace mount by tests will be owned by root
     # while the user_id we get via os.getuid() is different from root
     # which causes permission issues
     runtime = _load_runtime(
         temp_dir=None,
+        use_workspace=False,
         runtime_cls=runtime_cls,
         # Need to use non-root user to expose issues
         run_as_openhands=True,
@@ -634,12 +440,15 @@ def test_git_operation(runtime_cls):
     # this will happen if permission of runtime is not properly configured
     # fatal: detected dubious ownership in repository at '/workspace'
     try:
+        obs = _run_cmd_action(runtime, 'sudo chown -R openhands:root .')
+        assert obs.exit_code == 0
+
         # check the ownership of the current directory
         obs = _run_cmd_action(runtime, 'ls -alh .')
         assert obs.exit_code == 0
         # drwx--S--- 2 openhands root   64 Aug  7 23:32 .
         # drwxr-xr-x 1 root      root 4.0K Aug  7 23:33 ..
-        for line in obs.content.split('\r\n'):
+        for line in obs.content.split('\n'):
             if ' ..' in line:
                 # parent directory should be owned by root
                 assert 'root' in line
@@ -663,8 +472,10 @@ def test_git_operation(runtime_cls):
         assert obs.exit_code == 0
 
         # git diff
-        obs = _run_cmd_action(runtime, 'git diff')
+        obs = _run_cmd_action(runtime, 'git diff --no-color --cached')
         assert obs.exit_code == 0
+        assert 'b/test_file.txt' in obs.content
+        assert '+hello' in obs.content
 
         # git commit
         obs = _run_cmd_action(runtime, 'git commit -m "test commit"')
@@ -685,3 +496,276 @@ def test_python_version(temp_dir, runtime_cls, run_as_openhands):
         assert 'Python 3' in obs.content, 'The output should contain "Python 3".'
     finally:
         _close_test_runtime(runtime)
+
+
+def test_pwd_property(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Create a subdirectory and verify pwd updates
+        obs = _run_cmd_action(runtime, 'mkdir -p random_dir')
+        assert obs.exit_code == 0
+
+        obs = _run_cmd_action(runtime, 'cd random_dir && pwd')
+        assert obs.exit_code == 0
+        assert 'random_dir' in obs.content
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_basic_command(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Test simple command
+        obs = _run_cmd_action(runtime, "echo 'hello world'")
+        assert 'hello world' in obs.content
+        assert obs.exit_code == 0
+
+        # Test command with error
+        obs = _run_cmd_action(runtime, 'nonexistent_command')
+        assert obs.exit_code == 127
+        assert 'nonexistent_command: command not found' in obs.content
+
+        # Test command with special characters
+        obs = _run_cmd_action(runtime, "echo 'hello   world    with\nspecial  chars'")
+        assert 'hello   world    with\nspecial  chars' in obs.content
+        assert obs.exit_code == 0
+
+        # Test multiple commands in sequence
+        obs = _run_cmd_action(runtime, 'echo "first" && echo "second" && echo "third"')
+        assert 'first' in obs.content
+        assert 'second' in obs.content
+        assert 'third' in obs.content
+        assert obs.exit_code == 0
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_interactive_command(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Test interactive command
+        action = CmdRunAction('read -p "Enter name: " name && echo "Hello $name"')
+        action.timeout = 1
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        # assert 'Enter name:' in obs.content # FIXME: this is not working
+        assert '[The command timed out after 1 seconds.' in obs.metadata.suffix
+
+        action = CmdRunAction('John')
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert 'Hello John' in obs.content
+        assert '[The command completed with exit code 0.]' in obs.metadata.suffix
+
+        # Test multiline command input with here document
+        action = CmdRunAction("""cat << EOF
+line 1
+line 2
+EOF""")
+        obs = runtime.run_action(action)
+        assert 'line 1\nline 2' in obs.content
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert '[The command completed with exit code 0.]' in obs.metadata.suffix
+        assert obs.exit_code == 0
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_long_output(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Generate a long output
+        action = CmdRunAction('for i in $(seq 1 5000); do echo "Line $i"; done')
+        action.timeout = 10
+        obs = runtime.run_action(action)
+        assert obs.exit_code == 0
+        assert 'Line 1' in obs.content
+        assert 'Line 5000' in obs.content
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_long_output_exceed_history_limit(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Generate a long output
+        action = CmdRunAction('for i in $(seq 1 50000); do echo "Line $i"; done')
+        action.timeout = 30
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+        assert 'Previous command outputs are truncated' in obs.metadata.prefix
+        assert 'Line 40000' in obs.content
+        assert 'Line 50000' in obs.content
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_long_output_from_nested_directories(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Create nested directories with many files
+        setup_cmd = 'mkdir -p /tmp/test_dir && cd /tmp/test_dir && for i in $(seq 1 100); do mkdir -p "folder_$i"; for j in $(seq 1 100); do touch "folder_$i/file_$j.txt"; done; done'
+        setup_action = CmdRunAction(setup_cmd.strip())
+        setup_action.timeout = 60
+        obs = runtime.run_action(setup_action)
+        assert obs.exit_code == 0
+
+        # List the directory structure recursively
+        action = CmdRunAction('ls -R /tmp/test_dir')
+        action.timeout = 60
+        obs = runtime.run_action(action)
+        assert obs.exit_code == 0
+
+        # Verify output contains expected files
+        assert 'folder_1' in obs.content
+        assert 'file_1.txt' in obs.content
+        assert 'folder_100' in obs.content
+        assert 'file_100.txt' in obs.content
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_command_backslash(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Create a file with the content "implemented_function"
+        action = CmdRunAction(
+            'mkdir -p /tmp/test_dir && echo "implemented_function" > /tmp/test_dir/file_1.txt'
+        )
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+
+        # Reproduce an issue we ran into during evaluation
+        # find /workspace/sympy__sympy__1.0 -type f -exec grep -l "implemented_function" {} \;
+        # find: missing argument to `-exec'
+        # --> This is unexpected output due to incorrect escaping of \;
+        # This tests for correct escaping of \;
+        action = CmdRunAction(
+            'find /tmp/test_dir -type f -exec grep -l "implemented_function" {} \\;'
+        )
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+        assert '/tmp/test_dir/file_1.txt' in obs.content
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_command_output_continuation(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Start a command that produces output slowly
+        action = CmdRunAction('for i in {1..5}; do echo $i; sleep 3; done')
+        action.timeout = 2.5  # Set timeout to 2.5 seconds
+        obs = runtime.run_action(action)
+        assert obs.content.strip() == '1'
+        assert obs.metadata.prefix == ''
+        assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix
+
+        # Continue watching output
+        action = CmdRunAction('')
+        action.timeout = 2.5
+        obs = runtime.run_action(action)
+        assert '[Command output continued from previous command]' in obs.metadata.prefix
+        assert obs.content.strip() == '2'
+        assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix
+
+        # Continue until completion
+        for expected in ['3', '4', '5']:
+            action = CmdRunAction('')
+            action.timeout = 2.5
+            obs = runtime.run_action(action)
+            assert (
+                '[Command output continued from previous command]'
+                in obs.metadata.prefix
+            )
+            assert obs.content.strip() == expected
+            assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix
+
+        # Final empty command to complete
+        action = CmdRunAction('')
+        obs = runtime.run_action(action)
+        assert '[The command completed with exit code 0.]' in obs.metadata.suffix
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_long_running_command_follow_by_execute(
+    temp_dir, runtime_cls, run_as_openhands
+):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Test command that produces output slowly
+        action = CmdRunAction('for i in {1..3}; do echo $i; sleep 3; done')
+        action.timeout = 2.5
+        action.blocking = False
+        obs = runtime.run_action(action)
+        assert '1' in obs.content  # First number should appear before timeout
+        assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+        assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix
+        assert obs.metadata.prefix == ''
+
+        # Continue watching output
+        action = CmdRunAction('')
+        action.timeout = 2.5
+        obs = runtime.run_action(action)
+        assert '2' in obs.content
+        assert (
+            obs.metadata.prefix == '[Command output continued from previous command]\n'
+        )
+        assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix
+        assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+
+        # Test command that produces no output
+        action = CmdRunAction('sleep 15')
+        action.timeout = 2.5
+        obs = runtime.run_action(action)
+        assert '3' in obs.content
+        assert (
+            obs.metadata.prefix == '[Command output continued from previous command]\n'
+        )
+        assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix
+        assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_empty_command_errors(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Test empty command without previous command
+        obs = runtime.run_action(CmdRunAction(''))
+        assert isinstance(obs, CmdOutputObservation)
+        assert 'ERROR: No previous command to continue from' in obs.content
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_python_interactive_input(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Test Python program that asks for input - properly escaped for bash
+        python_script = """name = input('Enter your name: '); age = input('Enter your age: '); print(f'Hello {name}, you are {age} years old')"""
+
+        # Start Python with the interactive script
+        obs = runtime.run_action(CmdRunAction(f'python3 -c "{python_script}"'))
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert 'Enter your name:' in obs.content
+        assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+
+        # Send first input (name)
+        obs = runtime.run_action(CmdRunAction('Alice'))
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert 'Enter your age:' in obs.content
+        assert obs.metadata.exit_code == -1
+
+        # Send second input (age)
+        obs = runtime.run_action(CmdRunAction('25'))
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert 'Hello Alice, you are 25 years old' in obs.content
+        assert obs.metadata.exit_code == 0
+        assert '[The command completed with exit code 0.]' in obs.metadata.suffix
+    finally:
+        _close_test_runtime(runtime)
diff --git a/tests/runtime/test_ipython.py b/tests/runtime/test_ipython.py
index caee30001e81..8b5c89053b52 100644
--- a/tests/runtime/test_ipython.py
+++ b/tests/runtime/test_ipython.py
@@ -4,7 +4,6 @@
 from conftest import (
     TEST_IN_CI,
     _close_test_runtime,
-    _get_sandbox_folder,
     _load_runtime,
 )
 
@@ -33,8 +32,6 @@
 def test_simple_cmd_ipython_and_fileop(temp_dir, runtime_cls, run_as_openhands):
     runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
 
-    sandbox_dir = _get_sandbox_folder(runtime)
-
     # Test run command
     action_cmd = CmdRunAction(command='ls -l')
     logger.info(action_cmd, extra={'msg_type': 'ACTION'})
@@ -55,7 +52,7 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, runtime_cls, run_as_openhands):
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert obs.content.strip() == (
         'Hello, `World`!\n'
-        f'[Jupyter current working directory: {sandbox_dir}]\n'
+        '[Jupyter current working directory: /openhands/workspace]\n'
         '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.12/bin/python]'
     )
 
@@ -76,7 +73,7 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, runtime_cls, run_as_openhands):
 
     assert obs.content == ''
     # event stream runtime will always use absolute path
-    assert obs.path == f'{sandbox_dir}/hello.sh'
+    assert obs.path == '/openhands/workspace/hello.sh'
 
     # Test read file (file should exist)
     action_read = FileReadAction(path='hello.sh')
@@ -88,7 +85,7 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, runtime_cls, run_as_openhands):
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
 
     assert obs.content == 'echo "Hello, World!"\n'
-    assert obs.path == f'{sandbox_dir}/hello.sh'
+    assert obs.path == '/openhands/workspace/hello.sh'
 
     # clean up
     action = CmdRunAction(command='rm -rf hello.sh')
@@ -178,7 +175,6 @@ def test_ipython_multi_user(temp_dir, runtime_cls, run_as_openhands):
 
 def test_ipython_simple(temp_dir, runtime_cls):
     runtime = _load_runtime(temp_dir, runtime_cls)
-    sandbox_dir = _get_sandbox_folder(runtime)
 
     # Test run ipython
     # get username
@@ -192,7 +188,7 @@ def test_ipython_simple(temp_dir, runtime_cls):
         obs.content.strip()
         == (
             '1\n'
-            f'[Jupyter current working directory: {sandbox_dir}]\n'
+            '[Jupyter current working directory: /openhands/workspace]\n'
             '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.12/bin/python]'
         ).strip()
     )
@@ -203,7 +199,6 @@ def test_ipython_simple(temp_dir, runtime_cls):
 def test_ipython_package_install(temp_dir, runtime_cls, run_as_openhands):
     """Make sure that cd in bash also update the current working directory in ipython."""
     runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
-    sandbox_dir = _get_sandbox_folder(runtime)
 
     # It should error out since pymsgbox is not installed
     action = IPythonRunCellAction(code='import pymsgbox')
@@ -229,7 +224,7 @@ def test_ipython_package_install(temp_dir, runtime_cls, run_as_openhands):
     # import should not error out
     assert obs.content.strip() == (
         '[Code executed successfully with no output]\n'
-        f'[Jupyter current working directory: {sandbox_dir}]\n'
+        '[Jupyter current working directory: /openhands/workspace]\n'
         '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.12/bin/python]'
     )
 
@@ -239,7 +234,6 @@ def test_ipython_package_install(temp_dir, runtime_cls, run_as_openhands):
 def test_ipython_file_editor_permissions_as_openhands(temp_dir, runtime_cls):
     """Test file editor permission behavior when running as different users."""
     runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands=True)
-    sandbox_dir = _get_sandbox_folder(runtime)
 
     # Create a file owned by root with restricted permissions
     action = CmdRunAction(
@@ -277,18 +271,18 @@ def test_ipython_file_editor_permissions_as_openhands(temp_dir, runtime_cls):
     assert 'Permission denied' in obs.content
 
     # Try to use file editor in openhands sandbox directory - should work
-    test_code = f"""
+    test_code = """
 # Create file
-print(file_editor(command='create', path='{sandbox_dir}/test.txt', file_text='Line 1\\nLine 2\\nLine 3'))
+print(file_editor(command='create', path='/openhands/workspace/test.txt', file_text='Line 1\\nLine 2\\nLine 3'))
 
 # View file
-print(file_editor(command='view', path='{sandbox_dir}/test.txt'))
+print(file_editor(command='view', path='/openhands/workspace/test.txt'))
 
 # Edit file
-print(file_editor(command='str_replace', path='{sandbox_dir}/test.txt', old_str='Line 2', new_str='New Line 2'))
+print(file_editor(command='str_replace', path='/openhands/workspace/test.txt', old_str='Line 2', new_str='New Line 2'))
 
 # Undo edit
-print(file_editor(command='undo_edit', path='{sandbox_dir}/test.txt'))
+print(file_editor(command='undo_edit', path='/openhands/workspace/test.txt'))
 """
     action = IPythonRunCellAction(code=test_code)
     logger.info(action, extra={'msg_type': 'ACTION'})
@@ -303,7 +297,7 @@ def test_ipython_file_editor_permissions_as_openhands(temp_dir, runtime_cls):
     assert 'undone successfully' in obs.content
 
     # Clean up
-    action = CmdRunAction(command=f'rm -f {sandbox_dir}/test.txt')
+    action = CmdRunAction(command='rm -f /openhands/workspace/test.txt')
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -318,9 +312,9 @@ def test_ipython_file_editor_permissions_as_openhands(temp_dir, runtime_cls):
     _close_test_runtime(runtime)
 
 
-def test_file_read_and_edit_via_oh_aci(temp_dir, runtime_cls, run_as_openhands):
-    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
-    sandbox_dir = _get_sandbox_folder(runtime)
+def test_file_read_and_edit_via_oh_aci(runtime_cls, run_as_openhands):
+    runtime = _load_runtime(None, runtime_cls, run_as_openhands)
+    sandbox_dir = '/openhands/workspace'
 
     actions = [
         {
diff --git a/tests/unit/resolver/test_resolve_issues.py b/tests/unit/resolver/test_resolve_issues.py
index 9f5c8dc1f891..fcc12f1d0698 100644
--- a/tests/unit/resolver/test_resolve_issues.py
+++ b/tests/unit/resolver/test_resolve_issues.py
@@ -6,7 +6,11 @@
 
 from openhands.core.config import LLMConfig
 from openhands.events.action import CmdRunAction
-from openhands.events.observation import CmdOutputObservation, NullObservation
+from openhands.events.observation import (
+    CmdOutputMetadata,
+    CmdOutputObservation,
+    NullObservation,
+)
 from openhands.llm.llm import LLM
 from openhands.resolver.github_issue import GithubIssue, ReviewThread
 from openhands.resolver.issue_definitions import IssueHandler, PRHandler
@@ -55,23 +59,20 @@ def mock_followup_prompt_template():
     return 'Issue context: {{ issues }}\n\nReview comments: {{ review_comments }}\n\nReview threads: {{ review_threads }}\n\nFiles: {{ files }}\n\nThread comments: {{ thread_context }}\n\nPlease fix this issue.'
 
 
-def create_cmd_output(exit_code: int, content: str, command_id: int, command: str):
+def create_cmd_output(exit_code: int, content: str, command: str):
     return CmdOutputObservation(
-        exit_code=exit_code, content=content, command_id=command_id, command=command
+        content=content,
+        command=command,
+        metadata=CmdOutputMetadata(exit_code=exit_code),
     )
 
 
 def test_initialize_runtime():
     mock_runtime = MagicMock()
     mock_runtime.run_action.side_effect = [
+        create_cmd_output(exit_code=0, content='', command='cd /workspace'),
         create_cmd_output(
-            exit_code=0, content='', command_id=1, command='cd /workspace'
-        ),
-        create_cmd_output(
-            exit_code=0,
-            content='',
-            command_id=2,
-            command='git config --global core.pager ""',
+            exit_code=0, content='', command='git config --global core.pager ""'
         ),
     ]
 
@@ -291,30 +292,19 @@ def get_mock_response(url, *args, **kwargs):
 async def test_complete_runtime():
     mock_runtime = MagicMock()
     mock_runtime.run_action.side_effect = [
+        create_cmd_output(exit_code=0, content='', command='cd /workspace'),
         create_cmd_output(
-            exit_code=0, content='', command_id=1, command='cd /workspace'
+            exit_code=0, content='', command='git config --global core.pager ""'
         ),
         create_cmd_output(
             exit_code=0,
             content='',
-            command_id=2,
-            command='git config --global core.pager ""',
-        ),
-        create_cmd_output(
-            exit_code=0,
-            content='',
-            command_id=3,
             command='git config --global --add safe.directory /workspace',
         ),
         create_cmd_output(
-            exit_code=0,
-            content='',
-            command_id=4,
-            command='git diff base_commit_hash fix',
-        ),
-        create_cmd_output(
-            exit_code=0, content='git diff content', command_id=5, command='git apply'
+            exit_code=0, content='', command='git diff base_commit_hash fix'
         ),
+        create_cmd_output(exit_code=0, content='git diff content', command='git apply'),
     ]
 
     result = await complete_runtime(mock_runtime, 'base_commit_hash')
@@ -614,11 +604,7 @@ def test_guess_success():
         title='Test Issue',
         body='This is a test issue',
     )
-    mock_history = [
-        create_cmd_output(
-            exit_code=0, content='', command_id=1, command='cd /workspace'
-        )
-    ]
+    mock_history = [create_cmd_output(exit_code=0, content='', command='cd /workspace')]
     mock_llm_config = LLMConfig(model='test_model', api_key='test_api_key')
 
     mock_completion_response = MagicMock()
@@ -758,11 +744,7 @@ def test_guess_success_negative_case():
         title='Test Issue',
         body='This is a test issue',
     )
-    mock_history = [
-        create_cmd_output(
-            exit_code=0, content='', command_id=1, command='cd /workspace'
-        )
-    ]
+    mock_history = [create_cmd_output(exit_code=0, content='', command='cd /workspace')]
     mock_llm_config = LLMConfig(model='test_model', api_key='test_api_key')
 
     mock_completion_response = MagicMock()
@@ -795,11 +777,7 @@ def test_guess_success_invalid_output():
         title='Test Issue',
         body='This is a test issue',
     )
-    mock_history = [
-        create_cmd_output(
-            exit_code=0, content='', command_id=1, command='cd /workspace'
-        )
-    ]
+    mock_history = [create_cmd_output(exit_code=0, content='', command='cd /workspace')]
     mock_llm_config = LLMConfig(model='test_model', api_key='test_api_key')
 
     mock_completion_response = MagicMock()
diff --git a/tests/unit/test_action_serialization.py b/tests/unit/test_action_serialization.py
index 318dd612a2d7..374d5eee0100 100644
--- a/tests/unit/test_action_serialization.py
+++ b/tests/unit/test_action_serialization.py
@@ -41,11 +41,10 @@ def serialization_deserialization(
     serialized_action_memory = event_to_memory(action_instance, max_message_chars)
     original_memory_dict = original_action_dict.copy()
 
-    # we don't send backend properties like id or 'keep_prompt'
+    # we don't send backend properties like id
     original_memory_dict.pop('id', None)
     original_memory_dict.pop('timestamp', None)
     if 'args' in original_memory_dict:
-        original_memory_dict['args'].pop('keep_prompt', None)
         original_memory_dict['args'].pop('blocking', None)
         original_memory_dict['args'].pop('confirmation_state', None)
 
@@ -99,7 +98,6 @@ def test_cmd_run_action_serialization_deserialization():
             'blocking': False,
             'command': 'echo "Hello world"',
             'thought': '',
-            'keep_prompt': True,
             'hidden': False,
             'confirmation_state': ActionConfirmationStatus.CONFIRMED,
         },
@@ -154,3 +152,32 @@ def test_file_write_action_serialization_deserialization():
         },
     }
     serialization_deserialization(original_action_dict, FileWriteAction)
+
+
+def test_legacy_serialization():
+    original_action_dict = {
+        'action': 'run',
+        'args': {
+            'blocking': False,
+            'command': 'echo "Hello world"',
+            'thought': '',
+            'hidden': False,
+            'confirmation_state': ActionConfirmationStatus.CONFIRMED,
+            'keep_prompt': False,  # will be treated as no-op
+        },
+    }
+    event = event_from_dict(original_action_dict)
+    assert isinstance(event, Action)
+    assert isinstance(event, CmdRunAction)
+    assert event.command == 'echo "Hello world"'
+    assert event.hidden is False
+    assert not hasattr(event, 'keep_prompt')
+
+    event_dict = event_to_dict(event)
+    assert 'keep_prompt' not in event_dict['args']
+    assert (
+        event_dict['args']['confirmation_state'] == ActionConfirmationStatus.CONFIRMED
+    )
+    assert event_dict['args']['blocking'] is False
+    assert event_dict['args']['command'] == 'echo "Hello world"'
+    assert event_dict['args']['thought'] == ''
diff --git a/tests/unit/test_bash_parsing.py b/tests/unit/test_bash_parsing.py
index 4ba75028bba3..38ee0f221c42 100644
--- a/tests/unit/test_bash_parsing.py
+++ b/tests/unit/test_bash_parsing.py
@@ -1,6 +1,6 @@
 import pytest
 
-from openhands.runtime.utils.bash import split_bash_commands
+from openhands.runtime.utils.bash import escape_bash_special_chars, split_bash_commands
 
 
 def test_split_commands_util():
@@ -257,3 +257,172 @@ def test_split_commands_with_invalid_input():
     for input_command in invalid_inputs:
         # it will fall back to return the original input
         assert split_bash_commands(input_command) == [input_command]
+
+
+def test_escape_bash_special_chars():
+    test_cases = [
+        # Basic cases - use raw strings (r'') to avoid Python escape sequence warnings
+        ('echo test \\; ls', 'echo test \\\\; ls'),
+        ('grep pattern \\| sort', 'grep pattern \\\\| sort'),
+        ('cmd1 \\&\\& cmd2', 'cmd1 \\\\&\\\\& cmd2'),
+        ('cat file \\> output.txt', 'cat file \\\\> output.txt'),
+        ('cat \\< input.txt', 'cat \\\\< input.txt'),
+        # Quoted strings should remain unchanged
+        ('echo "test \\; unchanged"', 'echo "test \\; unchanged"'),
+        ("echo 'test \\| unchanged'", "echo 'test \\| unchanged'"),
+        # Mixed quoted and unquoted
+        (
+            'echo "quoted \\;" \\; "more" \\| grep',
+            'echo "quoted \\;" \\\\; "more" \\\\| grep',
+        ),
+        # Multiple escapes in sequence
+        ('cmd1 \\;\\|\\& cmd2', 'cmd1 \\\\;\\\\|\\\\& cmd2'),
+        # Commands with other backslashes
+        ('echo test\\ntest', 'echo test\\ntest'),
+        ('echo "test\\ntest"', 'echo "test\\ntest"'),
+        # Edge cases
+        ('', ''),  # Empty string
+        ('\\\\', '\\\\'),  # Double backslash
+        ('\\"', '\\"'),  # Escaped quote
+    ]
+
+    for input_cmd, expected in test_cases:
+        result = escape_bash_special_chars(input_cmd)
+        assert (
+            result == expected
+        ), f'Failed on input "{input_cmd}"\nExpected: "{expected}"\nGot: "{result}"'
+
+
+def test_escape_bash_special_chars_with_invalid_syntax():
+    invalid_inputs = [
+        'echo "unclosed quote',
+        "echo 'unclosed quote",
+        'cat <<EOF\nunclosed heredoc',
+    ]
+    for input_cmd in invalid_inputs:
+        # Should return original input when parsing fails
+        result = escape_bash_special_chars(input_cmd)
+        assert result == input_cmd, f'Failed to handle invalid input: {input_cmd}'
+
+
+def test_escape_bash_special_chars_with_heredoc():
+    input_cmd = r"""cat <<EOF
+line1 \; not escaped
+line2 \| not escaped
+EOF"""
+    # Heredoc content should not be escaped
+    expected = input_cmd
+    result = escape_bash_special_chars(input_cmd)
+    assert (
+        result == expected
+    ), f'Failed to handle heredoc correctly\nExpected: {expected}\nGot: {result}'
+
+
+def test_escape_bash_special_chars_with_parameter_expansion():
+    test_cases = [
+        # Parameter expansion should be preserved
+        ('echo $HOME', 'echo $HOME'),
+        ('echo ${HOME}', 'echo ${HOME}'),
+        ('echo ${HOME:-default}', 'echo ${HOME:-default}'),
+        # Mixed with special chars
+        ('echo $HOME \\; ls', 'echo $HOME \\\\; ls'),
+        ('echo ${PATH} \\| grep bin', 'echo ${PATH} \\\\| grep bin'),
+        # Quoted parameter expansion
+        ('echo "$HOME"', 'echo "$HOME"'),
+        ('echo "${HOME}"', 'echo "${HOME}"'),
+        # Complex parameter expansions
+        ('echo ${var:=default} \\; ls', 'echo ${var:=default} \\\\; ls'),
+        ('echo ${!prefix*} \\| sort', 'echo ${!prefix*} \\\\| sort'),
+    ]
+
+    for input_cmd, expected in test_cases:
+        result = escape_bash_special_chars(input_cmd)
+        assert (
+            result == expected
+        ), f'Failed on input "{input_cmd}"\nExpected: "{expected}"\nGot: "{result}"'
+
+
+def test_escape_bash_special_chars_with_command_substitution():
+    test_cases = [
+        # Basic command substitution
+        ('echo $(pwd)', 'echo $(pwd)'),
+        ('echo `pwd`', 'echo `pwd`'),
+        # Mixed with special chars
+        ('echo $(pwd) \\; ls', 'echo $(pwd) \\\\; ls'),
+        ('echo `pwd` \\| grep home', 'echo `pwd` \\\\| grep home'),
+        # Nested command substitution
+        ('echo $(echo `pwd`)', 'echo $(echo `pwd`)'),
+        # Complex command substitution
+        ('echo $(find . -name "*.txt" \\; ls)', 'echo $(find . -name "*.txt" \\; ls)'),
+        # Mixed with quotes
+        ('echo "$(pwd)"', 'echo "$(pwd)"'),
+        ('echo "`pwd`"', 'echo "`pwd`"'),
+    ]
+
+    for input_cmd, expected in test_cases:
+        result = escape_bash_special_chars(input_cmd)
+        assert (
+            result == expected
+        ), f'Failed on input "{input_cmd}"\nExpected: "{expected}"\nGot: "{result}"'
+
+
+def test_escape_bash_special_chars_mixed_nodes():
+    test_cases = [
+        # Mix of parameter expansion and command substitution
+        ('echo $HOME/$(pwd)', 'echo $HOME/$(pwd)'),
+        # Mix with special chars
+        ('echo $HOME/$(pwd) \\; ls', 'echo $HOME/$(pwd) \\\\; ls'),
+        # Complex mixed cases
+        (
+            'echo "${HOME}/$(basename `pwd`) \\; next"',
+            'echo "${HOME}/$(basename `pwd`) \\; next"',
+        ),
+        (
+            'VAR=${HOME} \\; echo $(pwd)',
+            'VAR=${HOME} \\\\; echo $(pwd)',
+        ),
+        # Real-world examples
+        (
+            'find . -name "*.txt" -exec grep "${PATTERN:-default}" {} \\;',
+            'find . -name "*.txt" -exec grep "${PATTERN:-default}" {} \\\\;',
+        ),
+        (
+            'echo "Current path: ${PWD}/$(basename `pwd`)" \\| grep home',
+            'echo "Current path: ${PWD}/$(basename `pwd`)" \\\\| grep home',
+        ),
+    ]
+
+    for input_cmd, expected in test_cases:
+        result = escape_bash_special_chars(input_cmd)
+        assert (
+            result == expected
+        ), f'Failed on input "{input_cmd}"\nExpected: "{expected}"\nGot: "{result}"'
+
+
+def test_escape_bash_special_chars_with_chained_commands():
+    test_cases = [
+        # Basic chained commands
+        ('ls && pwd', 'ls && pwd'),
+        ('echo "hello" && ls', 'echo "hello" && ls'),
+        # Chained commands with special chars
+        ('ls \\; pwd && echo test', 'ls \\\\; pwd && echo test'),
+        ('echo test && grep pattern \\| sort', 'echo test && grep pattern \\\\| sort'),
+        # Complex chained cases
+        ('echo ${HOME} && ls \\; pwd', 'echo ${HOME} && ls \\\\; pwd'),
+        (
+            'echo "$(pwd)" && cat file \\> out.txt',
+            'echo "$(pwd)" && cat file \\\\> out.txt',
+        ),
+        # Multiple chains
+        ('cmd1 && cmd2 && cmd3', 'cmd1 && cmd2 && cmd3'),
+        (
+            'cmd1 \\; ls && cmd2 \\| grep && cmd3',
+            'cmd1 \\\\; ls && cmd2 \\\\| grep && cmd3',
+        ),
+    ]
+
+    for input_cmd, expected in test_cases:
+        result = escape_bash_special_chars(input_cmd)
+        assert (
+            result == expected
+        ), f'Failed on input "{input_cmd}"\nExpected: "{expected}"\nGot: "{result}"'
diff --git a/tests/unit/test_bash_ps1_metadata.py b/tests/unit/test_bash_ps1_metadata.py
new file mode 100644
index 000000000000..023e3c2af101
--- /dev/null
+++ b/tests/unit/test_bash_ps1_metadata.py
@@ -0,0 +1,339 @@
+import json
+
+from openhands.events.observation.commands import (
+    CMD_OUTPUT_METADATA_PS1_REGEX,
+    CMD_OUTPUT_PS1_BEGIN,
+    CMD_OUTPUT_PS1_END,
+    CmdOutputMetadata,
+    CmdOutputObservation,
+)
+
+
+def test_ps1_metadata_format():
+    """Test that PS1 prompt has correct format markers"""
+    prompt = CmdOutputMetadata.to_ps1_prompt()
+    print(prompt)
+    assert prompt.startswith('\n###PS1JSON###\n')
+    assert prompt.endswith('\n###PS1END###\n')
+    assert r'\"exit_code\"' in prompt, 'PS1 prompt should contain escaped double quotes'
+
+
+def test_ps1_metadata_json_structure():
+    """Test that PS1 prompt contains valid JSON with expected fields"""
+    prompt = CmdOutputMetadata.to_ps1_prompt()
+    # Extract JSON content between markers
+    json_str = prompt.replace('###PS1JSON###\n', '').replace('\n###PS1END###\n', '')
+    # Remove escaping before parsing
+    json_str = json_str.replace(r'\"', '"')
+    # Remove any trailing content after the JSON
+    json_str = json_str.split('###PS1END###')[0].strip()
+    data = json.loads(json_str)
+
+    # Check required fields
+    expected_fields = {
+        'pid',
+        'exit_code',
+        'username',
+        'hostname',
+        'working_dir',
+        'py_interpreter_path',
+    }
+    assert set(data.keys()) == expected_fields
+
+
+def test_ps1_metadata_parsing():
+    """Test parsing PS1 output into CmdOutputMetadata"""
+    test_data = {
+        'exit_code': 0,
+        'username': 'testuser',
+        'hostname': 'localhost',
+        'working_dir': '/home/testuser',
+        'py_interpreter_path': '/usr/bin/python',
+    }
+
+    ps1_str = f"""###PS1JSON###
+{json.dumps(test_data, indent=2)}
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str)
+    assert len(matches) == 1
+    metadata = CmdOutputMetadata.from_ps1_match(matches[0])
+    assert metadata.exit_code == test_data['exit_code']
+    assert metadata.username == test_data['username']
+    assert metadata.hostname == test_data['hostname']
+    assert metadata.working_dir == test_data['working_dir']
+    assert metadata.py_interpreter_path == test_data['py_interpreter_path']
+
+
+def test_ps1_metadata_parsing_string():
+    """Test parsing PS1 output into CmdOutputMetadata"""
+    ps1_str = r"""###PS1JSON###
+{
+  "exit_code": "0",
+  "username": "myname",
+  "hostname": "myhostname",
+  "working_dir": "~/mydir",
+  "py_interpreter_path": "/my/python/path"
+}
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str)
+    assert len(matches) == 1
+    metadata = CmdOutputMetadata.from_ps1_match(matches[0])
+    assert metadata.exit_code == 0
+    assert metadata.username == 'myname'
+    assert metadata.hostname == 'myhostname'
+    assert metadata.working_dir == '~/mydir'
+    assert metadata.py_interpreter_path == '/my/python/path'
+
+
+def test_ps1_metadata_parsing_string_real_example():
+    """Test parsing PS1 output into CmdOutputMetadata"""
+    ps1_str = r"""
+###PS1JSON###
+{
+  "pid": "",
+  "exit_code": "0",
+  "username": "runner",
+  "hostname": "fv-az1055-610",
+  "working_dir": "/home/runner/work/OpenHands/OpenHands",
+  "py_interpreter_path": "/home/runner/.cache/pypoetry/virtualenvs/openhands-ai-ULPBlkAi-py3.12/bin/python"
+}
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str)
+    assert len(matches) == 1
+    metadata = CmdOutputMetadata.from_ps1_match(matches[0])
+    assert metadata.exit_code == 0
+    assert metadata.username == 'runner'
+    assert metadata.hostname == 'fv-az1055-610'
+    assert metadata.working_dir == '/home/runner/work/OpenHands/OpenHands'
+    assert (
+        metadata.py_interpreter_path
+        == '/home/runner/.cache/pypoetry/virtualenvs/openhands-ai-ULPBlkAi-py3.12/bin/python'
+    )
+
+
+def test_ps1_metadata_parsing_additional_prefix():
+    """Test parsing PS1 output into CmdOutputMetadata"""
+    test_data = {
+        'exit_code': 0,
+        'username': 'testuser',
+        'hostname': 'localhost',
+        'working_dir': '/home/testuser',
+        'py_interpreter_path': '/usr/bin/python',
+    }
+
+    ps1_str = f"""
+This is something that not part of the PS1 prompt
+
+###PS1JSON###
+{json.dumps(test_data, indent=2)}
+###PS1END###
+"""
+
+    matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str)
+    assert len(matches) == 1
+    metadata = CmdOutputMetadata.from_ps1_match(matches[0])
+    assert metadata.exit_code == test_data['exit_code']
+    assert metadata.username == test_data['username']
+    assert metadata.hostname == test_data['hostname']
+    assert metadata.working_dir == test_data['working_dir']
+    assert metadata.py_interpreter_path == test_data['py_interpreter_path']
+
+
+def test_ps1_metadata_parsing_invalid():
+    """Test parsing invalid PS1 output returns default metadata"""
+    # Test with invalid JSON
+    invalid_json = """###PS1JSON###
+    {invalid json}
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(invalid_json)
+    assert len(matches) == 0  # No matches should be found for invalid JSON
+
+    # Test with missing markers
+    invalid_format = """NOT A VALID PS1 PROMPT"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(invalid_format)
+    assert len(matches) == 0
+
+    # Test with empty PS1 metadata
+    empty_metadata = """###PS1JSON###
+
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(empty_metadata)
+    assert len(matches) == 0  # No matches should be found for empty metadata
+
+    # Test with whitespace in PS1 metadata
+    whitespace_metadata = """###PS1JSON###
+
+    {
+        "exit_code": "0",
+        "pid": "123",
+        "username": "test",
+        "hostname": "localhost",
+        "working_dir": "/home/test",
+        "py_interpreter_path": "/usr/bin/python"
+    }
+
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(whitespace_metadata)
+    assert len(matches) == 1
+    metadata = CmdOutputMetadata.from_ps1_match(matches[0])
+    assert metadata.exit_code == 0
+    assert metadata.pid == 123
+
+
+def test_ps1_metadata_missing_fields():
+    """Test handling of missing fields in PS1 metadata"""
+    # Test with only required fields
+    minimal_data = {'exit_code': 0, 'pid': 123}
+    ps1_str = f"""###PS1JSON###
+{json.dumps(minimal_data)}
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str)
+    assert len(matches) == 1
+    metadata = CmdOutputMetadata.from_ps1_match(matches[0])
+    assert metadata.exit_code == 0
+    assert metadata.pid == 123
+    assert metadata.username is None
+    assert metadata.hostname is None
+    assert metadata.working_dir is None
+    assert metadata.py_interpreter_path is None
+
+    # Test with missing exit_code but valid pid
+    no_exit_code = {'pid': 123, 'username': 'test'}
+    ps1_str = f"""###PS1JSON###
+{json.dumps(no_exit_code)}
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str)
+    assert len(matches) == 1
+    metadata = CmdOutputMetadata.from_ps1_match(matches[0])
+    assert metadata.exit_code == -1  # default value
+    assert metadata.pid == 123
+    assert metadata.username == 'test'
+
+
+def test_ps1_metadata_multiple_blocks():
+    """Test handling multiple PS1 metadata blocks"""
+    test_data = {
+        'exit_code': 0,
+        'username': 'testuser',
+        'hostname': 'localhost',
+        'working_dir': '/home/testuser',
+        'py_interpreter_path': '/usr/bin/python',
+    }
+
+    ps1_str = f"""###PS1JSON###
+{json.dumps(test_data, indent=2)}
+###PS1END###
+Some other content
+###PS1JSON###
+{json.dumps(test_data, indent=2)}
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str)
+    assert len(matches) == 2  # Should find both blocks
+    # Both blocks should parse successfully
+    metadata1 = CmdOutputMetadata.from_ps1_match(matches[0])
+    metadata2 = CmdOutputMetadata.from_ps1_match(matches[1])
+    assert metadata1.exit_code == test_data['exit_code']
+    assert metadata2.exit_code == test_data['exit_code']
+
+
+def test_ps1_metadata_regex_pattern():
+    """Test the regex pattern used to extract PS1 metadata"""
+    # Test basic pattern matching
+    test_str = f'{CMD_OUTPUT_PS1_BEGIN}test\n{CMD_OUTPUT_PS1_END}'
+    matches = CMD_OUTPUT_METADATA_PS1_REGEX.finditer(test_str)
+    match = next(matches)
+    assert match.group(1).strip() == 'test'
+
+    # Test with content before and after
+    test_str = f'prefix\n{CMD_OUTPUT_PS1_BEGIN}test\n{CMD_OUTPUT_PS1_END}suffix'
+    matches = CMD_OUTPUT_METADATA_PS1_REGEX.finditer(test_str)
+    match = next(matches)
+    assert match.group(1).strip() == 'test'
+
+    # Test with multiline content
+    test_str = f'{CMD_OUTPUT_PS1_BEGIN}line1\nline2\nline3\n{CMD_OUTPUT_PS1_END}'
+    matches = CMD_OUTPUT_METADATA_PS1_REGEX.finditer(test_str)
+    match = next(matches)
+    assert match.group(1).strip() == 'line1\nline2\nline3'
+
+
+def test_cmd_output_observation_properties():
+    """Test CmdOutputObservation class properties"""
+    # Test with successful command
+    metadata = CmdOutputMetadata(exit_code=0, pid=123)
+    obs = CmdOutputObservation(command='ls', content='file1\nfile2', metadata=metadata)
+    assert obs.command_id == 123
+    assert obs.exit_code == 0
+    assert not obs.error
+    assert 'exit code 0' in obs.message
+    assert 'ls' in obs.message
+    assert 'file1' in str(obs)
+    assert 'file2' in str(obs)
+    assert 'metadata' in str(obs)
+
+    # Test with failed command
+    metadata = CmdOutputMetadata(exit_code=1, pid=456)
+    obs = CmdOutputObservation(command='invalid', content='error', metadata=metadata)
+    assert obs.command_id == 456
+    assert obs.exit_code == 1
+    assert obs.error
+    assert 'exit code 1' in obs.message
+    assert 'invalid' in obs.message
+    assert 'error' in str(obs)
+
+
+def test_ps1_metadata_empty_fields():
+    """Test handling of empty fields in PS1 metadata"""
+    # Test with empty strings
+    empty_data = {
+        'exit_code': 0,
+        'pid': 123,
+        'username': '',
+        'hostname': '',
+        'working_dir': '',
+        'py_interpreter_path': '',
+    }
+    ps1_str = f"""###PS1JSON###
+{json.dumps(empty_data)}
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str)
+    assert len(matches) == 1
+    metadata = CmdOutputMetadata.from_ps1_match(matches[0])
+    assert metadata.exit_code == 0
+    assert metadata.pid == 123
+    assert metadata.username == ''
+    assert metadata.hostname == ''
+    assert metadata.working_dir == ''
+    assert metadata.py_interpreter_path == ''
+
+    # Test with malformed but valid JSON
+    malformed_json = """###PS1JSON###
+    {
+        "exit_code":0,
+        "pid"  :  123,
+        "username":    "test"  ,
+        "hostname": "host",
+        "working_dir"    :"dir",
+        "py_interpreter_path":"path"
+    }
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(malformed_json)
+    assert len(matches) == 1
+    metadata = CmdOutputMetadata.from_ps1_match(matches[0])
+    assert metadata.exit_code == 0
+    assert metadata.pid == 123
+    assert metadata.username == 'test'
+    assert metadata.hostname == 'host'
+    assert metadata.working_dir == 'dir'
+    assert metadata.py_interpreter_path == 'path'
diff --git a/tests/unit/test_bash_session.py b/tests/unit/test_bash_session.py
new file mode 100644
index 000000000000..13f32ad27d25
--- /dev/null
+++ b/tests/unit/test_bash_session.py
@@ -0,0 +1,384 @@
+import os
+import tempfile
+
+from openhands.core.logger import openhands_logger as logger
+from openhands.events.action import CmdRunAction
+from openhands.runtime.utils.bash import BashCommandStatus, BashSession
+
+
+def test_session_initialization():
+    # Test with custom working directory
+    with tempfile.TemporaryDirectory() as temp_dir:
+        session = BashSession(work_dir=temp_dir)
+        session.initialize()
+        obs = session.execute(CmdRunAction('pwd'))
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert temp_dir in obs.content
+        assert '[The command completed with exit code 0.]' in obs.metadata.suffix
+        session.close()
+
+    # Test with custom username
+    session = BashSession(work_dir=os.getcwd(), username='nobody')
+    session.initialize()
+    assert 'openhands-nobody' in session.session.name
+    session.close()
+
+
+def test_cwd_property(tmp_path):
+    session = BashSession(work_dir=tmp_path)
+    session.initialize()
+    # Change directory and verify pwd updates
+    random_dir = tmp_path / 'random'
+    random_dir.mkdir()
+    session.execute(CmdRunAction(f'cd {random_dir}'))
+    assert session.cwd == str(random_dir)
+    session.close()
+
+
+def test_basic_command():
+    session = BashSession(work_dir=os.getcwd())
+    session.initialize()
+
+    # Test simple command
+    obs = session.execute(CmdRunAction("echo 'hello world'"))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'hello world' in obs.content
+    assert obs.metadata.suffix == '\n[The command completed with exit code 0.]'
+    assert obs.metadata.prefix == ''
+    assert obs.metadata.exit_code == 0
+    assert session.prev_status == BashCommandStatus.COMPLETED
+
+    # Test command with error
+    obs = session.execute(CmdRunAction('nonexistent_command'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.metadata.exit_code == 127
+    assert 'nonexistent_command: command not found' in obs.content
+    assert obs.metadata.suffix == '\n[The command completed with exit code 127.]'
+    assert obs.metadata.prefix == ''
+    assert session.prev_status == BashCommandStatus.COMPLETED
+
+    # Test multiple commands in sequence
+    obs = session.execute(CmdRunAction('echo "first" && echo "second" && echo "third"'))
+    assert 'first' in obs.content
+    assert 'second' in obs.content
+    assert 'third' in obs.content
+    assert obs.metadata.suffix == '\n[The command completed with exit code 0.]'
+    assert obs.metadata.prefix == ''
+    assert obs.metadata.exit_code == 0
+    assert session.prev_status == BashCommandStatus.COMPLETED
+
+    session.close()
+
+
+def test_long_running_command_follow_by_execute():
+    session = BashSession(work_dir=os.getcwd(), no_change_timeout_seconds=2)
+    session.initialize()
+
+    # Test command that produces output slowly
+    obs = session.execute(
+        CmdRunAction('for i in {1..3}; do echo $i; sleep 3; done', blocking=False)
+    )
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert '1' in obs.content  # First number should appear before timeout
+    assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+    assert obs.metadata.suffix == (
+        '\n[The command has no new output after 2 seconds. '
+        "You may wait longer to see additional output by sending empty command '', "
+        'send other commands to interact with the current process, '
+        'or send keys to interrupt/kill the command.]'
+    )
+    assert obs.metadata.prefix == ''
+
+    # Continue watching output
+    obs = session.execute(CmdRunAction(''))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert '2' in obs.content
+    assert obs.metadata.prefix == '[Command output continued from previous command]\n'
+    assert obs.metadata.suffix == (
+        '\n[The command has no new output after 2 seconds. '
+        "You may wait longer to see additional output by sending empty command '', "
+        'send other commands to interact with the current process, '
+        'or send keys to interrupt/kill the command.]'
+    )
+    assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    # Test command that produces no output
+    obs = session.execute(CmdRunAction('sleep 15'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert '3' in obs.content
+    assert obs.metadata.prefix == '[Command output continued from previous command]\n'
+    assert obs.metadata.suffix == (
+        '\n[The command has no new output after 2 seconds. '
+        "You may wait longer to see additional output by sending empty command '', "
+        'send other commands to interact with the current process, '
+        'or send keys to interrupt/kill the command.]'
+    )
+    assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    session.close()
+
+
+def test_interactive_command():
+    session = BashSession(work_dir=os.getcwd(), no_change_timeout_seconds=3)
+    session.initialize()
+
+    # Test interactive command with blocking=True
+    obs = session.execute(
+        CmdRunAction(
+            'read -p \'Enter name: \' name && echo "Hello $name"',
+        )
+    )
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'Enter name:' in obs.content
+    assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+    assert obs.metadata.suffix == (
+        '\n[The command has no new output after 3 seconds. '
+        "You may wait longer to see additional output by sending empty command '', "
+        'send other commands to interact with the current process, '
+        'or send keys to interrupt/kill the command.]'
+    )
+    assert obs.metadata.prefix == ''
+
+    # Send input
+    obs = session.execute(CmdRunAction('John'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'Hello John' in obs.content
+    assert obs.metadata.exit_code == 0
+    assert obs.metadata.suffix == '\n[The command completed with exit code 0.]'
+    assert obs.metadata.prefix == ''
+    assert session.prev_status == BashCommandStatus.COMPLETED
+
+    # Test multiline command input
+    obs = session.execute(CmdRunAction('cat << EOF'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.metadata.exit_code == -1
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+    assert obs.metadata.suffix == (
+        '\n[The command has no new output after 3 seconds. '
+        "You may wait longer to see additional output by sending empty command '', "
+        'send other commands to interact with the current process, '
+        'or send keys to interrupt/kill the command.]'
+    )
+    assert obs.metadata.prefix == ''
+
+    obs = session.execute(CmdRunAction('line 1'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.metadata.exit_code == -1
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+    assert obs.metadata.suffix == (
+        '\n[The command has no new output after 3 seconds. '
+        "You may wait longer to see additional output by sending empty command '', "
+        'send other commands to interact with the current process, '
+        'or send keys to interrupt/kill the command.]'
+    )
+    assert obs.metadata.prefix == '[Command output continued from previous command]\n'
+
+    obs = session.execute(CmdRunAction('line 2'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.metadata.exit_code == -1
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+    assert obs.metadata.suffix == (
+        '\n[The command has no new output after 3 seconds. '
+        "You may wait longer to see additional output by sending empty command '', "
+        'send other commands to interact with the current process, '
+        'or send keys to interrupt/kill the command.]'
+    )
+    assert obs.metadata.prefix == '[Command output continued from previous command]\n'
+
+    obs = session.execute(CmdRunAction('EOF'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'line 1' in obs.content and 'line 2' in obs.content
+    assert obs.metadata.exit_code == 0
+    assert obs.metadata.suffix == '\n[The command completed with exit code 0.]'
+    assert obs.metadata.prefix == ''
+
+    session.close()
+
+
+def test_ctrl_c():
+    session = BashSession(work_dir=os.getcwd(), no_change_timeout_seconds=2)
+    session.initialize()
+
+    # Start infinite loop
+    obs = session.execute(
+        CmdRunAction("while true; do echo 'looping'; sleep 3; done"),
+    )
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'looping' in obs.content
+    assert obs.metadata.suffix == (
+        '\n[The command has no new output after 2 seconds. '
+        "You may wait longer to see additional output by sending empty command '', "
+        'send other commands to interact with the current process, '
+        'or send keys to interrupt/kill the command.]'
+    )
+    assert obs.metadata.prefix == ''
+    assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    # Send Ctrl+C
+    obs = session.execute(CmdRunAction('C-c'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.metadata.exit_code == 130  # Standard exit code for Ctrl+C
+    assert (
+        obs.metadata.suffix
+        == '\n[The command completed with exit code 130. CTRL+C was sent.]'
+    )
+    assert obs.metadata.prefix == ''
+    assert session.prev_status == BashCommandStatus.COMPLETED
+
+    session.close()
+
+
+def test_empty_command_errors():
+    session = BashSession(work_dir=os.getcwd())
+    session.initialize()
+
+    # Test empty command without previous command
+    obs = session.execute(CmdRunAction(''))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert (
+        obs.content
+        == 'ERROR: No previous command to continue from. Previous command has to be timeout to be continued.'
+    )
+    assert obs.metadata.exit_code == -1
+    assert obs.metadata.prefix == ''
+    assert obs.metadata.suffix == ''
+    assert session.prev_status is None
+
+    session.close()
+
+
+def test_command_output_continuation():
+    session = BashSession(work_dir=os.getcwd(), no_change_timeout_seconds=2)
+    session.initialize()
+
+    # Start a command that produces output slowly
+    obs = session.execute(CmdRunAction('for i in {1..5}; do echo $i; sleep 3; done'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.content.strip() == '1'
+    assert obs.metadata.prefix == ''
+    assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    obs = session.execute(CmdRunAction(''))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert '[Command output continued from previous command]' in obs.metadata.prefix
+    assert obs.content.strip() == '2'
+    assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    obs = session.execute(CmdRunAction(''))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert '[Command output continued from previous command]' in obs.metadata.prefix
+    assert obs.content.strip() == '3'
+
+    assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    obs = session.execute(CmdRunAction(''))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert '[Command output continued from previous command]' in obs.metadata.prefix
+    assert obs.content.strip() == '4'
+    assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    obs = session.execute(CmdRunAction(''))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert '[Command output continued from previous command]' in obs.metadata.prefix
+    assert obs.content.strip() == '5'
+    assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    obs = session.execute(CmdRunAction(''))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert '[The command completed with exit code 0.]' in obs.metadata.suffix
+    assert session.prev_status == BashCommandStatus.COMPLETED
+
+    session.close()
+
+
+def test_long_output():
+    session = BashSession(work_dir=os.getcwd())
+    session.initialize()
+
+    # Generate a long output that may exceed buffer size
+    obs = session.execute(CmdRunAction('for i in {1..5000}; do echo "Line $i"; done'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'Line 1' in obs.content
+    assert 'Line 5000' in obs.content
+    assert obs.metadata.exit_code == 0
+    assert obs.metadata.prefix == ''
+    assert obs.metadata.suffix == '\n[The command completed with exit code 0.]'
+
+    session.close()
+
+
+def test_long_output_exceed_history_limit():
+    session = BashSession(work_dir=os.getcwd())
+    session.initialize()
+
+    # Generate a long output that may exceed buffer size
+    obs = session.execute(CmdRunAction('for i in {1..50000}; do echo "Line $i"; done'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'Previous command outputs are truncated' in obs.metadata.prefix
+    assert 'Line 40000' in obs.content
+    assert 'Line 50000' in obs.content
+    assert obs.metadata.exit_code == 0
+    assert obs.metadata.suffix == '\n[The command completed with exit code 0.]'
+
+    session.close()
+
+
+def test_multiline_command():
+    session = BashSession(work_dir=os.getcwd())
+    session.initialize()
+
+    # Test multiline command with PS2 prompt disabled
+    obs = session.execute(
+        CmdRunAction("""if true; then
+echo "inside if"
+fi""")
+    )
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'inside if' in obs.content
+    assert obs.metadata.exit_code == 0
+    assert obs.metadata.prefix == ''
+    assert obs.metadata.suffix == '\n[The command completed with exit code 0.]'
+
+    session.close()
+
+
+def test_python_interactive_input():
+    session = BashSession(work_dir=os.getcwd(), no_change_timeout_seconds=2)
+    session.initialize()
+
+    # Test Python program that asks for input - properly escaped for bash
+    python_script = """name = input('Enter your name: '); age = input('Enter your age: '); print(f'Hello {name}, you are {age} years old')"""
+
+    # Start Python with the interactive script
+    obs = session.execute(CmdRunAction(f'python3 -c "{python_script}"'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'Enter your name:' in obs.content
+    assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    # Send first input (name)
+    obs = session.execute(CmdRunAction('Alice'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'Enter your age:' in obs.content
+    assert obs.metadata.exit_code == -1
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    # Send second input (age)
+    obs = session.execute(CmdRunAction('25'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'Hello Alice, you are 25 years old' in obs.content
+    assert obs.metadata.exit_code == 0
+    assert obs.metadata.suffix == '\n[The command completed with exit code 0.]'
+    assert session.prev_status == BashCommandStatus.COMPLETED
+
+    session.close()
diff --git a/tests/unit/test_codeact_agent.py b/tests/unit/test_codeact_agent.py
index 28696df7d2a7..fa265e750b23 100644
--- a/tests/unit/test_codeact_agent.py
+++ b/tests/unit/test_codeact_agent.py
@@ -26,6 +26,7 @@
 from openhands.events.event import EventSource, FileEditSource, FileReadSource
 from openhands.events.observation.browse import BrowserOutputObservation
 from openhands.events.observation.commands import (
+    CmdOutputMetadata,
     CmdOutputObservation,
     IPythonRunCellObservation,
 )
@@ -50,7 +51,11 @@ def agent() -> CodeActAgent:
 def test_cmd_output_observation_message(agent: CodeActAgent):
     agent.config.function_calling = False
     obs = CmdOutputObservation(
-        command='echo hello', content='Command output', command_id=1, exit_code=0
+        command='echo hello',
+        content='Command output',
+        metadata=CmdOutputMetadata(
+            exit_code=0,
+        ),
     )
 
     results = agent.get_observation_message(obs, tool_call_id_to_message={})
@@ -62,7 +67,7 @@ def test_cmd_output_observation_message(agent: CodeActAgent):
     assert len(result.content) == 1
     assert isinstance(result.content[0], TextContent)
     assert 'Command output' in result.content[0].text
-    assert 'Command finished with exit code 0' in result.content[0].text
+    assert '[Command finished with exit code 0]' in result.content[0].text
 
 
 def test_ipython_run_cell_observation_message(agent: CodeActAgent):
diff --git a/tests/unit/test_command_success.py b/tests/unit/test_command_success.py
index b52ceb4815c7..298a3bcb4f6c 100644
--- a/tests/unit/test_command_success.py
+++ b/tests/unit/test_command_success.py
@@ -1,4 +1,5 @@
 from openhands.events.observation.commands import (
+    CmdOutputMetadata,
     CmdOutputObservation,
     IPythonRunCellObservation,
 )
@@ -7,14 +8,18 @@
 def test_cmd_output_success():
     # Test successful command
     obs = CmdOutputObservation(
-        command_id=1, command='ls', content='file1.txt\nfile2.txt', exit_code=0
+        command='ls',
+        content='file1.txt\nfile2.txt',
+        metadata=CmdOutputMetadata(exit_code=0),
     )
     assert obs.success is True
     assert obs.error is False
 
     # Test failed command
     obs = CmdOutputObservation(
-        command_id=2, command='ls', content='No such file or directory', exit_code=1
+        command='ls',
+        content='No such file or directory',
+        metadata=CmdOutputMetadata(exit_code=1),
     )
     assert obs.success is False
     assert obs.error is True
diff --git a/tests/unit/test_event_serialization.py b/tests/unit/test_event_serialization.py
index d1989a30bb09..52df3ef1d0e1 100644
--- a/tests/unit/test_event_serialization.py
+++ b/tests/unit/test_event_serialization.py
@@ -1,18 +1,22 @@
-from openhands.events.observation import CmdOutputObservation
+from openhands.events.observation import CmdOutputMetadata, CmdOutputObservation
 from openhands.events.serialization import event_to_dict
 
 
 def test_command_output_success_serialization():
     # Test successful command
     obs = CmdOutputObservation(
-        command_id=1, command='ls', content='file1.txt\nfile2.txt', exit_code=0
+        command='ls',
+        content='file1.txt\nfile2.txt',
+        metadata=CmdOutputMetadata(exit_code=0),
     )
     serialized = event_to_dict(obs)
     assert serialized['success'] is True
 
     # Test failed command
     obs = CmdOutputObservation(
-        command_id=2, command='ls', content='No such file or directory', exit_code=1
+        command='ls',
+        content='No such file or directory',
+        metadata=CmdOutputMetadata(exit_code=1),
     )
     serialized = event_to_dict(obs)
     assert serialized['success'] is False
diff --git a/tests/unit/test_is_stuck.py b/tests/unit/test_is_stuck.py
index 4b9a4e820ee5..fae6b1038131 100644
--- a/tests/unit/test_is_stuck.py
+++ b/tests/unit/test_is_stuck.py
@@ -107,7 +107,7 @@ def test_history_too_short(self, stuck_detector: StuckDetector):
         cmd_action = CmdRunAction(command='ls')
         state.history.append(cmd_action)
         cmd_observation = CmdOutputObservation(
-            command_id=1, command='ls', content='file1.txt\nfile2.txt'
+            command='ls', content='file1.txt\nfile2.txt'
         )
         # cmd_observation._cause = cmd_action._id
         state.history.append(cmd_observation)
@@ -188,7 +188,7 @@ def test_is_stuck_repeating_action_observation(self, stuck_detector: StuckDetect
         cmd_action_1 = CmdRunAction(command='ls')
         cmd_action_1._id = 1
         state.history.append(cmd_action_1)
-        cmd_observation_1 = CmdOutputObservation(content='', command='ls', command_id=1)
+        cmd_observation_1 = CmdOutputObservation(content='', command='ls')
         cmd_observation_1._cause = cmd_action_1._id
         state.history.append(cmd_observation_1)
         # 4 events
@@ -196,7 +196,7 @@ def test_is_stuck_repeating_action_observation(self, stuck_detector: StuckDetect
         cmd_action_2 = CmdRunAction(command='ls')
         cmd_action_2._id = 2
         state.history.append(cmd_action_2)
-        cmd_observation_2 = CmdOutputObservation(content='', command='ls', command_id=2)
+        cmd_observation_2 = CmdOutputObservation(content='', command='ls')
         cmd_observation_2._cause = cmd_action_2._id
         state.history.append(cmd_observation_2)
         # 6 events
@@ -212,7 +212,7 @@ def test_is_stuck_repeating_action_observation(self, stuck_detector: StuckDetect
         cmd_action_3 = CmdRunAction(command='ls')
         cmd_action_3._id = 3
         state.history.append(cmd_action_3)
-        cmd_observation_3 = CmdOutputObservation(content='', command='ls', command_id=3)
+        cmd_observation_3 = CmdOutputObservation(content='', command='ls')
         cmd_observation_3._cause = cmd_action_3._id
         state.history.append(cmd_observation_3)
         # 10 events
@@ -223,7 +223,7 @@ def test_is_stuck_repeating_action_observation(self, stuck_detector: StuckDetect
         cmd_action_4 = CmdRunAction(command='ls')
         cmd_action_4._id = 4
         state.history.append(cmd_action_4)
-        cmd_observation_4 = CmdOutputObservation(content='', command='ls', command_id=4)
+        cmd_observation_4 = CmdOutputObservation(content='', command='ls')
         cmd_observation_4._cause = cmd_action_4._id
         state.history.append(cmd_observation_4)
         # 12 events
@@ -436,7 +436,7 @@ def test_is_stuck_repeating_action_observation_pattern(
         cmd_action_1 = CmdRunAction(command='ls')
         state.history.append(cmd_action_1)
         cmd_observation_1 = CmdOutputObservation(
-            command_id=1, command='ls', content='file1.txt\nfile2.txt'
+            command='ls', content='file1.txt\nfile2.txt'
         )
         # cmd_observation_1._cause = cmd_action_1._id
         state.history.append(cmd_observation_1)
@@ -452,7 +452,7 @@ def test_is_stuck_repeating_action_observation_pattern(
         cmd_action_2 = CmdRunAction(command='ls')
         state.history.append(cmd_action_2)
         cmd_observation_2 = CmdOutputObservation(
-            command_id=2, command='ls', content='file1.txt\nfile2.txt'
+            command='ls', content='file1.txt\nfile2.txt'
         )
         # cmd_observation_2._cause = cmd_action_2._id
         state.history.append(cmd_observation_2)
@@ -475,7 +475,7 @@ def test_is_stuck_repeating_action_observation_pattern(
         cmd_action_3 = CmdRunAction(command='ls')
         state.history.append(cmd_action_3)
         cmd_observation_3 = CmdOutputObservation(
-            command_id=3, command='ls', content='file1.txt\nfile2.txt'
+            command='ls', content='file1.txt\nfile2.txt'
         )
         # cmd_observation_3._cause = cmd_action_3._id
         state.history.append(cmd_observation_3)
@@ -506,7 +506,7 @@ def test_is_stuck_not_stuck(self, stuck_detector: StuckDetector):
         cmd_action_1 = CmdRunAction(command='ls')
         state.history.append(cmd_action_1)
         cmd_observation_1 = CmdOutputObservation(
-            command_id=cmd_action_1.id, command='ls', content='file1.txt\nfile2.txt'
+            command='ls', content='file1.txt\nfile2.txt'
         )
         # cmd_observation_1._cause = cmd_action_1._id
         state.history.append(cmd_observation_1)
@@ -521,9 +521,7 @@ def test_is_stuck_not_stuck(self, stuck_detector: StuckDetector):
 
         cmd_action_2 = CmdRunAction(command='pwd')
         state.history.append(cmd_action_2)
-        cmd_observation_2 = CmdOutputObservation(
-            command_id=2, command='pwd', content='/home/user'
-        )
+        cmd_observation_2 = CmdOutputObservation(command='pwd', content='/home/user')
         # cmd_observation_2._cause = cmd_action_2._id
         state.history.append(cmd_observation_2)
 
@@ -541,9 +539,7 @@ def test_is_stuck_not_stuck(self, stuck_detector: StuckDetector):
 
         cmd_action_3 = CmdRunAction(command='pwd')
         state.history.append(cmd_action_3)
-        cmd_observation_3 = CmdOutputObservation(
-            command_id=cmd_action_3.id, command='pwd', content='/home/user'
-        )
+        cmd_observation_3 = CmdOutputObservation(command='pwd', content='/home/user')
         # cmd_observation_3._cause = cmd_action_3._id
         state.history.append(cmd_observation_3)
 
@@ -590,7 +586,6 @@ def test_is_stuck_monologue(self, stuck_detector):
         # Add an observation event between the repeated message actions
         cmd_output_observation = CmdOutputObservation(
             content='OK, I was stuck, but no more.',
-            command_id=42,
             command='storybook',
             exit_code=0,
         )
diff --git a/tests/unit/test_observation_serialization.py b/tests/unit/test_observation_serialization.py
index 67a95449b719..626ea2c14774 100644
--- a/tests/unit/test_observation_serialization.py
+++ b/tests/unit/test_observation_serialization.py
@@ -1,4 +1,5 @@
 from openhands.events.observation import (
+    CmdOutputMetadata,
     CmdOutputObservation,
     Observation,
 )
@@ -40,13 +41,65 @@ def serialization_deserialization(
 
 
 # Additional tests for various observation subclasses can be included here
+def test_observation_event_props_serialization_deserialization():
+    original_observation_dict = {
+        'id': 42,
+        'source': 'agent',
+        'timestamp': '2021-08-01T12:00:00',
+        'observation': 'run',
+        'message': 'Command `ls -l` executed with exit code 0.',
+        'extras': {
+            'command': 'ls -l',
+            'hidden': False,
+            'metadata': {
+                'exit_code': 0,
+                'hostname': None,
+                'pid': -1,
+                'prefix': '',
+                'py_interpreter_path': None,
+                'suffix': '',
+                'username': None,
+                'working_dir': None,
+            },
+        },
+        'content': 'foo.txt',
+        'success': True,
+    }
+    serialization_deserialization(original_observation_dict, CmdOutputObservation)
+
+
+def test_command_output_observation_serialization_deserialization():
+    original_observation_dict = {
+        'observation': 'run',
+        'extras': {
+            'command': 'ls -l',
+            'hidden': False,
+            'metadata': {
+                'exit_code': 0,
+                'hostname': None,
+                'pid': -1,
+                'prefix': '',
+                'py_interpreter_path': None,
+                'suffix': '',
+                'username': None,
+                'working_dir': None,
+            },
+        },
+        'message': 'Command `ls -l` executed with exit code 0.',
+        'content': 'foo.txt',
+        'success': True,
+    }
+    serialization_deserialization(original_observation_dict, CmdOutputObservation)
+
+
 def test_success_field_serialization():
     # Test success=True
     obs = CmdOutputObservation(
         content='Command succeeded',
-        exit_code=0,
         command='ls -l',
-        command_id=3,
+        metadata=CmdOutputMetadata(
+            exit_code=0,
+        ),
     )
     serialized = event_to_dict(obs)
     assert serialized['success'] is True
@@ -54,9 +107,42 @@ def test_success_field_serialization():
     # Test success=False
     obs = CmdOutputObservation(
         content='No such file or directory',
-        exit_code=1,
         command='ls -l',
-        command_id=3,
+        metadata=CmdOutputMetadata(
+            exit_code=1,
+        ),
     )
     serialized = event_to_dict(obs)
     assert serialized['success'] is False
+
+
+def test_legacy_serialization():
+    original_observation_dict = {
+        'id': 42,
+        'source': 'agent',
+        'timestamp': '2021-08-01T12:00:00',
+        'observation': 'run',
+        'message': 'Command `ls -l` executed with exit code 0.',
+        'extras': {
+            'command': 'ls -l',
+            'hidden': False,
+            'exit_code': 0,
+            'command_id': 3,
+        },
+        'content': 'foo.txt',
+        'success': True,
+    }
+    event = event_from_dict(original_observation_dict)
+    assert isinstance(event, Observation)
+    assert isinstance(event, CmdOutputObservation)
+    assert event.metadata.exit_code == 0
+    assert event.success is True
+    assert event.command == 'ls -l'
+    assert event.hidden is False
+
+    event_dict = event_to_dict(event)
+    assert event_dict['success'] is True
+    assert event_dict['extras']['metadata']['exit_code'] == 0
+    assert event_dict['extras']['metadata']['pid'] == 3
+    assert event_dict['extras']['command'] == 'ls -l'
+    assert event_dict['extras']['hidden'] is False
diff --git a/tests/unit/test_security.py b/tests/unit/test_security.py
index 71afd04dbe61..0e2b3c8a0d07 100644
--- a/tests/unit/test_security.py
+++ b/tests/unit/test_security.py
@@ -368,7 +368,6 @@ async def test_unsafe_bash_command(temp_dir: str):
                             'blocking': False,
                             'command': 'ls',
                             'hidden': False,
-                            'keep_prompt': True,
                             'confirmation_state': ActionConfirmationStatus.CONFIRMED,
                         },
                     ),
@@ -495,9 +494,7 @@ def test_parse_action(action, expected_trace):
             ],
         ),
         (
-            CmdOutputObservation(
-                content='cmd output content', command_id=1, command='ls'
-            ),
+            CmdOutputObservation(content='cmd output content', command='ls'),
             [
                 ToolOutput(
                     metadata={},