From eb8d1600c3198bfe2ef51a24f228ccd10a70d6be Mon Sep 17 00:00:00 2001
From: Engel Nyst <enyst@users.noreply.github.com>
Date: Sat, 1 Feb 2025 18:14:08 +0100
Subject: [PATCH] Chore: clean up LLM (prompt caching, supports fn calling),
 leftover renames (#6095)

---
 .github/workflows/ghcr-build.yml              |   8 +-
 .../current/usage/architecture/runtime.md     |   4 +-
 .../current/usage/configuration-options.md    |   2 +-
 .../usage/how-to/evaluation-harness.md        |   2 +-
 .../current/usage/architecture/runtime.md     |   4 +-
 .../current/usage/configuration-options.md    |   2 +-
 .../usage/how-to/evaluation-harness.md        |   2 +-
 docs/modules/usage/architecture/runtime.md    |   4 +-
 docs/modules/usage/configuration-options.md   |   2 +-
 .../usage/how-to/evaluation-harness.md        |   2 +-
 openhands/core/exceptions.py                  |   6 -
 openhands/llm/llm.py                          | 194 +++++++++---------
 openhands/llm/retry_mixin.py                  |   6 +-
 tests/runtime/test_bash.py                    |   2 +-
 tests/runtime/test_browsing.py                |   2 +-
 tests/runtime/test_edit.py                    |   2 +-
 tests/runtime/test_env_vars.py                |   2 +-
 tests/runtime/test_images.py                  |   2 +-
 tests/runtime/test_stress_remote_runtime.py   |   2 +-
 tests/unit/test_llm.py                        |  21 --
 tests/unit/test_prompt_caching.py             |  35 ----
 21 files changed, 119 insertions(+), 187 deletions(-)

diff --git a/.github/workflows/ghcr-build.yml b/.github/workflows/ghcr-build.yml
index 6ec93e78d1db..acdc89f0f495 100644
--- a/.github/workflows/ghcr-build.yml
+++ b/.github/workflows/ghcr-build.yml
@@ -219,7 +219,7 @@ jobs:
             exit 1
           fi
 
-  # Run unit tests with the EventStream runtime Docker images as root
+  # Run unit tests with the Docker runtime Docker images as root
   test_runtime_root:
     name: RT Unit Tests (Root)
     needs: [ghcr_build_runtime]
@@ -286,7 +286,7 @@ jobs:
           image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
           image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
 
-          TEST_RUNTIME=eventstream \
+          TEST_RUNTIME=docker \
           SANDBOX_USER_ID=$(id -u) \
           SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
           TEST_IN_CI=true \
@@ -297,7 +297,7 @@ jobs:
         env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
 
-  # Run unit tests with the EventStream runtime Docker images as openhands user
+  # Run unit tests with the Docker runtime Docker images as openhands user
   test_runtime_oh:
     name: RT Unit Tests (openhands)
     runs-on: ubuntu-latest
@@ -363,7 +363,7 @@ jobs:
           image_name=ghcr.io/${{ github.repository_owner }}/runtime:${{ env.RELEVANT_SHA }}-${{ matrix.base_image }}
           image_name=$(echo $image_name | tr '[:upper:]' '[:lower:]')
 
-          TEST_RUNTIME=eventstream \
+          TEST_RUNTIME=docker \
           SANDBOX_USER_ID=$(id -u) \
           SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
           TEST_IN_CI=true \
diff --git a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/architecture/runtime.md b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/architecture/runtime.md
index 42e1dae5d824..71e121d45d62 100644
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/architecture/runtime.md
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/architecture/runtime.md
@@ -1,8 +1,8 @@
 
 
-# 📦 Runtime EventStream
+# 📦 Runtime Docker
 
-Le Runtime EventStream d'OpenHands est le composant principal qui permet l'exécution sécurisée et flexible des actions des agents d'IA.
+Le Runtime Docker d'OpenHands est le composant principal qui permet l'exécution sécurisée et flexible des actions des agents d'IA.
 Il crée un environnement en bac à sable (sandbox) en utilisant Docker, où du code arbitraire peut être exécuté en toute sécurité sans risquer le système hôte.
 
 ## Pourquoi avons-nous besoin d'un runtime en bac à sable ?
diff --git a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/configuration-options.md b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/configuration-options.md
index 0f22d218b817..7115c85b1e1f 100644
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/configuration-options.md
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/configuration-options.md
@@ -163,7 +163,7 @@ Les options de configuration de base sont définies dans la section `[core]` du
 
 - `runtime`
   - Type : `str`
-  - Valeur par défaut : `"eventstream"`
+  - Valeur par défaut : `"docker"`
   - Description : Environnement d'exécution
 
 - `default_agent`
diff --git a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md
index 1e15665e79fa..b0aee6764acf 100644
--- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md
+++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md
@@ -114,7 +114,7 @@ Pour créer un workflow d'évaluation pour votre benchmark, suivez ces étapes :
    def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig:
        config = AppConfig(
            default_agent=metadata.agent_class,
-           runtime='eventstream',
+           runtime='docker',
            max_iterations=metadata.max_iterations,
            sandbox=SandboxConfig(
                base_container_image='your_container_image',
diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/architecture/runtime.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/architecture/runtime.md
index fe00e9399540..5e01f62da5d8 100644
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/architecture/runtime.md
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/architecture/runtime.md
@@ -1,8 +1,8 @@
 以下是翻译后的内容:
 
-# 📦 EventStream 运行时
+# 📦 Docker 运行时
 
-OpenHands EventStream 运行时是实现 AI 代理操作安全灵活执行的核心组件。
+OpenHands Docker 运行时是实现 AI 代理操作安全灵活执行的核心组件。
 它使用 Docker 创建一个沙盒环境,可以安全地运行任意代码而不会危及主机系统。
 
 ## 为什么我们需要沙盒运行时?
diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/configuration-options.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/configuration-options.md
index b79a65073acc..4676cfd23822 100644
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/configuration-options.md
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/configuration-options.md
@@ -162,7 +162,7 @@
 
 - `runtime`
   - 类型: `str`
-  - 默认值: `"eventstream"`
+  - 默认值: `"docker"`
   - 描述: 运行时环境
 
 - `default_agent`
diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md
index 1e3fff538ffb..9872034bd1a3 100644
--- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md
+++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md
@@ -112,7 +112,7 @@ OpenHands 的主要入口点在 `openhands/core/main.py` 中。以下是它的
    def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig:
        config = AppConfig(
            default_agent=metadata.agent_class,
-           runtime='eventstream',
+           runtime='docker',
            max_iterations=metadata.max_iterations,
            sandbox=SandboxConfig(
                base_container_image='your_container_image',
diff --git a/docs/modules/usage/architecture/runtime.md b/docs/modules/usage/architecture/runtime.md
index 3aa05056fff2..b08a1ed99bbf 100644
--- a/docs/modules/usage/architecture/runtime.md
+++ b/docs/modules/usage/architecture/runtime.md
@@ -1,6 +1,6 @@
-# 📦 EventStream Runtime
+# 📦 Docker Runtime
 
-The OpenHands EventStream Runtime is the core component that enables secure and flexible execution of AI agent's action.
+The OpenHands Docker Runtime is the core component that enables secure and flexible execution of AI agent's action.
 It creates a sandboxed environment using Docker, where arbitrary code can be run safely without risking the host system.
 
 ## Why do we need a sandboxed runtime?
diff --git a/docs/modules/usage/configuration-options.md b/docs/modules/usage/configuration-options.md
index ff0aa5674cc8..90050765d6ba 100644
--- a/docs/modules/usage/configuration-options.md
+++ b/docs/modules/usage/configuration-options.md
@@ -126,7 +126,7 @@ The core configuration options are defined in the `[core]` section of the `confi
 
 - `runtime`
   - Type: `str`
-  - Default: `"eventstream"`
+  - Default: `"docker"`
   - Description: Runtime environment
 
 - `default_agent`
diff --git a/docs/modules/usage/how-to/evaluation-harness.md b/docs/modules/usage/how-to/evaluation-harness.md
index 339783ea8d7e..79ecf7fe371b 100644
--- a/docs/modules/usage/how-to/evaluation-harness.md
+++ b/docs/modules/usage/how-to/evaluation-harness.md
@@ -112,7 +112,7 @@ To create an evaluation workflow for your benchmark, follow these steps:
    def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig:
        config = AppConfig(
            default_agent=metadata.agent_class,
-           runtime='eventstream',
+           runtime='docker',
            max_iterations=metadata.max_iterations,
            sandbox=SandboxConfig(
                base_container_image='your_container_image',
diff --git a/openhands/core/exceptions.py b/openhands/core/exceptions.py
index 532f8becbf66..db8b9afa8838 100644
--- a/openhands/core/exceptions.py
+++ b/openhands/core/exceptions.py
@@ -98,12 +98,6 @@ def __init__(self, message='Operation was cancelled'):
         super().__init__(message)
 
 
-class CloudFlareBlockageError(Exception):
-    """Exception raised when a request is blocked by CloudFlare."""
-
-    pass
-
-
 # ============================================
 # LLM function calling Exceptions
 # ============================================
diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index af25baded4c4..5b656387ecc8 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -27,7 +27,6 @@
 from litellm.types.utils import CostPerToken, ModelResponse, Usage
 from litellm.utils import create_pretrained_tokenizer
 
-from openhands.core.exceptions import CloudFlareBlockageError
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import Message
 from openhands.llm.debug_mixin import DebugMixin
@@ -218,99 +217,86 @@ def wrapper(*args, **kwargs):
             # log the entire LLM prompt
             self.log_prompt(messages)
 
-            if self.is_caching_prompt_active():
-                # Anthropic-specific prompt caching
-                if 'claude-3' in self.config.model:
-                    kwargs['extra_headers'] = {
-                        'anthropic-beta': 'prompt-caching-2024-07-31',
-                    }
-
             # set litellm modify_params to the configured value
             # True by default to allow litellm to do transformations like adding a default message, when a message is empty
             # NOTE: this setting is global; unlike drop_params, it cannot be overridden in the litellm completion partial
             litellm.modify_params = self.config.modify_params
 
-            try:
-                # Record start time for latency measurement
-                start_time = time.time()
-                # we don't support streaming here, thus we get a ModelResponse
-                resp: ModelResponse = self._completion_unwrapped(*args, **kwargs)
+            # Record start time for latency measurement
+            start_time = time.time()
 
-                # Calculate and record latency
-                latency = time.time() - start_time
-                response_id = resp.get('id', 'unknown')
-                self.metrics.add_response_latency(latency, response_id)
+            # we don't support streaming here, thus we get a ModelResponse
+            resp: ModelResponse = self._completion_unwrapped(*args, **kwargs)
 
-                non_fncall_response = copy.deepcopy(resp)
-                if mock_function_calling:
-                    assert len(resp.choices) == 1
-                    assert mock_fncall_tools is not None
-                    non_fncall_response_message = resp.choices[0].message
-                    fn_call_messages_with_response = (
-                        convert_non_fncall_messages_to_fncall_messages(
-                            messages + [non_fncall_response_message], mock_fncall_tools
-                        )
+            # Calculate and record latency
+            latency = time.time() - start_time
+            response_id = resp.get('id', 'unknown')
+            self.metrics.add_response_latency(latency, response_id)
+
+            non_fncall_response = copy.deepcopy(resp)
+            if mock_function_calling:
+                assert len(resp.choices) == 1
+                assert mock_fncall_tools is not None
+                non_fncall_response_message = resp.choices[0].message
+                fn_call_messages_with_response = (
+                    convert_non_fncall_messages_to_fncall_messages(
+                        messages + [non_fncall_response_message], mock_fncall_tools
                     )
-                    fn_call_response_message = fn_call_messages_with_response[-1]
-                    if not isinstance(fn_call_response_message, LiteLLMMessage):
-                        fn_call_response_message = LiteLLMMessage(
-                            **fn_call_response_message
-                        )
-                    resp.choices[0].message = fn_call_response_message
-
-                message_back: str = resp['choices'][0]['message']['content'] or ''
-                tool_calls: list[ChatCompletionMessageToolCall] = resp['choices'][0][
-                    'message'
-                ].get('tool_calls', [])
-                if tool_calls:
-                    for tool_call in tool_calls:
-                        fn_name = tool_call.function.name
-                        fn_args = tool_call.function.arguments
-                        message_back += f'\nFunction call: {fn_name}({fn_args})'
-
-                # log the LLM response
-                self.log_response(message_back)
-
-                # post-process the response first to calculate cost
-                cost = self._post_completion(resp)
-
-                # log for evals or other scripts that need the raw completion
-                if self.config.log_completions:
-                    assert self.config.log_completions_folder is not None
-                    log_file = os.path.join(
-                        self.config.log_completions_folder,
-                        # use the metric model name (for draft editor)
-                        f'{self.metrics.model_name.replace("/", "__")}-{time.time()}.json',
+                )
+                fn_call_response_message = fn_call_messages_with_response[-1]
+                if not isinstance(fn_call_response_message, LiteLLMMessage):
+                    fn_call_response_message = LiteLLMMessage(
+                        **fn_call_response_message
                     )
+                resp.choices[0].message = fn_call_response_message
+
+            message_back: str = resp['choices'][0]['message']['content'] or ''
+            tool_calls: list[ChatCompletionMessageToolCall] = resp['choices'][0][
+                'message'
+            ].get('tool_calls', [])
+            if tool_calls:
+                for tool_call in tool_calls:
+                    fn_name = tool_call.function.name
+                    fn_args = tool_call.function.arguments
+                    message_back += f'\nFunction call: {fn_name}({fn_args})'
+
+            # log the LLM response
+            self.log_response(message_back)
+
+            # post-process the response first to calculate cost
+            cost = self._post_completion(resp)
+
+            # log for evals or other scripts that need the raw completion
+            if self.config.log_completions:
+                assert self.config.log_completions_folder is not None
+                log_file = os.path.join(
+                    self.config.log_completions_folder,
+                    # use the metric model name (for draft editor)
+                    f'{self.metrics.model_name.replace("/", "__")}-{time.time()}.json',
+                )
+
+                # set up the dict to be logged
+                _d = {
+                    'messages': messages,
+                    'response': resp,
+                    'args': args,
+                    'kwargs': {k: v for k, v in kwargs.items() if k != 'messages'},
+                    'timestamp': time.time(),
+                    'cost': cost,
+                }
+
+                # if non-native function calling, save messages/response separately
+                if mock_function_calling:
+                    # Overwrite response as non-fncall to be consistent with messages
+                    _d['response'] = non_fncall_response
+
+                    # Save fncall_messages/response separately
+                    _d['fncall_messages'] = original_fncall_messages
+                    _d['fncall_response'] = resp
+                with open(log_file, 'w') as f:
+                    f.write(json.dumps(_d))
 
-                    # set up the dict to be logged
-                    _d = {
-                        'messages': messages,
-                        'response': resp,
-                        'args': args,
-                        'kwargs': {k: v for k, v in kwargs.items() if k != 'messages'},
-                        'timestamp': time.time(),
-                        'cost': cost,
-                    }
-
-                    # if non-native function calling, save messages/response separately
-                    if mock_function_calling:
-                        # Overwrite response as non-fncall to be consistent with messages
-                        _d['response'] = non_fncall_response
-
-                        # Save fncall_messages/response separately
-                        _d['fncall_messages'] = original_fncall_messages
-                        _d['fncall_response'] = resp
-                    with open(log_file, 'w') as f:
-                        f.write(json.dumps(_d))
-
-                return resp
-            except APIError as e:
-                if 'Attention Required! | Cloudflare' in str(e):
-                    raise CloudFlareBlockageError(
-                        'Request blocked by CloudFlare'
-                    ) from e
-                raise
+            return resp
 
         self._completion = wrapper
 
@@ -414,6 +400,25 @@ def init_model_info(self):
                 ):
                     self.config.max_output_tokens = self.model_info['max_tokens']
 
+        # Initialize function calling capability
+        # Check if model name is in our supported list
+        model_name_supported = (
+            self.config.model in FUNCTION_CALLING_SUPPORTED_MODELS
+            or self.config.model.split('/')[-1] in FUNCTION_CALLING_SUPPORTED_MODELS
+            or any(m in self.config.model for m in FUNCTION_CALLING_SUPPORTED_MODELS)
+        )
+
+        # Handle native_tool_calling user-defined configuration
+        if self.config.native_tool_calling is None:
+            self._function_calling_active = model_name_supported
+        elif self.config.native_tool_calling is False:
+            self._function_calling_active = False
+        else:
+            # try to enable native tool calling if supported by the model
+            self._function_calling_active = litellm.supports_function_calling(
+                model=self.config.model
+            )
+
     def vision_is_active(self) -> bool:
         with warnings.catch_warnings():
             warnings.simplefilter('ignore')
@@ -455,24 +460,11 @@ def is_caching_prompt_active(self) -> bool:
         )
 
     def is_function_calling_active(self) -> bool:
-        # Check if model name is in our supported list
-        model_name_supported = (
-            self.config.model in FUNCTION_CALLING_SUPPORTED_MODELS
-            or self.config.model.split('/')[-1] in FUNCTION_CALLING_SUPPORTED_MODELS
-            or any(m in self.config.model for m in FUNCTION_CALLING_SUPPORTED_MODELS)
-        )
+        """Returns whether function calling is supported and enabled for this LLM instance.
 
-        # Handle native_tool_calling user-defined configuration
-        if self.config.native_tool_calling is None:
-            return model_name_supported
-        elif self.config.native_tool_calling is False:
-            return False
-        else:
-            # try to enable native tool calling if supported by the model
-            supports_fn_call = litellm.supports_function_calling(
-                model=self.config.model
-            )
-            return supports_fn_call
+        The result is cached during initialization for performance.
+        """
+        return self._function_calling_active
 
     def _post_completion(self, response: ModelResponse) -> float:
         """Post-process the completion response.
diff --git a/openhands/llm/retry_mixin.py b/openhands/llm/retry_mixin.py
index 714153e4c1a1..08a8add63939 100644
--- a/openhands/llm/retry_mixin.py
+++ b/openhands/llm/retry_mixin.py
@@ -24,7 +24,7 @@ def retry_decorator(self, **kwargs):
             A retry decorator with the parameters customizable in configuration.
         """
         num_retries = kwargs.get('num_retries')
-        retry_exceptions = kwargs.get('retry_exceptions')
+        retry_exceptions: tuple = kwargs.get('retry_exceptions', ())
         retry_min_wait = kwargs.get('retry_min_wait')
         retry_max_wait = kwargs.get('retry_max_wait')
         retry_multiplier = kwargs.get('retry_multiplier')
@@ -39,7 +39,9 @@ def before_sleep(retry_state):
             before_sleep=before_sleep,
             stop=stop_after_attempt(num_retries) | stop_if_should_exit(),
             reraise=True,
-            retry=(retry_if_exception_type(retry_exceptions)),
+            retry=(
+                retry_if_exception_type(retry_exceptions)
+            ),  # retry only for these types
             wait=wait_exponential(
                 multiplier=retry_multiplier,
                 min=retry_min_wait,
diff --git a/tests/runtime/test_bash.py b/tests/runtime/test_bash.py
index 4af28d9065b0..d107cc9569c8 100644
--- a/tests/runtime/test_bash.py
+++ b/tests/runtime/test_bash.py
@@ -1,4 +1,4 @@
-"""Bash-related tests for the EventStreamRuntime, which connects to the ActionExecutor running in the sandbox."""
+"""Bash-related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""
 
 import os
 import time
diff --git a/tests/runtime/test_browsing.py b/tests/runtime/test_browsing.py
index 6097c891907b..0dee3750953f 100644
--- a/tests/runtime/test_browsing.py
+++ b/tests/runtime/test_browsing.py
@@ -1,4 +1,4 @@
-"""Browsing-related tests for the EventStreamRuntime, which connects to the ActionExecutor running in the sandbox."""
+"""Browsing-related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""
 
 from conftest import _close_test_runtime, _load_runtime
 
diff --git a/tests/runtime/test_edit.py b/tests/runtime/test_edit.py
index 99a7ce113b18..c507166a840d 100644
--- a/tests/runtime/test_edit.py
+++ b/tests/runtime/test_edit.py
@@ -1,4 +1,4 @@
-"""Edit-related tests for the EventStreamRuntime."""
+"""Edit-related tests for the DockerRuntime."""
 
 import os
 
diff --git a/tests/runtime/test_env_vars.py b/tests/runtime/test_env_vars.py
index de65bf8101ed..898003ff66c7 100644
--- a/tests/runtime/test_env_vars.py
+++ b/tests/runtime/test_env_vars.py
@@ -1,4 +1,4 @@
-"""Env vars related tests for the EventStreamRuntime, which connects to the ActionExecutor running in the sandbox."""
+"""Env vars related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""
 
 import os
 from unittest.mock import patch
diff --git a/tests/runtime/test_images.py b/tests/runtime/test_images.py
index 1dd7e295c415..b7ab82b54b3c 100644
--- a/tests/runtime/test_images.py
+++ b/tests/runtime/test_images.py
@@ -1,4 +1,4 @@
-"""Image-related tests for the EventStreamRuntime, which connects to the ActionExecutor running in the sandbox."""
+"""Image-related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""
 
 import pytest
 from conftest import _close_test_runtime, _load_runtime
diff --git a/tests/runtime/test_stress_remote_runtime.py b/tests/runtime/test_stress_remote_runtime.py
index a2f6c7d2082b..5c201af8b726 100644
--- a/tests/runtime/test_stress_remote_runtime.py
+++ b/tests/runtime/test_stress_remote_runtime.py
@@ -1,4 +1,4 @@
-"""Bash-related tests for the EventStreamRuntime, which connects to the ActionExecutor running in the sandbox."""
+"""Bash-related tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""
 
 import asyncio
 import os
diff --git a/tests/unit/test_llm.py b/tests/unit/test_llm.py
index 227b0006b020..98783c050d0a 100644
--- a/tests/unit/test_llm.py
+++ b/tests/unit/test_llm.py
@@ -389,27 +389,6 @@ def test_completion_with_two_positional_args(mock_litellm_completion, default_co
     )  # No positional args should be passed to litellm_completion here
 
 
-@patch('openhands.llm.llm.litellm_completion')
-def test_llm_cloudflare_blockage(mock_litellm_completion, default_config):
-    from litellm.exceptions import APIError
-
-    from openhands.core.exceptions import CloudFlareBlockageError
-
-    llm = LLM(default_config)
-    mock_litellm_completion.side_effect = APIError(
-        message='Attention Required! | Cloudflare',
-        llm_provider='test_provider',
-        model='test_model',
-        status_code=403,
-    )
-
-    with pytest.raises(CloudFlareBlockageError, match='Request blocked by CloudFlare'):
-        llm.completion(messages=[{'role': 'user', 'content': 'Hello'}])
-
-    # Ensure the completion was called
-    mock_litellm_completion.assert_called_once()
-
-
 @patch('openhands.llm.llm.litellm.token_counter')
 def test_get_token_count_with_dict_messages(mock_token_counter, default_config):
     mock_token_counter.return_value = 42
diff --git a/tests/unit/test_prompt_caching.py b/tests/unit/test_prompt_caching.py
index ea4eeb59935d..3258fa486a9f 100644
--- a/tests/unit/test_prompt_caching.py
+++ b/tests/unit/test_prompt_caching.py
@@ -128,38 +128,3 @@ def test_get_messages_prompt_caching(codeact_agent: CodeActAgent):
     assert cached_user_messages[0].content[0].text.startswith('You are OpenHands agent')
     assert cached_user_messages[2].content[0].text.startswith('User message 1')
     assert cached_user_messages[3].content[0].text.startswith('User message 1')
-
-
-def test_prompt_caching_headers(codeact_agent: CodeActAgent):
-    history = list()
-    # Setup
-    msg1 = MessageAction('Hello, agent!')
-    msg1._source = 'user'
-    history.append(msg1)
-    msg2 = MessageAction('Hello, user!')
-    msg2._source = 'agent'
-    history.append(msg2)
-
-    mock_state = Mock()
-    mock_state.history = history
-    mock_state.max_iterations = 5
-    mock_state.iteration = 0
-    mock_state.extra_data = {}
-
-    codeact_agent.reset()
-
-    # Create a mock for litellm_completion
-    def check_headers(**kwargs):
-        assert 'extra_headers' in kwargs
-        assert 'anthropic-beta' in kwargs['extra_headers']
-        assert kwargs['extra_headers']['anthropic-beta'] == 'prompt-caching-2024-07-31'
-        return ModelResponse(
-            choices=[{'message': {'content': 'Hello! How can I assist you today?'}}]
-        )
-
-    codeact_agent.llm._completion_unwrapped = check_headers
-    result = codeact_agent.step(mock_state)
-
-    # Assert
-    assert isinstance(result, MessageAction)
-    assert result.content == 'Hello! How can I assist you today?'