diff --git a/.github/workflows/openhands-resolver.yml b/.github/workflows/openhands-resolver.yml
index 77b3c49b785c..45718be9ef3d 100644
--- a/.github/workflows/openhands-resolver.yml
+++ b/.github/workflows/openhands-resolver.yml
@@ -184,6 +184,7 @@ jobs:
             });
 
       - name: Install OpenHands
+        id: install_openhands
         uses: actions/github-script@v7
         env:
           COMMENT_BODY: ${{ github.event.comment.body || '' }}
@@ -196,7 +197,6 @@ jobs:
             const reviewBody = process.env.REVIEW_BODY.trim();
             const labelName = process.env.LABEL_NAME.trim();
             const eventName = process.env.EVENT_NAME.trim();
-
             // Check conditions
             const isExperimentalLabel = labelName === "fix-me-experimental";
             const isIssueCommentExperimental =
@@ -205,6 +205,9 @@ jobs:
             const isReviewCommentExperimental =
               eventName === "pull_request_review" && reviewBody.includes("@openhands-agent-exp");
 
+            // Set output variable
+            core.setOutput('isExperimental', isExperimentalLabel || isIssueCommentExperimental || isReviewCommentExperimental);
+
             // Perform package installation
             if (isExperimentalLabel || isIssueCommentExperimental || isReviewCommentExperimental) {
               console.log("Installing experimental OpenHands...");
@@ -230,7 +233,8 @@ jobs:
             --issue-number ${{ env.ISSUE_NUMBER }} \
             --issue-type ${{ env.ISSUE_TYPE }} \
             --max-iterations ${{ env.MAX_ITERATIONS }} \
-            --comment-id ${{ env.COMMENT_ID }}
+            --comment-id ${{ env.COMMENT_ID }} \
+            --is-experimental ${{ steps.install_openhands.outputs.isExperimental }}
 
       - name: Check resolution result
         id: check_result
diff --git a/evaluation/benchmarks/the_agent_company/run_infer.py b/evaluation/benchmarks/the_agent_company/run_infer.py
index 03561913087c..6f0cda2efe40 100644
--- a/evaluation/benchmarks/the_agent_company/run_infer.py
+++ b/evaluation/benchmarks/the_agent_company/run_infer.py
@@ -80,7 +80,7 @@ def load_dependencies(runtime: Runtime) -> List[str]:
 def init_task_env(runtime: Runtime, hostname: str, env_llm_config: LLMConfig):
     command = (
         f'SERVER_HOSTNAME={hostname} '
-        f'LITELLM_API_KEY={env_llm_config.api_key} '
+        f'LITELLM_API_KEY={env_llm_config.api_key.get_secret_value() if env_llm_config.api_key else None} '
         f'LITELLM_BASE_URL={env_llm_config.base_url} '
         f'LITELLM_MODEL={env_llm_config.model} '
         'bash /utils/init.sh'
@@ -165,7 +165,7 @@ def run_evaluator(
     runtime: Runtime, env_llm_config: LLMConfig, trajectory_path: str, result_path: str
 ):
     command = (
-        f'LITELLM_API_KEY={env_llm_config.api_key} '
+        f'LITELLM_API_KEY={env_llm_config.api_key.get_secret_value() if env_llm_config.api_key else None} '
         f'LITELLM_BASE_URL={env_llm_config.base_url} '
         f'LITELLM_MODEL={env_llm_config.model} '
         f"DECRYPTION_KEY='theagentcompany is all you need' "  # Hardcoded Key
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index 5a5cdd8b86e8..4f165831eca5 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -53,30 +53,6 @@ class EvalMetadata(BaseModel):
     details: dict[str, Any] | None = None
     condenser_config: CondenserConfig | None = None
 
-    def model_dump(self, *args, **kwargs):
-        dumped_dict = super().model_dump(*args, **kwargs)
-        # avoid leaking sensitive information
-        dumped_dict['llm_config'] = self.llm_config.to_safe_dict()
-        if hasattr(self.condenser_config, 'llm_config'):
-            dumped_dict['condenser_config']['llm_config'] = (
-                self.condenser_config.llm_config.to_safe_dict()
-            )
-
-        return dumped_dict
-
-    def model_dump_json(self, *args, **kwargs):
-        dumped = super().model_dump_json(*args, **kwargs)
-        dumped_dict = json.loads(dumped)
-        # avoid leaking sensitive information
-        dumped_dict['llm_config'] = self.llm_config.to_safe_dict()
-        if hasattr(self.condenser_config, 'llm_config'):
-            dumped_dict['condenser_config']['llm_config'] = (
-                self.condenser_config.llm_config.to_safe_dict()
-            )
-
-        logger.debug(f'Dumped metadata: {dumped_dict}')
-        return json.dumps(dumped_dict)
-
 
 class EvalOutput(BaseModel):
     # NOTE: User-specified
@@ -99,23 +75,6 @@ class EvalOutput(BaseModel):
     # Optionally save the input test instance
     instance: dict[str, Any] | None = None
 
-    def model_dump(self, *args, **kwargs):
-        dumped_dict = super().model_dump(*args, **kwargs)
-        # Remove None values
-        dumped_dict = {k: v for k, v in dumped_dict.items() if v is not None}
-        # Apply custom serialization for metadata (to avoid leaking sensitive information)
-        if self.metadata is not None:
-            dumped_dict['metadata'] = self.metadata.model_dump()
-        return dumped_dict
-
-    def model_dump_json(self, *args, **kwargs):
-        dumped = super().model_dump_json(*args, **kwargs)
-        dumped_dict = json.loads(dumped)
-        # Apply custom serialization for metadata (to avoid leaking sensitive information)
-        if 'metadata' in dumped_dict:
-            dumped_dict['metadata'] = json.loads(self.metadata.model_dump_json())
-        return json.dumps(dumped_dict)
-
 
 class EvalException(Exception):
     pass
@@ -315,15 +274,7 @@ def update_progress(
     logger.info(
         f'Finished evaluation for instance {result.instance_id}: {str(result.test_result)[:300]}...\n'
     )
-
-    # Custom JSON encoder
-    class NumpyEncoder(json.JSONEncoder):
-        def default(self, obj):
-            if isinstance(obj, np.ndarray):
-                return obj.tolist()  # Convert ndarray to list
-            return super().default(obj)
-
-    output_fp.write(json.dumps(result.model_dump(), cls=NumpyEncoder) + '\n')
+    output_fp.write(result.model_dump_json() + '\n')
     output_fp.flush()
 
 
diff --git a/openhands/core/config/README.md b/openhands/core/config/README.md
index 5e3abae5b13a..c612a0824403 100644
--- a/openhands/core/config/README.md
+++ b/openhands/core/config/README.md
@@ -37,21 +37,17 @@ export SANDBOX_TIMEOUT='300'
 
 ## Type Handling
 
-The `load_from_env` function attempts to cast environment variable values to the types specified in the dataclasses. It handles:
+The `load_from_env` function attempts to cast environment variable values to the types specified in the models. It handles:
 
 - Basic types (str, int, bool)
 - Optional types (e.g., `str | None`)
-- Nested dataclasses
+- Nested models
 
 If type casting fails, an error is logged, and the default value is retained.
 
 ## Default Values
 
-If an environment variable is not set, the default value specified in the dataclass is used.
-
-## Nested Configurations
-
-The `AppConfig` class contains nested configurations like `LLMConfig` and `AgentConfig`. The `load_from_env` function handles these by recursively processing nested dataclasses with updated prefixes.
+If an environment variable is not set, the default value specified in the model is used.
 
 ## Security Considerations
 
diff --git a/openhands/core/config/agent_config.py b/openhands/core/config/agent_config.py
index c57a16210027..de29aed9744f 100644
--- a/openhands/core/config/agent_config.py
+++ b/openhands/core/config/agent_config.py
@@ -1,11 +1,9 @@
-from dataclasses import dataclass, field, fields
-import inspect
+from pydantic import BaseModel, Field
+
 from openhands.core.config.condenser_config import CondenserConfig, NoOpCondenserConfig
-from openhands.core.config.config_utils import get_field_info
 
 
-@dataclass
-class AgentConfig:
+class AgentConfig(BaseModel):
     """Configuration for the agent.
 
     Attributes:
@@ -24,30 +22,18 @@ class AgentConfig:
         condenser: Configuration for the memory condenser. Default is NoOpCondenserConfig.
     """
 
-    function_calling: bool = False
-    codeact_enable_browsing: bool = True
-    codeact_enable_llm_editor: bool = False
-    codeact_enable_jupyter: bool = True
-    micro_agent_name: str | None = None
-    memory_enabled: bool = False
-    memory_max_threads: int = 3
-    llm_config: str | None = None
-    use_microagents: bool = True
-    disabled_microagents: list[str] | None = None
-    mind_voice: str | None = None
-    mind_voice_language: str = 'English'
-    condenser: CondenserConfig = field(default_factory=NoOpCondenserConfig)  # type: ignore
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        result = {}
-        for f in fields(self):
-            result[f.name] = get_field_info(f)
-        return result
+    codeact_enable_browsing: bool = Field(default=True)
+    codeact_enable_llm_editor: bool = Field(default=False)
+    codeact_enable_jupyter: bool = Field(default=True)
+    micro_agent_name: str | None = Field(default=None)
+    memory_enabled: bool = Field(default=False)
+    memory_max_threads: int = Field(default=3)
+    llm_config: str | None = Field(default=None)
+    use_microagents: bool = Field(default=True)
+    disabled_microagents: list[str] | None = Field(default=None)
+    condenser: CondenserConfig = Field(default_factory=NoOpCondenserConfig)
+    
+    function_calling: bool = Field(default=True)
+    mind_voice: str | None = Field(default=None)
+    mind_voice_language: str = Field(default='English')
 
-    @classmethod
-    def from_dict(cls, env):      
-        return cls(**{
-            k: v for k, v in env.items() 
-            if k in inspect.signature(cls).parameters
-        })
diff --git a/openhands/core/config/app_config.py b/openhands/core/config/app_config.py
index 4d9d5749a9fb..8c3e495fd0d9 100644
--- a/openhands/core/config/app_config.py
+++ b/openhands/core/config/app_config.py
@@ -1,20 +1,20 @@
-from dataclasses import dataclass, field, fields, is_dataclass
 from typing import ClassVar
-import inspect
+
+from pydantic import BaseModel, Field, SecretStr
+
 from openhands.core import logger
 from openhands.core.config.agent_config import AgentConfig
 from openhands.core.config.config_utils import (
     OH_DEFAULT_AGENT,
     OH_MAX_ITERATIONS,
-    get_field_info,
+    model_defaults_to_dict,
 )
 from openhands.core.config.llm_config import LLMConfig
 from openhands.core.config.sandbox_config import SandboxConfig
 from openhands.core.config.security_config import SecurityConfig
 
 
-@dataclass
-class AppConfig:
+class AppConfig(BaseModel):
     """Configuration for the app.
 
     Attributes:
@@ -51,43 +51,45 @@ class AppConfig:
             input is read line by line. When enabled, input continues until /exit command.
     """
 
-    llms: dict[str, LLMConfig] = field(default_factory=dict)
-    agents: dict = field(default_factory=dict)
-    default_agent: str = OH_DEFAULT_AGENT
-    sandbox: SandboxConfig = field(default_factory=SandboxConfig)
-    security: SecurityConfig = field(default_factory=SecurityConfig)
-    runtime: str = 'docker'
-    file_store: str = 'local'
-    file_store_path: str = '/tmp/openhands_file_store'
-    trajectories_path: str | None = None
-    workspace_base: str  = './workspace'
-    workspace_mount_path: str | None = None
-    workspace_mount_path_in_sandbox: str = '/workspace'
-    workspace_mount_rewrite: str | None = None
-    cache_dir: str = '/tmp/cache'
-    run_as_openhands: bool = True
-    show_workspace_contents: bool = True
-    max_iterations: int = OH_MAX_ITERATIONS
-    max_budget_per_task: float | None = None
-    e2b_api_key: str = ''
-    modal_api_token_id: str = ''
-    modal_api_token_secret: str = ''
-    disable_color: bool = False
-    jwt_secret: str = 'secretpass'
-    debug: bool = False
-    file_uploads_max_file_size_mb: int = 0
-    file_uploads_restrict_file_types: bool = False
-    file_uploads_allowed_extensions: list[str] = field(default_factory=lambda: ['.*'])
-    override_UI_settings: bool = False
-    runloop_api_key: str | None = None
-    custom_instructions: str = ''
-    use_selenium: bool = False
-    dont_restore_state: bool = False
-
-    cli_multiline_input: bool = False
+    llms: dict[str, LLMConfig] = Field(default_factory=dict)
+    agents: dict = Field(default_factory=dict)
+    default_agent: str = Field(default=OH_DEFAULT_AGENT)
+    sandbox: SandboxConfig = Field(default_factory=SandboxConfig)
+    security: SecurityConfig = Field(default_factory=SecurityConfig)
+    runtime: str = Field(default='docker')
+    file_store: str = Field(default='local')
+    file_store_path: str = Field(default='/tmp/openhands_file_store')
+    trajectories_path: str | None = Field(default=None)
+    workspace_base: str | None = Field(default='./workspace')
+    workspace_mount_path: str | None = Field(default=None)
+    workspace_mount_path_in_sandbox: str = Field(default='/workspace')
+    workspace_mount_rewrite: str | None = Field(default=None)
+    cache_dir: str = Field(default='/tmp/cache')
+    run_as_openhands: bool = Field(default=True)
+    max_iterations: int = Field(default=OH_MAX_ITERATIONS)
+    max_budget_per_task: float | None = Field(default=None)
+    e2b_api_key: SecretStr | None = Field(default=None)
+    modal_api_token_id: SecretStr | None = Field(default=None)
+    modal_api_token_secret: SecretStr | None = Field(default=None)
+    disable_color: bool = Field(default=False)
+    jwt_secret: SecretStr | None = Field(default=None)
+    debug: bool = Field(default=False)
+    file_uploads_max_file_size_mb: int = Field(default=0)
+    file_uploads_restrict_file_types: bool = Field(default=False)
+    file_uploads_allowed_extensions: list[str] = Field(default_factory=lambda: ['.*'])
+    runloop_api_key: SecretStr | None = Field(default=None)
+    cli_multiline_input: bool = Field(default=False)
+    
+    show_workspace_contents: bool = Field(default=True)
+    override_UI_settings: bool = Field(default=False)
+    custom_instructions: str = Field(default='')
+    use_selenium: bool = Field(default=False)
+    dont_restore_state: bool = Field(default=False)
 
     defaults_dict: ClassVar[dict] = {}
 
+    model_config = {'extra': 'forbid'}
+
     def get_llm_config(self, name='llm') -> LLMConfig:
         """'llm' is the name for default config (for backward compatibility prior to 0.8)."""
         if name in self.llms:
@@ -126,49 +128,7 @@ def get_llm_config_from_agent(self, name='agent') -> LLMConfig:
     def get_agent_configs(self) -> dict[str, AgentConfig]:
         return self.agents
 
-    def __post_init__(self):
+    def model_post_init(self, __context):
         """Post-initialization hook, called when the instance is created with only default values."""
-        AppConfig.defaults_dict = self.defaults_to_dict()
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        result = {}
-        for f in fields(self):
-            field_value = getattr(self, f.name)
-
-            # dataclasses compute their defaults themselves
-            if is_dataclass(type(field_value)):
-                result[f.name] = field_value.defaults_to_dict()
-            else:
-                result[f.name] = get_field_info(f)
-        return result
-
-    def __str__(self):
-        attr_str = []
-        for f in fields(self):
-            attr_name = f.name
-            attr_value = getattr(self, f.name)
-
-            if attr_name in [
-                'e2b_api_key',
-                'github_token',
-                'jwt_secret',
-                'modal_api_token_id',
-                'modal_api_token_secret',
-                'runloop_api_key',
-            ]:
-                attr_value = '******' if attr_value else None
-
-            attr_str.append(f'{attr_name}={repr(attr_value)}')
-
-        return f"AppConfig({', '.join(attr_str)}"
-
-    def __repr__(self):
-        return self.__str__()
-
-    @classmethod
-    def from_dict(cls, env):      
-        return cls(**{
-            k: v for k, v in env.items() 
-            if k in inspect.signature(cls).parameters
-        })
+        super().model_post_init(__context)
+        AppConfig.defaults_dict = model_defaults_to_dict(self)
diff --git a/openhands/core/config/config_utils.py b/openhands/core/config/config_utils.py
index 38c3c1d03df5..44893e119b5a 100644
--- a/openhands/core/config/config_utils.py
+++ b/openhands/core/config/config_utils.py
@@ -1,19 +1,22 @@
 from types import UnionType
-from typing import get_args, get_origin
+from typing import Any, get_args, get_origin
+
+from pydantic import BaseModel
+from pydantic.fields import FieldInfo
 
 OH_DEFAULT_AGENT = 'CodeActAgent'
 OH_MAX_ITERATIONS = 500
 
 
-def get_field_info(f):
+def get_field_info(field: FieldInfo) -> dict[str, Any]:
     """Extract information about a dataclass field: type, optional, and default.
 
     Args:
-        f: The field to extract information from.
+        field: The field to extract information from.
 
     Returns: A dict with the field's type, whether it's optional, and its default value.
     """
-    field_type = f.type
+    field_type = field.annotation
     optional = False
 
     # for types like str | None, find the non-None type and set optional to True
@@ -33,7 +36,21 @@ def get_field_info(f):
     )
 
     # default is always present
-    default = f.default
+    default = field.default
 
     # return a schema with the useful info for frontend
     return {'type': type_name.lower(), 'optional': optional, 'default': default}
+
+
+def model_defaults_to_dict(model: BaseModel) -> dict[str, Any]:
+    """Serialize field information in a dict for the frontend, including type hints, defaults, and whether it's optional."""
+    result = {}
+    for name, field in model.model_fields.items():
+        field_value = getattr(model, name)
+
+        if isinstance(field_value, BaseModel):
+            result[name] = model_defaults_to_dict(field_value)
+        else:
+            result[name] = get_field_info(field)
+
+    return result
diff --git a/openhands/core/config/llm_config.py b/openhands/core/config/llm_config.py
index bef074860fb0..f7be683c4c57 100644
--- a/openhands/core/config/llm_config.py
+++ b/openhands/core/config/llm_config.py
@@ -1,16 +1,14 @@
+from __future__ import annotations
+
 import os
-import inspect
-from dataclasses import dataclass, fields
-from typing import Optional
+from typing import Any
 
-from openhands.core.config.config_utils import get_field_info
-from openhands.core.logger import LOG_DIR
+from pydantic import BaseModel, Field, SecretStr
 
-LLM_SENSITIVE_FIELDS = ['api_key', 'aws_access_key_id', 'aws_secret_access_key']
+from openhands.core.logger import LOG_DIR
 
 
-@dataclass
-class LLMConfig:
+class LLMConfig(BaseModel):
     """Configuration for the LLM model.
 
     Attributes:
@@ -52,101 +50,61 @@ class LLMConfig:
         native_tool_calling: Whether to use native tool calling if supported by the model. Can be True, False, or not set.
     """
 
-    model: str = 'claude-3-5-sonnet-20241022'
-    api_key: str | None = None
-    base_url: str | None = None
-    api_version: str | None = None
-    embedding_model: str = 'local'
-    embedding_base_url: str | None = None
-    embedding_deployment_name: str | None = None
-    aws_access_key_id: str | None = None
-    aws_secret_access_key: str | None = None
-    aws_region_name: str | None = None
-    openrouter_site_url: str = 'https://docs.all-hands.dev/'
-    openrouter_app_name: str = 'OpenHands'
-    num_retries: int = 8
-    retry_multiplier: float = 1.25
-    retry_min_wait: int = 1
-    retry_max_wait: int = 120
-    timeout: int | None = None
-    max_message_chars: int = 30_000  # maximum number of characters in an observation's content when sent to the llm
-    temperature: float = 0.0
-    top_p: float = 1.0
-    custom_llm_provider: str | None = None
-    max_input_tokens: int | None = None
-    max_output_tokens: int | None = None
-    input_cost_per_token: float | None = None
-    output_cost_per_token: float | None = None
-    ollama_base_url: str | None = None
-    message_summary_trunc_tokens_frac: float = 0.75
-    enable_cache: bool = True
+    model: str = Field(default='claude-3-5-sonnet-20241022')
+    api_key: SecretStr | None = Field(default=None)
+    base_url: str | None = Field(default=None)
+    api_version: str | None = Field(default=None)
+    embedding_model: str = Field(default='local')
+    embedding_base_url: str | None = Field(default=None)
+    embedding_deployment_name: str | None = Field(default=None)
+    aws_access_key_id: SecretStr | None = Field(default=None)
+    aws_secret_access_key: SecretStr | None = Field(default=None)
+    aws_region_name: str | None = Field(default=None)
+    openrouter_site_url: str = Field(default='https://docs.all-hands.dev/')
+    openrouter_app_name: str = Field(default='OpenHands')
+    num_retries: int = Field(default=8)
+    retry_multiplier: float = Field(default=2)
+    retry_min_wait: int = Field(default=15)
+    retry_max_wait: int = Field(default=120)
+    timeout: int | None = Field(default=None)
+    max_message_chars: int = Field(
+        default=30_000
+    )  # maximum number of characters in an observation's content when sent to the llm
+    temperature: float = Field(default=0.0)
+    top_p: float = Field(default=1.0)
+    custom_llm_provider: str | None = Field(default=None)
+    max_input_tokens: int | None = Field(default=None)
+    max_output_tokens: int | None = Field(default=None)
+    input_cost_per_token: float | None = Field(default=None)
+    output_cost_per_token: float | None = Field(default=None)
+    ollama_base_url: str | None = Field(default=None)
+    message_summary_trunc_tokens_frac: float = Field(default=0.75)
+    enable_cache: bool = Field(default=True)
     # This setting can be sent in each call to litellm
-    drop_params: bool = True
+    drop_params: bool = Field(default=True)
     # Note: this setting is actually global, unlike drop_params
-    modify_params: bool = True
-    disable_vision: bool | None = None
-    caching_prompt: bool = True
-    log_completions: bool = False
-    log_completions_folder: str = os.path.join(LOG_DIR, 'completions')
-    draft_editor: Optional['LLMConfig'] = None
-    custom_tokenizer: str | None = None
-    use_group: str | None = None
-    native_tool_calling: bool | None = None
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        result = {}
-        for f in fields(self):
-            result[f.name] = get_field_info(f)
-        return result
+    modify_params: bool = Field(default=True)
+    disable_vision: bool | None = Field(default=None)
+    caching_prompt: bool = Field(default=True)
+    log_completions: bool = Field(default=False)
+    log_completions_folder: str = Field(default=os.path.join(LOG_DIR, 'completions'))
+    draft_editor: LLMConfig | None = Field(default=None)
+    custom_tokenizer: str | None = Field(default=None)
+    native_tool_calling: bool | None = Field(default=None)
+    
+    use_group: str | None = Field(default=None)
+
+    model_config = {'extra': 'forbid'}
+
+    def model_post_init(self, __context: Any):
+        """Post-initialization hook to assign OpenRouter-related variables to environment variables.
 
-    def __post_init__(self):
-        """
-        Post-initialization hook to assign OpenRouter-related variables to environment variables.
         This ensures that these values are accessible to litellm at runtime.
         """
+        super().model_post_init(__context)
 
         # Assign OpenRouter-specific variables to environment variables
         if self.openrouter_site_url:
             os.environ['OR_SITE_URL'] = self.openrouter_site_url
         if self.openrouter_app_name:
             os.environ['OR_APP_NAME'] = self.openrouter_app_name
-
-    def __str__(self):
-        attr_str = []
-        for f in fields(self):
-            attr_name = f.name
-            attr_value = getattr(self, f.name)
-
-            if attr_name in LLM_SENSITIVE_FIELDS:
-                attr_value = '******' if attr_value else None
-
-            attr_str.append(f'{attr_name}={repr(attr_value)}')
-
-        return f"LLMConfig({', '.join(attr_str)})"
-
-    def __repr__(self):
-        return self.__str__()
-
-    def to_safe_dict(self):
-        """Return a dict with the sensitive fields replaced with ******."""
-        ret = self.__dict__.copy()
-        for k, v in ret.items():
-            if k in LLM_SENSITIVE_FIELDS:
-                ret[k] = '******' if v else None
-            elif isinstance(v, LLMConfig):
-                ret[k] = v.to_safe_dict()
-        return ret
-
-    @classmethod
-    def from_dict(cls, llm_config_dict: dict) -> 'LLMConfig':
-        """Create an LLMConfig object from a dictionary.
-
-        This function is used to create an LLMConfig object from a dictionary,
-        with the exception of the 'draft_editor' key, which is a nested LLMConfig object.
-        """
-        args = {k: v for k, v in llm_config_dict.items() if not isinstance(v, dict) and k in inspect.signature(cls).parameters}
-        if 'draft_editor' in llm_config_dict:
-            draft_editor_config = LLMConfig(**llm_config_dict['draft_editor'])
-            args['draft_editor'] = draft_editor_config
-        return cls(**args)
diff --git a/openhands/core/config/sandbox_config.py b/openhands/core/config/sandbox_config.py
index a00bb2306e41..8c7307cb1276 100644
--- a/openhands/core/config/sandbox_config.py
+++ b/openhands/core/config/sandbox_config.py
@@ -1,11 +1,9 @@
 import os
-import inspect
-from dataclasses import dataclass, field, fields
-from openhands.core.config.config_utils import get_field_info
 
+from pydantic import BaseModel, Field
 
-@dataclass
-class SandboxConfig:
+
+class SandboxConfig(BaseModel):
     """Configuration for the sandbox.
 
     Attributes:
@@ -41,57 +39,35 @@ class SandboxConfig:
             This should be a JSON string that will be parsed into a dictionary.
     """
 
-    remote_runtime_api_url: str = 'http://localhost:8000'
-    local_runtime_url: str = 'http://localhost'
-    keep_runtime_alive: bool = True
-    rm_all_containers: bool = False
-    api_key: str | None = None
-    base_container_image: str = 'docker.io/nikolaik/python-nodejs:python3.12-nodejs22'
-    runtime_container_image: str | None = None
-    user_id: int = os.getuid() if hasattr(os, 'getuid') else 1000
-    timeout: int = 120
-    remote_runtime_init_timeout: int = 180
-    enable_auto_lint: bool = (
-        False  # once enabled, OpenHands would lint files after editing
+    remote_runtime_api_url: str = Field(default='http://localhost:8000')
+    local_runtime_url: str = Field(default='http://localhost')
+    keep_runtime_alive: bool = Field(default=True)
+    rm_all_containers: bool = Field(default=False)
+    api_key: str | None = Field(default=None)
+    base_container_image: str = Field(
+        default='docker.io/nikolaik/python-nodejs:python3.12-nodejs22'
+    )
+    runtime_container_image: str | None = Field(default=None)
+    user_id: int = Field(default=os.getuid() if hasattr(os, 'getuid') else 1000)
+    timeout: int = Field(default=120)
+    remote_runtime_init_timeout: int = Field(default=180)
+    enable_auto_lint: bool = Field(
+        default=False  # once enabled, OpenHands would lint files after editing
     )
-    use_host_network: bool = True
-    runtime_extra_build_args: list[str] | None = None
-    initialize_plugins: bool = True
-    force_rebuild_runtime: bool = False
-    runtime_extra_deps: str | None = None
-    runtime_startup_env_vars: dict[str, str] = field(default_factory=dict)
-    browsergym_eval_env: str | None = None
+    use_host_network: bool = Field(default=True)
+    runtime_extra_build_args: list[str] | None = Field(default=None)
+    initialize_plugins: bool = Field(default=True)
+    force_rebuild_runtime: bool = Field(default=False)
+    runtime_extra_deps: str | None = Field(default=None)
+    runtime_startup_env_vars: dict[str, str] = Field(default_factory=dict)
+    browsergym_eval_env: str | None = Field(default=None)
+    platform: str | None = Field(default=None)
+    close_delay: int = Field(default=900)
+    remote_runtime_resource_factor: int = Field(default=1)
+    enable_gpu: bool = Field(default=False)
+    docker_runtime_kwargs: str | None = Field(default=None)
+
     persist_sandbox: bool = True
     port: int = 63710
-    platform: str | None = None
-    close_delay: int = 900
-    remote_runtime_resource_factor: int = 1
-    enable_gpu: bool = False
-    docker_runtime_kwargs: str | None = None
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        dict = {}
-        for f in fields(self):
-            dict[f.name] = get_field_info(f)
-        return dict
-
-    def __str__(self):
-        attr_str = []
-        for f in fields(self):
-            attr_name = f.name
-            attr_value = getattr(self, f.name)
-
-            attr_str.append(f'{attr_name}={repr(attr_value)}')
-
-        return f"SandboxConfig({', '.join(attr_str)})"
-
-    def __repr__(self):
-        return self.__str__()
 
-    @classmethod
-    def from_dict(cls, env):      
-        return cls(**{
-            k: v for k, v in env.items() 
-            if k in inspect.signature(cls).parameters
-        })
+    model_config = {'extra': 'forbid'}
diff --git a/openhands/core/config/security_config.py b/openhands/core/config/security_config.py
index 0cc87f39e53b..a4805e3ab85f 100644
--- a/openhands/core/config/security_config.py
+++ b/openhands/core/config/security_config.py
@@ -1,10 +1,7 @@
-from dataclasses import dataclass, fields
-import inspect
-from openhands.core.config.config_utils import get_field_info
+from pydantic import BaseModel, Field
 
 
-@dataclass
-class SecurityConfig:
+class SecurityConfig(BaseModel):
     """Configuration for security related functionalities.
 
     Attributes:
@@ -12,33 +9,5 @@ class SecurityConfig:
         security_analyzer: The security analyzer to use.
     """
 
-    confirmation_mode: bool = False
-    security_analyzer: str | None = None
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        dict = {}
-        for f in fields(self):
-            dict[f.name] = get_field_info(f)
-        return dict
-
-    def __str__(self):
-        attr_str = []
-        for f in fields(self):
-            attr_name = f.name
-            attr_value = getattr(self, f.name)
-
-            attr_str.append(f'{attr_name}={repr(attr_value)}')
-
-        return f"SecurityConfig({', '.join(attr_str)})"
-
-    @classmethod
-    def from_dict(cls, security_config_dict: dict) -> 'SecurityConfig':
-        return cls(**{
-            k: v for k, v in security_config_dict.items() 
-            if k in inspect.signature(cls).parameters
-        })
-
-    def __repr__(self):
-        return self.__str__()
-    
\ No newline at end of file
+    confirmation_mode: bool = Field(default=False)
+    security_analyzer: str | None = Field(default=None)
diff --git a/openhands/core/config/utils.py b/openhands/core/config/utils.py
index d382480b1816..31938264c972 100644
--- a/openhands/core/config/utils.py
+++ b/openhands/core/config/utils.py
@@ -3,13 +3,13 @@
 import pathlib
 import platform
 import sys
-from dataclasses import is_dataclass
 from types import UnionType
 from typing import Any, MutableMapping, get_args, get_origin
 from uuid import uuid4
 
 import toml
 from dotenv import load_dotenv
+from pydantic import BaseModel, ValidationError
 
 from openhands.core import logger
 from openhands.core.config.agent_config import AgentConfig
@@ -43,17 +43,19 @@ def get_optional_type(union_type: UnionType) -> Any:
         return next((t for t in types if t is not type(None)), None)
 
     # helper function to set attributes based on env vars
-    def set_attr_from_env(sub_config: Any, prefix=''):
-        """Set attributes of a config dataclass based on environment variables."""
-        for field_name, field_type in sub_config.__annotations__.items():
+    def set_attr_from_env(sub_config: BaseModel, prefix=''):
+        """Set attributes of a config model based on environment variables."""
+        for field_name, field_info in sub_config.model_fields.items():
+            field_value = getattr(sub_config, field_name)
+            field_type = field_info.annotation
+
             # compute the expected env var name from the prefix and field name
             # e.g. LLM_BASE_URL
             env_var_name = (prefix + field_name).upper()
 
-            if is_dataclass(field_type):
-                # nested dataclass
-                nested_sub_config = getattr(sub_config, field_name)
-                set_attr_from_env(nested_sub_config, prefix=field_name + '_')
+            if isinstance(field_value, BaseModel):
+                set_attr_from_env(field_value, prefix=field_name + '_')
+
             elif env_var_name in env_or_toml_dict:
                 # convert the env var to the correct type and set it
                 value = env_or_toml_dict[env_var_name]
@@ -122,45 +124,60 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
         if isinstance(value, dict):
             try:
                 if key is not None and key.lower() == 'agent':
+                    # Every entry here is either a field for the default `agent` config group, or itself a group
+                    # The best way to tell the difference is to try to parse it as an AgentConfig object
+                    agent_group_ids: set[str] = set()
+                    for nested_key, nested_value in value.items():
+                        if isinstance(nested_value, dict):
+                            try:
+                                agent_config = AgentConfig(**nested_value)
+                            except ValidationError:
+                                continue
+                            agent_group_ids.add(nested_key)
+                            cfg.set_agent_config(agent_config, nested_key)
+
                     logger.openhands_logger.debug(
                         'Attempt to load default agent config from config toml'
                     )
-                    non_dict_fields = {
-                        k: v for k, v in value.items() if not isinstance(v, dict)
+                    value_without_groups = {
+                        k: v for k, v in value.items() if k not in agent_group_ids
                     }
-                    agent_config = AgentConfig.from_dict(non_dict_fields)
+                    agent_config = AgentConfig(**value_without_groups)
                     cfg.set_agent_config(agent_config, 'agent')
-                    for nested_key, nested_value in value.items():
-                        if isinstance(nested_value, dict):
-                            logger.openhands_logger.debug(
-                                f'Attempt to load group {nested_key} from config toml as agent config'
-                            )
-                            agent_config = AgentConfig.from_dict(nested_value)
-                            cfg.set_agent_config(agent_config, nested_key)
+
                 elif key is not None and key.lower() == 'llm':
-                    # logger.openhands_logger.debug(
-                    #     'Attempt to load default LLM config from config toml'
-                    # )
-                    llm_config = LLMConfig.from_dict(value)
-                    cfg.set_llm_config(llm_config, 'llm')
+                    # Every entry here is either a field for the default `llm` config group, or itself a group
+                    # The best way to tell the difference is to try to parse it as an LLMConfig object
+                    llm_group_ids: set[str] = set()
                     for nested_key, nested_value in value.items():
                         if isinstance(nested_value, dict):
-                            # logger.openhands_logger.debug(
-                            #     f'Attempt to load group {nested_key} from config toml as llm config'
-                            # )
-                            llm_config = LLMConfig.from_dict(nested_value)
+                            try:
+                                llm_config = LLMConfig(**nested_value)
+                            except ValidationError:
+                                continue
+                            llm_group_ids.add(nested_key)
                             cfg.set_llm_config(llm_config, nested_key)
+
+                    logger.openhands_logger.debug(
+                        'Attempt to load default LLM config from config toml'
+                    )
+                    value_without_groups = {
+                        k: v for k, v in value.items() if k not in llm_group_ids
+                    }
+                    llm_config = LLMConfig(**value_without_groups)
+                    cfg.set_llm_config(llm_config, 'llm')
+
                 elif key is not None and key.lower() == 'security':
                     logger.openhands_logger.debug(
                         'Attempt to load security config from config toml'
                     )
-                    security_config = SecurityConfig.from_dict(value)
+                    security_config = SecurityConfig(**value)
                     cfg.security = security_config
                 elif not key.startswith('sandbox') and key.lower() != 'core':
                     logger.openhands_logger.warning(
                         f'Unknown key in {toml_file}: "{key}"'
                     )
-            except (TypeError, KeyError) as e:
+            except (TypeError, KeyError, ValidationError) as e:
                 logger.openhands_logger.warning(
                     f'Cannot parse [{key}] config from toml, values have not been applied.\nError: {e}',
                     exc_info=False,
@@ -197,7 +214,7 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
                 logger.openhands_logger.warning(
                     f'Unknown config key "{key}" in [core] section'
                 )
-    except (TypeError, KeyError) as e:
+    except (TypeError, KeyError, ValidationError) as e:
         logger.openhands_logger.warning(
             f'Cannot parse [sandbox] config from toml, values have not been applied.\nError: {e}',
             exc_info=False,
@@ -299,7 +316,7 @@ def get_llm_config_arg(
         return LLMConfig(**toml_config[llm_config_arg])
     # update the llm config with the specified section
     if 'llm' in toml_config and llm_config_arg in toml_config['llm']:
-        return LLMConfig.from_dict(toml_config['llm'][llm_config_arg])
+        return LLMConfig(**toml_config['llm'][llm_config_arg])
     raise ValueError(f'Loading from toml failed for {llm_config_arg}')
 
 
diff --git a/openhands/llm/async_llm.py b/openhands/llm/async_llm.py
index 233def8da8b5..a6cbc7f34cd3 100644
--- a/openhands/llm/async_llm.py
+++ b/openhands/llm/async_llm.py
@@ -19,7 +19,9 @@ def __init__(self, *args, **kwargs):
         self._async_completion = partial(
             self._call_acompletion,
             model=self.config.model,
-            api_key=self.config.api_key,
+            api_key=self.config.api_key.get_secret_value()
+            if self.config.api_key
+            else None,
             base_url=self.config.base_url,
             api_version=self.config.api_version,
             custom_llm_provider=self.config.custom_llm_provider,
diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index 61177b4e3726..1ffa250a43e3 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -189,7 +189,9 @@ def __init__(
         self._completion = partial(
             litellm_completion,
             model=self.config.model,
-            api_key=self.config.api_key,
+            api_key=self.config.api_key.get_secret_value()
+            if self.config.api_key
+            else None,
             base_url=self.config.base_url,
             api_version=self.config.api_version,
             custom_llm_provider=self.config.custom_llm_provider,
@@ -459,7 +461,9 @@ def init_model_info(self):
             # GET {base_url}/v1/model/info with litellm_model_id as path param
             response = requests.get(
                 f'{self.config.base_url}/v1/model/info',
-                headers={'Authorization': f'Bearer {self.config.api_key}'},
+                headers={
+                    'Authorization': f'Bearer {self.config.api_key.get_secret_value() if self.config.api_key else None}'
+                },
             )
             resp_json = response.json()
             if 'data' not in resp_json:
diff --git a/openhands/llm/streaming_llm.py b/openhands/llm/streaming_llm.py
index a45beba985ab..b7dacefd7660 100644
--- a/openhands/llm/streaming_llm.py
+++ b/openhands/llm/streaming_llm.py
@@ -16,7 +16,9 @@ def __init__(self, *args, **kwargs):
         self._async_streaming_completion = partial(
             self._call_acompletion,
             model=self.config.model,
-            api_key=self.config.api_key,
+            api_key=self.config.api_key.get_secret_value()
+            if self.config.api_key
+            else None,
             base_url=self.config.base_url,
             api_version=self.config.api_version,
             custom_llm_provider=self.config.custom_llm_provider,
diff --git a/openhands/resolver/resolve_issue.py b/openhands/resolver/resolve_issue.py
index 21036dbc29b8..f50b37d79447 100644
--- a/openhands/resolver/resolve_issue.py
+++ b/openhands/resolver/resolve_issue.py
@@ -14,12 +14,7 @@
 
 import openhands
 from openhands.controller.state.state import State
-from openhands.core.config import (
-    AgentConfig,
-    AppConfig,
-    LLMConfig,
-    SandboxConfig,
-)
+from openhands.core.config import AgentConfig, AppConfig, LLMConfig, SandboxConfig
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import CmdRunAction, MessageAction
@@ -153,7 +148,7 @@ async def process_issue(
     max_iterations: int,
     llm_config: LLMConfig,
     output_dir: str,
-    runtime_container_image: str,
+    runtime_container_image: str | None,
     prompt_template: str,
     issue_handler: IssueHandlerInterface,
     repo_instruction: str | None = None,
@@ -306,7 +301,7 @@ async def resolve_issue(
     max_iterations: int,
     output_dir: str,
     llm_config: LLMConfig,
-    runtime_container_image: str,
+    runtime_container_image: str | None,
     prompt_template: str,
     issue_type: str,
     repo_instruction: str | None,
@@ -583,11 +578,16 @@ def int_or_none(value):
         default=None,
         help="Target branch to pull and create PR against (for PRs). If not specified, uses the PR's base branch.",
     )
+    parser.add_argument(
+        '--is-experimental',
+        type=lambda x: x.lower() == 'true',
+        help='Whether to run in experimental mode.',
+    )
 
     my_args = parser.parse_args()
 
     runtime_container_image = my_args.runtime_container_image
-    if runtime_container_image is None:
+    if runtime_container_image is None and not my_args.is_experimental:
         runtime_container_image = (
             f'ghcr.io/all-hands-ai/runtime:{openhands.__version__}-nikolaik'
         )
diff --git a/openhands/runtime/impl/modal/modal_runtime.py b/openhands/runtime/impl/modal/modal_runtime.py
index 4995f07503ff..8d5aa57e7d10 100644
--- a/openhands/runtime/impl/modal/modal_runtime.py
+++ b/openhands/runtime/impl/modal/modal_runtime.py
@@ -60,7 +60,8 @@ def __init__(
         self.sid = sid
 
         self.modal_client = modal.Client.from_credentials(
-            config.modal_api_token_id, config.modal_api_token_secret
+            config.modal_api_token_id.get_secret_value(),
+            config.modal_api_token_secret.get_secret_value(),
         )
         self.app = modal.App.lookup(
             'openhands', create_if_missing=True, client=self.modal_client
diff --git a/openhands/runtime/impl/runloop/runloop_runtime.py b/openhands/runtime/impl/runloop/runloop_runtime.py
index 51628f54056d..add4619aea81 100644
--- a/openhands/runtime/impl/runloop/runloop_runtime.py
+++ b/openhands/runtime/impl/runloop/runloop_runtime.py
@@ -40,7 +40,7 @@ def __init__(
         self.devbox: DevboxView | None = None
         self.config = config
         self.runloop_api_client = Runloop(
-            bearer_token=config.runloop_api_key,
+            bearer_token=config.runloop_api_key.get_secret_value(),
         )
         self.container_name = CONTAINER_NAME_PREFIX + sid
         super().__init__(
diff --git a/openhands/server/routes/public.py b/openhands/server/routes/public.py
index 5a8925b741b4..fcdb1e52cef7 100644
--- a/openhands/server/routes/public.py
+++ b/openhands/server/routes/public.py
@@ -51,8 +51,8 @@ async def get_litellm_models() -> list[str]:
     ):
         bedrock_model_list = bedrock.list_foundation_models(
             llm_config.aws_region_name,
-            llm_config.aws_access_key_id,
-            llm_config.aws_secret_access_key,
+            llm_config.aws_access_key_id.get_secret_value(),
+            llm_config.aws_secret_access_key.get_secret_value(),
         )
     model_list = litellm_model_list_without_bedrock + bedrock_model_list
     for llm_config in config.llms.values():
diff --git a/openhands/utils/embeddings.py b/openhands/utils/embeddings.py
index 7e251f0e5022..6791787d3204 100644
--- a/openhands/utils/embeddings.py
+++ b/openhands/utils/embeddings.py
@@ -90,7 +90,9 @@ def get_embedding_model(strategy: str, llm_config: LLMConfig) -> 'BaseEmbedding'
 
             return OpenAIEmbedding(
                 model='text-embedding-ada-002',
-                api_key=llm_config.api_key,
+                api_key=llm_config.api_key.get_secret_value()
+                if llm_config.api_key
+                else None,
             )
         elif strategy == 'azureopenai':
             from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
diff --git a/poetry.lock b/poetry.lock
index 6df255f5826c..51f9746cac06 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -5389,17 +5389,15 @@ realtime = ["websockets (>=13,<15)"]
 
 [[package]]
 name = "openhands-aci"
-version = "0.1.6"
+version = "0.1.8"
 description = "An Agent-Computer Interface (ACI) designed for software development agents OpenHands."
 optional = false
-python-versions = "<4.0,>=3.12"
-files = [
-    {file = "openhands_aci-0.1.6-py3-none-any.whl", hash = "sha256:e9589d959a146fad3e6935be1f80b7a4368dd7aa2ba38ad267862c4f8a246e72"},
-    {file = "openhands_aci-0.1.6.tar.gz", hash = "sha256:6edf4d6478a349140a324c4a0c4be6d1e9a7acce1739a37d02eecbb9006a2ce7"},
-]
+python-versions = "^3.12"
+files = []
+develop = false
 
 [package.dependencies]
-diskcache = ">=5.6.3,<6.0.0"
+diskcache = "^5.6.3"
 flake8 = "*"
 gitpython = "*"
 grep-ast = "0.3.3"
@@ -5409,7 +5407,13 @@ numpy = "*"
 pandas = "*"
 scipy = "*"
 tree-sitter = "0.21.3"
-whatthepatch = ">=1.0.6,<2.0.0"
+whatthepatch = "^1.0.6"
+
+[package.source]
+type = "git"
+url = "https://github.com/All-Hands-AI/openhands-aci.git"
+reference = "fix-find-show-only-hidden-subpaths"
+resolved_reference = "910e8c470aff0e496bf262bc673c7ee7b4531159"
 
 [[package]]
 name = "opentelemetry-api"
@@ -9853,4 +9857,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "239681e32cbe17b32855c0bccaf636cc05c55a5411fdb79d180ab3ad833284ea"
+content-hash = "8320b6c6bb05538516a965589ce03fec4d30df38fb7b47fc934258f1d8d47e30"
diff --git a/pyproject.toml b/pyproject.toml
index 3753ae45d200..af177c2a6fd9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,7 +65,7 @@ runloop-api-client = "0.12.0"
 libtmux = ">=0.37,<0.40"
 pygithub = "^2.5.0"
 joblib = "*"
-openhands-aci = "0.1.6"
+openhands-aci = "0.1.8"
 python-socketio = "^5.11.4"
 redis = "^5.2.0"
 sse-starlette = "^2.1.3"
diff --git a/tests/unit/test_acompletion.py b/tests/unit/test_acompletion.py
index b6753759be3d..cca18bbb5b29 100644
--- a/tests/unit/test_acompletion.py
+++ b/tests/unit/test_acompletion.py
@@ -109,9 +109,6 @@ async def mock_on_cancel_requested():
         print(f'Cancel requested: {is_set}')
         return is_set
 
-    config = load_app_config()
-    config.on_cancel_requested_fn = mock_on_cancel_requested
-
     async def mock_acompletion(*args, **kwargs):
         print('Starting mock_acompletion')
         for i in range(20):  # Increased iterations for longer running task
@@ -153,13 +150,6 @@ async def cancel_after_delay():
 async def test_async_streaming_completion_with_user_cancellation(cancel_after_chunks):
     cancel_requested = False
 
-    async def mock_on_cancel_requested():
-        nonlocal cancel_requested
-        return cancel_requested
-
-    config = load_app_config()
-    config.on_cancel_requested_fn = mock_on_cancel_requested
-
     test_messages = [
         'This is ',
         'a test ',
diff --git a/tests/unit/test_codeact_agent.py b/tests/unit/test_codeact_agent.py
index c17607b193af..b4074e63b694 100644
--- a/tests/unit/test_codeact_agent.py
+++ b/tests/unit/test_codeact_agent.py
@@ -60,7 +60,6 @@ def mock_state() -> State:
 
 
 def test_cmd_output_observation_message(agent: CodeActAgent):
-    agent.config.function_calling = False
     obs = CmdOutputObservation(
         command='echo hello',
         content='Command output',
@@ -82,7 +81,6 @@ def test_cmd_output_observation_message(agent: CodeActAgent):
 
 
 def test_ipython_run_cell_observation_message(agent: CodeActAgent):
-    agent.config.function_calling = False
     obs = IPythonRunCellObservation(
         code='plt.plot()',
         content='IPython output\n![image](data:image/png;base64,ABC123)',
@@ -105,7 +103,6 @@ def test_ipython_run_cell_observation_message(agent: CodeActAgent):
 
 
 def test_agent_delegate_observation_message(agent: CodeActAgent):
-    agent.config.function_calling = False
     obs = AgentDelegateObservation(
         content='Content', outputs={'content': 'Delegated agent output'}
     )
@@ -122,7 +119,6 @@ def test_agent_delegate_observation_message(agent: CodeActAgent):
 
 
 def test_error_observation_message(agent: CodeActAgent):
-    agent.config.function_calling = False
     obs = ErrorObservation('Error message')
 
     results = agent.get_observation_message(obs, tool_call_id_to_message={})
@@ -145,7 +141,6 @@ def test_unknown_observation_message(agent: CodeActAgent):
 
 
 def test_file_edit_observation_message(agent: CodeActAgent):
-    agent.config.function_calling = False
     obs = FileEditObservation(
         path='/test/file.txt',
         prev_exist=True,
@@ -167,7 +162,6 @@ def test_file_edit_observation_message(agent: CodeActAgent):
 
 
 def test_file_read_observation_message(agent: CodeActAgent):
-    agent.config.function_calling = False
     obs = FileReadObservation(
         path='/test/file.txt',
         content='File content',
@@ -186,7 +180,6 @@ def test_file_read_observation_message(agent: CodeActAgent):
 
 
 def test_browser_output_observation_message(agent: CodeActAgent):
-    agent.config.function_calling = False
     obs = BrowserOutputObservation(
         url='http://example.com',
         trigger_by_action='browse',
@@ -207,7 +200,6 @@ def test_browser_output_observation_message(agent: CodeActAgent):
 
 
 def test_user_reject_observation_message(agent: CodeActAgent):
-    agent.config.function_calling = False
     obs = UserRejectObservation('Action rejected')
 
     results = agent.get_observation_message(obs, tool_call_id_to_message={})
@@ -223,7 +215,6 @@ def test_user_reject_observation_message(agent: CodeActAgent):
 
 
 def test_function_calling_observation_message(agent: CodeActAgent):
-    agent.config.function_calling = True
     mock_response = {
         'id': 'mock_id',
         'total_calls_in_response': 1,
diff --git a/tests/unit/test_condenser.py b/tests/unit/test_condenser.py
new file mode 100644
index 000000000000..91878c86baa1
--- /dev/null
+++ b/tests/unit/test_condenser.py
@@ -0,0 +1,520 @@
+from datetime import datetime
+from typing import Any
+from unittest.mock import MagicMock
+
+import pytest
+
+from openhands.controller.state.state import State
+from openhands.core.config.condenser_config import (
+    AmortizedForgettingCondenserConfig,
+    LLMAttentionCondenserConfig,
+    LLMSummarizingCondenserConfig,
+    NoOpCondenserConfig,
+    ObservationMaskingCondenserConfig,
+    RecentEventsCondenserConfig,
+)
+from openhands.core.config.llm_config import LLMConfig
+from openhands.events.event import Event, EventSource
+from openhands.events.observation.observation import Observation
+from openhands.llm import LLM
+from openhands.memory.condenser import (
+    AmortizedForgettingCondenser,
+    Condenser,
+    ImportantEventSelection,
+    LLMAttentionCondenser,
+    LLMSummarizingCondenser,
+    NoOpCondenser,
+    ObservationMaskingCondenser,
+    RecentEventsCondenser,
+)
+
+
+def create_test_event(
+    message: str, timestamp: datetime | None = None, id: int | None = None
+) -> Event:
+    """Create a simple test event."""
+    event = Event()
+    event._message = message
+    event.timestamp = timestamp if timestamp else datetime.now()
+    if id:
+        event._id = id
+    event._source = EventSource.USER
+    return event
+
+
+@pytest.fixture
+def mock_llm() -> LLM:
+    """Mocks an LLM object with a utility function for setting and resetting response contents in unit tests."""
+    # Create a MagicMock for the LLM object
+    mock_llm = MagicMock(
+        spec=LLM,
+        config=MagicMock(
+            spec=LLMConfig, model='gpt-4o', api_key='test_key', custom_llm_provider=None
+        ),
+        metrics=MagicMock(),
+    )
+    _mock_content = None
+
+    # Set a mock message with the mocked content
+    mock_message = MagicMock()
+    mock_message.content = _mock_content
+
+    def set_mock_response_content(content: Any):
+        """Set the mock response for the LLM."""
+        nonlocal mock_message
+        mock_message.content = content
+
+    mock_choice = MagicMock()
+    mock_choice.message = mock_message
+
+    mock_response = MagicMock()
+    mock_response.choices = [mock_choice]
+
+    mock_llm.completion.return_value = mock_response
+
+    # Attach helper methods to the mock object
+    mock_llm.set_mock_response_content = set_mock_response_content
+
+    return mock_llm
+
+
+@pytest.fixture
+def mock_state() -> State:
+    """Mocks a State object with the only parameters needed for testing condensers: history and extra_data."""
+    mock_state = MagicMock(spec=State)
+    mock_state.history = []
+    mock_state.extra_data = {}
+
+    return mock_state
+
+
+def test_noop_condenser_from_config():
+    """Test that the NoOpCondenser objects can be made from config."""
+    config = NoOpCondenserConfig()
+    condenser = Condenser.from_config(config)
+
+    assert isinstance(condenser, NoOpCondenser)
+
+
+def test_noop_condenser():
+    """Test that NoOpCondensers preserve their input events."""
+    events = [
+        create_test_event('Event 1'),
+        create_test_event('Event 2'),
+        create_test_event('Event 3'),
+    ]
+
+    mock_state = MagicMock()
+    mock_state.history = events
+
+    condenser = NoOpCondenser()
+    result = condenser.condensed_history(mock_state)
+
+    assert result == events
+
+
+def test_observation_masking_condenser_from_config():
+    """Test that ObservationMaskingCondenser objects can be made from config."""
+    attention_window = 5
+    config = ObservationMaskingCondenserConfig(attention_window=attention_window)
+    condenser = Condenser.from_config(config)
+
+    assert isinstance(condenser, ObservationMaskingCondenser)
+    assert condenser.attention_window == attention_window
+
+
+def test_observation_masking_condenser_respects_attention_window(mock_state):
+    """Test that ObservationMaskingCondenser only masks events outside the attention window."""
+    attention_window = 3
+    condenser = ObservationMaskingCondenser(attention_window=attention_window)
+
+    events = [
+        create_test_event('Event 1'),
+        Observation('Observation 1'),
+        create_test_event('Event 3'),
+        create_test_event('Event 4'),
+        Observation('Observation 2'),
+    ]
+
+    mock_state.history = events
+    result = condenser.condensed_history(mock_state)
+
+    assert len(result) == len(events)
+
+    for index, (event, condensed_event) in enumerate(zip(events, result)):
+        # If we're outside the attention window, observations should be masked.
+        if index < len(events) - attention_window:
+            if isinstance(event, Observation):
+                assert '<MASKED>' in str(condensed_event)
+
+        # If we're within the attention window, events are unchanged.
+        else:
+            assert event == condensed_event
+
+
+def test_recent_events_condenser_from_config():
+    """Test that RecentEventsCondenser objects can be made from config."""
+    max_events = 5
+    keep_first = True
+    config = RecentEventsCondenserConfig(keep_first=keep_first, max_events=max_events)
+    condenser = Condenser.from_config(config)
+
+    assert isinstance(condenser, RecentEventsCondenser)
+    assert condenser.max_events == max_events
+    assert condenser.keep_first == keep_first
+
+
+def test_recent_events_condenser():
+    """Test that RecentEventsCondensers keep just the most recent events."""
+    events = [
+        create_test_event('Event 1'),
+        create_test_event('Event 2'),
+        create_test_event('Event 3'),
+        create_test_event('Event 4'),
+        create_test_event('Event 5'),
+    ]
+
+    mock_state = MagicMock()
+    mock_state.history = events
+
+    # If the max_events are larger than the number of events, equivalent to a NoOpCondenser.
+    condenser = RecentEventsCondenser(max_events=len(events))
+    result = condenser.condensed_history(mock_state)
+
+    assert result == events
+
+    # If the max_events are smaller than the number of events, only keep the last few.
+    max_events = 2
+    condenser = RecentEventsCondenser(max_events=max_events)
+    result = condenser.condensed_history(mock_state)
+
+    assert len(result) == max_events
+    assert result[0]._message == 'Event 4'
+    assert result[1]._message == 'Event 5'
+
+    # If the keep_first flag is set, the first event will always be present.
+    keep_first = 1
+    max_events = 2
+    condenser = RecentEventsCondenser(keep_first=keep_first, max_events=max_events)
+    result = condenser.condensed_history(mock_state)
+
+    assert len(result) == max_events
+    assert result[0]._message == 'Event 1'
+    assert result[1]._message == 'Event 5'
+
+    # We should be able to keep more of the initial events.
+    keep_first = 2
+    max_events = 3
+    condenser = RecentEventsCondenser(keep_first=keep_first, max_events=max_events)
+    result = condenser.condensed_history(mock_state)
+
+    assert len(result) == max_events
+    assert result[0]._message == 'Event 1'
+    assert result[1]._message == 'Event 2'
+    assert result[2]._message == 'Event 5'
+
+
+def test_llm_condenser_from_config():
+    """Test that LLMCondensers can be made from config."""
+    config = LLMSummarizingCondenserConfig(
+        llm_config=LLMConfig(
+            model='gpt-4o',
+            api_key='test_key',
+        )
+    )
+    condenser = Condenser.from_config(config)
+
+    assert isinstance(condenser, LLMSummarizingCondenser)
+    assert condenser.llm.config.model == 'gpt-4o'
+    assert condenser.llm.config.api_key.get_secret_value() == 'test_key'
+
+
+def test_llm_condenser(mock_llm, mock_state):
+    """Test that LLMCondensers use the LLM to generate a summary event."""
+    events = [
+        create_test_event('Event 1'),
+        create_test_event('Event 2'),
+    ]
+    mock_state.history = events
+
+    mock_llm.metrics = MagicMock()
+    mock_llm.metrics.get.return_value = {'test_metric': 1.0}
+
+    mock_llm.set_mock_response_content('Summary of events')
+
+    condenser = LLMSummarizingCondenser(llm=mock_llm)
+    result = condenser.condensed_history(mock_state)
+
+    assert len(result) == 1
+    assert result[0].content == 'Summary of events'
+
+    # Verify LLM was called with correct prompt.
+    mock_llm.completion.assert_called_once()
+    call_args = mock_llm.completion.call_args[1]
+    assert 'messages' in call_args
+    assert len(call_args['messages']) == 1
+    assert 'Event 1' in call_args['messages'][0]['content']
+    assert 'Event 2' in call_args['messages'][0]['content']
+
+    # Verify metrics were added to state
+    assert 'condenser_meta' in mock_state.extra_data
+    assert len(mock_state.extra_data['condenser_meta']) == 1
+    assert mock_state.extra_data['condenser_meta'][0]['metrics'] == {'test_metric': 1.0}
+
+
+def test_llm_condenser_error():
+    """Test that LLM errors are propagated during condensation."""
+    events = [create_test_event('Event 1', datetime(2024, 1, 1, 10, 0))]
+
+    mock_state = MagicMock()
+    mock_state.history = events
+
+    mock_llm = MagicMock()
+    mock_llm.completion.side_effect = Exception('LLM error')
+
+    condenser = LLMSummarizingCondenser(llm=mock_llm)
+
+    try:
+        condenser.condensed_history(mock_state)
+        raise AssertionError('Expected exception was not raised.')
+    except Exception as e:
+        assert str(e) == 'LLM error'
+
+
+def test_amortized_forgetting_condenser_from_config():
+    """Test that AmortizedForgettingCondenser objects can be made from config."""
+    max_size = 50
+    keep_first = 10
+    config = AmortizedForgettingCondenserConfig(
+        max_size=max_size, keep_first=keep_first
+    )
+    condenser = Condenser.from_config(config)
+
+    assert isinstance(condenser, AmortizedForgettingCondenser)
+    assert condenser.max_size == max_size
+    assert condenser.keep_first == keep_first
+
+
+def test_amortized_forgetting_condenser_invalid_config():
+    """Test that AmortizedForgettingCondenser raises error when keep_first > max_size."""
+    pytest.raises(ValueError, AmortizedForgettingCondenser, max_size=4, keep_first=2)
+    pytest.raises(ValueError, AmortizedForgettingCondenser, max_size=0)
+    pytest.raises(ValueError, AmortizedForgettingCondenser, keep_first=-1)
+
+
+def test_amortized_forgetting_condenser_grows_to_max_size():
+    """Test that AmortizedForgettingCondenser correctly maintains an event context up to max size."""
+    max_size = 15
+    condenser = AmortizedForgettingCondenser(max_size=max_size)
+
+    mock_state = MagicMock()
+    mock_state.extra_data = {}
+    mock_state.history = []
+
+    for i in range(max_size):
+        event = create_test_event(f'Event {i}')
+        mock_state.history.append(event)
+        results = condenser.condensed_history(mock_state)
+        assert len(results) == i + 1
+
+
+def test_amortized_forgetting_condenser_forgets_when_larger_than_max_size():
+    """Test that the AmortizedForgettingCondenser forgets events when the context grows too large."""
+    max_size = 2
+    condenser = AmortizedForgettingCondenser(max_size=max_size)
+
+    mock_state = MagicMock()
+    mock_state.extra_data = {}
+    mock_state.history = []
+
+    for i in range(max_size * 10):
+        event = create_test_event(f'Event {i}')
+        mock_state.history.append(event)
+        results = condenser.condensed_history(mock_state)
+
+        # The last event in the results is always the event we just added.
+        assert results[-1] == event
+
+        # The number of results should bounce back and forth between 1, 2, 1, 2, ...
+        assert len(results) == (i % 2) + 1
+
+
+def test_amortized_forgetting_condenser_keeps_first_events():
+    """Test that the AmortizedForgettingCondenser keeps the right number of initial events when forgetting."""
+    max_size = 4
+    keep_first = 1
+    condenser = AmortizedForgettingCondenser(max_size=max_size, keep_first=keep_first)
+
+    first_event = create_test_event('Event 0')
+
+    mock_state = MagicMock()
+    mock_state.extra_data = {}
+    mock_state.history = [first_event]
+
+    for i in range(max_size * 10):
+        event = create_test_event(f'Event {i+1}', datetime(2024, 1, 1, 10, i + 1))
+        mock_state.history.append(event)
+        results = condenser.condensed_history(mock_state)
+
+        # The last event is always the event we just added.
+        assert results[-1] == event
+
+        # The first event is always the first event.
+        assert results[0] == first_event
+
+        # The number of results should bounce back between 2, 3, 4, 2, 3, 4, ...
+        print(len(results))
+        assert len(results) == (i % 3) + 2
+
+
+def test_llm_attention_condenser_from_config():
+    """Test that LLMAttentionCondenser objects can be made from config."""
+    config = LLMAttentionCondenserConfig(
+        max_size=50,
+        keep_first=10,
+        llm_config=LLMConfig(
+            model='gpt-4o',
+            api_key='test_key',
+        ),
+    )
+    condenser = Condenser.from_config(config)
+
+    assert isinstance(condenser, LLMAttentionCondenser)
+    assert condenser.llm.config.model == 'gpt-4o'
+    assert condenser.llm.config.api_key.get_secret_value() == 'test_key'
+    assert condenser.max_size == 50
+    assert condenser.keep_first == 10
+
+
+def test_llm_attention_condenser_invalid_config():
+    """Test that LLMAttentionCondenser raises an error if the configured LLM doesn't support response schema."""
+    config = LLMAttentionCondenserConfig(
+        max_size=50,
+        keep_first=10,
+        llm_config=LLMConfig(
+            model='claude-2',  # Older model that doesn't support response schema
+            api_key='test_key',
+        ),
+    )
+
+    pytest.raises(ValueError, LLMAttentionCondenser.from_config, config)
+
+
+def test_llm_attention_condenser_keeps_first_events(mock_llm, mock_state):
+    """Test that the LLMAttentionCondenser keeps the right number of initial events when forgetting."""
+    max_size = 4
+    condenser = LLMAttentionCondenser(max_size=max_size, keep_first=1, llm=mock_llm)
+
+    first_event = create_test_event('Event 0', id=0)
+    mock_state.history.append(first_event)
+
+    for i in range(max_size * 10):
+        event = create_test_event(f'Event {i+1}', id=i + 1)
+        mock_state.history.append(event)
+
+        mock_llm.set_mock_response_content(
+            ImportantEventSelection(
+                ids=[event.id for event in mock_state.history]
+            ).model_dump_json()
+        )
+        results = condenser.condensed_history(mock_state)
+
+        # The first event is always the first event.
+        assert results[0] == first_event
+
+
+def test_llm_attention_condenser_grows_to_max_size(mock_llm, mock_state):
+    """Test that LLMAttentionCondenser correctly maintains an event context up to max size."""
+    max_size = 15
+    condenser = LLMAttentionCondenser(max_size=max_size, llm=mock_llm)
+
+    for i in range(max_size):
+        event = create_test_event(f'Event {i}')
+        mock_state.history.append(event)
+        mock_llm.set_mock_response_content(
+            ImportantEventSelection(ids=[event.id for event in mock_state.history])
+        )
+        results = condenser.condensed_history(mock_state)
+        assert len(results) == i + 1
+
+
+def test_llm_attention_condenser_forgets_when_larger_than_max_size(
+    mock_llm, mock_state
+):
+    """Test that the LLMAttentionCondenser forgets events when the context grows too large."""
+    max_size = 2
+    condenser = LLMAttentionCondenser(max_size=max_size, llm=mock_llm)
+
+    for i in range(max_size * 10):
+        event = create_test_event(f'Event {i}', id=i)
+        mock_state.history.append(event)
+
+        mock_llm.set_mock_response_content(
+            ImportantEventSelection(
+                ids=[event.id for event in mock_state.history]
+            ).model_dump_json()
+        )
+
+        results = condenser.condensed_history(mock_state)
+
+        # The number of results should bounce back and forth between 1, 2, 1, 2, ...
+        assert len(results) == (i % 2) + 1
+
+
+def test_llm_attention_condenser_handles_events_outside_history(mock_llm, mock_state):
+    """Test that the LLMAttentionCondenser handles event IDs that aren't from the event history."""
+    max_size = 2
+    condenser = LLMAttentionCondenser(max_size=max_size, llm=mock_llm)
+
+    for i in range(max_size * 10):
+        event = create_test_event(f'Event {i}', id=i)
+        mock_state.history.append(event)
+
+        mock_llm.set_mock_response_content(
+            ImportantEventSelection(
+                ids=[event.id for event in mock_state.history] + [-1, -2, -3, -4]
+            ).model_dump_json()
+        )
+        results = condenser.condensed_history(mock_state)
+
+        # The number of results should bounce back and forth between 1, 2, 1, 2, ...
+        assert len(results) == (i % 2) + 1
+
+
+def test_llm_attention_condenser_handles_too_many_events(mock_llm, mock_state):
+    """Test that the LLMAttentionCondenser handles when the response contains too many event IDs."""
+    max_size = 2
+    condenser = LLMAttentionCondenser(max_size=max_size, llm=mock_llm)
+
+    for i in range(max_size * 10):
+        event = create_test_event(f'Event {i}', id=i)
+        mock_state.history.append(event)
+        mock_llm.set_mock_response_content(
+            ImportantEventSelection(
+                ids=[event.id for event in mock_state.history]
+                + [event.id for event in mock_state.history]
+            ).model_dump_json()
+        )
+        results = condenser.condensed_history(mock_state)
+
+        # The number of results should bounce back and forth between 1, 2, 1, 2, ...
+        assert len(results) == (i % 2) + 1
+
+
+def test_llm_attention_condenser_handles_too_few_events(mock_llm, mock_state):
+    """Test that the LLMAttentionCondenser handles when the response contains too few event IDs."""
+    max_size = 2
+    condenser = LLMAttentionCondenser(max_size=max_size, llm=mock_llm)
+
+    for i in range(max_size * 10):
+        event = create_test_event(f'Event {i}', id=i)
+        mock_state.history.append(event)
+
+        mock_llm.set_mock_response_content(
+            ImportantEventSelection(ids=[]).model_dump_json()
+        )
+
+        results = condenser.condensed_history(mock_state)
+
+        # The number of results should bounce back and forth between 1, 2, 1, 2, ...
+        assert len(results) == (i % 2) + 1
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
index b8c08d9dad63..08139848c448 100644
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -63,7 +63,7 @@ def test_compat_env_to_config(monkeypatch, setup_env):
 
     assert config.workspace_base == '/repos/openhands/workspace'
     assert isinstance(config.get_llm_config(), LLMConfig)
-    assert config.get_llm_config().api_key == 'sk-proj-rgMV0...'
+    assert config.get_llm_config().api_key.get_secret_value() == 'sk-proj-rgMV0...'
     assert config.get_llm_config().model == 'gpt-4o'
     assert isinstance(config.get_agent_config(), AgentConfig)
     assert isinstance(config.get_agent_config().memory_max_threads, int)
@@ -83,7 +83,7 @@ def test_load_from_old_style_env(monkeypatch, default_config):
 
     load_from_env(default_config, os.environ)
 
-    assert default_config.get_llm_config().api_key == 'test-api-key'
+    assert default_config.get_llm_config().api_key.get_secret_value() == 'test-api-key'
     assert default_config.get_agent_config().memory_enabled is True
     assert default_config.default_agent == 'PlannerAgent'
     assert default_config.workspace_base == '/opt/files/workspace'
@@ -126,7 +126,7 @@ def test_load_from_new_style_toml(default_config, temp_toml_file):
     # default llm & agent configs
     assert default_config.default_agent == 'TestAgent'
     assert default_config.get_llm_config().model == 'test-model'
-    assert default_config.get_llm_config().api_key == 'toml-api-key'
+    assert default_config.get_llm_config().api_key.get_secret_value() == 'toml-api-key'
     assert default_config.get_agent_config().memory_enabled is True
 
     # undefined agent config inherits default ones
@@ -291,7 +291,7 @@ def test_env_overrides_compat_toml(monkeypatch, default_config, temp_toml_file):
     assert default_config.get_llm_config().model == 'test-model'
     assert default_config.get_llm_config('llm').model == 'test-model'
     assert default_config.get_llm_config_from_agent().model == 'test-model'
-    assert default_config.get_llm_config().api_key == 'env-api-key'
+    assert default_config.get_llm_config().api_key.get_secret_value() == 'env-api-key'
 
     # after we set workspace_base to 'UNDEFINED' in the environment,
     # workspace_base should be set to that
@@ -336,7 +336,7 @@ def test_env_overrides_sandbox_toml(monkeypatch, default_config, temp_toml_file)
     assert default_config.workspace_mount_path is None
 
     # before load_from_env, values are set to the values from the toml file
-    assert default_config.get_llm_config().api_key == 'toml-api-key'
+    assert default_config.get_llm_config().api_key.get_secret_value() == 'toml-api-key'
     assert default_config.sandbox.timeout == 500
     assert default_config.sandbox.user_id == 1001
 
@@ -345,7 +345,7 @@ def test_env_overrides_sandbox_toml(monkeypatch, default_config, temp_toml_file)
     # values from env override values from toml
     assert os.environ.get('LLM_MODEL') is None
     assert default_config.get_llm_config().model == 'test-model'
-    assert default_config.get_llm_config().api_key == 'env-api-key'
+    assert default_config.get_llm_config().api_key.get_secret_value() == 'env-api-key'
 
     assert default_config.sandbox.timeout == 1000
     assert default_config.sandbox.user_id == 1002
@@ -412,7 +412,7 @@ def test_security_config_from_dict():
     # Test with all fields
     config_dict = {'confirmation_mode': True, 'security_analyzer': 'some_analyzer'}
 
-    security_config = SecurityConfig.from_dict(config_dict)
+    security_config = SecurityConfig(**config_dict)
 
     # Verify all fields are correctly set
     assert security_config.confirmation_mode is True
@@ -558,10 +558,7 @@ def test_load_from_toml_partial_invalid(default_config, temp_toml_file, caplog):
         assert 'Cannot parse [llm] config from toml' in log_content
         assert 'values have not been applied' in log_content
         # Error: LLMConfig.__init__() got an unexpected keyword argume
-        assert (
-            'Error: LLMConfig.__init__() got an unexpected keyword argume'
-            in log_content
-        )
+        assert 'Error: 1 validation error for LLMConfig' in log_content
         assert 'invalid_field' in log_content
 
         # invalid [sandbox] config
@@ -633,12 +630,14 @@ def test_api_keys_repr_str():
         aws_access_key_id='my_access_key',
         aws_secret_access_key='my_secret_key',
     )
-    assert "api_key='******'" in repr(llm_config)
-    assert "aws_access_key_id='******'" in repr(llm_config)
-    assert "aws_secret_access_key='******'" in repr(llm_config)
-    assert "api_key='******'" in str(llm_config)
-    assert "aws_access_key_id='******'" in str(llm_config)
-    assert "aws_secret_access_key='******'" in str(llm_config)
+
+    # Check that no secret keys are emitted in representations of the config object
+    assert 'my_api_key' not in repr(llm_config)
+    assert 'my_api_key' not in str(llm_config)
+    assert 'my_access_key' not in repr(llm_config)
+    assert 'my_access_key' not in str(llm_config)
+    assert 'my_secret_key' not in repr(llm_config)
+    assert 'my_secret_key' not in str(llm_config)
 
     # Check that no other attrs in LLMConfig have 'key' or 'token' in their name
     # This will fail when new attrs are added, and attract attention
@@ -650,7 +649,7 @@ def test_api_keys_repr_str():
         'output_cost_per_token',
         'custom_tokenizer',
     ]
-    for attr_name in dir(LLMConfig):
+    for attr_name in LLMConfig.model_fields.keys():
         if (
             not attr_name.startswith('__')
             and attr_name not in known_key_token_attrs_llm
@@ -665,7 +664,7 @@ def test_api_keys_repr_str():
     # Test AgentConfig
     # No attrs in AgentConfig have 'key' or 'token' in their name
     agent_config = AgentConfig(memory_enabled=True, memory_max_threads=4)
-    for attr_name in dir(AgentConfig):
+    for attr_name in AgentConfig.model_fields.keys():
         if not attr_name.startswith('__'):
             assert (
                 'key' not in attr_name.lower()
@@ -684,16 +683,16 @@ def test_api_keys_repr_str():
         modal_api_token_secret='my_modal_api_token_secret',
         runloop_api_key='my_runloop_api_key',
     )
-    assert "e2b_api_key='******'" in repr(app_config)
-    assert "e2b_api_key='******'" in str(app_config)
-    assert "jwt_secret='******'" in repr(app_config)
-    assert "jwt_secret='******'" in str(app_config)
-    assert "modal_api_token_id='******'" in repr(app_config)
-    assert "modal_api_token_id='******'" in str(app_config)
-    assert "modal_api_token_secret='******'" in repr(app_config)
-    assert "modal_api_token_secret='******'" in str(app_config)
-    assert "runloop_api_key='******'" in repr(app_config)
-    assert "runloop_api_key='******'" in str(app_config)
+    assert 'my_e2b_api_key' not in repr(app_config)
+    assert 'my_e2b_api_key' not in str(app_config)
+    assert 'my_jwt_secret' not in repr(app_config)
+    assert 'my_jwt_secret' not in str(app_config)
+    assert 'my_modal_api_token_id' not in repr(app_config)
+    assert 'my_modal_api_token_id' not in str(app_config)
+    assert 'my_modal_api_token_secret' not in repr(app_config)
+    assert 'my_modal_api_token_secret' not in str(app_config)
+    assert 'my_runloop_api_key' not in repr(app_config)
+    assert 'my_runloop_api_key' not in str(app_config)
 
     # Check that no other attrs in AppConfig have 'key' or 'token' in their name
     # This will fail when new attrs are added, and attract attention
@@ -703,7 +702,7 @@ def test_api_keys_repr_str():
         'modal_api_token_secret',
         'runloop_api_key',
     ]
-    for attr_name in dir(AppConfig):
+    for attr_name in AppConfig.model_fields.keys():
         if (
             not attr_name.startswith('__')
             and attr_name not in known_key_token_attrs_app
diff --git a/tests/unit/test_llm.py b/tests/unit/test_llm.py
index fc37e41c4dd4..43f55e502f7a 100644
--- a/tests/unit/test_llm.py
+++ b/tests/unit/test_llm.py
@@ -40,7 +40,7 @@ def default_config():
 def test_llm_init_with_default_config(default_config):
     llm = LLM(default_config)
     assert llm.config.model == 'gpt-4o'
-    assert llm.config.api_key == 'test_key'
+    assert llm.config.api_key.get_secret_value() == 'test_key'
     assert isinstance(llm.metrics, Metrics)
     assert llm.metrics.model_name == 'gpt-4o'
 
@@ -77,7 +77,7 @@ def test_llm_init_with_custom_config():
     )
     llm = LLM(custom_config)
     assert llm.config.model == 'custom-model'
-    assert llm.config.api_key == 'custom_key'
+    assert llm.config.api_key.get_secret_value() == 'custom_key'
     assert llm.config.max_input_tokens == 5000
     assert llm.config.max_output_tokens == 1500
     assert llm.config.temperature == 0.8