diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index 0abfee13328b..c7ee4d6198d1 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -307,10 +307,14 @@ def get_observation_message(
             text = obs.get_agent_obs_text()
             if (
                 obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE
-                and self.config.codeact_enable_visual_browsing
-                and self.llm.vision_is_active()
                 and obs.set_of_marks is not None
                 and len(obs.set_of_marks) > 0
+                and self.config.codeact_enable_visual_browsing
+                and self.llm.vision_is_active()
+                and (
+                    self.mock_function_calling
+                    or self.llm.is_visual_browser_tool_active()
+                )
             ):
                 text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.)\n'
                 message = Message(
diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index 98bcf7cb173d..631febb13499 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -74,6 +74,16 @@
     'o1-2024-12-17',
 ]
 
+# visual browsing tool supported models
+# This flag is needed since gpt-4o and gpt-4o-mini do not allow passing image_urls with role='tool'
+VISUAL_BROWSING_TOOL_SUPPORTED_MODELS = [
+    'claude-3-5-sonnet',
+    'claude-3-5-sonnet-20240620',
+    'claude-3-5-sonnet-20241022',
+    'o1-2024-12-17',
+]
+
+
 REASONING_EFFORT_SUPPORTED_MODELS = [
     'o1-2024-12-17',
 ]
@@ -472,6 +482,15 @@ def is_function_calling_active(self) -> bool:
             )
             return supports_fn_call
 
+    def is_visual_browser_tool_active(self) -> bool:
+        return (
+            self.config.model in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS
+            or self.config.model.split('/')[-1] in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS
+            or any(
+                m in self.config.model for m in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS
+            )
+        )
+
     def _post_completion(self, response: ModelResponse) -> float:
         """Post-process the completion response.