diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py index 0abfee13328b..c7ee4d6198d1 100644 --- a/openhands/agenthub/codeact_agent/codeact_agent.py +++ b/openhands/agenthub/codeact_agent/codeact_agent.py @@ -307,10 +307,14 @@ def get_observation_message( text = obs.get_agent_obs_text() if ( obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE - and self.config.codeact_enable_visual_browsing - and self.llm.vision_is_active() and obs.set_of_marks is not None and len(obs.set_of_marks) > 0 + and self.config.codeact_enable_visual_browsing + and self.llm.vision_is_active() + and ( + self.mock_function_calling + or self.llm.is_visual_browser_tool_active() + ) ): text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.)\n' message = Message( diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py index 98bcf7cb173d..631febb13499 100644 --- a/openhands/llm/llm.py +++ b/openhands/llm/llm.py @@ -74,6 +74,16 @@ 'o1-2024-12-17', ] +# visual browsing tool supported models +# This flag is needed since gpt-4o and gpt-4o-mini do not allow passing image_urls with role='tool' +VISUAL_BROWSING_TOOL_SUPPORTED_MODELS = [ + 'claude-3-5-sonnet', + 'claude-3-5-sonnet-20240620', + 'claude-3-5-sonnet-20241022', + 'o1-2024-12-17', +] + + REASONING_EFFORT_SUPPORTED_MODELS = [ 'o1-2024-12-17', ] @@ -472,6 +482,15 @@ def is_function_calling_active(self) -> bool: ) return supports_fn_call + def is_visual_browser_tool_active(self) -> bool: + return ( + self.config.model in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS + or self.config.model.split('/')[-1] in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS + or any( + m in self.config.model for m in VISUAL_BROWSING_TOOL_SUPPORTED_MODELS + ) + ) + def _post_completion(self, response: ModelResponse) -> float: """Post-process the completion response.