From 76cae8fee3078f189399023a1c3bca02712442fc Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Tue, 26 Nov 2024 12:13:13 -0500
Subject: [PATCH 1/9] feat(agent): add webpage read microagent

---
 .../codeact_agent/micro/webpage_read.md       | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 openhands/agenthub/codeact_agent/micro/webpage_read.md

diff --git a/openhands/agenthub/codeact_agent/micro/webpage_read.md b/openhands/agenthub/codeact_agent/micro/webpage_read.md
new file mode 100644
index 000000000000..24d8919e1b5f
--- /dev/null
+++ b/openhands/agenthub/codeact_agent/micro/webpage_read.md
@@ -0,0 +1,19 @@
+---
+name: webpage_read
+agent: CodeActAgent
+triggers:
+- http://
+- https://
+---
+
+To read content from a webpage, you can use the `percollate` CLI tool:
+
+1. Install `percollate` with `npm install -g percollate`
+2. Once installed, use it to convert a webpage to markdown with the following command:
+
+```bash
+percollate md https://example.com --output example.md
+```
+
+3. Then, you can read the markdown file `./example.md` using other tools you have access to.
+4. If you need to interact further with the webpage, you should not use the `percollate` CLI tool. Instead, you should use the web browser directly provided to you.

From 3660445d7ead15137a803941ae86858bbd0a8ce8 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Tue, 26 Nov 2024 13:17:28 -0500
Subject: [PATCH 2/9] use BrowseURLAction instead

---
 .../codeact_agent/function_calling.py         | 30 +++++++-
 .../codeact_agent/micro/webpage_read.md       | 19 -----
 openhands/events/observation/browse.py        | 69 ++++++++++++-------
 openhands/runtime/browser/utils.py            |  2 +
 tests/unit/test_security.py                   |  1 +
 5 files changed, 76 insertions(+), 45 deletions(-)
 delete mode 100644 openhands/agenthub/codeact_agent/micro/webpage_read.md

diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py
index a4ee35ff7b59..399776e6c6f3 100644
--- a/openhands/agenthub/codeact_agent/function_calling.py
+++ b/openhands/agenthub/codeact_agent/function_calling.py
@@ -19,6 +19,7 @@
     AgentDelegateAction,
     AgentFinishAction,
     BrowseInteractiveAction,
+    BrowseURLAction,
     CmdRunAction,
     FileEditAction,
     IPythonRunCellAction,
@@ -266,6 +267,30 @@ def __init__(self):
     ),
 )
 
+
+_WEB_DESCRIPTION = """Read (convert to markdown) content from a webpage. You should prefer using the `webpage_read` tool over the `browser` tool, but do use the `browser` tool if you need to interact with a webpage (e.g., click a button, fill out a form, etc.).
+
+You may use the `webpage_read` tool to read content from a webpage, and even search the webpage content using a Google search query (e.g., url=`https://www.google.com/search?q=YOUR_QUERY`).
+"""
+
+WebReadTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='web_read',
+        description=_WEB_DESCRIPTION,
+        parameters={
+            'type': 'object',
+            'properties': {
+                'url': {
+                    'type': 'string',
+                    'description': 'The URL of the webpage to read. You can also use a Google search query here (e.g., `https://www.google.com/search?q=YOUR_QUERY`).',
+                }
+            },
+            'required': ['url'],
+        },
+    ),
+)
+
 # from browsergym/core/action/highlevel.py
 _browser_action_space = HighLevelActionSet(
     subsets=['bid', 'nav'],
@@ -274,7 +299,7 @@ def __init__(self):
 )
 
 
-_BROWSER_DESCRIPTION = """Interact with the browser using Python code.
+_BROWSER_DESCRIPTION = """Interact with the browser using Python code. Use it ONLY when you need to interact with a webpage.
 
 See the description of "code" parameter for more details.
 
@@ -484,6 +509,8 @@ def response_to_actions(response: ModelResponse) -> list[Action]:
                 action = IPythonRunCellAction(code=code, include_extra=False)
             elif tool_call.function.name == 'browser':
                 action = BrowseInteractiveAction(browser_actions=arguments['code'])
+            elif tool_call.function.name == 'web_read':
+                action = BrowseURLAction(url=arguments['url'])
             else:
                 raise FunctionCallNotExistsError(
                     f'Tool {tool_call.function.name} is not registered. (arguments: {arguments}). Please check the tool name and retry with an existing tool.'
@@ -516,6 +543,7 @@ def get_tools(
 ) -> list[ChatCompletionToolParam]:
     tools = [CmdRunTool, FinishTool]
     if codeact_enable_browsing:
+        tools.append(WebReadTool)
         tools.append(BrowserTool)
     if codeact_enable_jupyter:
         tools.append(IPythonTool)
diff --git a/openhands/agenthub/codeact_agent/micro/webpage_read.md b/openhands/agenthub/codeact_agent/micro/webpage_read.md
deleted file mode 100644
index 24d8919e1b5f..000000000000
--- a/openhands/agenthub/codeact_agent/micro/webpage_read.md
+++ /dev/null
@@ -1,19 +0,0 @@
----
-name: webpage_read
-agent: CodeActAgent
-triggers:
-- http://
-- https://
----
-
-To read content from a webpage, you can use the `percollate` CLI tool:
-
-1. Install `percollate` with `npm install -g percollate`
-2. Once installed, use it to convert a webpage to markdown with the following command:
-
-```bash
-percollate md https://example.com --output example.md
-```
-
-3. Then, you can read the markdown file `./example.md` using other tools you have access to.
-4. If you need to interact further with the webpage, you should not use the `percollate` CLI tool. Instead, you should use the web browser directly provided to you.
diff --git a/openhands/events/observation/browse.py b/openhands/events/observation/browse.py
index 9632fac57d54..fd3e12871f16 100644
--- a/openhands/events/observation/browse.py
+++ b/openhands/events/observation/browse.py
@@ -2,7 +2,7 @@
 
 from browsergym.utils.obs import flatten_axtree_to_str
 
-from openhands.core.schema import ObservationType
+from openhands.core.schema import ActionType, ObservationType
 from openhands.events.observation.observation import Observation
 
 
@@ -11,6 +11,7 @@ class BrowserOutputObservation(Observation):
     """This data class represents the output of a browser."""
 
     url: str
+    trigger_by_action: str
     screenshot: str = field(repr=False)  # don't show in repr
     error: bool = False
     observation: str = ObservationType.BROWSE
@@ -48,31 +49,49 @@ def __str__(self) -> str:
 
     def get_agent_obs_text(self) -> str:
         """Get a concise text that will be shown to the agent."""
-        text = f'[Current URL: {self.url}]\n'
-        text += f'[Focused element bid: {self.focused_element_bid}]\n\n'
-        if self.error:
-            text += (
-                '================ BEGIN error message ===============\n'
-                'The following error occurred when executing the last action:\n'
-                f'{self.last_browser_action_error}\n'
-                '================ END error message ===============\n'
-            )
-        else:
-            text += '[Action executed successfully.]\n'
+        if self.trigger_by_action == ActionType.BROWSE_INTERACTIVE:
+            text = f'[Current URL: {self.url}]\n'
+            text += f'[Focused element bid: {self.focused_element_bid}]\n\n'
+            if self.error:
+                text += (
+                    '================ BEGIN error message ===============\n'
+                    'The following error occurred when executing the last action:\n'
+                    f'{self.last_browser_action_error}\n'
+                    '================ END error message ===============\n'
+                )
+            else:
+                text += '[Action executed successfully.]\n'
+            try:
+                # We do not filter visible only here because we want to show the full content
+                # of the web page to the agent for simplicity.
+                # FIXME: handle the case when the web page is too large
+                cur_axtree_txt = self.get_axtree_str(filter_visible_only=False)
+                text += (
+                    f'============== BEGIN accessibility tree ==============\n'
+                    f'{cur_axtree_txt}\n'
+                    f'============== END accessibility tree ==============\n'
+                )
+            except Exception as e:
+                text += (
+                    f'\n[Error encountered when processing the accessibility tree: {e}]'
+                )
+            return text
 
-        try:
-            # We do not filter visible only here because we want to show the full content
-            # of the web page to the agent for simplicity.
-            # FIXME: handle the case when the web page is too large
-            cur_axtree_txt = self.get_axtree_str(filter_visible_only=False)
-            text += (
-                f'============== BEGIN accessibility tree ==============\n'
-                f'{cur_axtree_txt}\n'
-                f'============== END accessibility tree ==============\n'
-            )
-        except Exception as e:
-            text += f'\n[Error encountered when processing the accessibility tree: {e}]'
-        return text
+        elif self.trigger_by_action == ActionType.BROWSE:
+            text = f'[Current URL: {self.url}]\n'
+            if self.error:
+                text += (
+                    '================ BEGIN error message ===============\n'
+                    'The following error occurred when trying to visit the URL:\n'
+                    f'{self.last_browser_action_error}\n'
+                    '================ END error message ===============\n'
+                )
+            text += '============== BEGIN webpage content ==============\n'
+            text += self.content
+            text += '\n============== END webpage content ==============\n'
+            return text
+        else:
+            raise ValueError(f'Invalid trigger_by_action: {self.trigger_by_action}')
 
     def get_axtree_str(self, filter_visible_only: bool = False) -> str:
         cur_axtree_txt = flatten_axtree_to_str(
diff --git a/openhands/runtime/browser/utils.py b/openhands/runtime/browser/utils.py
index 336b3801e3e2..6f823e47d546 100644
--- a/openhands/runtime/browser/utils.py
+++ b/openhands/runtime/browser/utils.py
@@ -49,6 +49,7 @@ async def browse(
             ),  # last browser env action performed
             last_browser_action_error=obs.get('last_action_error', ''),
             error=True if obs.get('last_action_error', '') else False,  # error flag
+            trigger_by_action=action.action,
         )
     except Exception as e:
         return BrowserOutputObservation(
@@ -57,4 +58,5 @@ async def browse(
             error=True,
             last_browser_action_error=str(e),
             url=asked_url if action.action == ActionType.BROWSE else '',
+            trigger_by_action=action.action,
         )
diff --git a/tests/unit/test_security.py b/tests/unit/test_security.py
index 771ccc206d3c..68ea9f9eea63 100644
--- a/tests/unit/test_security.py
+++ b/tests/unit/test_security.py
@@ -382,6 +382,7 @@ def test_parse_action(action, expected_trace):
                 content='browser output content',
                 url='http://localhost:3000',
                 screenshot='screenshot',
+                trigger_by_action=ActionType.BROWSE,
             ),
             [
                 ToolOutput(

From 1a33d2a7cdc87546ac79dbf17b93184ed480460a Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Tue, 26 Nov 2024 13:24:07 -0500
Subject: [PATCH 3/9] dispatch thought for browse url first

---
 frontend/src/services/actions.ts  | 6 +++++-
 openhands/events/action/browse.py | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/frontend/src/services/actions.ts b/frontend/src/services/actions.ts
index 13265776dcee..69844b8a22cd 100644
--- a/frontend/src/services/actions.ts
+++ b/frontend/src/services/actions.ts
@@ -21,7 +21,11 @@ import { handleObservationMessage } from "./observations";
 
 const messageActions = {
   [ActionType.BROWSE]: (message: ActionMessage) => {
-    store.dispatch(addAssistantMessage(message.message));
+    if (message.args.thought) {
+      store.dispatch(addAssistantMessage(message.args.thought));
+    } else {
+      store.dispatch(addAssistantMessage(message.message));
+    }
   },
   [ActionType.BROWSE_INTERACTIVE]: (message: ActionMessage) => {
     if (message.args.thought) {
diff --git a/openhands/events/action/browse.py b/openhands/events/action/browse.py
index 41816216d6d5..418dd0444366 100644
--- a/openhands/events/action/browse.py
+++ b/openhands/events/action/browse.py
@@ -15,7 +15,7 @@ class BrowseURLAction(Action):
 
     @property
     def message(self) -> str:
-        return f'Browsing URL: {self.url}'
+        return f'I am browsing the URL: {self.url}'
 
     def __str__(self) -> str:
         ret = '**BrowseURLAction**\n'

From 8474a42056d039db87e49b4f939848386e095e48 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Tue, 26 Nov 2024 13:24:18 -0500
Subject: [PATCH 4/9] remove content from browser output str for debug clarity

---
 openhands/events/observation/browse.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/openhands/events/observation/browse.py b/openhands/events/observation/browse.py
index fd3e12871f16..1052aaf17a91 100644
--- a/openhands/events/observation/browse.py
+++ b/openhands/events/observation/browse.py
@@ -41,7 +41,6 @@ def __str__(self) -> str:
             f'Last browser action: {self.last_browser_action}\n'
             f'Last browser action error: {self.last_browser_action_error}\n'
             f'Focused element bid: {self.focused_element_bid}\n'
-            f'Content: {self.content}\n'
         )
         ret += '--- Agent Observation ---\n'
         ret += self.get_agent_obs_text()

From b9b75638ab41c707f0faeae95676a875296c8fcb Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Tue, 26 Nov 2024 13:46:38 -0500
Subject: [PATCH 5/9] add BrowseURL action to codeact

---
 openhands/agenthub/codeact_agent/codeact_agent.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index 6743de87ade6..1113fd0271d3 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -15,6 +15,7 @@
     AgentDelegateAction,
     AgentFinishAction,
     BrowseInteractiveAction,
+    BrowseURLAction,
     CmdRunAction,
     FileEditAction,
     IPythonRunCellAction,
@@ -151,6 +152,7 @@ def get_action_message(
                 IPythonRunCellAction,
                 FileEditAction,
                 BrowseInteractiveAction,
+                BrowseURLAction,
             ),
         ) or (
             isinstance(action, (AgentFinishAction, CmdRunAction))

From 4e9cb764806f03a5abb25a37351e71dfd6b45f10 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Tue, 26 Nov 2024 13:47:28 -0500
Subject: [PATCH 6/9] fix pr browsing test

---
 evaluation/integration_tests/tests/t06_github_pr_browsing.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/evaluation/integration_tests/tests/t06_github_pr_browsing.py b/evaluation/integration_tests/tests/t06_github_pr_browsing.py
index 52ec927cd334..69d37f69b6b6 100644
--- a/evaluation/integration_tests/tests/t06_github_pr_browsing.py
+++ b/evaluation/integration_tests/tests/t06_github_pr_browsing.py
@@ -27,6 +27,8 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
                 content = event.content
             elif isinstance(event, AgentFinishAction):
                 content = event.outputs.get('content', '')
+                if not content:
+                    content = event.thought
             elif isinstance(event, MessageAction):
                 content = event.content
             else:

From 87f5e9abf914778492a8dd60595822f322d4e13e Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Tue, 26 Nov 2024 16:02:03 -0500
Subject: [PATCH 7/9] fix test

---
 evaluation/integration_tests/run_infer.py                    | 2 +-
 evaluation/integration_tests/tests/t06_github_pr_browsing.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py
index 5e3205fefe2e..6eace74403d0 100644
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -130,7 +130,7 @@ def process_instance(
     # # =============================================
 
     histories = [event_to_dict(event) for event in state.history]
-    test_result: TestResult = test_class.verify_result(runtime, histories)
+    test_result: TestResult = test_class.verify_result(runtime, state.history)
     metrics = state.metrics.get() if state.metrics else None
 
     # Save the output
diff --git a/evaluation/integration_tests/tests/t06_github_pr_browsing.py b/evaluation/integration_tests/tests/t06_github_pr_browsing.py
index 69d37f69b6b6..7536ccc828d8 100644
--- a/evaluation/integration_tests/tests/t06_github_pr_browsing.py
+++ b/evaluation/integration_tests/tests/t06_github_pr_browsing.py
@@ -27,8 +27,8 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
                 content = event.content
             elif isinstance(event, AgentFinishAction):
                 content = event.outputs.get('content', '')
-                if not content:
-                    content = event.thought
+                if event.thought:
+                    content += f'\n\n{event.thought}'
             elif isinstance(event, MessageAction):
                 content = event.content
             else:

From 922849b5be812ff21d5211a55f7194c5bca107b1 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Wed, 27 Nov 2024 09:42:40 -0500
Subject: [PATCH 8/9] cleanup miniwob runtime appropriately

---
 evaluation/integration_tests/run_infer.py | 45 ++++++++++++-----------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py
index 6eace74403d0..2c7f7d7a1c14 100644
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -107,31 +107,32 @@ def process_instance(
     # =============================================
     # create sandbox and run the agent
     # =============================================
-
     runtime: Runtime = create_runtime(config)
     call_async_from_sync(runtime.connect)
-
-    test_class.initialize_runtime(runtime)
-
-    # Here's how you can run the agent (similar to the `main` function) and get the final task state
-    state: State | None = asyncio.run(
-        run_controller(
-            config=config,
-            initial_user_action=MessageAction(content=instruction),
-            runtime=runtime,
-            fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
+    try:
+        test_class.initialize_runtime(runtime)
+
+        # Here's how you can run the agent (similar to the `main` function) and get the final task state
+        state: State | None = asyncio.run(
+            run_controller(
+                config=config,
+                initial_user_action=MessageAction(content=instruction),
+                runtime=runtime,
+                fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
+            )
         )
-    )
-    if state is None:
-        raise ValueError('State should not be None.')
-
-    # # =============================================
-    # # result evaluation
-    # # =============================================
-
-    histories = [event_to_dict(event) for event in state.history]
-    test_result: TestResult = test_class.verify_result(runtime, state.history)
-    metrics = state.metrics.get() if state.metrics else None
+        if state is None:
+            raise ValueError('State should not be None.')
+
+        # # =============================================
+        # # result evaluation
+        # # =============================================
+
+        histories = [event_to_dict(event) for event in state.history]
+        test_result: TestResult = test_class.verify_result(runtime, state.history)
+        metrics = state.metrics.get() if state.metrics else None
+    finally:
+        runtime.close()
 
     # Save the output
     output = EvalOutput(

From 97df362af04aa934ef14537e3341adb60e129d86 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao6@illinois.edu>
Date: Wed, 27 Nov 2024 16:35:41 -0500
Subject: [PATCH 9/9] fix linter

---
 evaluation/integration_tests/run_infer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py
index 36eb353d73f4..3b6f1c6ff2cc 100644
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -135,7 +135,7 @@ def process_instance(
         # # =============================================
 
         histories = state.history
-        
+
         # some basic check
         logger.info(f'Total events in history: {len(histories)}')
         assert len(histories) > 0, 'History should not be empty'