Add Planner Agent again

Reverts All-Hands-AI#5959
SmartManoj · Jan 3, 2025 · 282e878 · 282e878
1 parent 2a8df13
commit 282e878
Show file tree

Hide file tree

Showing 17 changed files with 243 additions and 28 deletions.
diff --git a/frontend/src/types/action-type.tsx b/frontend/src/types/action-type.tsx
@@ -36,6 +36,12 @@ enum ActionType {
   // Reject a request from user or another agent.
   REJECT = "reject",
 
+  // Adds a task to the plan.
+  ADD_TASK = "add_task",
+
+  // Updates a task in the plan.
+  MODIFY_TASK = "modify_task",
+
   // Changes the state of the agent, e.g. to paused or running
   CHANGE_AGENT_STATE = "change_agent_state",
 }

diff --git a/frontend/src/types/core/actions.ts b/frontend/src/types/core/actions.ts
@@ -78,6 +78,27 @@ export interface BrowseInteractiveAction
   };
 }
 
+export interface AddTaskAction extends OpenHandsActionEvent<"add_task"> {
+  source: "agent";
+  timeout: number;
+  args: {
+    parent: string;
+    goal: string;
+    subtasks: unknown[];
+    thought: string;
+  };
+}
+
+export interface ModifyTaskAction extends OpenHandsActionEvent<"modify_task"> {
+  source: "agent";
+  timeout: number;
+  args: {
+    task_id: string;
+    state: string;
+    thought: string;
+  };
+}
+
 export interface FileReadAction extends OpenHandsActionEvent<"read"> {
   source: "agent";
   args: {
@@ -123,4 +144,6 @@ export type OpenHandsAction =
   | FileReadAction
   | FileEditAction
   | FileWriteAction
+  | AddTaskAction
+  | ModifyTaskAction
   | RejectAction;
diff --git a/frontend/src/types/core/base.ts b/frontend/src/types/core/base.ts
@@ -10,6 +10,8 @@ export type OpenHandsEventType =
   | "browse"
   | "browse_interactive"
   | "reject"
+  | "add_task"
+  | "modify_task"
   | "finish"
   | "error";
 

diff --git a/openhands/agenthub/__init__.py b/openhands/agenthub/__init__.py
@@ -12,10 +12,12 @@
     codeact_agent,
     delegator_agent,
     dummy_agent,
+    planner_agent,
 )
 
 __all__ = [
     'codeact_agent',
+    'planner_agent',
     'delegator_agent',
     'dummy_agent',
     'browsing_agent',

diff --git a/openhands/agenthub/dummy_agent/agent.py b/openhands/agenthub/dummy_agent/agent.py
@@ -1,11 +1,12 @@
-from typing import TypedDict
+from typing import TypedDict, Union
 
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
 from openhands.core.schema import AgentState
 from openhands.events.action import (
     Action,
+    AddTaskAction,
     AgentFinishAction,
     AgentRejectAction,
     BrowseInteractiveAction,
@@ -14,10 +15,10 @@
     FileReadAction,
     FileWriteAction,
     MessageAction,
+    ModifyTaskAction,
 )
 from openhands.events.observation import (
     AgentStateChangedObservation,
-    BrowserOutputObservation,
     CmdOutputObservation,
     FileReadObservation,
     FileWriteObservation,
@@ -48,6 +49,20 @@ class DummyAgent(Agent):
     def __init__(self, llm: LLM, config: AgentConfig):
         super().__init__(llm, config)
         self.steps: list[ActionObs] = [
+            {
+                'action': AddTaskAction(
+                    parent='None', goal='check the current directory'
+                ),
+                'observations': [],
+            },
+            {
+                'action': AddTaskAction(parent='0', goal='run ls'),
+                'observations': [],
+            },
+            {
+                'action': ModifyTaskAction(task_id='0', state='in_progress'),
+                'observations': [],
+            },
             {
                 'action': MessageAction('Time to get started!'),
                 'observations': [],
@@ -90,25 +105,15 @@ def __init__(self, llm: LLM, config: AgentConfig):
             {
                 'action': BrowseURLAction(url='https://google.com'),
                 'observations': [
-                    BrowserOutputObservation(
-                        '<html><body>Simulated Google page</body></html>',
-                        url='https://google.com',
-                        screenshot='',
-                        trigger_by_action='',
-                    ),
+                    # BrowserOutputObservation('<html><body>Simulated Google page</body></html>',url='https://google.com',screenshot=''),
                 ],
             },
             {
                 'action': BrowseInteractiveAction(
                     browser_actions='goto("https://google.com")'
                 ),
                 'observations': [
-                    BrowserOutputObservation(
-                        '<html><body>Simulated Google page after interaction</body></html>',
-                        url='https://google.com',
-                        screenshot='',
-                        trigger_by_action='',
-                    ),
+                    # BrowserOutputObservation('<html><body>Simulated Google page after interaction</body></html>',url='https://google.com',screenshot=''),
                 ],
             },
             {
@@ -130,6 +135,30 @@ def step(self, state: State) -> Action:
         current_step = self.steps[state.iteration]
         action = current_step['action']
 
+        # If the action is AddTaskAction or ModifyTaskAction, update the parent ID or task_id
+        if isinstance(action, AddTaskAction):
+            if action.parent == 'None':
+                action.parent = ''  # Root task has no parent
+            elif action.parent == '0':
+                action.parent = state.root_task.id
+            elif action.parent.startswith('0.'):
+                action.parent = f'{state.root_task.id}{action.parent[1:]}'
+        elif isinstance(action, ModifyTaskAction):
+            if action.task_id == '0':
+                action.task_id = state.root_task.id
+            elif action.task_id.startswith('0.'):
+                action.task_id = f'{state.root_task.id}{action.task_id[1:]}'
+            # Ensure the task_id doesn't start with a dot
+            if action.task_id.startswith('.'):
+                action.task_id = action.task_id[1:]
+        elif isinstance(action, (BrowseURLAction, BrowseInteractiveAction)):
+            try:
+                return self.simulate_browser_action(action)
+            except (
+                Exception
+            ):  # This could be a specific exception for browser unavailability
+                return self.handle_browser_unavailable(action)
+
         if state.iteration > 0:
             prev_step = self.steps[state.iteration - 1]
 
@@ -161,3 +190,22 @@ def step(self, state: State) -> Action:
                         )
 
         return action
+
+    def simulate_browser_action(
+        self, action: Union[BrowseURLAction, BrowseInteractiveAction]
+    ) -> Action:
+        # Instead of simulating, we'll reject the browser action
+        return self.handle_browser_unavailable(action)
+
+    def handle_browser_unavailable(
+        self, action: Union[BrowseURLAction, BrowseInteractiveAction]
+    ) -> Action:
+        # Create a message action to inform that browsing is not available
+        message = 'Browser actions are not available in the DummyAgent environment.'
+        if isinstance(action, BrowseURLAction):
+            message += f' Unable to browse URL: {action.url}'
+        elif isinstance(action, BrowseInteractiveAction):
+            message += (
+                f' Unable to perform interactive browsing: {action.browser_actions}'
+            )
+        return MessageAction(content=message)
diff --git a/openhands/agenthub/planner_agent/__init__.py b/openhands/agenthub/planner_agent/__init__.py
@@ -0,0 +1,4 @@
+from openhands.agenthub.planner_agent.agent import PlannerAgent
+from openhands.controller.agent import Agent
+
+Agent.register('PlannerAgent', PlannerAgent)
diff --git a/openhands/agenthub/planner_agent/agent.py b/openhands/agenthub/planner_agent/agent.py
@@ -0,0 +1,53 @@
+from openhands.agenthub.planner_agent.prompt import get_prompt_and_images
+from openhands.agenthub.planner_agent.response_parser import PlannerResponseParser
+from openhands.controller.agent import Agent
+from openhands.controller.state.state import State
+from openhands.core.config import AgentConfig
+from openhands.core.message import ImageContent, Message, TextContent
+from openhands.events.action import Action, AgentFinishAction
+from openhands.llm.llm import LLM
+
+
+class PlannerAgent(Agent):
+    VERSION = '1.0'
+    """
+    The planner agent utilizes a special prompting strategy to create long term plans for solving problems.
+    The agent is given its previous action-observation pairs, current task, and hint based on last action taken at every step.
+    """
+    response_parser = PlannerResponseParser()
+
+    def __init__(self, llm: LLM, config: AgentConfig):
+        """Initialize the Planner Agent with an LLM
+
+        Parameters:
+        - llm (LLM): The llm to be used by this agent
+        """
+        super().__init__(llm, config)
+
+    def step(self, state: State) -> Action:
+        """Checks to see if current step is completed, returns AgentFinishAction if True.
+        Otherwise, creates a plan prompt and sends to model for inference, returning the result as the next action.
+
+        Parameters:
+        - state (State): The current state given the previous actions and observations
+
+        Returns:
+        - AgentFinishAction: If the last state was 'completed', 'verified', or 'abandoned'
+        - Action: The next action to take based on llm response
+        """
+        if state.root_task.state in [
+            'completed',
+            'verified',
+            'abandoned',
+        ]:
+            return AgentFinishAction()
+
+        prompt, image_urls = get_prompt_and_images(
+            state, self.llm.config.max_message_chars
+        )
+        content = [TextContent(text=prompt)]
+        if self.llm.vision_is_active() and image_urls:
+            content.append(ImageContent(image_urls=image_urls))
+        message = Message(role='user', content=content)
+        resp = self.llm.completion(messages=self.llm.format_messages_for_llm(message))
+        return self.response_parser.parse(resp)
diff --git a/openhands/agenthub/planner_agent/response_parser.py b/openhands/agenthub/planner_agent/response_parser.py
@@ -0,0 +1,37 @@
+from openhands.controller.action_parser import ResponseParser
+from openhands.core.utils import json
+from openhands.events.action import (
+    Action,
+)
+from openhands.events.serialization.action import action_from_dict
+
+
+class PlannerResponseParser(ResponseParser):
+    def __init__(self):
+        super().__init__()
+
+    def parse(self, response: str) -> Action:
+        action_str = self.parse_response(response)
+        return self.parse_action(action_str)
+
+    def parse_response(self, response) -> str:
+        # get the next action from the response
+        return response['choices'][0]['message']['content']
+
+    def parse_action(self, action_str: str) -> Action:
+        """Parses a string to find an action within it
+
+        Parameters:
+        - response (str): The string to be parsed
+
+        Returns:
+        - Action: The action that was found in the response string
+        """
+        # attempt to load the JSON dict from the response
+        action_dict = json.loads(action_str)
+
+        if 'content' in action_dict:
+            # The LLM gets confused here. Might as well be robust
+            action_dict['contents'] = action_dict.pop('content')
+
+        return action_from_dict(action_dict)
diff --git a/openhands/controller/agent_controller.py b/openhands/controller/agent_controller.py
@@ -25,6 +25,7 @@
 from openhands.events.action import (
     Action,
     ActionConfirmationStatus,
+    AddTaskAction,
     AgentDelegateAction,
     AgentFinishAction,
     AgentRejectAction,
@@ -33,6 +34,7 @@
     CmdRunAction,
     IPythonRunCellAction,
     MessageAction,
+    ModifyTaskAction,
     NullAction,
     RegenerateAction,
 )
@@ -278,7 +280,12 @@ async def _handle_action(self, action: Action) -> None:
             await self._handle_message_action(action)
         elif isinstance(action, AgentDelegateAction):
             await self.start_delegate(action)
-
+        elif isinstance(action, AddTaskAction):
+            self.state.root_task.add_subtask(
+                action.parent, action.goal, action.subtasks
+            )
+        elif isinstance(action, ModifyTaskAction):
+            self.state.root_task.set_subtask_state(action.task_id, action.state)
         elif isinstance(action, AgentFinishAction):
             self.state.outputs = action.outputs
             self.state.metrics.merge(self.state.local_metrics)

diff --git a/openhands/core/schema/action.py b/openhands/core/schema/action.py
@@ -66,6 +66,10 @@ class ActionTypeSchema(BaseModel):
 
     SUMMARIZE: str = Field(default='summarize')
 
+    ADD_TASK: str = Field(default='add_task')
+
+    MODIFY_TASK: str = Field(default='modify_task')
+
     PAUSE: str = Field(default='pause')
     """Pauses the task.
     """

diff --git a/openhands/events/action/__init__.py b/openhands/events/action/__init__.py
@@ -30,6 +30,8 @@
     'AgentRejectAction',
     'AgentDelegateAction',
     'AgentSummarizeAction',
+    'AddTaskAction',
+    'ModifyTaskAction',
     'ChangeAgentStateAction',
     'IPythonRunCellAction',
     'MessageAction',

diff --git a/openhands/events/serialization/action.py b/openhands/events/serialization/action.py
@@ -33,6 +33,8 @@
     AgentFinishAction,
     AgentRejectAction,
     AgentDelegateAction,
+    AddTaskAction,
+    ModifyTaskAction,
     ChangeAgentStateAction,
     MessageAction,
     RegenerateAction,

diff --git a/openhands/server/mock/listen.py b/openhands/server/mock/listen.py
@@ -49,6 +49,7 @@ def read_llm_models():
 def read_llm_agents():
     return [
         'CodeActAgent',
+        'PlannerAgent',
     ]