add actions db model and caching V0 (#980)

Skyvern-AI · Oct 15, 2024 · 9048cdf · 9048cdf
1 parent e7583ac
commit 9048cdf
Show file tree

Hide file tree

Showing 19 changed files with 731 additions and 90 deletions.
diff --git a/alembic/versions/2024_10_15_1903-137eee1d3b3e_actions_table.py b/alembic/versions/2024_10_15_1903-137eee1d3b3e_actions_table.py
@@ -0,0 +1,81 @@
+"""actions table
+
+Revision ID: 137eee1d3b3e
+Revises: 12fb2dede685
+Create Date: 2024-10-15 19:03:29.086340+00:00
+
+"""
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "137eee1d3b3e"
+down_revision: Union[str, None] = "12fb2dede685"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table(
+        "actions",
+        sa.Column("action_id", sa.String(), nullable=False),
+        sa.Column("action_type", sa.String(), nullable=False),
+        sa.Column("source_action_id", sa.String(), nullable=True),
+        sa.Column("organization_id", sa.String(), nullable=True),
+        sa.Column("workflow_run_id", sa.String(), nullable=True),
+        sa.Column("task_id", sa.String(), nullable=False),
+        sa.Column("step_id", sa.String(), nullable=False),
+        sa.Column("step_order", sa.Integer(), nullable=False),
+        sa.Column("action_order", sa.Integer(), nullable=False),
+        sa.Column("status", sa.String(), nullable=False),
+        sa.Column("reasoning", sa.String(), nullable=True),
+        sa.Column("intention", sa.String(), nullable=True),
+        sa.Column("response", sa.String(), nullable=True),
+        sa.Column("element_id", sa.String(), nullable=True),
+        sa.Column("skyvern_element_hash", sa.String(), nullable=True),
+        sa.Column("skyvern_element_data", sa.JSON(), nullable=True),
+        sa.Column("action_json", sa.JSON(), nullable=True),
+        sa.Column("created_at", sa.DateTime(), nullable=False),
+        sa.Column("modified_at", sa.DateTime(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["organization_id"],
+            ["organizations.organization_id"],
+        ),
+        sa.ForeignKeyConstraint(
+            ["source_action_id"],
+            ["actions.action_id"],
+        ),
+        sa.ForeignKeyConstraint(
+            ["step_id"],
+            ["steps.step_id"],
+        ),
+        sa.ForeignKeyConstraint(
+            ["task_id"],
+            ["tasks.task_id"],
+        ),
+        sa.ForeignKeyConstraint(
+            ["workflow_run_id"],
+            ["workflow_runs.workflow_run_id"],
+        ),
+        sa.PrimaryKeyConstraint("action_id"),
+    )
+    op.create_index("action_org_task_step_index", "actions", ["organization_id", "task_id", "step_id"], unique=False)
+    op.create_index(op.f("ix_actions_action_id"), "actions", ["action_id"], unique=False)
+    op.create_index(op.f("ix_actions_source_action_id"), "actions", ["source_action_id"], unique=False)
+    op.create_index(op.f("ix_actions_task_id"), "actions", ["task_id"], unique=False)
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index(op.f("ix_actions_task_id"), table_name="actions")
+    op.drop_index(op.f("ix_actions_source_action_id"), table_name="actions")
+    op.drop_index(op.f("ix_actions_action_id"), table_name="actions")
+    op.drop_index("action_org_task_step_index", table_name="actions")
+    op.drop_table("actions")
+    # ### end Alembic commands ###
diff --git a/skyvern/exceptions.py b/skyvern/exceptions.py
@@ -490,3 +490,8 @@ class IllegitComplete(SkyvernException):
     def __init__(self, data: dict | None = None) -> None:
         data_str = f", data={data}" if data else ""
         super().__init__(f"Illegit complete{data_str}")
+
+
+class CachedActionPlanError(SkyvernException):
+    def __init__(self, message: str) -> None:
+        super().__init__(message)
diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py
@@ -51,9 +51,14 @@
     WebAction,
     parse_actions,
 )
-from skyvern.webeye.actions.handler import ActionHandler, handle_complete_action, poll_verification_code
+from skyvern.webeye.actions.caching import retrieve_action_plan
+from skyvern.webeye.actions.handler import (
+    ActionHandler,
+    extract_information_for_navigation_goal,
+    poll_verification_code,
+)
 from skyvern.webeye.actions.models import AgentStepOutput, DetailedAgentStepOutput
-from skyvern.webeye.actions.responses import ActionResult
+from skyvern.webeye.actions.responses import ActionResult, ActionSuccess
 from skyvern.webeye.browser_factory import BrowserState
 from skyvern.webeye.scraper.scraper import ElementTreeFormat, ScrapedPage, scrape_website
 from skyvern.webeye.utils.page import SkyvernFrame
@@ -553,7 +558,22 @@ async def agent_step(
             detailed_agent_step_output.extract_action_prompt = extract_action_prompt
             json_response = None
             actions: list[Action]
-            if task.navigation_goal:
+
+            using_cached_action_plan = False
+            if not task.navigation_goal:
+                actions = [
+                    CompleteAction(
+                        reasoning="Task has no navigation goal.",
+                        data_extraction_goal=task.data_extraction_goal,
+                    )
+                ]
+            elif (
+                task_block
+                and task_block.cache_actions
+                and (actions := await retrieve_action_plan(task, step, scraped_page))
+            ):
+                using_cached_action_plan = True
+            else:
                 self.async_operation_pool.run_operation(task.task_id, AgentPhase.llm)
                 json_response = await app.LLM_API_HANDLER(
                     prompt=extract_action_prompt,
@@ -569,14 +589,8 @@ async def agent_step(
                 )
                 detailed_agent_step_output.llm_response = json_response
 
-                actions = parse_actions(task, json_response["actions"])
-            else:
-                actions = [
-                    CompleteAction(
-                        reasoning="Task has no navigation goal.",
-                        data_extraction_goal=task.data_extraction_goal,
-                    )
-                ]
+                actions = parse_actions(task, step.step_id, step.order, scraped_page, json_response["actions"])
+
             detailed_agent_step_output.actions = actions
             if len(actions) == 0:
                 LOG.info(
@@ -621,7 +635,8 @@ async def agent_step(
                 wait_actions_to_skip = [action for action in actions if action.action_type == ActionType.WAIT]
                 wait_actions_len = len(wait_actions_to_skip)
                 # if there are wait actions and there are other actions in the list, skip wait actions
-                if wait_actions_len > 0 and wait_actions_len < len(actions):
+                # if we are using cached action plan, we don't skip wait actions
+                if wait_actions_len > 0 and wait_actions_len < len(actions) and not using_cached_action_plan:
                     actions = [action for action in actions if action.action_type != ActionType.WAIT]
                     LOG.info(
                         "Skipping wait actions",
@@ -871,12 +886,10 @@ async def check_user_goal_success(
                 navigation_payload=task.navigation_payload,
                 elements=scraped_page.build_element_tree(ElementTreeFormat.HTML),
             )
-            screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=page.url)
-
             verification_llm_api_handler = app.SECONDARY_LLM_API_HANDLER
 
             verification_response = await verification_llm_api_handler(
-                prompt=verification_prompt, step=step, screenshots=screenshots
+                prompt=verification_prompt, step=step, screenshots=None
             )
             if "user_goal_achieved" not in verification_response or "reasoning" not in verification_response:
                 LOG.error(
@@ -895,9 +908,16 @@ async def check_user_goal_success(
                 return None
 
             LOG.info("User goal achieved, executing complete action")
-            action_results = await handle_complete_action(complete_action, page, scraped_page, task, step)
+            extracted_data = None
+            if complete_action.data_extraction_goal:
+                scrape_action_result = await extract_information_for_navigation_goal(
+                    scraped_page=scraped_page,
+                    task=task,
+                    step=step,
+                )
+                extracted_data = scrape_action_result.scraped_data
 
-            return complete_action, action_results
+            return complete_action, [ActionSuccess(data=extracted_data)]
 
         except Exception:
             LOG.error("LLM verification failed for complete action, skipping LLM verification", exc_info=True)

diff --git a/skyvern/forge/prompts/skyvern/answer-user-detail-questions.j2 b/skyvern/forge/prompts/skyvern/answer-user-detail-questions.j2
@@ -0,0 +1,25 @@
+You will be given information about a user's goal and details. 
+
+Your job is to answer the user's questions based on the information provided.
+
+The user's questions will be provided in JSON format.
+
+Your answers should be direct and to the point. No need to explain the answer.
+
+Your response should be in JSON format. Basically fill in the answer part and return the JSON.
+
+User's goal: {{ navigation_goal }}
+
+User's details: {{ navigation_payload }}
+
+User's questions: {{ queries_and_answers }}
+
+YOUR RESPONSE HAS TO BE IN JSON FORMAT. DO NOT RETURN ANYTHING ELSE. 
+THESE ANSWERS WILL BE USED TO FILL OUT INFORMATION ON A WEBPAGE. DO NOT INCLUDE ANY UNRELATED INFORMATION OR UNNECESSARY DETAILS IN YOUR ANSWERS.
+
+EXAMPLE RESPONSE FORMAT:
+{
+  "question_1": "answer_1",
+  "question_2": "answer_2",
+  "question_3": "answer_3"
+}
diff --git a/skyvern/forge/prompts/skyvern/check-user-goal.j2 b/skyvern/forge/prompts/skyvern/check-user-goal.j2
@@ -1,4 +1,4 @@
-Based on the content of the screenshot and the elements on the page, determine whether the user goal has been successfully completed or not.
+Based on the content of the elements on the page, determine whether the user goal has been successfully completed or not.
 
 The JSON object should be in this format:
 ```json
@@ -7,15 +7,15 @@ The JSON object should be in this format:
   "user_goal_achieved": bool // True if the user goal has been completed, False otherwise.
 }
 
-Make sure to ONLY return the JSON object, with no additional text before or after it. Do not make any assumptions based on the screenshot, return a response solely based on what you observe in the screenshot and nothing else.
+Make sure to ONLY return the JSON object, with no additional text before or after it. Do not make any assumptions, return a response solely based on the elements on the page.
 
 Examples:
 {
-  "reasoning": "The screenshot shows a success message for a file upload field. Since the user's goal is to upload a file, it has been successfully completed.",
+  "reasoning": "There is a success message for a file upload field. Since the user's goal is to upload a file, it has been successfully completed.",
   "user_goal_achieved": true
 }
 {
-  "reasoning": "The screenshot shows a job application form with fields. Since the user's goal is to submit a job application, it has not been successfully completed.",
+  "reasoning": "This is a job application form with fields. Since the user's goal is to submit a job application, it has not been successfully completed.",
   "user_goal_achieved": false
 }
 

diff --git a/skyvern/forge/prompts/skyvern/extract-action.j2 b/skyvern/forge/prompts/skyvern/extract-action.j2
@@ -14,7 +14,9 @@ Reply in JSON format with the following keys:
     "action_plan": str, // A string that describes the plan of actions you're going to take. Be specific and to the point. Use this as a quick summary of the actions you're going to take, and what order you're going to take them in, and how that moves you towards your overall goal. Output "COMPLETE" action in the "actions" if user_goal_achieved is True.
     "actions": array // An array of actions. Here's the format of each action:
     [{
-        "reasoning": str, // The reasoning behind the action. Be specific, referencing any user information and their fields and element ids in your reasoning. Mention why you chose the action type, and why you chose the element id. Keep the reasoning short and to the point.
+        "reasoning": str, // The reasoning behind the action. This reasoning must be user information agnostic. Mention why you chose the action type, and why you chose the element id. Keep the reasoning short and to the point.
+        "user_detail_query": str, // Think of this value as a Jeopardy question. Ask the user for the details you need for executing this action. Ask the question even if the details are disclosed in user goal or user details. If it's a text field, ask for the text. If it's a file upload, ask for the file. If it's a dropdown, ask for the relevant information. If you are clicking on something specific, ask about what to click on. If you're downloading a file and you have multiple options, ask the user which one to download. Otherwise, use null. Examples are: "What product ID should I input into the search bar?", "What file should I upload?", "What is the previous insurance provider of the user?", "Which invoice should I download?", "Does the user have any pets?". If the action doesn't require any user details, use null.
+        "user_detail_answer": str, // The answer to the `user_detail_query`. The source of this answer can be user goal or user details.
         "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence
         "action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE". "CLICK" is an element you'd like to click. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the user goal has been achieved AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the user goal is achieved. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.
         "id": str, // The id of the element to take action on. The id has to be one from the elements list

diff --git a/skyvern/forge/sdk/api/crypto.py b/skyvern/forge/sdk/api/crypto.py
@@ -0,0 +1,8 @@
+import hashlib
+
+
+def calculate_sha256(data: str) -> str:
+    """Helper function to calculate SHA256 hash of a string."""
+    sha256_hash = hashlib.sha256()
+    sha256_hash.update(data.encode())
+    return sha256_hash.hexdigest()
diff --git a/skyvern/forge/sdk/api/files.py b/skyvern/forge/sdk/api/files.py
@@ -113,7 +113,7 @@ def rename_file(file_path: str, new_file_name: str) -> str:
         return file_path
 
 
-def calculate_sha256(file_path: str) -> str:
+def calculate_sha256_for_file(file_path: str) -> str:
     """Helper function to calculate SHA256 hash of a file."""
     sha256_hash = hashlib.sha256()
     with open(file_path, "rb") as f:

diff --git a/skyvern/forge/sdk/db/client.py b/skyvern/forge/sdk/db/client.py
@@ -13,6 +13,7 @@
 from skyvern.forge.sdk.db.enums import OrganizationAuthTokenType
 from skyvern.forge.sdk.db.exceptions import NotFoundError
 from skyvern.forge.sdk.db.models import (
+    ActionModel,
     ArtifactModel,
     AWSSecretParameterModel,
     BitwardenCreditCardDataParameterModel,
@@ -68,6 +69,7 @@
     WorkflowRunParameter,
     WorkflowRunStatus,
 )
+from skyvern.webeye.actions.actions import Action
 from skyvern.webeye.actions.models import AgentStepOutput
 
 LOG = structlog.get_logger()
@@ -1571,3 +1573,59 @@ async def get_totp_codes(
             )
             totp_code = (await session.scalars(query)).all()
             return [TOTPCode.model_validate(totp_code) for totp_code in totp_code]
+
+    async def create_action(self, action: Action) -> Action:
+        async with self.Session() as session:
+            new_action = ActionModel(
+                action_type=action.action_type,
+                source_action_id=action.source_action_id,
+                organization_id=action.organization_id,
+                workflow_run_id=action.workflow_run_id,
+                task_id=action.task_id,
+                step_id=action.step_id,
+                step_order=action.step_order,
+                action_order=action.action_order,
+                status=action.status,
+                reasoning=action.reasoning,
+                intention=action.intention,
+                response=action.response,
+                element_id=action.element_id,
+                skyvern_element_hash=action.skyvern_element_hash,
+                skyvern_element_data=action.skyvern_element_data,
+                action_json=action.model_dump(),
+            )
+            session.add(new_action)
+            await session.commit()
+            await session.refresh(new_action)
+            return Action.model_validate(new_action)
+
+    async def retrieve_action_plan(self, task: Task) -> list[Action]:
+        async with self.Session() as session:
+            subquery = (
+                select(TaskModel.task_id)
+                .filter(TaskModel.url == task.url)
+                .filter(TaskModel.navigation_goal == task.navigation_goal)
+                .filter(TaskModel.status == TaskStatus.completed)
+                .order_by(TaskModel.created_at.desc())
+                .limit(1)
+                .subquery()
+            )
+
+            query = (
+                select(ActionModel)
+                .filter(ActionModel.task_id == subquery.c.task_id)
+                .order_by(ActionModel.step_order, ActionModel.action_order, ActionModel.created_at)
+            )
+
+            actions = (await session.scalars(query)).all()
+            return [Action.model_validate(action) for action in actions]
+
+    async def get_previous_actions_for_task(self, task_id: str) -> list[Action]:
+        async with self.Session() as session:
+            query = (
+                select(ActionModel)
+                .filter_by(task_id=task_id)
+                .order_by(ActionModel.step_order, ActionModel.action_order, ActionModel.created_at)
+            )
+            actions = (await session.scalars(query)).all()
+            return [Action.model_validate(action) for action in actions]
diff --git a/skyvern/forge/sdk/db/id.py b/skyvern/forge/sdk/db/id.py
@@ -130,6 +130,11 @@ def generate_totp_code_id() -> str:
     return f"totp_{int_id}"
 
 
+def generate_action_id() -> str:
+    int_id = generate_id()
+    return f"a_{int_id}"
+
+
 def generate_id() -> int:
     """
     generate a 64-bit int ID