Skip to content

Commit

Permalink
add actions db model and caching V0 (#980)
Browse files Browse the repository at this point in the history
  • Loading branch information
wintonzheng authored Oct 15, 2024
1 parent e7583ac commit 9048cdf
Show file tree
Hide file tree
Showing 19 changed files with 731 additions and 90 deletions.
81 changes: 81 additions & 0 deletions alembic/versions/2024_10_15_1903-137eee1d3b3e_actions_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""actions table
Revision ID: 137eee1d3b3e
Revises: 12fb2dede685
Create Date: 2024-10-15 19:03:29.086340+00:00
"""

from typing import Sequence, Union

import sqlalchemy as sa

from alembic import op

# revision identifiers, used by Alembic.
revision: str = "137eee1d3b3e"
down_revision: Union[str, None] = "12fb2dede685"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"actions",
sa.Column("action_id", sa.String(), nullable=False),
sa.Column("action_type", sa.String(), nullable=False),
sa.Column("source_action_id", sa.String(), nullable=True),
sa.Column("organization_id", sa.String(), nullable=True),
sa.Column("workflow_run_id", sa.String(), nullable=True),
sa.Column("task_id", sa.String(), nullable=False),
sa.Column("step_id", sa.String(), nullable=False),
sa.Column("step_order", sa.Integer(), nullable=False),
sa.Column("action_order", sa.Integer(), nullable=False),
sa.Column("status", sa.String(), nullable=False),
sa.Column("reasoning", sa.String(), nullable=True),
sa.Column("intention", sa.String(), nullable=True),
sa.Column("response", sa.String(), nullable=True),
sa.Column("element_id", sa.String(), nullable=True),
sa.Column("skyvern_element_hash", sa.String(), nullable=True),
sa.Column("skyvern_element_data", sa.JSON(), nullable=True),
sa.Column("action_json", sa.JSON(), nullable=True),
sa.Column("created_at", sa.DateTime(), nullable=False),
sa.Column("modified_at", sa.DateTime(), nullable=False),
sa.ForeignKeyConstraint(
["organization_id"],
["organizations.organization_id"],
),
sa.ForeignKeyConstraint(
["source_action_id"],
["actions.action_id"],
),
sa.ForeignKeyConstraint(
["step_id"],
["steps.step_id"],
),
sa.ForeignKeyConstraint(
["task_id"],
["tasks.task_id"],
),
sa.ForeignKeyConstraint(
["workflow_run_id"],
["workflow_runs.workflow_run_id"],
),
sa.PrimaryKeyConstraint("action_id"),
)
op.create_index("action_org_task_step_index", "actions", ["organization_id", "task_id", "step_id"], unique=False)
op.create_index(op.f("ix_actions_action_id"), "actions", ["action_id"], unique=False)
op.create_index(op.f("ix_actions_source_action_id"), "actions", ["source_action_id"], unique=False)
op.create_index(op.f("ix_actions_task_id"), "actions", ["task_id"], unique=False)
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_index(op.f("ix_actions_task_id"), table_name="actions")
op.drop_index(op.f("ix_actions_source_action_id"), table_name="actions")
op.drop_index(op.f("ix_actions_action_id"), table_name="actions")
op.drop_index("action_org_task_step_index", table_name="actions")
op.drop_table("actions")
# ### end Alembic commands ###
5 changes: 5 additions & 0 deletions skyvern/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,3 +490,8 @@ class IllegitComplete(SkyvernException):
def __init__(self, data: dict | None = None) -> None:
data_str = f", data={data}" if data else ""
super().__init__(f"Illegit complete{data_str}")


class CachedActionPlanError(SkyvernException):
def __init__(self, message: str) -> None:
super().__init__(message)
54 changes: 37 additions & 17 deletions skyvern/forge/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,14 @@
WebAction,
parse_actions,
)
from skyvern.webeye.actions.handler import ActionHandler, handle_complete_action, poll_verification_code
from skyvern.webeye.actions.caching import retrieve_action_plan
from skyvern.webeye.actions.handler import (
ActionHandler,
extract_information_for_navigation_goal,
poll_verification_code,
)
from skyvern.webeye.actions.models import AgentStepOutput, DetailedAgentStepOutput
from skyvern.webeye.actions.responses import ActionResult
from skyvern.webeye.actions.responses import ActionResult, ActionSuccess
from skyvern.webeye.browser_factory import BrowserState
from skyvern.webeye.scraper.scraper import ElementTreeFormat, ScrapedPage, scrape_website
from skyvern.webeye.utils.page import SkyvernFrame
Expand Down Expand Up @@ -553,7 +558,22 @@ async def agent_step(
detailed_agent_step_output.extract_action_prompt = extract_action_prompt
json_response = None
actions: list[Action]
if task.navigation_goal:

using_cached_action_plan = False
if not task.navigation_goal:
actions = [
CompleteAction(
reasoning="Task has no navigation goal.",
data_extraction_goal=task.data_extraction_goal,
)
]
elif (
task_block
and task_block.cache_actions
and (actions := await retrieve_action_plan(task, step, scraped_page))
):
using_cached_action_plan = True
else:
self.async_operation_pool.run_operation(task.task_id, AgentPhase.llm)
json_response = await app.LLM_API_HANDLER(
prompt=extract_action_prompt,
Expand All @@ -569,14 +589,8 @@ async def agent_step(
)
detailed_agent_step_output.llm_response = json_response

actions = parse_actions(task, json_response["actions"])
else:
actions = [
CompleteAction(
reasoning="Task has no navigation goal.",
data_extraction_goal=task.data_extraction_goal,
)
]
actions = parse_actions(task, step.step_id, step.order, scraped_page, json_response["actions"])

detailed_agent_step_output.actions = actions
if len(actions) == 0:
LOG.info(
Expand Down Expand Up @@ -621,7 +635,8 @@ async def agent_step(
wait_actions_to_skip = [action for action in actions if action.action_type == ActionType.WAIT]
wait_actions_len = len(wait_actions_to_skip)
# if there are wait actions and there are other actions in the list, skip wait actions
if wait_actions_len > 0 and wait_actions_len < len(actions):
# if we are using cached action plan, we don't skip wait actions
if wait_actions_len > 0 and wait_actions_len < len(actions) and not using_cached_action_plan:
actions = [action for action in actions if action.action_type != ActionType.WAIT]
LOG.info(
"Skipping wait actions",
Expand Down Expand Up @@ -871,12 +886,10 @@ async def check_user_goal_success(
navigation_payload=task.navigation_payload,
elements=scraped_page.build_element_tree(ElementTreeFormat.HTML),
)
screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=page.url)

verification_llm_api_handler = app.SECONDARY_LLM_API_HANDLER

verification_response = await verification_llm_api_handler(
prompt=verification_prompt, step=step, screenshots=screenshots
prompt=verification_prompt, step=step, screenshots=None
)
if "user_goal_achieved" not in verification_response or "reasoning" not in verification_response:
LOG.error(
Expand All @@ -895,9 +908,16 @@ async def check_user_goal_success(
return None

LOG.info("User goal achieved, executing complete action")
action_results = await handle_complete_action(complete_action, page, scraped_page, task, step)
extracted_data = None
if complete_action.data_extraction_goal:
scrape_action_result = await extract_information_for_navigation_goal(
scraped_page=scraped_page,
task=task,
step=step,
)
extracted_data = scrape_action_result.scraped_data

return complete_action, action_results
return complete_action, [ActionSuccess(data=extracted_data)]

except Exception:
LOG.error("LLM verification failed for complete action, skipping LLM verification", exc_info=True)
Expand Down
25 changes: 25 additions & 0 deletions skyvern/forge/prompts/skyvern/answer-user-detail-questions.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
You will be given information about a user's goal and details.

Your job is to answer the user's questions based on the information provided.

The user's questions will be provided in JSON format.

Your answers should be direct and to the point. No need to explain the answer.

Your response should be in JSON format. Basically fill in the answer part and return the JSON.

User's goal: {{ navigation_goal }}

User's details: {{ navigation_payload }}

User's questions: {{ queries_and_answers }}

YOUR RESPONSE HAS TO BE IN JSON FORMAT. DO NOT RETURN ANYTHING ELSE.
THESE ANSWERS WILL BE USED TO FILL OUT INFORMATION ON A WEBPAGE. DO NOT INCLUDE ANY UNRELATED INFORMATION OR UNNECESSARY DETAILS IN YOUR ANSWERS.

EXAMPLE RESPONSE FORMAT:
{
"question_1": "answer_1",
"question_2": "answer_2",
"question_3": "answer_3"
}
8 changes: 4 additions & 4 deletions skyvern/forge/prompts/skyvern/check-user-goal.j2
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Based on the content of the screenshot and the elements on the page, determine whether the user goal has been successfully completed or not.
Based on the content of the elements on the page, determine whether the user goal has been successfully completed or not.

The JSON object should be in this format:
```json
Expand All @@ -7,15 +7,15 @@ The JSON object should be in this format:
"user_goal_achieved": bool // True if the user goal has been completed, False otherwise.
}

Make sure to ONLY return the JSON object, with no additional text before or after it. Do not make any assumptions based on the screenshot, return a response solely based on what you observe in the screenshot and nothing else.
Make sure to ONLY return the JSON object, with no additional text before or after it. Do not make any assumptions, return a response solely based on the elements on the page.

Examples:
{
"reasoning": "The screenshot shows a success message for a file upload field. Since the user's goal is to upload a file, it has been successfully completed.",
"reasoning": "There is a success message for a file upload field. Since the user's goal is to upload a file, it has been successfully completed.",
"user_goal_achieved": true
}
{
"reasoning": "The screenshot shows a job application form with fields. Since the user's goal is to submit a job application, it has not been successfully completed.",
"reasoning": "This is a job application form with fields. Since the user's goal is to submit a job application, it has not been successfully completed.",
"user_goal_achieved": false
}

Expand Down
4 changes: 3 additions & 1 deletion skyvern/forge/prompts/skyvern/extract-action.j2
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ Reply in JSON format with the following keys:
"action_plan": str, // A string that describes the plan of actions you're going to take. Be specific and to the point. Use this as a quick summary of the actions you're going to take, and what order you're going to take them in, and how that moves you towards your overall goal. Output "COMPLETE" action in the "actions" if user_goal_achieved is True.
"actions": array // An array of actions. Here's the format of each action:
[{
"reasoning": str, // The reasoning behind the action. Be specific, referencing any user information and their fields and element ids in your reasoning. Mention why you chose the action type, and why you chose the element id. Keep the reasoning short and to the point.
"reasoning": str, // The reasoning behind the action. This reasoning must be user information agnostic. Mention why you chose the action type, and why you chose the element id. Keep the reasoning short and to the point.
"user_detail_query": str, // Think of this value as a Jeopardy question. Ask the user for the details you need for executing this action. Ask the question even if the details are disclosed in user goal or user details. If it's a text field, ask for the text. If it's a file upload, ask for the file. If it's a dropdown, ask for the relevant information. If you are clicking on something specific, ask about what to click on. If you're downloading a file and you have multiple options, ask the user which one to download. Otherwise, use null. Examples are: "What product ID should I input into the search bar?", "What file should I upload?", "What is the previous insurance provider of the user?", "Which invoice should I download?", "Does the user have any pets?". If the action doesn't require any user details, use null.
"user_detail_answer": str, // The answer to the `user_detail_query`. The source of this answer can be user goal or user details.
"confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence
"action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE". "CLICK" is an element you'd like to click. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the user goal has been achieved AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the user goal is achieved. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.
"id": str, // The id of the element to take action on. The id has to be one from the elements list
Expand Down
8 changes: 8 additions & 0 deletions skyvern/forge/sdk/api/crypto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import hashlib


def calculate_sha256(data: str) -> str:
"""Helper function to calculate SHA256 hash of a string."""
sha256_hash = hashlib.sha256()
sha256_hash.update(data.encode())
return sha256_hash.hexdigest()
2 changes: 1 addition & 1 deletion skyvern/forge/sdk/api/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def rename_file(file_path: str, new_file_name: str) -> str:
return file_path


def calculate_sha256(file_path: str) -> str:
def calculate_sha256_for_file(file_path: str) -> str:
"""Helper function to calculate SHA256 hash of a file."""
sha256_hash = hashlib.sha256()
with open(file_path, "rb") as f:
Expand Down
58 changes: 58 additions & 0 deletions skyvern/forge/sdk/db/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from skyvern.forge.sdk.db.enums import OrganizationAuthTokenType
from skyvern.forge.sdk.db.exceptions import NotFoundError
from skyvern.forge.sdk.db.models import (
ActionModel,
ArtifactModel,
AWSSecretParameterModel,
BitwardenCreditCardDataParameterModel,
Expand Down Expand Up @@ -68,6 +69,7 @@
WorkflowRunParameter,
WorkflowRunStatus,
)
from skyvern.webeye.actions.actions import Action
from skyvern.webeye.actions.models import AgentStepOutput

LOG = structlog.get_logger()
Expand Down Expand Up @@ -1571,3 +1573,59 @@ async def get_totp_codes(
)
totp_code = (await session.scalars(query)).all()
return [TOTPCode.model_validate(totp_code) for totp_code in totp_code]

async def create_action(self, action: Action) -> Action:
async with self.Session() as session:
new_action = ActionModel(
action_type=action.action_type,
source_action_id=action.source_action_id,
organization_id=action.organization_id,
workflow_run_id=action.workflow_run_id,
task_id=action.task_id,
step_id=action.step_id,
step_order=action.step_order,
action_order=action.action_order,
status=action.status,
reasoning=action.reasoning,
intention=action.intention,
response=action.response,
element_id=action.element_id,
skyvern_element_hash=action.skyvern_element_hash,
skyvern_element_data=action.skyvern_element_data,
action_json=action.model_dump(),
)
session.add(new_action)
await session.commit()
await session.refresh(new_action)
return Action.model_validate(new_action)

async def retrieve_action_plan(self, task: Task) -> list[Action]:
async with self.Session() as session:
subquery = (
select(TaskModel.task_id)
.filter(TaskModel.url == task.url)
.filter(TaskModel.navigation_goal == task.navigation_goal)
.filter(TaskModel.status == TaskStatus.completed)
.order_by(TaskModel.created_at.desc())
.limit(1)
.subquery()
)

query = (
select(ActionModel)
.filter(ActionModel.task_id == subquery.c.task_id)
.order_by(ActionModel.step_order, ActionModel.action_order, ActionModel.created_at)
)

actions = (await session.scalars(query)).all()
return [Action.model_validate(action) for action in actions]

async def get_previous_actions_for_task(self, task_id: str) -> list[Action]:
async with self.Session() as session:
query = (
select(ActionModel)
.filter_by(task_id=task_id)
.order_by(ActionModel.step_order, ActionModel.action_order, ActionModel.created_at)
)
actions = (await session.scalars(query)).all()
return [Action.model_validate(action) for action in actions]
5 changes: 5 additions & 0 deletions skyvern/forge/sdk/db/id.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,11 @@ def generate_totp_code_id() -> str:
return f"totp_{int_id}"


def generate_action_id() -> str:
int_id = generate_id()
return f"a_{int_id}"


def generate_id() -> int:
"""
generate a 64-bit int ID
Expand Down
Loading

0 comments on commit 9048cdf

Please sign in to comment.