Merge pull request #921 from julep-ai/f/spider

feat(integrations): spider api call to support async + removed duplicate code + readme update
julep-ai · Dec 7, 2024 · fcc02f1 · fcc02f1
2 parents 1ae7ab0 + 7b91ef3
commit fcc02f1
Show file tree

Hide file tree

Showing 55 changed files with 831 additions and 261 deletions.
diff --git a/.github/workflows/lint-agents-api-pr.yml b/.github/workflows/lint-agents-api-pr.yml
@@ -17,10 +17,7 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: Install libboost
-        uses: awalsh128/cache-apt-pkgs-action@latest
-        with:
-          packages: libboost-all-dev
-          version: 1.0
+        run: sudo apt-get install -y libboost-all-dev
 
       - name: Install uv
         uses: astral-sh/setup-uv@v4

diff --git a/.github/workflows/test-agents-api-pr.yml b/.github/workflows/test-agents-api-pr.yml
@@ -17,10 +17,7 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: Install libboost
-        uses: awalsh128/cache-apt-pkgs-action@latest
-        with:
-          packages: libboost-all-dev
-          version: 1.0
+        run: sudo apt-get install -y libboost-all-dev
 
       - name: Install uv
         uses: astral-sh/setup-uv@v4

diff --git a/.github/workflows/typecheck-agents-api-pr.yml b/.github/workflows/typecheck-agents-api-pr.yml
@@ -17,10 +17,7 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: Install libboost
-        uses: awalsh128/cache-apt-pkgs-action@latest
-        with:
-          packages: libboost-all-dev
-          version: 1.0
+        run: sudo apt-get install -y libboost-all-dev
 
       - name: Cache pytype
         uses: actions/cache@v4

diff --git a/README.md b/README.md
@@ -1281,7 +1281,7 @@ arguments:
   query: string # The search query for searching with Brave
 
 output:
-  result: string # The result of the Brave Search
+  result: list[dict] # A list of search results, each containing: title, link, snippet
 ```
 
 </td>
@@ -1356,11 +1356,11 @@ setup:
 
 arguments:
   url: string # The URL for which to fetch data
-  mode: string # The type of crawlers (default: "scrape")
   params: dict # (Optional) The parameters for the Spider API
+  content_type: string # (Optional) The content type to return. Default is "application/json". Other options: "text/csv", "application/xml", "application/jsonl"
 
 output:
-  documents: list # The documents returned from the spider
+  result: list[dict] # A list of results, each containing: content, error, status, costs, url
 ```
 
 </td>
@@ -1452,13 +1452,18 @@ arguments:
   base64: boolean # Whether the input file is base64 encoded. Default is false.
 
 output:
-  documents: list # The parsed data from the document
+  documents: list[Document] # A list of parsed documents
 ```
 
 </td>
+<td>
+
+**Example cookbook**: [cookbooks/07-personalized-research-assistant.ipynb](https://github.com/julep-ai/julep/blob/dev/cookbooks/07-personalized-research-assistant.ipynb)
 
+</td>
 </tr>
 
+
 <tr>
 <td> <b>Cloudinary</b> </td>
 <td>
@@ -1489,14 +1494,44 @@ output:
 ```
 
 </td>
-
 <td>
 
 **Example cookbook**: [cookbooks/05-video-processing-with-natural-language.ipynb](https://github.com/julep-ai/julep/blob/dev/cookbooks/05-video-processing-with-natural-language.ipynb)
 
 </td>
 </tr>
 
+<tr>
+<td> <b>Arxiv</b> </td>
+<td>
+
+```yaml
+method: search # The method to use for the Arxiv integration
+
+setup:
+  # No specific setup parameters are required for Arxiv
+
+arguments:
+  query: string # The search query for searching with Arxiv
+  id_list: list[string] | None # (Optional) The list of Arxiv IDs to search with
+  max_results: integer # The maximum number of results to return, must be between 1 and 300000
+  download_pdf: boolean # Whether to download the PDF of the results. Default is false.
+  sort_by: string # The sort criterion for the results, options: relevance, lastUpdatedDate, submittedDate
+  sort_order: string # The sort order for the results, options: ascending, descending
+
+output:
+  result: list[dict] # A list of search results, each containing: entry_id, title, updated, published, authors, summary, comment, journal_ref, doi, primary_category, categories, links, pdf_url, pdf_downloaded
+```
+
+</td>
+
+<td>
+
+**Example cookbook**: [cookbooks/07-personalized-research-assistant.ipynb](https://github.com/julep-ai/julep/blob/dev/cookbooks/07-personalized-research-assistant.ipynb)
+
+</td>
+</tr>
+
 </table>
 
 For more details, refer to our [Integrations Documentation](#integrations).
@@ -1674,4 +1709,4 @@ Choose Julep when you need a robust framework for stateful agents with advanced
     <a href="#-table-of-contents">
         <img src="https://img.shields.io/badge/Table%20of%20Contents-000000?style=for-the-badge&logo=github&logoColor=white" alt="Table of Contents">
     </a>
-</div>
+</div>
diff --git a/agents-api/agents_api/activities/embed_docs.py b/agents-api/agents_api/activities/embed_docs.py
@@ -18,26 +18,42 @@
 async def embed_docs(
     payload: EmbedDocsPayload, cozo_client=None, max_batch_size: int = 100
 ) -> None:
-    indices, snippets = list(zip(*enumerate(payload.content)))
-    batched_snippets = batched(snippets, max_batch_size)
+    # Create batches of both indices and snippets together
+    indexed_snippets = list(enumerate(payload.content))
+    # Batch snippets into groups of max_batch_size for parallel processing
+    batched_indexed_snippets = list(batched(indexed_snippets, max_batch_size))
+    # Get embedding instruction and title from payload, defaulting to empty strings
     embed_instruction: str = payload.embed_instruction or ""
     title: str = payload.title or ""
 
-    async def embed_batch(snippets):
-        return await litellm.aembedding(
+    # Helper function to embed a batch of snippets
+    async def embed_batch(indexed_batch):
+        # Split indices and snippets for the batch
+        batch_indices, batch_snippets = zip(*indexed_batch)
+        embeddings = await litellm.aembedding(
             inputs=[
-                (
-                    embed_instruction + (title + "\n\n" + snippet) if title else snippet
-                ).strip()
-                for snippet in snippets
-            ]
+                ((title + "\n\n" + snippet) if title else snippet).strip()
+                for snippet in batch_snippets
+            ],
+            embed_instruction=embed_instruction,
         )
+        return list(zip(batch_indices, embeddings))
 
-    embeddings = reduce(
+    # Gather embeddings with their corresponding indices
+    indexed_embeddings = reduce(
         operator.add,
-        await asyncio.gather(*[embed_batch(snippets) for snippets in batched_snippets]),
+        await asyncio.gather(
+            *[embed_batch(batch) for batch in batched_indexed_snippets]
+        ),
     )
 
+    # Split indices and embeddings after all batches are processed
+    indices, embeddings = zip(*indexed_embeddings)
+
+    # Convert to lists since embed_snippets_query expects list types
+    indices = list(indices)
+    embeddings = list(embeddings)
+
     embed_snippets_query(
         developer_id=payload.developer_id,
         doc_id=payload.doc_id,

diff --git a/agents-api/agents_api/activities/execute_integration.py b/agents-api/agents_api/activities/execute_integration.py
@@ -6,7 +6,7 @@
 from ..autogen.openapi_model import BaseIntegrationDef
 from ..clients import integrations
 from ..common.exceptions.tools import IntegrationExecutionException
-from ..common.protocol.tasks import StepContext
+from ..common.protocol.tasks import ExecutionInput, StepContext
 from ..common.storage_handler import auto_blob_store
 from ..env import testing
 from ..models.tools import get_tool_args_from_metadata
@@ -21,6 +21,9 @@ async def execute_integration(
     arguments: dict[str, Any],
     setup: dict[str, Any] = {},
 ) -> Any:
+    if not isinstance(context.execution_input, ExecutionInput):
+        raise TypeError("Expected ExecutionInput type for context.execution_input")
+
     developer_id = context.execution_input.developer_id
     agent_id = context.execution_input.agent.id
     task_id = context.execution_input.task.id

diff --git a/agents-api/agents_api/activities/execute_system.py b/agents-api/agents_api/activities/execute_system.py
@@ -6,6 +6,7 @@
 
 from beartype import beartype
 from box import Box, BoxList
+from fastapi import HTTPException
 from fastapi.background import BackgroundTasks
 from temporalio import activity
 
@@ -18,7 +19,7 @@
     TextOnlyDocSearchRequest,
     VectorDocSearchRequest,
 )
-from ..common.protocol.tasks import StepContext
+from ..common.protocol.tasks import ExecutionInput, StepContext
 from ..common.storage_handler import auto_blob_store, load_from_blob_store_if_remote
 from ..env import testing
 from ..models.developer import get_developer
@@ -40,6 +41,9 @@ async def execute_system(
     if set(arguments.keys()) == {"bucket", "key"}:
         arguments = await load_from_blob_store_if_remote(arguments)
 
+    if not isinstance(context.execution_input, ExecutionInput):
+        raise TypeError("Expected ExecutionInput type for context.execution_input")
+
     arguments["developer_id"] = context.execution_input.developer_id
 
     # Unbox all the arguments

diff --git a/agents-api/agents_api/activities/task_steps/prompt_step.py b/agents-api/agents_api/activities/task_steps/prompt_step.py
@@ -7,7 +7,7 @@
 from ...clients import (
     litellm,  # We dont directly import `acompletion` so we can mock it
 )
-from ...common.protocol.tasks import StepContext, StepOutcome
+from ...common.protocol.tasks import ExecutionInput, StepContext, StepOutcome
 from ...common.storage_handler import auto_blob_store
 from ...common.utils.template import render_template
 from ...env import debug
@@ -98,6 +98,9 @@ async def prompt_step(context: StepContext) -> StepOutcome:
             skip_vars=["developer_id"],
         )
 
+    if not isinstance(context.execution_input, ExecutionInput):
+        raise TypeError("Expected ExecutionInput type for context.execution_input")
+
     # Get settings and run llm
     agent_default_settings: dict = (
         context.execution_input.agent.default_settings.model_dump()

diff --git a/agents-api/agents_api/activities/task_steps/transition_step.py b/agents-api/agents_api/activities/task_steps/transition_step.py
@@ -1,12 +1,13 @@
 import asyncio
+from typing import cast
 
 from beartype import beartype
 from fastapi import HTTPException
 from temporalio import activity
 
 from ...autogen.openapi_model import CreateTransitionRequest, Transition
 from ...clients.temporal import get_workflow_handle
-from ...common.protocol.tasks import StepContext
+from ...common.protocol.tasks import ExecutionInput, StepContext
 from ...common.storage_handler import load_from_blob_store_if_remote
 from ...env import temporal_activity_after_retry_timeout, testing
 from ...exceptions import LastErrorInput, TooManyRequestsError
@@ -38,6 +39,9 @@ async def transition_step(
         transition_info.output
     )
 
+    if not isinstance(context.execution_input, ExecutionInput):
+        raise TypeError("Expected ExecutionInput type for context.execution_input")
+
     # Create transition
     try:
         transition = await create_execution_transition_async(
@@ -50,12 +54,11 @@ async def transition_step(
         )
 
     except Exception as e:
-        if isinstance(e, HTTPException) and e.status_code == 429:
+        if isinstance(e, HTTPException) and cast(HTTPException, e).status_code == 429:
             await wf_handle.signal(
                 TaskExecutionWorkflow.set_last_error,
                 LastErrorInput(last_error=TooManyRequestsError()),
             )
-
         raise e
 
     return transition

diff --git a/agents-api/agents_api/activities/task_steps/yield_step.py b/agents-api/agents_api/activities/task_steps/yield_step.py
@@ -4,7 +4,7 @@
 from temporalio import activity
 
 from ...autogen.openapi_model import TransitionTarget, YieldStep
-from ...common.protocol.tasks import StepContext, StepOutcome
+from ...common.protocol.tasks import ExecutionInput, StepContext, StepOutcome
 from ...common.storage_handler import auto_blob_store
 from ...env import testing
 from .base_evaluate import base_evaluate
@@ -16,6 +16,9 @@ async def yield_step(context: StepContext) -> StepOutcome:
     try:
         assert isinstance(context.current_step, YieldStep)
 
+        if not isinstance(context.execution_input, ExecutionInput):
+            raise TypeError("Expected ExecutionInput type for context.execution_input")
+
         all_workflows = context.execution_input.task.workflows
         workflow = context.current_step.workflow
         exprs = context.current_step.arguments

diff --git a/agents-api/agents_api/activities/utils.py b/agents-api/agents_api/activities/utils.py
@@ -17,13 +17,22 @@
 from simpleeval import EvalWithCompoundTypes, SimpleEval
 
 from ..autogen.openapi_model import SystemDef
+from ..common.nlp import nlp
 from ..common.utils import yaml
 
 T = TypeVar("T")
 R = TypeVar("R")
 P = ParamSpec("P")
 
 
+def chunk_doc(string: str) -> list[str]:
+    """
+    Chunk a string into sentences.
+    """
+    doc = nlp(string)
+    return [" ".join([sent.text for sent in chunk]) for chunk in doc._.chunks]
+
+
 # TODO: We need to make sure that we dont expose any security issues
 ALLOWED_FUNCTIONS = {
     "abs": abs,
@@ -54,6 +63,8 @@
     "dump_json": json.dumps,
     "dump_yaml": yaml.dump,
     "match_regex": lambda pattern, string: bool(re2.fullmatch(pattern, string)),
+    "nlp": nlp.__call__,
+    "chunk_doc": chunk_doc,
 }