Skip to content

Commit

Permalink
Merge pull request #921 from julep-ai/f/spider
Browse files Browse the repository at this point in the history
feat(integrations): spider api call to support async + removed duplicate code + readme update
  • Loading branch information
creatorrr authored Dec 7, 2024
2 parents 1ae7ab0 + 7b91ef3 commit fcc02f1
Show file tree
Hide file tree
Showing 55 changed files with 831 additions and 261 deletions.
5 changes: 1 addition & 4 deletions .github/workflows/lint-agents-api-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,7 @@ jobs:
- uses: actions/checkout@v4

- name: Install libboost
uses: awalsh128/cache-apt-pkgs-action@latest
with:
packages: libboost-all-dev
version: 1.0
run: sudo apt-get install -y libboost-all-dev

- name: Install uv
uses: astral-sh/setup-uv@v4
Expand Down
5 changes: 1 addition & 4 deletions .github/workflows/test-agents-api-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,7 @@ jobs:
- uses: actions/checkout@v4

- name: Install libboost
uses: awalsh128/cache-apt-pkgs-action@latest
with:
packages: libboost-all-dev
version: 1.0
run: sudo apt-get install -y libboost-all-dev

- name: Install uv
uses: astral-sh/setup-uv@v4
Expand Down
5 changes: 1 addition & 4 deletions .github/workflows/typecheck-agents-api-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,7 @@ jobs:
- uses: actions/checkout@v4

- name: Install libboost
uses: awalsh128/cache-apt-pkgs-action@latest
with:
packages: libboost-all-dev
version: 1.0
run: sudo apt-get install -y libboost-all-dev

- name: Cache pytype
uses: actions/cache@v4
Expand Down
47 changes: 41 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1281,7 +1281,7 @@ arguments:
query: string # The search query for searching with Brave
output:
result: string # The result of the Brave Search
result: list[dict] # A list of search results, each containing: title, link, snippet
```

</td>
Expand Down Expand Up @@ -1356,11 +1356,11 @@ setup:
arguments:
url: string # The URL for which to fetch data
mode: string # The type of crawlers (default: "scrape")
params: dict # (Optional) The parameters for the Spider API
content_type: string # (Optional) The content type to return. Default is "application/json". Other options: "text/csv", "application/xml", "application/jsonl"
output:
documents: list # The documents returned from the spider
result: list[dict] # A list of results, each containing: content, error, status, costs, url
```

</td>
Expand Down Expand Up @@ -1452,13 +1452,18 @@ arguments:
base64: boolean # Whether the input file is base64 encoded. Default is false.
output:
documents: list # The parsed data from the document
documents: list[Document] # A list of parsed documents
```

</td>
<td>

**Example cookbook**: [cookbooks/07-personalized-research-assistant.ipynb](https://github.com/julep-ai/julep/blob/dev/cookbooks/07-personalized-research-assistant.ipynb)

</td>
</tr>


<tr>
<td> <b>Cloudinary</b> </td>
<td>
Expand Down Expand Up @@ -1489,14 +1494,44 @@ output:
```

</td>

<td>

**Example cookbook**: [cookbooks/05-video-processing-with-natural-language.ipynb](https://github.com/julep-ai/julep/blob/dev/cookbooks/05-video-processing-with-natural-language.ipynb)

</td>
</tr>

<tr>
<td> <b>Arxiv</b> </td>
<td>

```yaml
method: search # The method to use for the Arxiv integration
setup:
# No specific setup parameters are required for Arxiv
arguments:
query: string # The search query for searching with Arxiv
id_list: list[string] | None # (Optional) The list of Arxiv IDs to search with
max_results: integer # The maximum number of results to return, must be between 1 and 300000
download_pdf: boolean # Whether to download the PDF of the results. Default is false.
sort_by: string # The sort criterion for the results, options: relevance, lastUpdatedDate, submittedDate
sort_order: string # The sort order for the results, options: ascending, descending
output:
result: list[dict] # A list of search results, each containing: entry_id, title, updated, published, authors, summary, comment, journal_ref, doi, primary_category, categories, links, pdf_url, pdf_downloaded
```

</td>

<td>

**Example cookbook**: [cookbooks/07-personalized-research-assistant.ipynb](https://github.com/julep-ai/julep/blob/dev/cookbooks/07-personalized-research-assistant.ipynb)

</td>
</tr>

</table>

For more details, refer to our [Integrations Documentation](#integrations).
Expand Down Expand Up @@ -1674,4 +1709,4 @@ Choose Julep when you need a robust framework for stateful agents with advanced
<a href="#-table-of-contents">
<img src="https://img.shields.io/badge/Table%20of%20Contents-000000?style=for-the-badge&logo=github&logoColor=white" alt="Table of Contents">
</a>
</div>
</div>
38 changes: 27 additions & 11 deletions agents-api/agents_api/activities/embed_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,42 @@
async def embed_docs(
payload: EmbedDocsPayload, cozo_client=None, max_batch_size: int = 100
) -> None:
indices, snippets = list(zip(*enumerate(payload.content)))
batched_snippets = batched(snippets, max_batch_size)
# Create batches of both indices and snippets together
indexed_snippets = list(enumerate(payload.content))
# Batch snippets into groups of max_batch_size for parallel processing
batched_indexed_snippets = list(batched(indexed_snippets, max_batch_size))
# Get embedding instruction and title from payload, defaulting to empty strings
embed_instruction: str = payload.embed_instruction or ""
title: str = payload.title or ""

async def embed_batch(snippets):
return await litellm.aembedding(
# Helper function to embed a batch of snippets
async def embed_batch(indexed_batch):
# Split indices and snippets for the batch
batch_indices, batch_snippets = zip(*indexed_batch)
embeddings = await litellm.aembedding(
inputs=[
(
embed_instruction + (title + "\n\n" + snippet) if title else snippet
).strip()
for snippet in snippets
]
((title + "\n\n" + snippet) if title else snippet).strip()
for snippet in batch_snippets
],
embed_instruction=embed_instruction,
)
return list(zip(batch_indices, embeddings))

embeddings = reduce(
# Gather embeddings with their corresponding indices
indexed_embeddings = reduce(
operator.add,
await asyncio.gather(*[embed_batch(snippets) for snippets in batched_snippets]),
await asyncio.gather(
*[embed_batch(batch) for batch in batched_indexed_snippets]
),
)

# Split indices and embeddings after all batches are processed
indices, embeddings = zip(*indexed_embeddings)

# Convert to lists since embed_snippets_query expects list types
indices = list(indices)
embeddings = list(embeddings)

embed_snippets_query(
developer_id=payload.developer_id,
doc_id=payload.doc_id,
Expand Down
5 changes: 4 additions & 1 deletion agents-api/agents_api/activities/execute_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from ..autogen.openapi_model import BaseIntegrationDef
from ..clients import integrations
from ..common.exceptions.tools import IntegrationExecutionException
from ..common.protocol.tasks import StepContext
from ..common.protocol.tasks import ExecutionInput, StepContext
from ..common.storage_handler import auto_blob_store
from ..env import testing
from ..models.tools import get_tool_args_from_metadata
Expand All @@ -21,6 +21,9 @@ async def execute_integration(
arguments: dict[str, Any],
setup: dict[str, Any] = {},
) -> Any:
if not isinstance(context.execution_input, ExecutionInput):
raise TypeError("Expected ExecutionInput type for context.execution_input")

developer_id = context.execution_input.developer_id
agent_id = context.execution_input.agent.id
task_id = context.execution_input.task.id
Expand Down
6 changes: 5 additions & 1 deletion agents-api/agents_api/activities/execute_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from beartype import beartype
from box import Box, BoxList
from fastapi import HTTPException
from fastapi.background import BackgroundTasks
from temporalio import activity

Expand All @@ -18,7 +19,7 @@
TextOnlyDocSearchRequest,
VectorDocSearchRequest,
)
from ..common.protocol.tasks import StepContext
from ..common.protocol.tasks import ExecutionInput, StepContext
from ..common.storage_handler import auto_blob_store, load_from_blob_store_if_remote
from ..env import testing
from ..models.developer import get_developer
Expand All @@ -40,6 +41,9 @@ async def execute_system(
if set(arguments.keys()) == {"bucket", "key"}:
arguments = await load_from_blob_store_if_remote(arguments)

if not isinstance(context.execution_input, ExecutionInput):
raise TypeError("Expected ExecutionInput type for context.execution_input")

arguments["developer_id"] = context.execution_input.developer_id

# Unbox all the arguments
Expand Down
5 changes: 4 additions & 1 deletion agents-api/agents_api/activities/task_steps/prompt_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from ...clients import (
litellm, # We dont directly import `acompletion` so we can mock it
)
from ...common.protocol.tasks import StepContext, StepOutcome
from ...common.protocol.tasks import ExecutionInput, StepContext, StepOutcome
from ...common.storage_handler import auto_blob_store
from ...common.utils.template import render_template
from ...env import debug
Expand Down Expand Up @@ -98,6 +98,9 @@ async def prompt_step(context: StepContext) -> StepOutcome:
skip_vars=["developer_id"],
)

if not isinstance(context.execution_input, ExecutionInput):
raise TypeError("Expected ExecutionInput type for context.execution_input")

# Get settings and run llm
agent_default_settings: dict = (
context.execution_input.agent.default_settings.model_dump()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import asyncio
from typing import cast

from beartype import beartype
from fastapi import HTTPException
from temporalio import activity

from ...autogen.openapi_model import CreateTransitionRequest, Transition
from ...clients.temporal import get_workflow_handle
from ...common.protocol.tasks import StepContext
from ...common.protocol.tasks import ExecutionInput, StepContext
from ...common.storage_handler import load_from_blob_store_if_remote
from ...env import temporal_activity_after_retry_timeout, testing
from ...exceptions import LastErrorInput, TooManyRequestsError
Expand Down Expand Up @@ -38,6 +39,9 @@ async def transition_step(
transition_info.output
)

if not isinstance(context.execution_input, ExecutionInput):
raise TypeError("Expected ExecutionInput type for context.execution_input")

# Create transition
try:
transition = await create_execution_transition_async(
Expand All @@ -50,12 +54,11 @@ async def transition_step(
)

except Exception as e:
if isinstance(e, HTTPException) and e.status_code == 429:
if isinstance(e, HTTPException) and cast(HTTPException, e).status_code == 429:
await wf_handle.signal(
TaskExecutionWorkflow.set_last_error,
LastErrorInput(last_error=TooManyRequestsError()),
)

raise e

return transition
Expand Down
5 changes: 4 additions & 1 deletion agents-api/agents_api/activities/task_steps/yield_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from temporalio import activity

from ...autogen.openapi_model import TransitionTarget, YieldStep
from ...common.protocol.tasks import StepContext, StepOutcome
from ...common.protocol.tasks import ExecutionInput, StepContext, StepOutcome
from ...common.storage_handler import auto_blob_store
from ...env import testing
from .base_evaluate import base_evaluate
Expand All @@ -16,6 +16,9 @@ async def yield_step(context: StepContext) -> StepOutcome:
try:
assert isinstance(context.current_step, YieldStep)

if not isinstance(context.execution_input, ExecutionInput):
raise TypeError("Expected ExecutionInput type for context.execution_input")

all_workflows = context.execution_input.task.workflows
workflow = context.current_step.workflow
exprs = context.current_step.arguments
Expand Down
11 changes: 11 additions & 0 deletions agents-api/agents_api/activities/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,22 @@
from simpleeval import EvalWithCompoundTypes, SimpleEval

from ..autogen.openapi_model import SystemDef
from ..common.nlp import nlp
from ..common.utils import yaml

T = TypeVar("T")
R = TypeVar("R")
P = ParamSpec("P")


def chunk_doc(string: str) -> list[str]:
"""
Chunk a string into sentences.
"""
doc = nlp(string)
return [" ".join([sent.text for sent in chunk]) for chunk in doc._.chunks]


# TODO: We need to make sure that we dont expose any security issues
ALLOWED_FUNCTIONS = {
"abs": abs,
Expand Down Expand Up @@ -54,6 +63,8 @@
"dump_json": json.dumps,
"dump_yaml": yaml.dump,
"match_regex": lambda pattern, string: bool(re2.fullmatch(pattern, string)),
"nlp": nlp.__call__,
"chunk_doc": chunk_doc,
}


Expand Down
Loading

0 comments on commit fcc02f1

Please sign in to comment.