Skip to content

Commit

Permalink
Add example using workflow API (#6)
Browse files Browse the repository at this point in the history
Llama index has a new "workflow" API that they plan to encourage going
forward. This updates the README to include an example using it.
  • Loading branch information
augray authored Sep 30, 2024
1 parent e277e81 commit 8f9de5f
Showing 1 changed file with 107 additions and 0 deletions.
107 changes: 107 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,9 @@ url = at.upload_from_arrow_tables([table], name="My Arrow Dataset").url

#### LlamaIndex

Note that these examples also involve installing additional Llama Index
integrations.

```python
from llama_index.readers.github import GithubRepositoryReader, GithubClient
from llama_index.core.node_parser import (
Expand Down Expand Up @@ -205,3 +208,107 @@ result = upload_from_llama_nodes(
)
print(f"Uploaded {result.size} rows to {result.name}. View at: {result.url}")
```

Alternatively, using the
["Workflows"](https://docs.llamaindex.ai/en/stable/module_guides/workflow/)
API:

```python
import asyncio

from llama_index.core.schema import Node
from llama_index.core.workflow import (
Context,
Event,
StartEvent,
StopEvent,
Workflow,
step,
)
from llama_index.readers.web import AsyncWebPageReader

from airtrain import DatasetMetadata, upload_from_llama_nodes


URLS = [
"https://news.ycombinator.com/item?id=41694044",
"https://news.ycombinator.com/item?id=41696046",
"https://news.ycombinator.com/item?id=41693087",
"https://news.ycombinator.com/item?id=41695756",
"https://news.ycombinator.com/item?id=41666269",
"https://news.ycombinator.com/item?id=41697137",
"https://news.ycombinator.com/item?id=41695840",
"https://news.ycombinator.com/item?id=41694712",
"https://news.ycombinator.com/item?id=41690302",
"https://news.ycombinator.com/item?id=41695076",
"https://news.ycombinator.com/item?id=41669747",
"https://news.ycombinator.com/item?id=41694504",
"https://news.ycombinator.com/item?id=41697032",
"https://news.ycombinator.com/item?id=41694025",
"https://news.ycombinator.com/item?id=41652935",
"https://news.ycombinator.com/item?id=41693979",
"https://news.ycombinator.com/item?id=41696236",
"https://news.ycombinator.com/item?id=41696434",
"https://news.ycombinator.com/item?id=41688469",
"https://news.ycombinator.com/item?id=41646782",
"https://news.ycombinator.com/item?id=41689332",
"https://news.ycombinator.com/item?id=41688018",
"https://news.ycombinator.com/item?id=41668896",
"https://news.ycombinator.com/item?id=41690087",
"https://news.ycombinator.com/item?id=41679497",
"https://news.ycombinator.com/item?id=41687739",
"https://news.ycombinator.com/item?id=41686722",
"https://news.ycombinator.com/item?id=41689138",
"https://news.ycombinator.com/item?id=41691530"
]


class CompletedDocumentRetrievalEvent(Event):
name: str
documents: list[Node]

class AirtrainDocumentDatasetEvent(Event):
metadata: DatasetMetadata


class IngestToAirtrainWorkflow(Workflow):
@step
async def ingest_documents(
self, ctx: Context, ev: StartEvent
) -> CompletedDocumentRetrievalEvent | None:
if not ev.get("urls"):
return None
reader = AsyncWebPageReader(html_to_text=True)
documents = await reader.aload_data(urls=ev.get("urls"))
return CompletedDocumentRetrievalEvent(name=ev.get("name"), documents=documents)

@step
async def ingest_documents_to_airtrain(
self, ctx: Context, ev: CompletedDocumentRetrievalEvent
) -> AirtrainDocumentDatasetEvent | None:
if not isinstance(ev, CompletedDocumentRetrievalEvent):
return None

dataset_meta = upload_from_llama_nodes(ev.documents, name=ev.name)
return AirtrainDocumentDatasetEvent(metadata=dataset_meta)

@step
async def complete_workflow(
self, ctx: Context, ev: AirtrainDocumentDatasetEvent
) -> None | StopEvent:
if not isinstance(ev, AirtrainDocumentDatasetEvent):
return None
return StopEvent(result=ev.metadata)


async def main() -> None:
workflow = IngestToAirtrainWorkflow()
result = await workflow.run(
name="My HN Discussions Dataset", urls=URLS,
)
print(f"Uploaded {result.size} rows to {result.name}. View at: {result.url}")


if __name__ == "__main__":
asyncio.run(main())
```

0 comments on commit 8f9de5f

Please sign in to comment.