Add docs

sematic-ai · Sep 11, 2024 · b912188 · b912188
1 parent e2585b8
commit b912188
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -171,3 +171,37 @@ table = pa.table({"foo": [1, 2, 3], "bar": ["a", "b", "c"]})
 
 url = at.upload_from_arrow_tables([table], name="My Arrow Dataset").url
 ```
+
+
+#### LlamaIndex
+
+```python
+from llama_index.readers.github import GithubRepositoryReader, GithubClient
+from llama_index.core.node_parser import (
+    SentenceSplitter,
+    SemanticSplitterNodeParser,
+)
+from llama_index.embeddings.openai import OpenAIEmbedding
+
+# Data does not have to come from GitHub; this is for illustrative purposes.
+github_client = GithubClient(...)
+documents = GithubRepositoryReader(...).load_data(branch=branch)
+
+# You can upload documents directly. In this case Airtrain will generate embeddings
+result = at.upload_from_llama_nodes(
+    nodes,
+    name="My Document Dataset",
+)
+print(f"Uploaded {result.size} rows to {result.name}. View at: {result.url}")
+
+# Or you can chunk and/or embed it first. Airtrain will use the embeddings
+# you created via LlamaIndex.
+embed_model = OpenAIEmbedding()
+splitter = SemanticSplitterNodeParser(...)
+nodes = splitter.get_nodes_from_documents(documents)
+result = upload_from_llama_nodes(
+    nodes,
+    name="My embedded RAG Dataset",
+)
+print(f"Uploaded {result.size} rows to {result.name}. View at: {result.url}")
+```
diff --git a/src/airtrain/integrations/llamaindex/core.py b/src/airtrain/integrations/llamaindex/core.py
@@ -29,6 +29,23 @@
 def upload_from_llama_nodes(
     data: Iterable[BaseNode], **kwargs: Unpack[CreationArgs]
 ) -> DatasetMetadata:
+    """Upload an Airtrain dataset from the provided LlamaIndex nodes.
+
+    Parameters
+    ----------
+    data:
+        Nodes may be from documents, chunks, or anything else that results
+        in instances of `BaseNode`.  Attributes of the nodes will be treated
+        as columns in the resulting Airtrain dataset. If present (and python
+        version is >= 3.11), the relationships and metadata for the node
+        will be flattened into multiple columns of the resulting dataset.
+    kwargs:
+        See `upload_from_arrow_tables` for other arguments.
+
+    Returns
+    -------
+    A DatasetMetadata object summarizing the created dataset.
+    """
     data_as_iter = iter(data)
     try:
         first_node = next(data_as_iter)