Add basic integrations (#4)

Some integrations are with libs that are widely adopted, and essentially come "for free." This PR adds those and associated docs/packaging changes.
sematic-ai · Sep 16, 2024 · 6ebba51 · 6ebba51
1 parent df63e58
commit 6ebba51
Show file tree

Hide file tree

Showing 10 changed files with 534 additions and 9 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -5,6 +5,7 @@ on: [push]
 jobs:
   test-python:
     runs-on: ubuntu-latest
+    timeout-minutes: 10
     strategy:
       matrix:
         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
@@ -19,4 +20,4 @@ jobs:
         run: make lint
 
       - name: Run tests
-        run: make test
+        run: make ci-test
diff --git a/Makefile b/Makefile
@@ -27,7 +27,7 @@ py-prep:
 
 .PHONY: sync
 sync:
-	uv sync
+	uv sync --extra all
 
 .PHONY: fix
 fix:
@@ -43,3 +43,9 @@ lint:
 .PHONY: test
 test:
 	uv run pytest ./
+
+.PHONY: ci-test
+ci-test:
+	uv sync --extra polars
+	uv pip install pandas # uv seems to stall if py 3.12 installs this as an extra
+	uv run pytest ./
diff --git a/README.md b/README.md
@@ -37,6 +37,26 @@ This repository holds the SDK for interacting with
 the tool for improving your AI apps, RAG pipelines, and models by curating
 high-quality training and eval datasets.
 
+## Installation
+
+To install the core package without any integrations, simply
+
+`pip install airtrain-py`
+
+You may install integrations by using pip extras. As an example, to
+install the pandas integration:
+
+`pip install airtrain-py[pandas]`
+
+If you want to install all integrations, you may do the following:
+
+`pip install airtrain-py[all]`
+
+The following are available extras:
+
+- `pandas`
+- `polars`
+
 ## Usage
 
 Obtain your API key by going to your user settings on
@@ -54,9 +74,99 @@ url = at.upload_from_dicts(
     [
         {"foo": "some text", "bar": "more text"},
         {"foo": "even more text", "bar": "so much text"},
-    ]
+    ],
+    name="My Dataset name",  # name is Optional
 ).url
 
-# You may view your dataset at this URL
+# You may view your dataset in the Airtrain dashboard at this URL
+# It may take some time to complete ingestion and generation of
+# automated insights. You will receive an email when it is complete.
 print(f"Dataset URL: {url}")
 ```
+
+The data may be any iterable of dictionaries that can be represented using
+automatically inferred [Apache Arrow](https://arrow.apache.org/docs/python/index.html)
+types. If you would like to give a hint as to the Arrow schema of the data being
+uploaded, you may provide one using the `schema` parameter to `upload_from_dicts`.
+
+### Custom Embeddings
+
+Airtrain produces a variety of insights into your data automatically. Some of
+these insights (ex: automatic clustering) relies on embeddings of the data. Airtrain
+will also embed your data automatically, but if you wish to provide your own embeddings
+you may do so by adding the `embedding_column` parameter when you upload:
+
+```python
+url = at.upload_from_dicts(
+    [
+        {"foo": "some text", "bar": [0.0, 0.707, 0.707, 0.0]},
+        {"foo": "even more text", "bar": [0.577, 0.577, 0.0, 0.577]},
+    ],
+    embedding_column="bar",
+).url
+```
+
+If you provide this argument, the embeddings must all be lists of floating point
+numbers with the same length.
+
+### Integrations
+
+Airtrain provides integrations to allow for uploading data from a variety of
+sources. In general most integrations take the form of an `upload_from_x(...)`
+function with a signature matching that of `upload_from_dicts` except for
+the first parameter specifying the data to be uploaded. Integrations may require
+installing the Airtrain SDK [with extras](#installation).
+
+#### Pandas
+
+```python
+import pandas as pd
+
+# ...
+
+df = pd.DataFrame(
+    {
+        "foo": ["some text", "more text", "even more"],
+        "bar": [1, 2, 3],
+    }
+)
+
+
+url = at.upload_from_pandas(df, name="My Pandas Dataset").url
+```
+
+You may also provide an iterable of dataframes instead of a single one.
+
+#### Polars
+
+```python
+import polars as pl
+
+# ...
+
+df = pl.DataFrame(
+    {
+        "foo": ["some text", "more text", "even more"],
+        "bar": [1, 2, 3],
+    }
+)
+
+
+url = at.upload_from_polars(df, name="My Polars Dataset").url
+```
+
+You may also provide an iterable of dataframes instead of a single one.
+
+
+#### Arrow
+
+```python
+import pyarrow as pa
+
+# ...
+
+table = pa.table({"foo": [1, 2, 3], "bar": ["a", "b", "c"]})
+
+
+url = at.upload_from_arrow_tables([table], name="My Arrow Dataset").url
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,5 @@
 [project]
-name = "airtrain"
+name = "airtrain-py"
 description = "SDK for interacting with https://airtrain.ai"
 version = "0.0.1"
 requires-python = ">=3.8"
@@ -33,6 +33,16 @@ classifiers = [
   "Programming Language :: Python :: 3.12",
 ]
 
+[project.optional-dependencies]
+pandas = [
+  "pandas>=1.0.0; python_version < '3.12'",
+  "pandas>=2.0.0; python_version >= '3.12'",
+]
+polars = [
+  "polars>=0.19.0",
+]
+all = ["airtrain-py[pandas,polars]"]
+
 [tool.uv]
 dev-dependencies = [
   "mypy==1.11.1",
@@ -80,3 +90,15 @@ ignore_missing_imports = true
 [[tool.mypy.overrides]]
 module = "pyarrow.*"
 ignore_missing_imports = true
+
+[[tool.mypy.overrides]]
+module = "pandas.*"
+ignore_missing_imports = true
+
+[[tool.mypy.overrides]]
+module = "polars.*"
+ignore_missing_imports = true
+
+[[tool.mypy.overrides]]
+module = "tests.*"
+ignore_missing_imports = true
diff --git a/src/airtrain/__init__.py b/src/airtrain/__init__.py
@@ -4,3 +4,5 @@
     upload_from_arrow_tables,
     upload_from_dicts,
 )
+from airtrain.integrations.pandas import upload_from_pandas  # noqa: F401
+from airtrain.integrations.polars import upload_from_polars  # noqa: F401
diff --git a/src/airtrain/integrations/pandas.py b/src/airtrain/integrations/pandas.py
@@ -0,0 +1,46 @@
+from typing import Any, Iterable, Union
+
+
+try:
+    import pandas as pd
+
+    ENABLED = True
+except ImportError:
+    ENABLED = False
+import pyarrow as pa
+
+from airtrain.core import CreationArgs, DatasetMetadata, Unpack, upload_from_arrow_tables
+
+
+# In case pandas is not installed
+DataFrame = Any
+
+
+def upload_from_pandas(
+    data: Union[Iterable[DataFrame], DataFrame],
+    **kwargs: Unpack[CreationArgs],
+) -> DatasetMetadata:
+    """Upload an Airtrain dataset from the provided pandas DataFrame(s).
+
+    Parameters
+    ----------
+    data:
+        Either an individual pandas DataFrame or an iterable of DataFrames.
+        Data will be intermediately represented as pyarrow tables.
+    kwargs:
+        See `upload_from_arrow_tables` for other arguments.
+
+    Returns
+    -------
+    A DatasetMetadata object summarizing the created dataset.
+    """
+    if not ENABLED:
+        raise ImportError(
+            "Pandas integration not enabled. Please install Airtrain package as "
+            "`airtrain-py[pandas]`"
+        )
+    if isinstance(data, pd.DataFrame):
+        data = [data]
+    data = (pa.Table.from_pandas(df) for df in data)  # type: ignore
+
+    return upload_from_arrow_tables(data, **kwargs)
diff --git a/src/airtrain/integrations/polars.py b/src/airtrain/integrations/polars.py
@@ -0,0 +1,46 @@
+from typing import Any, Iterable, Union
+
+
+try:
+    import polars as pl
+
+    ENABLED = True
+except ImportError:
+    ENABLED = False
+
+from airtrain.core import CreationArgs, DatasetMetadata, Unpack, upload_from_arrow_tables
+
+
+# In case polars is not installed
+DataFrame = Any
+
+
+def upload_from_polars(
+    data: Union[Iterable[DataFrame], DataFrame],
+    **kwargs: Unpack[CreationArgs],
+) -> DatasetMetadata:
+    """Upload an Airtrain dataset from the provided polars DataFrame(s).
+
+    Parameters
+    ----------
+    data:
+        Either an individual polars DataFrame or an iterable of DataFrames.
+        Data will be intermediately represented as pyarrow tables.
+    kwargs:
+        See `upload_from_arrow_tables` for other arguments.
+
+    Returns
+    -------
+    A DatasetMetadata object summarizing the created dataset.
+    """
+    if not ENABLED:
+        raise ImportError(
+            "Polars integration not enabled. Please install Airtrain package as "
+            "`airtrain-py[polars]`"
+        )
+    if isinstance(data, pl.DataFrame):
+        data = [data]
+
+    data = (df.to_arrow() for df in data)  # type: ignore
+
+    return upload_from_arrow_tables(data, **kwargs)
diff --git a/src/airtrain/integrations/tests/test_pandas.py b/src/airtrain/integrations/tests/test_pandas.py
@@ -0,0 +1,90 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from airtrain.core import DatasetMetadata
+from airtrain.integrations.pandas import upload_from_pandas
+from tests.fixtures import MockAirtrainClient, mock_client  # noqa: F401
+
+
+def test_upload_from_pandas(mock_client: MockAirtrainClient):  # noqa: F811
+    df = pd.DataFrame(
+        [
+            {"foo": 42, "bar": "a"},
+            {"foo": 43, "bar": "b"},
+            {"foo": 44, "bar": "c"},
+            {"foo": 45, "bar": "d"},
+        ]
+    )
+    name = "Foo dataset"
+    result = upload_from_pandas(df, name=name)
+    assert isinstance(result, DatasetMetadata)
+    assert result.size == df.shape[0]
+    assert result.name == name
+    fake_dataset = mock_client.get_fake_dataset(result.id)
+    assert fake_dataset.name == name
+    table = fake_dataset.ingested
+    assert table is not None
+    assert table.shape[0] == df.shape[0]
+    assert table["foo"].to_pylist() == [42, 43, 44, 45]
+    assert table["bar"].to_pylist() == ["a", "b", "c", "d"]
+
+
+def test_upload_from_pandas_multiple(mock_client: MockAirtrainClient):  # noqa: F811
+    df_1 = pd.DataFrame(
+        [
+            {"foo": 42, "bar": "a"},
+            {"foo": 43, "bar": "b"},
+            {"foo": 44, "bar": "c"},
+            {"foo": 45, "bar": "d"},
+        ]
+    )
+    df_2 = pd.DataFrame(
+        [
+            {"foo": 46, "bar": "e"},
+            {"foo": 47, "bar": "f"},
+            {"foo": 48, "bar": "g"},
+            {"foo": 49, "bar": "h"},
+        ]
+    )
+    result = upload_from_pandas((df_1, df_2))
+    assert isinstance(result, DatasetMetadata)
+    assert result.size == df_1.shape[0] + df_2.shape[0]
+    fake_dataset = mock_client.get_fake_dataset(result.id)
+    table = fake_dataset.ingested
+    assert table is not None
+    assert table.shape[0] == result.size
+    assert table["foo"].to_pylist() == [42, 43, 44, 45, 46, 47, 48, 49]
+    assert table["bar"].to_pylist() == ["a", "b", "c", "d", "e", "f", "g", "h"]
+
+
+def test_upload_from_pandas_embeddings(mock_client: MockAirtrainClient):  # noqa: F811
+    df = pd.DataFrame(
+        [
+            {"foo": 42, "bar": np.array([1.0, 0.0, 0.0, 0.0])},
+            {"foo": 43, "bar": np.array([0.0, 1.0, 0.0, 0.0])},
+            {"foo": 44, "bar": np.array([0.0, 0.0, 1.0, 0.0])},
+            {"foo": 45, "bar": np.array([0.0, 0.0, 0.0, 1.0])},
+        ]
+    )
+    result = upload_from_pandas(df, embedding_column="bar")
+    assert isinstance(result, DatasetMetadata)
+    assert result.size == df.shape[0]
+    fake_dataset = mock_client.get_fake_dataset(result.id)
+    table = fake_dataset.ingested
+    assert table is not None
+    assert table.shape[0] == df.shape[0]
+    assert table["foo"].to_pylist() == [42, 43, 44, 45]
+    assert table["bar"].to_pylist()[1] == [0.0, 1.0, 0.0, 0.0]
+
+    df_bad = pd.DataFrame(
+        [
+            {"foo": 42, "bar": np.array([1.0, 0.0, 0.0, 0.0])},
+            {"foo": 43, "bar": np.array([0.0, 1.0, 0.0, 0.0])},
+            {"foo": 44, "bar": np.array([0.0, 0.0, 1.0])},
+            {"foo": 45, "bar": np.array([0.0, 0.0, 0.0, 1.0])},
+        ]
+    )
+    with pytest.raises(ValueError):
+        # one row has a different number of embedding dimensions.
+        upload_from_pandas(df_bad, embedding_column="bar")