Add polars and packaging

sematic-ai · Sep 4, 2024 · f52f0d4 · f52f0d4
1 parent 76a0387
commit f52f0d4
Show file tree

Hide file tree

Showing 8 changed files with 272 additions and 8 deletions.
diff --git a/Makefile b/Makefile
@@ -27,7 +27,7 @@ py-prep:
 
 .PHONY: sync
 sync:
-	uv sync
+	uv sync --extra all
 
 .PHONY: fix
 fix:

diff --git a/README.md b/README.md
@@ -37,6 +37,26 @@ This repository holds the SDK for interacting with
 the tool for improving your AI apps, RAG pipelines, and models by curating
 high-quality training and eval datasets.
 
+## Installation
+
+To install the core package without any integrations, simply
+
+`pip install airtrain-py`
+
+You may install integrations by using pip extras. As an example, to
+install the pandas integration:
+
+`pip install airtrain-py[pandas]`
+
+If you want to install all integrations, you may do the following:
+
+`pip install airtrain-py[all]`
+
+The following are available extras:
+
+- `pandas`
+- `polars`
+
 ## Usage
 
 Obtain your API key by going to your user settings on
@@ -54,9 +74,85 @@ url = at.upload_from_dicts(
     [
         {"foo": "some text", "bar": "more text"},
         {"foo": "even more text", "bar": "so much text"},
-    ]
+    ],
+    name="My Dataset name",  # name is Optional
 ).url
 
-# You may view your dataset at this URL
+# You may view your dataset in the Airtrain dashboard at this URL
+# It may take some time to complete ingestion and generation of
+# automated insights. You will receive an email when it is complete.
 print(f"Dataset URL: {url}")
 ```
+
+The data may be any iterable of dictionaries that can be represented using
+automatically inferred [Apache Arrow](https://arrow.apache.org/docs/python/index.html)
+types. If you would like to give a hint as to the Arrow schema of the data being
+uploaded, you may provide one using the `schema` parameter to `upload_from_dicts`.
+
+### Custom Embeddings
+
+Airtrain produces a variety of insights into your data automatically. Some of
+these insights (ex: automatic clustering) relies on embeddings of the data. Airtrain
+will also embed your data automatically, but if you wish to provide your own embeddings
+you may do so by adding the `embedding_column` parameter when you upload:
+
+```python
+url = at.upload_from_dicts(
+    [
+        {"foo": "some text", "bar": [0.0, 0.707, 0.707, 0.0]},
+        {"foo": "even more text", "bar": [0.577, 0.577, 0.0, 0.577]},
+    ],
+    embedding_column="bar",
+).url
+```
+
+If you provide this argument, the embeddings must all be lists of floating point
+numbers with the same length.
+
+### Integrations
+
+Airtrain provides integrations to allow for uploading data from a variety of
+sources. In general most integrations take the form of an `upload_from_x(...)`
+function with a signature matching that of `upload_from_dicts` except for
+the first parameter specifying the data to be uploaded. Integrations may require
+installing the Airtrain SDK [with extras](#installation).
+
+#### Pandas
+
+```python
+import pandas as pd
+
+# ...
+
+df = pd.DataFrame(
+    {
+        "foo": ["some text", "more text", "even more"],
+        "bar": [1, 2, 3],
+    }
+)
+
+
+url = at.upload_from_pandas(df, name="My Pandas Dataset").url
+```
+
+You may also provide an iterable of dataframes instead of a single one.
+
+#### Polars
+
+```python
+import polars as pl
+
+# ...
+
+df = pl.DataFrame(
+    {
+        "foo": ["some text", "more text", "even more"],
+        "bar": [1, 2, 3],
+    }
+)
+
+
+url = at.upload_from_polars(df, name="My Polars Dataset").url
+```
+
+You may also provide an iterable of dataframes instead of a single one.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,5 @@
 [project]
-name = "airtrain"
+name = "airtrain-py"
 description = "SDK for interacting with https://airtrain.ai"
 version = "0.0.1"
 requires-python = ">=3.8"
@@ -37,6 +37,10 @@ classifiers = [
 pandas = [
   "pandas>=1.0.0",
 ]
+polars = [
+  "polars>=0.19.0",
+]
+all = ["airtrain-py[pandas,polars]"]
 
 [tool.uv]
 dev-dependencies = [
@@ -90,6 +94,10 @@ ignore_missing_imports = true
 module = "pandas.*"
 ignore_missing_imports = true
 
+[[tool.mypy.overrides]]
+module = "polars.*"
+ignore_missing_imports = true
+
 [[tool.mypy.overrides]]
 module = "tests.*"
 ignore_missing_imports = true
diff --git a/src/airtrain/__init__.py b/src/airtrain/__init__.py
@@ -5,3 +5,4 @@
     upload_from_dicts,
 )
 from airtrain.integrations.pandas import upload_from_pandas  # noqa: F401
+from airtrain.integrations.polars import upload_from_polars  # noqa: F401
diff --git a/src/airtrain/integrations/pandas.py b/src/airtrain/integrations/pandas.py
@@ -37,7 +37,7 @@ def upload_from_pandas(
     if not ENABLED:
         raise ImportError(
             "Pandas integration not enabled. Please install Airtrain package as "
-            "`airtrain[pandas]`"
+            "`airtrain-py[pandas]`"
         )
     if isinstance(data, pd.DataFrame):
         data = [data]

diff --git a/src/airtrain/integrations/polars.py b/src/airtrain/integrations/polars.py
@@ -0,0 +1,46 @@
+from typing import Any, Iterable, Union
+
+
+try:
+    import polars as pl
+
+    ENABLED = True
+except ImportError:
+    ENABLED = False
+
+from airtrain.core import CreationArgs, DatasetMetadata, Unpack, upload_from_arrow_tables
+
+
+# In case polars is not installed
+DataFrame = Any
+
+
+def upload_from_polars(
+    data: Union[Iterable[DataFrame], DataFrame],
+    **kwargs: Unpack[CreationArgs],
+) -> DatasetMetadata:
+    """Upload an Airtrain dataset from the provided polars DataFrame(s).
+
+    Parameters
+    ----------
+    data:
+        Either an individual polars DataFrame or an iterable of DataFrames.
+        Data will be intermediately represented as pyarrow tables.
+    kwargs:
+        See `upload_from_arrow_tables` for other arguments.
+
+    Returns
+    -------
+    A DatasetMetadata object summarizing the created dataset.
+    """
+    if not ENABLED:
+        raise ImportError(
+            "Polars integration not enabled. Please install Airtrain package as "
+            "`airtrain-py[polars]`"
+        )
+    if isinstance(data, pl.DataFrame):
+        data = [data]
+
+    data = (df.to_arrow() for df in data)  # type: ignore
+
+    return upload_from_arrow_tables(data, **kwargs)
diff --git a/src/airtrain/integrations/tests/test_polars.py b/src/airtrain/integrations/tests/test_polars.py
@@ -0,0 +1,89 @@
+import polars as pl
+import pytest
+
+from airtrain.core import DatasetMetadata
+from airtrain.integrations.polars import upload_from_polars
+from tests.fixtures import MockAirtrainClient, mock_client  # noqa: F401
+
+
+def test_upload_from_polars(mock_client: MockAirtrainClient):  # noqa: F811
+    df = pl.DataFrame(
+        [
+            {"foo": 42, "bar": "a"},
+            {"foo": 43, "bar": "b"},
+            {"foo": 44, "bar": "c"},
+            {"foo": 45, "bar": "d"},
+        ]
+    )
+    name = "Foo dataset"
+    result = upload_from_polars(df, name=name)
+    assert isinstance(result, DatasetMetadata)
+    assert result.size == df.shape[0]
+    assert result.name == name
+    fake_dataset = mock_client.get_fake_dataset(result.id)
+    assert fake_dataset.name == name
+    table = fake_dataset.ingested
+    assert table is not None
+    assert table.shape[0] == df.shape[0]
+    assert table["foo"].to_pylist() == [42, 43, 44, 45]
+    assert table["bar"].to_pylist() == ["a", "b", "c", "d"]
+
+
+def test_upload_from_polars_multiple(mock_client: MockAirtrainClient):  # noqa: F811
+    df_1 = pl.DataFrame(
+        [
+            {"foo": 42, "bar": "a"},
+            {"foo": 43, "bar": "b"},
+            {"foo": 44, "bar": "c"},
+            {"foo": 45, "bar": "d"},
+        ]
+    )
+    df_2 = pl.DataFrame(
+        [
+            {"foo": 46, "bar": "e"},
+            {"foo": 47, "bar": "f"},
+            {"foo": 48, "bar": "g"},
+            {"foo": 49, "bar": "h"},
+        ]
+    )
+    result = upload_from_polars((df_1, df_2))
+    assert isinstance(result, DatasetMetadata)
+    assert result.size == df_1.shape[0] + df_2.shape[0]
+    fake_dataset = mock_client.get_fake_dataset(result.id)
+    table = fake_dataset.ingested
+    assert table is not None
+    assert table.shape[0] == result.size
+    assert table["foo"].to_pylist() == [42, 43, 44, 45, 46, 47, 48, 49]
+    assert table["bar"].to_pylist() == ["a", "b", "c", "d", "e", "f", "g", "h"]
+
+
+def test_upload_from_polars_embeddings(mock_client: MockAirtrainClient):  # noqa: F811
+    df = pl.DataFrame(
+        [
+            {"foo": 42, "bar": [1.0, 0.0, 0.0, 0.0]},
+            {"foo": 43, "bar": [0.0, 1.0, 0.0, 0.0]},
+            {"foo": 44, "bar": [0.0, 0.0, 1.0, 0.0]},
+            {"foo": 45, "bar": [0.0, 0.0, 0.0, 1.0]},
+        ]
+    )
+    result = upload_from_polars(df, embedding_column="bar")
+    assert isinstance(result, DatasetMetadata)
+    assert result.size == df.shape[0]
+    fake_dataset = mock_client.get_fake_dataset(result.id)
+    table = fake_dataset.ingested
+    assert table is not None
+    assert table.shape[0] == df.shape[0]
+    assert table["foo"].to_pylist() == [42, 43, 44, 45]
+    assert table["bar"].to_pylist()[1] == [0.0, 1.0, 0.0, 0.0]
+
+    df_bad = pl.DataFrame(
+        [
+            {"foo": 42, "bar": [1.0, 0.0, 0.0, 0.0]},
+            {"foo": 43, "bar": [0.0, 1.0, 0.0, 0.0]},
+            {"foo": 44, "bar": [0.0, 0.0, 1.0]},
+            {"foo": 45, "bar": [0.0, 0.0, 0.0, 1.0]},
+        ]
+    )
+    with pytest.raises(ValueError):
+        # one row has a different number of embedding dimensions.
+        upload_from_polars(df_bad, embedding_column="bar")
diff --git a/uv.lock b/uv.lock